1 //===-- lib/CodeGen/GlobalISel/GICombinerHelper.cpp -----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
9 #include "llvm/CodeGen/GlobalISel/Combiner.h"
10 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
11 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
12 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
13 #include "llvm/CodeGen/GlobalISel/Utils.h"
14 #include "llvm/CodeGen/MachineDominators.h"
15 #include "llvm/CodeGen/MachineFrameInfo.h"
16 #include "llvm/CodeGen/MachineInstr.h"
17 #include "llvm/CodeGen/MachineRegisterInfo.h"
18 #include "llvm/CodeGen/TargetInstrInfo.h"
19 #include "llvm/CodeGen/TargetLowering.h"
20 #include "llvm/Support/MathExtras.h"
21 #include "llvm/Target/TargetMachine.h"
22 
23 #define DEBUG_TYPE "gi-combiner"
24 
25 using namespace llvm;
26 
27 // Option to allow testing of the combiner while no targets know about indexed
28 // addressing.
29 static cl::opt<bool>
30     ForceLegalIndexing("force-legal-indexing", cl::Hidden, cl::init(false),
31                        cl::desc("Force all indexed operations to be "
32                                 "legal for the GlobalISel combiner"));
33 
34 
35 CombinerHelper::CombinerHelper(GISelChangeObserver &Observer,
36                                MachineIRBuilder &B, GISelKnownBits *KB,
37                                MachineDominatorTree *MDT)
38     : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer),
39       KB(KB), MDT(MDT) {
40   (void)this->KB;
41 }
42 
43 void CombinerHelper::replaceRegWith(MachineRegisterInfo &MRI, Register FromReg,
44                                     Register ToReg) const {
45   Observer.changingAllUsesOfReg(MRI, FromReg);
46 
47   if (MRI.constrainRegAttrs(ToReg, FromReg))
48     MRI.replaceRegWith(FromReg, ToReg);
49   else
50     Builder.buildCopy(ToReg, FromReg);
51 
52   Observer.finishedChangingAllUsesOfReg();
53 }
54 
55 void CombinerHelper::replaceRegOpWith(MachineRegisterInfo &MRI,
56                                       MachineOperand &FromRegOp,
57                                       Register ToReg) const {
58   assert(FromRegOp.getParent() && "Expected an operand in an MI");
59   Observer.changingInstr(*FromRegOp.getParent());
60 
61   FromRegOp.setReg(ToReg);
62 
63   Observer.changedInstr(*FromRegOp.getParent());
64 }
65 
66 bool CombinerHelper::tryCombineCopy(MachineInstr &MI) {
67   if (matchCombineCopy(MI)) {
68     applyCombineCopy(MI);
69     return true;
70   }
71   return false;
72 }
73 bool CombinerHelper::matchCombineCopy(MachineInstr &MI) {
74   if (MI.getOpcode() != TargetOpcode::COPY)
75     return false;
76   Register DstReg = MI.getOperand(0).getReg();
77   Register SrcReg = MI.getOperand(1).getReg();
78 
79   // Give up if either DstReg or SrcReg  is a physical register.
80   if (Register::isPhysicalRegister(DstReg) ||
81       Register::isPhysicalRegister(SrcReg))
82     return false;
83 
84   // Give up the types don't match.
85   LLT DstTy = MRI.getType(DstReg);
86   LLT SrcTy = MRI.getType(SrcReg);
87   // Give up if one has a valid LLT, but the other doesn't.
88   if (DstTy.isValid() != SrcTy.isValid())
89     return false;
90   // Give up if the types don't match.
91   if (DstTy.isValid() && SrcTy.isValid() && DstTy != SrcTy)
92     return false;
93 
94   // Get the register banks and classes.
95   const RegisterBank *DstBank = MRI.getRegBankOrNull(DstReg);
96   const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
97   const TargetRegisterClass *DstRC = MRI.getRegClassOrNull(DstReg);
98   const TargetRegisterClass *SrcRC = MRI.getRegClassOrNull(SrcReg);
99 
100   // Replace if the register constraints match.
101   if ((SrcRC == DstRC) && (SrcBank == DstBank))
102     return true;
103   // Replace if DstReg has no constraints.
104   if (!DstBank && !DstRC)
105     return true;
106 
107   return false;
108 }
109 void CombinerHelper::applyCombineCopy(MachineInstr &MI) {
110   Register DstReg = MI.getOperand(0).getReg();
111   Register SrcReg = MI.getOperand(1).getReg();
112   MI.eraseFromParent();
113   replaceRegWith(MRI, DstReg, SrcReg);
114 }
115 
116 bool CombinerHelper::tryCombineConcatVectors(MachineInstr &MI) {
117   bool IsUndef = false;
118   SmallVector<Register, 4> Ops;
119   if (matchCombineConcatVectors(MI, IsUndef, Ops)) {
120     applyCombineConcatVectors(MI, IsUndef, Ops);
121     return true;
122   }
123   return false;
124 }
125 
126 bool CombinerHelper::matchCombineConcatVectors(MachineInstr &MI, bool &IsUndef,
127                                                SmallVectorImpl<Register> &Ops) {
128   assert(MI.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
129          "Invalid instruction");
130   IsUndef = true;
131   MachineInstr *Undef = nullptr;
132 
133   // Walk over all the operands of concat vectors and check if they are
134   // build_vector themselves or undef.
135   // Then collect their operands in Ops.
136   for (const MachineOperand &MO : MI.uses()) {
137     Register Reg = MO.getReg();
138     MachineInstr *Def = MRI.getVRegDef(Reg);
139     assert(Def && "Operand not defined");
140     switch (Def->getOpcode()) {
141     case TargetOpcode::G_BUILD_VECTOR:
142       IsUndef = false;
143       // Remember the operands of the build_vector to fold
144       // them into the yet-to-build flattened concat vectors.
145       for (const MachineOperand &BuildVecMO : Def->uses())
146         Ops.push_back(BuildVecMO.getReg());
147       break;
148     case TargetOpcode::G_IMPLICIT_DEF: {
149       LLT OpType = MRI.getType(Reg);
150       // Keep one undef value for all the undef operands.
151       if (!Undef) {
152         Builder.setInsertPt(*MI.getParent(), MI);
153         Undef = Builder.buildUndef(OpType.getScalarType());
154       }
155       assert(MRI.getType(Undef->getOperand(0).getReg()) ==
156                  OpType.getScalarType() &&
157              "All undefs should have the same type");
158       // Break the undef vector in as many scalar elements as needed
159       // for the flattening.
160       for (unsigned EltIdx = 0, EltEnd = OpType.getNumElements();
161            EltIdx != EltEnd; ++EltIdx)
162         Ops.push_back(Undef->getOperand(0).getReg());
163       break;
164     }
165     default:
166       return false;
167     }
168   }
169   return true;
170 }
171 void CombinerHelper::applyCombineConcatVectors(
172     MachineInstr &MI, bool IsUndef, const ArrayRef<Register> Ops) {
173   // We determined that the concat_vectors can be flatten.
174   // Generate the flattened build_vector.
175   Register DstReg = MI.getOperand(0).getReg();
176   Builder.setInsertPt(*MI.getParent(), MI);
177   Register NewDstReg = MRI.cloneVirtualRegister(DstReg);
178 
179   // Note: IsUndef is sort of redundant. We could have determine it by
180   // checking that at all Ops are undef.  Alternatively, we could have
181   // generate a build_vector of undefs and rely on another combine to
182   // clean that up.  For now, given we already gather this information
183   // in tryCombineConcatVectors, just save compile time and issue the
184   // right thing.
185   if (IsUndef)
186     Builder.buildUndef(NewDstReg);
187   else
188     Builder.buildBuildVector(NewDstReg, Ops);
189   MI.eraseFromParent();
190   replaceRegWith(MRI, DstReg, NewDstReg);
191 }
192 
193 bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) {
194   SmallVector<Register, 4> Ops;
195   if (matchCombineShuffleVector(MI, Ops)) {
196     applyCombineShuffleVector(MI, Ops);
197     return true;
198   }
199   return false;
200 }
201 
202 bool CombinerHelper::matchCombineShuffleVector(MachineInstr &MI,
203                                                SmallVectorImpl<Register> &Ops) {
204   assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
205          "Invalid instruction kind");
206   LLT DstType = MRI.getType(MI.getOperand(0).getReg());
207   Register Src1 = MI.getOperand(1).getReg();
208   LLT SrcType = MRI.getType(Src1);
209   // As bizarre as it may look, shuffle vector can actually produce
210   // scalar! This is because at the IR level a <1 x ty> shuffle
211   // vector is perfectly valid.
212   unsigned DstNumElts = DstType.isVector() ? DstType.getNumElements() : 1;
213   unsigned SrcNumElts = SrcType.isVector() ? SrcType.getNumElements() : 1;
214 
215   // If the resulting vector is smaller than the size of the source
216   // vectors being concatenated, we won't be able to replace the
217   // shuffle vector into a concat_vectors.
218   //
219   // Note: We may still be able to produce a concat_vectors fed by
220   //       extract_vector_elt and so on. It is less clear that would
221   //       be better though, so don't bother for now.
222   //
223   // If the destination is a scalar, the size of the sources doesn't
224   // matter. we will lower the shuffle to a plain copy. This will
225   // work only if the source and destination have the same size. But
226   // that's covered by the next condition.
227   //
228   // TODO: If the size between the source and destination don't match
229   //       we could still emit an extract vector element in that case.
230   if (DstNumElts < 2 * SrcNumElts && DstNumElts != 1)
231     return false;
232 
233   // Check that the shuffle mask can be broken evenly between the
234   // different sources.
235   if (DstNumElts % SrcNumElts != 0)
236     return false;
237 
238   // Mask length is a multiple of the source vector length.
239   // Check if the shuffle is some kind of concatenation of the input
240   // vectors.
241   unsigned NumConcat = DstNumElts / SrcNumElts;
242   SmallVector<int, 8> ConcatSrcs(NumConcat, -1);
243   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
244   for (unsigned i = 0; i != DstNumElts; ++i) {
245     int Idx = Mask[i];
246     // Undef value.
247     if (Idx < 0)
248       continue;
249     // Ensure the indices in each SrcType sized piece are sequential and that
250     // the same source is used for the whole piece.
251     if ((Idx % SrcNumElts != (i % SrcNumElts)) ||
252         (ConcatSrcs[i / SrcNumElts] >= 0 &&
253          ConcatSrcs[i / SrcNumElts] != (int)(Idx / SrcNumElts)))
254       return false;
255     // Remember which source this index came from.
256     ConcatSrcs[i / SrcNumElts] = Idx / SrcNumElts;
257   }
258 
259   // The shuffle is concatenating multiple vectors together.
260   // Collect the different operands for that.
261   Register UndefReg;
262   Register Src2 = MI.getOperand(2).getReg();
263   for (auto Src : ConcatSrcs) {
264     if (Src < 0) {
265       if (!UndefReg) {
266         Builder.setInsertPt(*MI.getParent(), MI);
267         UndefReg = Builder.buildUndef(SrcType).getReg(0);
268       }
269       Ops.push_back(UndefReg);
270     } else if (Src == 0)
271       Ops.push_back(Src1);
272     else
273       Ops.push_back(Src2);
274   }
275   return true;
276 }
277 
278 void CombinerHelper::applyCombineShuffleVector(MachineInstr &MI,
279                                                const ArrayRef<Register> Ops) {
280   Register DstReg = MI.getOperand(0).getReg();
281   Builder.setInsertPt(*MI.getParent(), MI);
282   Register NewDstReg = MRI.cloneVirtualRegister(DstReg);
283 
284   if (Ops.size() == 1)
285     Builder.buildCopy(NewDstReg, Ops[0]);
286   else
287     Builder.buildMerge(NewDstReg, Ops);
288 
289   MI.eraseFromParent();
290   replaceRegWith(MRI, DstReg, NewDstReg);
291 }
292 
293 namespace {
294 
295 /// Select a preference between two uses. CurrentUse is the current preference
296 /// while *ForCandidate is attributes of the candidate under consideration.
297 PreferredTuple ChoosePreferredUse(PreferredTuple &CurrentUse,
298                                   const LLT &TyForCandidate,
299                                   unsigned OpcodeForCandidate,
300                                   MachineInstr *MIForCandidate) {
301   if (!CurrentUse.Ty.isValid()) {
302     if (CurrentUse.ExtendOpcode == OpcodeForCandidate ||
303         CurrentUse.ExtendOpcode == TargetOpcode::G_ANYEXT)
304       return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
305     return CurrentUse;
306   }
307 
308   // We permit the extend to hoist through basic blocks but this is only
309   // sensible if the target has extending loads. If you end up lowering back
310   // into a load and extend during the legalizer then the end result is
311   // hoisting the extend up to the load.
312 
313   // Prefer defined extensions to undefined extensions as these are more
314   // likely to reduce the number of instructions.
315   if (OpcodeForCandidate == TargetOpcode::G_ANYEXT &&
316       CurrentUse.ExtendOpcode != TargetOpcode::G_ANYEXT)
317     return CurrentUse;
318   else if (CurrentUse.ExtendOpcode == TargetOpcode::G_ANYEXT &&
319            OpcodeForCandidate != TargetOpcode::G_ANYEXT)
320     return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
321 
322   // Prefer sign extensions to zero extensions as sign-extensions tend to be
323   // more expensive.
324   if (CurrentUse.Ty == TyForCandidate) {
325     if (CurrentUse.ExtendOpcode == TargetOpcode::G_SEXT &&
326         OpcodeForCandidate == TargetOpcode::G_ZEXT)
327       return CurrentUse;
328     else if (CurrentUse.ExtendOpcode == TargetOpcode::G_ZEXT &&
329              OpcodeForCandidate == TargetOpcode::G_SEXT)
330       return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
331   }
332 
333   // This is potentially target specific. We've chosen the largest type
334   // because G_TRUNC is usually free. One potential catch with this is that
335   // some targets have a reduced number of larger registers than smaller
336   // registers and this choice potentially increases the live-range for the
337   // larger value.
338   if (TyForCandidate.getSizeInBits() > CurrentUse.Ty.getSizeInBits()) {
339     return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
340   }
341   return CurrentUse;
342 }
343 
344 /// Find a suitable place to insert some instructions and insert them. This
345 /// function accounts for special cases like inserting before a PHI node.
346 /// The current strategy for inserting before PHI's is to duplicate the
347 /// instructions for each predecessor. However, while that's ok for G_TRUNC
348 /// on most targets since it generally requires no code, other targets/cases may
349 /// want to try harder to find a dominating block.
350 static void InsertInsnsWithoutSideEffectsBeforeUse(
351     MachineIRBuilder &Builder, MachineInstr &DefMI, MachineOperand &UseMO,
352     std::function<void(MachineBasicBlock *, MachineBasicBlock::iterator,
353                        MachineOperand &UseMO)>
354         Inserter) {
355   MachineInstr &UseMI = *UseMO.getParent();
356 
357   MachineBasicBlock *InsertBB = UseMI.getParent();
358 
359   // If the use is a PHI then we want the predecessor block instead.
360   if (UseMI.isPHI()) {
361     MachineOperand *PredBB = std::next(&UseMO);
362     InsertBB = PredBB->getMBB();
363   }
364 
365   // If the block is the same block as the def then we want to insert just after
366   // the def instead of at the start of the block.
367   if (InsertBB == DefMI.getParent()) {
368     MachineBasicBlock::iterator InsertPt = &DefMI;
369     Inserter(InsertBB, std::next(InsertPt), UseMO);
370     return;
371   }
372 
373   // Otherwise we want the start of the BB
374   Inserter(InsertBB, InsertBB->getFirstNonPHI(), UseMO);
375 }
376 } // end anonymous namespace
377 
378 bool CombinerHelper::tryCombineExtendingLoads(MachineInstr &MI) {
379   PreferredTuple Preferred;
380   if (matchCombineExtendingLoads(MI, Preferred)) {
381     applyCombineExtendingLoads(MI, Preferred);
382     return true;
383   }
384   return false;
385 }
386 
387 bool CombinerHelper::matchCombineExtendingLoads(MachineInstr &MI,
388                                                 PreferredTuple &Preferred) {
389   // We match the loads and follow the uses to the extend instead of matching
390   // the extends and following the def to the load. This is because the load
391   // must remain in the same position for correctness (unless we also add code
392   // to find a safe place to sink it) whereas the extend is freely movable.
393   // It also prevents us from duplicating the load for the volatile case or just
394   // for performance.
395 
396   if (MI.getOpcode() != TargetOpcode::G_LOAD &&
397       MI.getOpcode() != TargetOpcode::G_SEXTLOAD &&
398       MI.getOpcode() != TargetOpcode::G_ZEXTLOAD)
399     return false;
400 
401   auto &LoadValue = MI.getOperand(0);
402   assert(LoadValue.isReg() && "Result wasn't a register?");
403 
404   LLT LoadValueTy = MRI.getType(LoadValue.getReg());
405   if (!LoadValueTy.isScalar())
406     return false;
407 
408   // Most architectures are going to legalize <s8 loads into at least a 1 byte
409   // load, and the MMOs can only describe memory accesses in multiples of bytes.
410   // If we try to perform extload combining on those, we can end up with
411   // %a(s8) = extload %ptr (load 1 byte from %ptr)
412   // ... which is an illegal extload instruction.
413   if (LoadValueTy.getSizeInBits() < 8)
414     return false;
415 
416   // For non power-of-2 types, they will very likely be legalized into multiple
417   // loads. Don't bother trying to match them into extending loads.
418   if (!isPowerOf2_32(LoadValueTy.getSizeInBits()))
419     return false;
420 
421   // Find the preferred type aside from the any-extends (unless it's the only
422   // one) and non-extending ops. We'll emit an extending load to that type and
423   // and emit a variant of (extend (trunc X)) for the others according to the
424   // relative type sizes. At the same time, pick an extend to use based on the
425   // extend involved in the chosen type.
426   unsigned PreferredOpcode = MI.getOpcode() == TargetOpcode::G_LOAD
427                                  ? TargetOpcode::G_ANYEXT
428                                  : MI.getOpcode() == TargetOpcode::G_SEXTLOAD
429                                        ? TargetOpcode::G_SEXT
430                                        : TargetOpcode::G_ZEXT;
431   Preferred = {LLT(), PreferredOpcode, nullptr};
432   for (auto &UseMI : MRI.use_instructions(LoadValue.getReg())) {
433     if (UseMI.getOpcode() == TargetOpcode::G_SEXT ||
434         UseMI.getOpcode() == TargetOpcode::G_ZEXT ||
435         UseMI.getOpcode() == TargetOpcode::G_ANYEXT) {
436       Preferred = ChoosePreferredUse(Preferred,
437                                      MRI.getType(UseMI.getOperand(0).getReg()),
438                                      UseMI.getOpcode(), &UseMI);
439     }
440   }
441 
442   // There were no extends
443   if (!Preferred.MI)
444     return false;
445   // It should be impossible to chose an extend without selecting a different
446   // type since by definition the result of an extend is larger.
447   assert(Preferred.Ty != LoadValueTy && "Extending to same type?");
448 
449   LLVM_DEBUG(dbgs() << "Preferred use is: " << *Preferred.MI);
450   return true;
451 }
452 
453 void CombinerHelper::applyCombineExtendingLoads(MachineInstr &MI,
454                                                 PreferredTuple &Preferred) {
455   // Rewrite the load to the chosen extending load.
456   Register ChosenDstReg = Preferred.MI->getOperand(0).getReg();
457 
458   // Inserter to insert a truncate back to the original type at a given point
459   // with some basic CSE to limit truncate duplication to one per BB.
460   DenseMap<MachineBasicBlock *, MachineInstr *> EmittedInsns;
461   auto InsertTruncAt = [&](MachineBasicBlock *InsertIntoBB,
462                            MachineBasicBlock::iterator InsertBefore,
463                            MachineOperand &UseMO) {
464     MachineInstr *PreviouslyEmitted = EmittedInsns.lookup(InsertIntoBB);
465     if (PreviouslyEmitted) {
466       Observer.changingInstr(*UseMO.getParent());
467       UseMO.setReg(PreviouslyEmitted->getOperand(0).getReg());
468       Observer.changedInstr(*UseMO.getParent());
469       return;
470     }
471 
472     Builder.setInsertPt(*InsertIntoBB, InsertBefore);
473     Register NewDstReg = MRI.cloneVirtualRegister(MI.getOperand(0).getReg());
474     MachineInstr *NewMI = Builder.buildTrunc(NewDstReg, ChosenDstReg);
475     EmittedInsns[InsertIntoBB] = NewMI;
476     replaceRegOpWith(MRI, UseMO, NewDstReg);
477   };
478 
479   Observer.changingInstr(MI);
480   MI.setDesc(
481       Builder.getTII().get(Preferred.ExtendOpcode == TargetOpcode::G_SEXT
482                                ? TargetOpcode::G_SEXTLOAD
483                                : Preferred.ExtendOpcode == TargetOpcode::G_ZEXT
484                                      ? TargetOpcode::G_ZEXTLOAD
485                                      : TargetOpcode::G_LOAD));
486 
487   // Rewrite all the uses to fix up the types.
488   auto &LoadValue = MI.getOperand(0);
489   SmallVector<MachineOperand *, 4> Uses;
490   for (auto &UseMO : MRI.use_operands(LoadValue.getReg()))
491     Uses.push_back(&UseMO);
492 
493   for (auto *UseMO : Uses) {
494     MachineInstr *UseMI = UseMO->getParent();
495 
496     // If the extend is compatible with the preferred extend then we should fix
497     // up the type and extend so that it uses the preferred use.
498     if (UseMI->getOpcode() == Preferred.ExtendOpcode ||
499         UseMI->getOpcode() == TargetOpcode::G_ANYEXT) {
500       Register UseDstReg = UseMI->getOperand(0).getReg();
501       MachineOperand &UseSrcMO = UseMI->getOperand(1);
502       const LLT &UseDstTy = MRI.getType(UseDstReg);
503       if (UseDstReg != ChosenDstReg) {
504         if (Preferred.Ty == UseDstTy) {
505           // If the use has the same type as the preferred use, then merge
506           // the vregs and erase the extend. For example:
507           //    %1:_(s8) = G_LOAD ...
508           //    %2:_(s32) = G_SEXT %1(s8)
509           //    %3:_(s32) = G_ANYEXT %1(s8)
510           //    ... = ... %3(s32)
511           // rewrites to:
512           //    %2:_(s32) = G_SEXTLOAD ...
513           //    ... = ... %2(s32)
514           replaceRegWith(MRI, UseDstReg, ChosenDstReg);
515           Observer.erasingInstr(*UseMO->getParent());
516           UseMO->getParent()->eraseFromParent();
517         } else if (Preferred.Ty.getSizeInBits() < UseDstTy.getSizeInBits()) {
518           // If the preferred size is smaller, then keep the extend but extend
519           // from the result of the extending load. For example:
520           //    %1:_(s8) = G_LOAD ...
521           //    %2:_(s32) = G_SEXT %1(s8)
522           //    %3:_(s64) = G_ANYEXT %1(s8)
523           //    ... = ... %3(s64)
524           /// rewrites to:
525           //    %2:_(s32) = G_SEXTLOAD ...
526           //    %3:_(s64) = G_ANYEXT %2:_(s32)
527           //    ... = ... %3(s64)
528           replaceRegOpWith(MRI, UseSrcMO, ChosenDstReg);
529         } else {
530           // If the preferred size is large, then insert a truncate. For
531           // example:
532           //    %1:_(s8) = G_LOAD ...
533           //    %2:_(s64) = G_SEXT %1(s8)
534           //    %3:_(s32) = G_ZEXT %1(s8)
535           //    ... = ... %3(s32)
536           /// rewrites to:
537           //    %2:_(s64) = G_SEXTLOAD ...
538           //    %4:_(s8) = G_TRUNC %2:_(s32)
539           //    %3:_(s64) = G_ZEXT %2:_(s8)
540           //    ... = ... %3(s64)
541           InsertInsnsWithoutSideEffectsBeforeUse(Builder, MI, *UseMO,
542                                                  InsertTruncAt);
543         }
544         continue;
545       }
546       // The use is (one of) the uses of the preferred use we chose earlier.
547       // We're going to update the load to def this value later so just erase
548       // the old extend.
549       Observer.erasingInstr(*UseMO->getParent());
550       UseMO->getParent()->eraseFromParent();
551       continue;
552     }
553 
554     // The use isn't an extend. Truncate back to the type we originally loaded.
555     // This is free on many targets.
556     InsertInsnsWithoutSideEffectsBeforeUse(Builder, MI, *UseMO, InsertTruncAt);
557   }
558 
559   MI.getOperand(0).setReg(ChosenDstReg);
560   Observer.changedInstr(MI);
561 }
562 
563 bool CombinerHelper::isPredecessor(MachineInstr &DefMI, MachineInstr &UseMI) {
564   assert(DefMI.getParent() == UseMI.getParent());
565   if (&DefMI == &UseMI)
566     return false;
567 
568   // Loop through the basic block until we find one of the instructions.
569   MachineBasicBlock::const_iterator I = DefMI.getParent()->begin();
570   for (; &*I != &DefMI && &*I != &UseMI; ++I)
571     return &*I == &DefMI;
572 
573   llvm_unreachable("Block must contain instructions");
574 }
575 
576 bool CombinerHelper::dominates(MachineInstr &DefMI, MachineInstr &UseMI) {
577   if (MDT)
578     return MDT->dominates(&DefMI, &UseMI);
579   else if (DefMI.getParent() != UseMI.getParent())
580     return false;
581 
582   return isPredecessor(DefMI, UseMI);
583 }
584 
585 bool CombinerHelper::findPostIndexCandidate(MachineInstr &MI, Register &Addr,
586                                             Register &Base, Register &Offset) {
587   auto &MF = *MI.getParent()->getParent();
588   const auto &TLI = *MF.getSubtarget().getTargetLowering();
589 
590 #ifndef NDEBUG
591   unsigned Opcode = MI.getOpcode();
592   assert(Opcode == TargetOpcode::G_LOAD || Opcode == TargetOpcode::G_SEXTLOAD ||
593          Opcode == TargetOpcode::G_ZEXTLOAD || Opcode == TargetOpcode::G_STORE);
594 #endif
595 
596   Base = MI.getOperand(1).getReg();
597   MachineInstr *BaseDef = MRI.getUniqueVRegDef(Base);
598   if (BaseDef && BaseDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
599     return false;
600 
601   LLVM_DEBUG(dbgs() << "Searching for post-indexing opportunity for: " << MI);
602 
603   for (auto &Use : MRI.use_instructions(Base)) {
604     if (Use.getOpcode() != TargetOpcode::G_PTR_ADD)
605       continue;
606 
607     Offset = Use.getOperand(2).getReg();
608     if (!ForceLegalIndexing &&
609         !TLI.isIndexingLegal(MI, Base, Offset, /*IsPre*/ false, MRI)) {
610       LLVM_DEBUG(dbgs() << "    Ignoring candidate with illegal addrmode: "
611                         << Use);
612       continue;
613     }
614 
615     // Make sure the offset calculation is before the potentially indexed op.
616     // FIXME: we really care about dependency here. The offset calculation might
617     // be movable.
618     MachineInstr *OffsetDef = MRI.getUniqueVRegDef(Offset);
619     if (!OffsetDef || !dominates(*OffsetDef, MI)) {
620       LLVM_DEBUG(dbgs() << "    Ignoring candidate with offset after mem-op: "
621                         << Use);
622       continue;
623     }
624 
625     // FIXME: check whether all uses of Base are load/store with foldable
626     // addressing modes. If so, using the normal addr-modes is better than
627     // forming an indexed one.
628 
629     bool MemOpDominatesAddrUses = true;
630     for (auto &PtrAddUse : MRI.use_instructions(Use.getOperand(0).getReg())) {
631       if (!dominates(MI, PtrAddUse)) {
632         MemOpDominatesAddrUses = false;
633         break;
634       }
635     }
636 
637     if (!MemOpDominatesAddrUses) {
638       LLVM_DEBUG(
639           dbgs() << "    Ignoring candidate as memop does not dominate uses: "
640                  << Use);
641       continue;
642     }
643 
644     LLVM_DEBUG(dbgs() << "    Found match: " << Use);
645     Addr = Use.getOperand(0).getReg();
646     return true;
647   }
648 
649   return false;
650 }
651 
652 bool CombinerHelper::findPreIndexCandidate(MachineInstr &MI, Register &Addr,
653                                            Register &Base, Register &Offset) {
654   auto &MF = *MI.getParent()->getParent();
655   const auto &TLI = *MF.getSubtarget().getTargetLowering();
656 
657 #ifndef NDEBUG
658   unsigned Opcode = MI.getOpcode();
659   assert(Opcode == TargetOpcode::G_LOAD || Opcode == TargetOpcode::G_SEXTLOAD ||
660          Opcode == TargetOpcode::G_ZEXTLOAD || Opcode == TargetOpcode::G_STORE);
661 #endif
662 
663   Addr = MI.getOperand(1).getReg();
664   MachineInstr *AddrDef = getOpcodeDef(TargetOpcode::G_PTR_ADD, Addr, MRI);
665   if (!AddrDef || MRI.hasOneUse(Addr))
666     return false;
667 
668   Base = AddrDef->getOperand(1).getReg();
669   Offset = AddrDef->getOperand(2).getReg();
670 
671   LLVM_DEBUG(dbgs() << "Found potential pre-indexed load_store: " << MI);
672 
673   if (!ForceLegalIndexing &&
674       !TLI.isIndexingLegal(MI, Base, Offset, /*IsPre*/ true, MRI)) {
675     LLVM_DEBUG(dbgs() << "    Skipping, not legal for target");
676     return false;
677   }
678 
679   MachineInstr *BaseDef = getDefIgnoringCopies(Base, MRI);
680   if (BaseDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
681     LLVM_DEBUG(dbgs() << "    Skipping, frame index would need copy anyway.");
682     return false;
683   }
684 
685   if (MI.getOpcode() == TargetOpcode::G_STORE) {
686     // Would require a copy.
687     if (Base == MI.getOperand(0).getReg()) {
688       LLVM_DEBUG(dbgs() << "    Skipping, storing base so need copy anyway.");
689       return false;
690     }
691 
692     // We're expecting one use of Addr in MI, but it could also be the
693     // value stored, which isn't actually dominated by the instruction.
694     if (MI.getOperand(0).getReg() == Addr) {
695       LLVM_DEBUG(dbgs() << "    Skipping, does not dominate all addr uses");
696       return false;
697     }
698   }
699 
700   // FIXME: check whether all uses of the base pointer are constant PtrAdds.
701   // That might allow us to end base's liveness here by adjusting the constant.
702 
703   for (auto &UseMI : MRI.use_instructions(Addr)) {
704     if (!dominates(MI, UseMI)) {
705       LLVM_DEBUG(dbgs() << "    Skipping, does not dominate all addr uses.");
706       return false;
707     }
708   }
709 
710   return true;
711 }
712 
713 bool CombinerHelper::tryCombineIndexedLoadStore(MachineInstr &MI) {
714   IndexedLoadStoreMatchInfo MatchInfo;
715   if (matchCombineIndexedLoadStore(MI, MatchInfo)) {
716     applyCombineIndexedLoadStore(MI, MatchInfo);
717     return true;
718   }
719   return false;
720 }
721 
722 bool CombinerHelper::matchCombineIndexedLoadStore(MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo) {
723   unsigned Opcode = MI.getOpcode();
724   if (Opcode != TargetOpcode::G_LOAD && Opcode != TargetOpcode::G_SEXTLOAD &&
725       Opcode != TargetOpcode::G_ZEXTLOAD && Opcode != TargetOpcode::G_STORE)
726     return false;
727 
728   MatchInfo.IsPre = findPreIndexCandidate(MI, MatchInfo.Addr, MatchInfo.Base,
729                                           MatchInfo.Offset);
730   if (!MatchInfo.IsPre &&
731       !findPostIndexCandidate(MI, MatchInfo.Addr, MatchInfo.Base,
732                               MatchInfo.Offset))
733     return false;
734 
735   return true;
736 }
737 
738 void CombinerHelper::applyCombineIndexedLoadStore(
739     MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo) {
740   MachineInstr &AddrDef = *MRI.getUniqueVRegDef(MatchInfo.Addr);
741   MachineIRBuilder MIRBuilder(MI);
742   unsigned Opcode = MI.getOpcode();
743   bool IsStore = Opcode == TargetOpcode::G_STORE;
744   unsigned NewOpcode;
745   switch (Opcode) {
746   case TargetOpcode::G_LOAD:
747     NewOpcode = TargetOpcode::G_INDEXED_LOAD;
748     break;
749   case TargetOpcode::G_SEXTLOAD:
750     NewOpcode = TargetOpcode::G_INDEXED_SEXTLOAD;
751     break;
752   case TargetOpcode::G_ZEXTLOAD:
753     NewOpcode = TargetOpcode::G_INDEXED_ZEXTLOAD;
754     break;
755   case TargetOpcode::G_STORE:
756     NewOpcode = TargetOpcode::G_INDEXED_STORE;
757     break;
758   default:
759     llvm_unreachable("Unknown load/store opcode");
760   }
761 
762   auto MIB = MIRBuilder.buildInstr(NewOpcode);
763   if (IsStore) {
764     MIB.addDef(MatchInfo.Addr);
765     MIB.addUse(MI.getOperand(0).getReg());
766   } else {
767     MIB.addDef(MI.getOperand(0).getReg());
768     MIB.addDef(MatchInfo.Addr);
769   }
770 
771   MIB.addUse(MatchInfo.Base);
772   MIB.addUse(MatchInfo.Offset);
773   MIB.addImm(MatchInfo.IsPre);
774   MI.eraseFromParent();
775   AddrDef.eraseFromParent();
776 
777   LLVM_DEBUG(dbgs() << "    Combinined to indexed operation");
778 }
779 
780 bool CombinerHelper::matchElideBrByInvertingCond(MachineInstr &MI) {
781   if (MI.getOpcode() != TargetOpcode::G_BR)
782     return false;
783 
784   // Try to match the following:
785   // bb1:
786   //   %c(s32) = G_ICMP pred, %a, %b
787   //   %c1(s1) = G_TRUNC %c(s32)
788   //   G_BRCOND %c1, %bb2
789   //   G_BR %bb3
790   // bb2:
791   // ...
792   // bb3:
793 
794   // The above pattern does not have a fall through to the successor bb2, always
795   // resulting in a branch no matter which path is taken. Here we try to find
796   // and replace that pattern with conditional branch to bb3 and otherwise
797   // fallthrough to bb2.
798 
799   MachineBasicBlock *MBB = MI.getParent();
800   MachineBasicBlock::iterator BrIt(MI);
801   if (BrIt == MBB->begin())
802     return false;
803   assert(std::next(BrIt) == MBB->end() && "expected G_BR to be a terminator");
804 
805   MachineInstr *BrCond = &*std::prev(BrIt);
806   if (BrCond->getOpcode() != TargetOpcode::G_BRCOND)
807     return false;
808 
809   // Check that the next block is the conditional branch target.
810   if (!MBB->isLayoutSuccessor(BrCond->getOperand(1).getMBB()))
811     return false;
812 
813   MachineInstr *CmpMI = MRI.getVRegDef(BrCond->getOperand(0).getReg());
814   if (!CmpMI || CmpMI->getOpcode() != TargetOpcode::G_ICMP ||
815       !MRI.hasOneUse(CmpMI->getOperand(0).getReg()))
816     return false;
817   return true;
818 }
819 
820 bool CombinerHelper::tryElideBrByInvertingCond(MachineInstr &MI) {
821   if (!matchElideBrByInvertingCond(MI))
822     return false;
823   applyElideBrByInvertingCond(MI);
824   return true;
825 }
826 
827 void CombinerHelper::applyElideBrByInvertingCond(MachineInstr &MI) {
828   MachineBasicBlock *BrTarget = MI.getOperand(0).getMBB();
829   MachineBasicBlock::iterator BrIt(MI);
830   MachineInstr *BrCond = &*std::prev(BrIt);
831   MachineInstr *CmpMI = MRI.getVRegDef(BrCond->getOperand(0).getReg());
832 
833   CmpInst::Predicate InversePred = CmpInst::getInversePredicate(
834       (CmpInst::Predicate)CmpMI->getOperand(1).getPredicate());
835 
836   // Invert the G_ICMP condition.
837   Observer.changingInstr(*CmpMI);
838   CmpMI->getOperand(1).setPredicate(InversePred);
839   Observer.changedInstr(*CmpMI);
840 
841   // Change the conditional branch target.
842   Observer.changingInstr(*BrCond);
843   BrCond->getOperand(1).setMBB(BrTarget);
844   Observer.changedInstr(*BrCond);
845   MI.eraseFromParent();
846 }
847 
848 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
849   // On Darwin, -Os means optimize for size without hurting performance, so
850   // only really optimize for size when -Oz (MinSize) is used.
851   if (MF.getTarget().getTargetTriple().isOSDarwin())
852     return MF.getFunction().hasMinSize();
853   return MF.getFunction().hasOptSize();
854 }
855 
856 // Returns a list of types to use for memory op lowering in MemOps. A partial
857 // port of findOptimalMemOpLowering in TargetLowering.
858 static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
859                                           unsigned Limit, const MemOp &Op,
860                                           unsigned DstAS, unsigned SrcAS,
861                                           const AttributeList &FuncAttributes,
862                                           const TargetLowering &TLI) {
863   if (Op.getSrcAlign() != 0 && Op.getSrcAlign() < Op.getDstAlign())
864     return false;
865 
866   LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
867 
868   if (Ty == LLT()) {
869     // Use the largest scalar type whose alignment constraints are satisfied.
870     // We only need to check DstAlign here as SrcAlign is always greater or
871     // equal to DstAlign (or zero).
872     Ty = LLT::scalar(64);
873     while (Op.getDstAlign() && Op.getDstAlign() < Ty.getSizeInBytes() &&
874            !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, Op.getDstAlign()))
875       Ty = LLT::scalar(Ty.getSizeInBytes());
876     assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
877     // FIXME: check for the largest legal type we can load/store to.
878   }
879 
880   unsigned NumMemOps = 0;
881   auto Size = Op.size();
882   while (Size != 0) {
883     unsigned TySize = Ty.getSizeInBytes();
884     while (TySize > Size) {
885       // For now, only use non-vector load / store's for the left-over pieces.
886       LLT NewTy = Ty;
887       // FIXME: check for mem op safety and legality of the types. Not all of
888       // SDAGisms map cleanly to GISel concepts.
889       if (NewTy.isVector())
890         NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
891       NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits() - 1));
892       unsigned NewTySize = NewTy.getSizeInBytes();
893       assert(NewTySize > 0 && "Could not find appropriate type");
894 
895       // If the new LLT cannot cover all of the remaining bits, then consider
896       // issuing a (or a pair of) unaligned and overlapping load / store.
897       bool Fast;
898       // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
899       MVT VT = getMVTForLLT(Ty);
900       if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
901           TLI.allowsMisalignedMemoryAccesses(
902               VT, DstAS, Op.getDstAlign(), MachineMemOperand::MONone, &Fast) &&
903           Fast)
904         TySize = Size;
905       else {
906         Ty = NewTy;
907         TySize = NewTySize;
908       }
909     }
910 
911     if (++NumMemOps > Limit)
912       return false;
913 
914     MemOps.push_back(Ty);
915     Size -= TySize;
916   }
917 
918   return true;
919 }
920 
921 static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
922   if (Ty.isVector())
923     return VectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
924                            Ty.getNumElements());
925   return IntegerType::get(C, Ty.getSizeInBits());
926 }
927 
928 // Get a vectorized representation of the memset value operand, GISel edition.
929 static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
930   MachineRegisterInfo &MRI = *MIB.getMRI();
931   unsigned NumBits = Ty.getScalarSizeInBits();
932   auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI);
933   if (!Ty.isVector() && ValVRegAndVal) {
934     unsigned KnownVal = ValVRegAndVal->Value;
935     APInt Scalar = APInt(8, KnownVal);
936     APInt SplatVal = APInt::getSplat(NumBits, Scalar);
937     return MIB.buildConstant(Ty, SplatVal).getReg(0);
938   }
939   // FIXME: for vector types create a G_BUILD_VECTOR.
940   if (Ty.isVector())
941     return Register();
942 
943   // Extend the byte value to the larger type, and then multiply by a magic
944   // value 0x010101... in order to replicate it across every byte.
945   LLT ExtType = Ty.getScalarType();
946   auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
947   if (NumBits > 8) {
948     APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
949     auto MagicMI = MIB.buildConstant(ExtType, Magic);
950     Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
951   }
952 
953   assert(ExtType == Ty && "Vector memset value type not supported yet");
954   return Val;
955 }
956 
957 bool CombinerHelper::optimizeMemset(MachineInstr &MI, Register Dst,
958                                     Register Val, unsigned KnownLen,
959                                     Align Alignment, bool IsVolatile) {
960   auto &MF = *MI.getParent()->getParent();
961   const auto &TLI = *MF.getSubtarget().getTargetLowering();
962   auto &DL = MF.getDataLayout();
963   LLVMContext &C = MF.getFunction().getContext();
964 
965   assert(KnownLen != 0 && "Have a zero length memset length!");
966 
967   bool DstAlignCanChange = false;
968   MachineFrameInfo &MFI = MF.getFrameInfo();
969   bool OptSize = shouldLowerMemFuncForSize(MF);
970 
971   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
972   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
973     DstAlignCanChange = true;
974 
975   unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
976   std::vector<LLT> MemOps;
977 
978   const auto &DstMMO = **MI.memoperands_begin();
979   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
980 
981   auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI);
982   bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
983 
984   if (!findGISelOptimalMemOpLowering(MemOps, Limit,
985                                      MemOp::Set(KnownLen, DstAlignCanChange,
986                                                 Alignment,
987                                                 /*IsZeroMemset=*/IsZeroVal,
988                                                 /*IsVolatile=*/IsVolatile),
989                                      DstPtrInfo.getAddrSpace(), ~0u,
990                                      MF.getFunction().getAttributes(), TLI))
991     return false;
992 
993   if (DstAlignCanChange) {
994     // Get an estimate of the type from the LLT.
995     Type *IRTy = getTypeForLLT(MemOps[0], C);
996     Align NewAlign = DL.getABITypeAlign(IRTy);
997     if (NewAlign > Alignment) {
998       Alignment = NewAlign;
999       unsigned FI = FIDef->getOperand(1).getIndex();
1000       // Give the stack frame object a larger alignment if needed.
1001       if (MFI.getObjectAlign(FI) < Alignment)
1002         MFI.setObjectAlignment(FI, Alignment);
1003     }
1004   }
1005 
1006   MachineIRBuilder MIB(MI);
1007   // Find the largest store and generate the bit pattern for it.
1008   LLT LargestTy = MemOps[0];
1009   for (unsigned i = 1; i < MemOps.size(); i++)
1010     if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
1011       LargestTy = MemOps[i];
1012 
1013   // The memset stored value is always defined as an s8, so in order to make it
1014   // work with larger store types we need to repeat the bit pattern across the
1015   // wider type.
1016   Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
1017 
1018   if (!MemSetValue)
1019     return false;
1020 
1021   // Generate the stores. For each store type in the list, we generate the
1022   // matching store of that type to the destination address.
1023   LLT PtrTy = MRI.getType(Dst);
1024   unsigned DstOff = 0;
1025   unsigned Size = KnownLen;
1026   for (unsigned I = 0; I < MemOps.size(); I++) {
1027     LLT Ty = MemOps[I];
1028     unsigned TySize = Ty.getSizeInBytes();
1029     if (TySize > Size) {
1030       // Issuing an unaligned load / store pair that overlaps with the previous
1031       // pair. Adjust the offset accordingly.
1032       assert(I == MemOps.size() - 1 && I != 0);
1033       DstOff -= TySize - Size;
1034     }
1035 
1036     // If this store is smaller than the largest store see whether we can get
1037     // the smaller value for free with a truncate.
1038     Register Value = MemSetValue;
1039     if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
1040       MVT VT = getMVTForLLT(Ty);
1041       MVT LargestVT = getMVTForLLT(LargestTy);
1042       if (!LargestTy.isVector() && !Ty.isVector() &&
1043           TLI.isTruncateFree(LargestVT, VT))
1044         Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
1045       else
1046         Value = getMemsetValue(Val, Ty, MIB);
1047       if (!Value)
1048         return false;
1049     }
1050 
1051     auto *StoreMMO =
1052         MF.getMachineMemOperand(&DstMMO, DstOff, Ty.getSizeInBytes());
1053 
1054     Register Ptr = Dst;
1055     if (DstOff != 0) {
1056       auto Offset =
1057           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
1058       Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
1059     }
1060 
1061     MIB.buildStore(Value, Ptr, *StoreMMO);
1062     DstOff += Ty.getSizeInBytes();
1063     Size -= TySize;
1064   }
1065 
1066   MI.eraseFromParent();
1067   return true;
1068 }
1069 
1070 bool CombinerHelper::optimizeMemcpy(MachineInstr &MI, Register Dst,
1071                                     Register Src, unsigned KnownLen,
1072                                     Align DstAlign, Align SrcAlign,
1073                                     bool IsVolatile) {
1074   auto &MF = *MI.getParent()->getParent();
1075   const auto &TLI = *MF.getSubtarget().getTargetLowering();
1076   auto &DL = MF.getDataLayout();
1077   LLVMContext &C = MF.getFunction().getContext();
1078 
1079   assert(KnownLen != 0 && "Have a zero length memcpy length!");
1080 
1081   bool DstAlignCanChange = false;
1082   MachineFrameInfo &MFI = MF.getFrameInfo();
1083   bool OptSize = shouldLowerMemFuncForSize(MF);
1084   Align Alignment = commonAlignment(DstAlign, SrcAlign);
1085 
1086   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
1087   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
1088     DstAlignCanChange = true;
1089 
1090   // FIXME: infer better src pointer alignment like SelectionDAG does here.
1091   // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
1092   // if the memcpy is in a tail call position.
1093 
1094   unsigned Limit = TLI.getMaxStoresPerMemcpy(OptSize);
1095   std::vector<LLT> MemOps;
1096 
1097   const auto &DstMMO = **MI.memoperands_begin();
1098   const auto &SrcMMO = **std::next(MI.memoperands_begin());
1099   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
1100   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
1101 
1102   if (!findGISelOptimalMemOpLowering(
1103           MemOps, Limit,
1104           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
1105                       IsVolatile),
1106           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
1107           MF.getFunction().getAttributes(), TLI))
1108     return false;
1109 
1110   if (DstAlignCanChange) {
1111     // Get an estimate of the type from the LLT.
1112     Type *IRTy = getTypeForLLT(MemOps[0], C);
1113     Align NewAlign = DL.getABITypeAlign(IRTy);
1114 
1115     // Don't promote to an alignment that would require dynamic stack
1116     // realignment.
1117     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1118     if (!TRI->needsStackRealignment(MF))
1119       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
1120         NewAlign = NewAlign / 2;
1121 
1122     if (NewAlign > Alignment) {
1123       Alignment = NewAlign;
1124       unsigned FI = FIDef->getOperand(1).getIndex();
1125       // Give the stack frame object a larger alignment if needed.
1126       if (MFI.getObjectAlign(FI) < Alignment)
1127         MFI.setObjectAlignment(FI, Alignment);
1128     }
1129   }
1130 
1131   LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
1132 
1133   MachineIRBuilder MIB(MI);
1134   // Now we need to emit a pair of load and stores for each of the types we've
1135   // collected. I.e. for each type, generate a load from the source pointer of
1136   // that type width, and then generate a corresponding store to the dest buffer
1137   // of that value loaded. This can result in a sequence of loads and stores
1138   // mixed types, depending on what the target specifies as good types to use.
1139   unsigned CurrOffset = 0;
1140   LLT PtrTy = MRI.getType(Src);
1141   unsigned Size = KnownLen;
1142   for (auto CopyTy : MemOps) {
1143     // Issuing an unaligned load / store pair  that overlaps with the previous
1144     // pair. Adjust the offset accordingly.
1145     if (CopyTy.getSizeInBytes() > Size)
1146       CurrOffset -= CopyTy.getSizeInBytes() - Size;
1147 
1148     // Construct MMOs for the accesses.
1149     auto *LoadMMO =
1150         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
1151     auto *StoreMMO =
1152         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
1153 
1154     // Create the load.
1155     Register LoadPtr = Src;
1156     Register Offset;
1157     if (CurrOffset != 0) {
1158       Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset)
1159                    .getReg(0);
1160       LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
1161     }
1162     auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
1163 
1164     // Create the store.
1165     Register StorePtr =
1166         CurrOffset == 0 ? Dst : MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
1167     MIB.buildStore(LdVal, StorePtr, *StoreMMO);
1168     CurrOffset += CopyTy.getSizeInBytes();
1169     Size -= CopyTy.getSizeInBytes();
1170   }
1171 
1172   MI.eraseFromParent();
1173   return true;
1174 }
1175 
1176 bool CombinerHelper::optimizeMemmove(MachineInstr &MI, Register Dst,
1177                                      Register Src, unsigned KnownLen,
1178                                      Align DstAlign, Align SrcAlign,
1179                                      bool IsVolatile) {
1180   auto &MF = *MI.getParent()->getParent();
1181   const auto &TLI = *MF.getSubtarget().getTargetLowering();
1182   auto &DL = MF.getDataLayout();
1183   LLVMContext &C = MF.getFunction().getContext();
1184 
1185   assert(KnownLen != 0 && "Have a zero length memmove length!");
1186 
1187   bool DstAlignCanChange = false;
1188   MachineFrameInfo &MFI = MF.getFrameInfo();
1189   bool OptSize = shouldLowerMemFuncForSize(MF);
1190   Align Alignment = commonAlignment(DstAlign, SrcAlign);
1191 
1192   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
1193   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
1194     DstAlignCanChange = true;
1195 
1196   unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
1197   std::vector<LLT> MemOps;
1198 
1199   const auto &DstMMO = **MI.memoperands_begin();
1200   const auto &SrcMMO = **std::next(MI.memoperands_begin());
1201   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
1202   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
1203 
1204   // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
1205   // to a bug in it's findOptimalMemOpLowering implementation. For now do the
1206   // same thing here.
1207   if (!findGISelOptimalMemOpLowering(
1208           MemOps, Limit,
1209           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
1210                       /*IsVolatile*/ true),
1211           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
1212           MF.getFunction().getAttributes(), TLI))
1213     return false;
1214 
1215   if (DstAlignCanChange) {
1216     // Get an estimate of the type from the LLT.
1217     Type *IRTy = getTypeForLLT(MemOps[0], C);
1218     Align NewAlign = DL.getABITypeAlign(IRTy);
1219 
1220     // Don't promote to an alignment that would require dynamic stack
1221     // realignment.
1222     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1223     if (!TRI->needsStackRealignment(MF))
1224       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
1225         NewAlign = NewAlign / 2;
1226 
1227     if (NewAlign > Alignment) {
1228       Alignment = NewAlign;
1229       unsigned FI = FIDef->getOperand(1).getIndex();
1230       // Give the stack frame object a larger alignment if needed.
1231       if (MFI.getObjectAlign(FI) < Alignment)
1232         MFI.setObjectAlignment(FI, Alignment);
1233     }
1234   }
1235 
1236   LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
1237 
1238   MachineIRBuilder MIB(MI);
1239   // Memmove requires that we perform the loads first before issuing the stores.
1240   // Apart from that, this loop is pretty much doing the same thing as the
1241   // memcpy codegen function.
1242   unsigned CurrOffset = 0;
1243   LLT PtrTy = MRI.getType(Src);
1244   SmallVector<Register, 16> LoadVals;
1245   for (auto CopyTy : MemOps) {
1246     // Construct MMO for the load.
1247     auto *LoadMMO =
1248         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
1249 
1250     // Create the load.
1251     Register LoadPtr = Src;
1252     if (CurrOffset != 0) {
1253       auto Offset =
1254           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
1255       LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
1256     }
1257     LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
1258     CurrOffset += CopyTy.getSizeInBytes();
1259   }
1260 
1261   CurrOffset = 0;
1262   for (unsigned I = 0; I < MemOps.size(); ++I) {
1263     LLT CopyTy = MemOps[I];
1264     // Now store the values loaded.
1265     auto *StoreMMO =
1266         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
1267 
1268     Register StorePtr = Dst;
1269     if (CurrOffset != 0) {
1270       auto Offset =
1271           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
1272       StorePtr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
1273     }
1274     MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
1275     CurrOffset += CopyTy.getSizeInBytes();
1276   }
1277   MI.eraseFromParent();
1278   return true;
1279 }
1280 
1281 bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
1282   // This combine is fairly complex so it's not written with a separate
1283   // matcher function.
1284   assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
1285   Intrinsic::ID ID = (Intrinsic::ID)MI.getIntrinsicID();
1286   assert((ID == Intrinsic::memcpy || ID == Intrinsic::memmove ||
1287           ID == Intrinsic::memset) &&
1288          "Expected a memcpy like intrinsic");
1289 
1290   auto MMOIt = MI.memoperands_begin();
1291   const MachineMemOperand *MemOp = *MMOIt;
1292   bool IsVolatile = MemOp->isVolatile();
1293   // Don't try to optimize volatile.
1294   if (IsVolatile)
1295     return false;
1296 
1297   Align DstAlign(MemOp->getBaseAlignment());
1298   Align SrcAlign;
1299   Register Dst = MI.getOperand(1).getReg();
1300   Register Src = MI.getOperand(2).getReg();
1301   Register Len = MI.getOperand(3).getReg();
1302 
1303   if (ID != Intrinsic::memset) {
1304     assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
1305     MemOp = *(++MMOIt);
1306     SrcAlign = Align(MemOp->getBaseAlignment());
1307   }
1308 
1309   // See if this is a constant length copy
1310   auto LenVRegAndVal = getConstantVRegValWithLookThrough(Len, MRI);
1311   if (!LenVRegAndVal)
1312     return false; // Leave it to the legalizer to lower it to a libcall.
1313   unsigned KnownLen = LenVRegAndVal->Value;
1314 
1315   if (KnownLen == 0) {
1316     MI.eraseFromParent();
1317     return true;
1318   }
1319 
1320   if (MaxLen && KnownLen > MaxLen)
1321     return false;
1322 
1323   if (ID == Intrinsic::memcpy)
1324     return optimizeMemcpy(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
1325   if (ID == Intrinsic::memmove)
1326     return optimizeMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
1327   if (ID == Intrinsic::memset)
1328     return optimizeMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
1329   return false;
1330 }
1331 
1332 bool CombinerHelper::matchPtrAddImmedChain(MachineInstr &MI,
1333                                            PtrAddChain &MatchInfo) {
1334   // We're trying to match the following pattern:
1335   //   %t1 = G_PTR_ADD %base, G_CONSTANT imm1
1336   //   %root = G_PTR_ADD %t1, G_CONSTANT imm2
1337   // -->
1338   //   %root = G_PTR_ADD %base, G_CONSTANT (imm1 + imm2)
1339 
1340   if (MI.getOpcode() != TargetOpcode::G_PTR_ADD)
1341     return false;
1342 
1343   Register Add2 = MI.getOperand(1).getReg();
1344   Register Imm1 = MI.getOperand(2).getReg();
1345   auto MaybeImmVal = getConstantVRegValWithLookThrough(Imm1, MRI);
1346   if (!MaybeImmVal)
1347     return false;
1348 
1349   MachineInstr *Add2Def = MRI.getUniqueVRegDef(Add2);
1350   if (!Add2Def || Add2Def->getOpcode() != TargetOpcode::G_PTR_ADD)
1351     return false;
1352 
1353   Register Base = Add2Def->getOperand(1).getReg();
1354   Register Imm2 = Add2Def->getOperand(2).getReg();
1355   auto MaybeImm2Val = getConstantVRegValWithLookThrough(Imm2, MRI);
1356   if (!MaybeImm2Val)
1357     return false;
1358 
1359   // Pass the combined immediate to the apply function.
1360   MatchInfo.Imm = MaybeImmVal->Value + MaybeImm2Val->Value;
1361   MatchInfo.Base = Base;
1362   return true;
1363 }
1364 
1365 bool CombinerHelper::applyPtrAddImmedChain(MachineInstr &MI,
1366                                            PtrAddChain &MatchInfo) {
1367   assert(MI.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
1368   MachineIRBuilder MIB(MI);
1369   LLT OffsetTy = MRI.getType(MI.getOperand(2).getReg());
1370   auto NewOffset = MIB.buildConstant(OffsetTy, MatchInfo.Imm);
1371   Observer.changingInstr(MI);
1372   MI.getOperand(1).setReg(MatchInfo.Base);
1373   MI.getOperand(2).setReg(NewOffset.getReg(0));
1374   Observer.changedInstr(MI);
1375   return true;
1376 }
1377 
1378 bool CombinerHelper::matchCombineMulToShl(MachineInstr &MI,
1379                                           unsigned &ShiftVal) {
1380   assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL");
1381   auto MaybeImmVal =
1382       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
1383   if (!MaybeImmVal || !isPowerOf2_64(MaybeImmVal->Value))
1384     return false;
1385   ShiftVal = Log2_64(MaybeImmVal->Value);
1386   return true;
1387 }
1388 
1389 bool CombinerHelper::applyCombineMulToShl(MachineInstr &MI,
1390                                           unsigned &ShiftVal) {
1391   assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL");
1392   MachineIRBuilder MIB(MI);
1393   LLT ShiftTy = MRI.getType(MI.getOperand(0).getReg());
1394   auto ShiftCst = MIB.buildConstant(ShiftTy, ShiftVal);
1395   Observer.changingInstr(MI);
1396   MI.setDesc(MIB.getTII().get(TargetOpcode::G_SHL));
1397   MI.getOperand(2).setReg(ShiftCst.getReg(0));
1398   Observer.changedInstr(MI);
1399   return true;
1400 }
1401 
1402 bool CombinerHelper::tryCombine(MachineInstr &MI) {
1403   if (tryCombineCopy(MI))
1404     return true;
1405   if (tryCombineExtendingLoads(MI))
1406     return true;
1407   if (tryCombineIndexedLoadStore(MI))
1408     return true;
1409   return false;
1410 }
1411