1 //===-- lib/CodeGen/GlobalISel/GICombinerHelper.cpp -----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
9 #include "llvm/CodeGen/GlobalISel/Combiner.h"
10 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
11 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
12 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
13 #include "llvm/CodeGen/GlobalISel/Utils.h"
14 #include "llvm/CodeGen/MachineDominators.h"
15 #include "llvm/CodeGen/MachineFrameInfo.h"
16 #include "llvm/CodeGen/MachineInstr.h"
17 #include "llvm/CodeGen/MachineRegisterInfo.h"
18 #include "llvm/CodeGen/TargetInstrInfo.h"
19 #include "llvm/CodeGen/TargetLowering.h"
20 #include "llvm/Support/MathExtras.h"
21 #include "llvm/Target/TargetMachine.h"
22 
23 #define DEBUG_TYPE "gi-combiner"
24 
25 using namespace llvm;
26 
27 // Option to allow testing of the combiner while no targets know about indexed
28 // addressing.
29 static cl::opt<bool>
30     ForceLegalIndexing("force-legal-indexing", cl::Hidden, cl::init(false),
31                        cl::desc("Force all indexed operations to be "
32                                 "legal for the GlobalISel combiner"));
33 
34 
35 CombinerHelper::CombinerHelper(GISelChangeObserver &Observer,
36                                MachineIRBuilder &B, GISelKnownBits *KB,
37                                MachineDominatorTree *MDT)
38     : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer),
39       KB(KB), MDT(MDT) {
40   (void)this->KB;
41 }
42 
43 void CombinerHelper::replaceRegWith(MachineRegisterInfo &MRI, Register FromReg,
44                                     Register ToReg) const {
45   Observer.changingAllUsesOfReg(MRI, FromReg);
46 
47   if (MRI.constrainRegAttrs(ToReg, FromReg))
48     MRI.replaceRegWith(FromReg, ToReg);
49   else
50     Builder.buildCopy(ToReg, FromReg);
51 
52   Observer.finishedChangingAllUsesOfReg();
53 }
54 
55 void CombinerHelper::replaceRegOpWith(MachineRegisterInfo &MRI,
56                                       MachineOperand &FromRegOp,
57                                       Register ToReg) const {
58   assert(FromRegOp.getParent() && "Expected an operand in an MI");
59   Observer.changingInstr(*FromRegOp.getParent());
60 
61   FromRegOp.setReg(ToReg);
62 
63   Observer.changedInstr(*FromRegOp.getParent());
64 }
65 
66 bool CombinerHelper::tryCombineCopy(MachineInstr &MI) {
67   if (matchCombineCopy(MI)) {
68     applyCombineCopy(MI);
69     return true;
70   }
71   return false;
72 }
73 bool CombinerHelper::matchCombineCopy(MachineInstr &MI) {
74   if (MI.getOpcode() != TargetOpcode::COPY)
75     return false;
76   Register DstReg = MI.getOperand(0).getReg();
77   Register SrcReg = MI.getOperand(1).getReg();
78 
79   // Give up if either DstReg or SrcReg  is a physical register.
80   if (Register::isPhysicalRegister(DstReg) ||
81       Register::isPhysicalRegister(SrcReg))
82     return false;
83 
84   // Give up the types don't match.
85   LLT DstTy = MRI.getType(DstReg);
86   LLT SrcTy = MRI.getType(SrcReg);
87   // Give up if one has a valid LLT, but the other doesn't.
88   if (DstTy.isValid() != SrcTy.isValid())
89     return false;
90   // Give up if the types don't match.
91   if (DstTy.isValid() && SrcTy.isValid() && DstTy != SrcTy)
92     return false;
93 
94   // Get the register banks and classes.
95   const RegisterBank *DstBank = MRI.getRegBankOrNull(DstReg);
96   const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
97   const TargetRegisterClass *DstRC = MRI.getRegClassOrNull(DstReg);
98   const TargetRegisterClass *SrcRC = MRI.getRegClassOrNull(SrcReg);
99 
100   // Replace if the register constraints match.
101   if ((SrcRC == DstRC) && (SrcBank == DstBank))
102     return true;
103   // Replace if DstReg has no constraints.
104   if (!DstBank && !DstRC)
105     return true;
106 
107   return false;
108 }
109 void CombinerHelper::applyCombineCopy(MachineInstr &MI) {
110   Register DstReg = MI.getOperand(0).getReg();
111   Register SrcReg = MI.getOperand(1).getReg();
112   MI.eraseFromParent();
113   replaceRegWith(MRI, DstReg, SrcReg);
114 }
115 
116 bool CombinerHelper::tryCombineConcatVectors(MachineInstr &MI) {
117   bool IsUndef = false;
118   SmallVector<Register, 4> Ops;
119   if (matchCombineConcatVectors(MI, IsUndef, Ops)) {
120     applyCombineConcatVectors(MI, IsUndef, Ops);
121     return true;
122   }
123   return false;
124 }
125 
126 bool CombinerHelper::matchCombineConcatVectors(MachineInstr &MI, bool &IsUndef,
127                                                SmallVectorImpl<Register> &Ops) {
128   assert(MI.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
129          "Invalid instruction");
130   IsUndef = true;
131   MachineInstr *Undef = nullptr;
132 
133   // Walk over all the operands of concat vectors and check if they are
134   // build_vector themselves or undef.
135   // Then collect their operands in Ops.
136   for (const MachineOperand &MO : MI.uses()) {
137     Register Reg = MO.getReg();
138     MachineInstr *Def = MRI.getVRegDef(Reg);
139     assert(Def && "Operand not defined");
140     switch (Def->getOpcode()) {
141     case TargetOpcode::G_BUILD_VECTOR:
142       IsUndef = false;
143       // Remember the operands of the build_vector to fold
144       // them into the yet-to-build flattened concat vectors.
145       for (const MachineOperand &BuildVecMO : Def->uses())
146         Ops.push_back(BuildVecMO.getReg());
147       break;
148     case TargetOpcode::G_IMPLICIT_DEF: {
149       LLT OpType = MRI.getType(Reg);
150       // Keep one undef value for all the undef operands.
151       if (!Undef) {
152         Builder.setInsertPt(*MI.getParent(), MI);
153         Undef = Builder.buildUndef(OpType.getScalarType());
154       }
155       assert(MRI.getType(Undef->getOperand(0).getReg()) ==
156                  OpType.getScalarType() &&
157              "All undefs should have the same type");
158       // Break the undef vector in as many scalar elements as needed
159       // for the flattening.
160       for (unsigned EltIdx = 0, EltEnd = OpType.getNumElements();
161            EltIdx != EltEnd; ++EltIdx)
162         Ops.push_back(Undef->getOperand(0).getReg());
163       break;
164     }
165     default:
166       return false;
167     }
168   }
169   return true;
170 }
171 void CombinerHelper::applyCombineConcatVectors(
172     MachineInstr &MI, bool IsUndef, const ArrayRef<Register> Ops) {
173   // We determined that the concat_vectors can be flatten.
174   // Generate the flattened build_vector.
175   Register DstReg = MI.getOperand(0).getReg();
176   Builder.setInsertPt(*MI.getParent(), MI);
177   Register NewDstReg = MRI.cloneVirtualRegister(DstReg);
178 
179   // Note: IsUndef is sort of redundant. We could have determine it by
180   // checking that at all Ops are undef.  Alternatively, we could have
181   // generate a build_vector of undefs and rely on another combine to
182   // clean that up.  For now, given we already gather this information
183   // in tryCombineConcatVectors, just save compile time and issue the
184   // right thing.
185   if (IsUndef)
186     Builder.buildUndef(NewDstReg);
187   else
188     Builder.buildBuildVector(NewDstReg, Ops);
189   MI.eraseFromParent();
190   replaceRegWith(MRI, DstReg, NewDstReg);
191 }
192 
193 bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) {
194   SmallVector<Register, 4> Ops;
195   if (matchCombineShuffleVector(MI, Ops)) {
196     applyCombineShuffleVector(MI, Ops);
197     return true;
198   }
199   return false;
200 }
201 
202 bool CombinerHelper::matchCombineShuffleVector(MachineInstr &MI,
203                                                SmallVectorImpl<Register> &Ops) {
204   assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
205          "Invalid instruction kind");
206   LLT DstType = MRI.getType(MI.getOperand(0).getReg());
207   Register Src1 = MI.getOperand(1).getReg();
208   LLT SrcType = MRI.getType(Src1);
209   // As bizarre as it may look, shuffle vector can actually produce
210   // scalar! This is because at the IR level a <1 x ty> shuffle
211   // vector is perfectly valid.
212   unsigned DstNumElts = DstType.isVector() ? DstType.getNumElements() : 1;
213   unsigned SrcNumElts = SrcType.isVector() ? SrcType.getNumElements() : 1;
214 
215   // If the resulting vector is smaller than the size of the source
216   // vectors being concatenated, we won't be able to replace the
217   // shuffle vector into a concat_vectors.
218   //
219   // Note: We may still be able to produce a concat_vectors fed by
220   //       extract_vector_elt and so on. It is less clear that would
221   //       be better though, so don't bother for now.
222   //
223   // If the destination is a scalar, the size of the sources doesn't
224   // matter. we will lower the shuffle to a plain copy. This will
225   // work only if the source and destination have the same size. But
226   // that's covered by the next condition.
227   //
228   // TODO: If the size between the source and destination don't match
229   //       we could still emit an extract vector element in that case.
230   if (DstNumElts < 2 * SrcNumElts && DstNumElts != 1)
231     return false;
232 
233   // Check that the shuffle mask can be broken evenly between the
234   // different sources.
235   if (DstNumElts % SrcNumElts != 0)
236     return false;
237 
238   // Mask length is a multiple of the source vector length.
239   // Check if the shuffle is some kind of concatenation of the input
240   // vectors.
241   unsigned NumConcat = DstNumElts / SrcNumElts;
242   SmallVector<int, 8> ConcatSrcs(NumConcat, -1);
243   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
244   for (unsigned i = 0; i != DstNumElts; ++i) {
245     int Idx = Mask[i];
246     // Undef value.
247     if (Idx < 0)
248       continue;
249     // Ensure the indices in each SrcType sized piece are sequential and that
250     // the same source is used for the whole piece.
251     if ((Idx % SrcNumElts != (i % SrcNumElts)) ||
252         (ConcatSrcs[i / SrcNumElts] >= 0 &&
253          ConcatSrcs[i / SrcNumElts] != (int)(Idx / SrcNumElts)))
254       return false;
255     // Remember which source this index came from.
256     ConcatSrcs[i / SrcNumElts] = Idx / SrcNumElts;
257   }
258 
259   // The shuffle is concatenating multiple vectors together.
260   // Collect the different operands for that.
261   Register UndefReg;
262   Register Src2 = MI.getOperand(2).getReg();
263   for (auto Src : ConcatSrcs) {
264     if (Src < 0) {
265       if (!UndefReg) {
266         Builder.setInsertPt(*MI.getParent(), MI);
267         UndefReg = Builder.buildUndef(SrcType).getReg(0);
268       }
269       Ops.push_back(UndefReg);
270     } else if (Src == 0)
271       Ops.push_back(Src1);
272     else
273       Ops.push_back(Src2);
274   }
275   return true;
276 }
277 
278 void CombinerHelper::applyCombineShuffleVector(MachineInstr &MI,
279                                                const ArrayRef<Register> Ops) {
280   Register DstReg = MI.getOperand(0).getReg();
281   Builder.setInsertPt(*MI.getParent(), MI);
282   Register NewDstReg = MRI.cloneVirtualRegister(DstReg);
283 
284   if (Ops.size() == 1)
285     Builder.buildCopy(NewDstReg, Ops[0]);
286   else
287     Builder.buildMerge(NewDstReg, Ops);
288 
289   MI.eraseFromParent();
290   replaceRegWith(MRI, DstReg, NewDstReg);
291 }
292 
293 namespace {
294 
295 /// Select a preference between two uses. CurrentUse is the current preference
296 /// while *ForCandidate is attributes of the candidate under consideration.
297 PreferredTuple ChoosePreferredUse(PreferredTuple &CurrentUse,
298                                   const LLT &TyForCandidate,
299                                   unsigned OpcodeForCandidate,
300                                   MachineInstr *MIForCandidate) {
301   if (!CurrentUse.Ty.isValid()) {
302     if (CurrentUse.ExtendOpcode == OpcodeForCandidate ||
303         CurrentUse.ExtendOpcode == TargetOpcode::G_ANYEXT)
304       return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
305     return CurrentUse;
306   }
307 
308   // We permit the extend to hoist through basic blocks but this is only
309   // sensible if the target has extending loads. If you end up lowering back
310   // into a load and extend during the legalizer then the end result is
311   // hoisting the extend up to the load.
312 
313   // Prefer defined extensions to undefined extensions as these are more
314   // likely to reduce the number of instructions.
315   if (OpcodeForCandidate == TargetOpcode::G_ANYEXT &&
316       CurrentUse.ExtendOpcode != TargetOpcode::G_ANYEXT)
317     return CurrentUse;
318   else if (CurrentUse.ExtendOpcode == TargetOpcode::G_ANYEXT &&
319            OpcodeForCandidate != TargetOpcode::G_ANYEXT)
320     return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
321 
322   // Prefer sign extensions to zero extensions as sign-extensions tend to be
323   // more expensive.
324   if (CurrentUse.Ty == TyForCandidate) {
325     if (CurrentUse.ExtendOpcode == TargetOpcode::G_SEXT &&
326         OpcodeForCandidate == TargetOpcode::G_ZEXT)
327       return CurrentUse;
328     else if (CurrentUse.ExtendOpcode == TargetOpcode::G_ZEXT &&
329              OpcodeForCandidate == TargetOpcode::G_SEXT)
330       return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
331   }
332 
333   // This is potentially target specific. We've chosen the largest type
334   // because G_TRUNC is usually free. One potential catch with this is that
335   // some targets have a reduced number of larger registers than smaller
336   // registers and this choice potentially increases the live-range for the
337   // larger value.
338   if (TyForCandidate.getSizeInBits() > CurrentUse.Ty.getSizeInBits()) {
339     return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
340   }
341   return CurrentUse;
342 }
343 
344 /// Find a suitable place to insert some instructions and insert them. This
345 /// function accounts for special cases like inserting before a PHI node.
346 /// The current strategy for inserting before PHI's is to duplicate the
347 /// instructions for each predecessor. However, while that's ok for G_TRUNC
348 /// on most targets since it generally requires no code, other targets/cases may
349 /// want to try harder to find a dominating block.
350 static void InsertInsnsWithoutSideEffectsBeforeUse(
351     MachineIRBuilder &Builder, MachineInstr &DefMI, MachineOperand &UseMO,
352     std::function<void(MachineBasicBlock *, MachineBasicBlock::iterator,
353                        MachineOperand &UseMO)>
354         Inserter) {
355   MachineInstr &UseMI = *UseMO.getParent();
356 
357   MachineBasicBlock *InsertBB = UseMI.getParent();
358 
359   // If the use is a PHI then we want the predecessor block instead.
360   if (UseMI.isPHI()) {
361     MachineOperand *PredBB = std::next(&UseMO);
362     InsertBB = PredBB->getMBB();
363   }
364 
365   // If the block is the same block as the def then we want to insert just after
366   // the def instead of at the start of the block.
367   if (InsertBB == DefMI.getParent()) {
368     MachineBasicBlock::iterator InsertPt = &DefMI;
369     Inserter(InsertBB, std::next(InsertPt), UseMO);
370     return;
371   }
372 
373   // Otherwise we want the start of the BB
374   Inserter(InsertBB, InsertBB->getFirstNonPHI(), UseMO);
375 }
376 } // end anonymous namespace
377 
378 bool CombinerHelper::tryCombineExtendingLoads(MachineInstr &MI) {
379   PreferredTuple Preferred;
380   if (matchCombineExtendingLoads(MI, Preferred)) {
381     applyCombineExtendingLoads(MI, Preferred);
382     return true;
383   }
384   return false;
385 }
386 
387 bool CombinerHelper::matchCombineExtendingLoads(MachineInstr &MI,
388                                                 PreferredTuple &Preferred) {
389   // We match the loads and follow the uses to the extend instead of matching
390   // the extends and following the def to the load. This is because the load
391   // must remain in the same position for correctness (unless we also add code
392   // to find a safe place to sink it) whereas the extend is freely movable.
393   // It also prevents us from duplicating the load for the volatile case or just
394   // for performance.
395 
396   if (MI.getOpcode() != TargetOpcode::G_LOAD &&
397       MI.getOpcode() != TargetOpcode::G_SEXTLOAD &&
398       MI.getOpcode() != TargetOpcode::G_ZEXTLOAD)
399     return false;
400 
401   auto &LoadValue = MI.getOperand(0);
402   assert(LoadValue.isReg() && "Result wasn't a register?");
403 
404   LLT LoadValueTy = MRI.getType(LoadValue.getReg());
405   if (!LoadValueTy.isScalar())
406     return false;
407 
408   // Most architectures are going to legalize <s8 loads into at least a 1 byte
409   // load, and the MMOs can only describe memory accesses in multiples of bytes.
410   // If we try to perform extload combining on those, we can end up with
411   // %a(s8) = extload %ptr (load 1 byte from %ptr)
412   // ... which is an illegal extload instruction.
413   if (LoadValueTy.getSizeInBits() < 8)
414     return false;
415 
416   // For non power-of-2 types, they will very likely be legalized into multiple
417   // loads. Don't bother trying to match them into extending loads.
418   if (!isPowerOf2_32(LoadValueTy.getSizeInBits()))
419     return false;
420 
421   // Find the preferred type aside from the any-extends (unless it's the only
422   // one) and non-extending ops. We'll emit an extending load to that type and
423   // and emit a variant of (extend (trunc X)) for the others according to the
424   // relative type sizes. At the same time, pick an extend to use based on the
425   // extend involved in the chosen type.
426   unsigned PreferredOpcode = MI.getOpcode() == TargetOpcode::G_LOAD
427                                  ? TargetOpcode::G_ANYEXT
428                                  : MI.getOpcode() == TargetOpcode::G_SEXTLOAD
429                                        ? TargetOpcode::G_SEXT
430                                        : TargetOpcode::G_ZEXT;
431   Preferred = {LLT(), PreferredOpcode, nullptr};
432   for (auto &UseMI : MRI.use_instructions(LoadValue.getReg())) {
433     if (UseMI.getOpcode() == TargetOpcode::G_SEXT ||
434         UseMI.getOpcode() == TargetOpcode::G_ZEXT ||
435         UseMI.getOpcode() == TargetOpcode::G_ANYEXT) {
436       Preferred = ChoosePreferredUse(Preferred,
437                                      MRI.getType(UseMI.getOperand(0).getReg()),
438                                      UseMI.getOpcode(), &UseMI);
439     }
440   }
441 
442   // There were no extends
443   if (!Preferred.MI)
444     return false;
445   // It should be impossible to chose an extend without selecting a different
446   // type since by definition the result of an extend is larger.
447   assert(Preferred.Ty != LoadValueTy && "Extending to same type?");
448 
449   LLVM_DEBUG(dbgs() << "Preferred use is: " << *Preferred.MI);
450   return true;
451 }
452 
453 void CombinerHelper::applyCombineExtendingLoads(MachineInstr &MI,
454                                                 PreferredTuple &Preferred) {
455   // Rewrite the load to the chosen extending load.
456   Register ChosenDstReg = Preferred.MI->getOperand(0).getReg();
457 
458   // Inserter to insert a truncate back to the original type at a given point
459   // with some basic CSE to limit truncate duplication to one per BB.
460   DenseMap<MachineBasicBlock *, MachineInstr *> EmittedInsns;
461   auto InsertTruncAt = [&](MachineBasicBlock *InsertIntoBB,
462                            MachineBasicBlock::iterator InsertBefore,
463                            MachineOperand &UseMO) {
464     MachineInstr *PreviouslyEmitted = EmittedInsns.lookup(InsertIntoBB);
465     if (PreviouslyEmitted) {
466       Observer.changingInstr(*UseMO.getParent());
467       UseMO.setReg(PreviouslyEmitted->getOperand(0).getReg());
468       Observer.changedInstr(*UseMO.getParent());
469       return;
470     }
471 
472     Builder.setInsertPt(*InsertIntoBB, InsertBefore);
473     Register NewDstReg = MRI.cloneVirtualRegister(MI.getOperand(0).getReg());
474     MachineInstr *NewMI = Builder.buildTrunc(NewDstReg, ChosenDstReg);
475     EmittedInsns[InsertIntoBB] = NewMI;
476     replaceRegOpWith(MRI, UseMO, NewDstReg);
477   };
478 
479   Observer.changingInstr(MI);
480   MI.setDesc(
481       Builder.getTII().get(Preferred.ExtendOpcode == TargetOpcode::G_SEXT
482                                ? TargetOpcode::G_SEXTLOAD
483                                : Preferred.ExtendOpcode == TargetOpcode::G_ZEXT
484                                      ? TargetOpcode::G_ZEXTLOAD
485                                      : TargetOpcode::G_LOAD));
486 
487   // Rewrite all the uses to fix up the types.
488   auto &LoadValue = MI.getOperand(0);
489   SmallVector<MachineOperand *, 4> Uses;
490   for (auto &UseMO : MRI.use_operands(LoadValue.getReg()))
491     Uses.push_back(&UseMO);
492 
493   for (auto *UseMO : Uses) {
494     MachineInstr *UseMI = UseMO->getParent();
495 
496     // If the extend is compatible with the preferred extend then we should fix
497     // up the type and extend so that it uses the preferred use.
498     if (UseMI->getOpcode() == Preferred.ExtendOpcode ||
499         UseMI->getOpcode() == TargetOpcode::G_ANYEXT) {
500       Register UseDstReg = UseMI->getOperand(0).getReg();
501       MachineOperand &UseSrcMO = UseMI->getOperand(1);
502       const LLT &UseDstTy = MRI.getType(UseDstReg);
503       if (UseDstReg != ChosenDstReg) {
504         if (Preferred.Ty == UseDstTy) {
505           // If the use has the same type as the preferred use, then merge
506           // the vregs and erase the extend. For example:
507           //    %1:_(s8) = G_LOAD ...
508           //    %2:_(s32) = G_SEXT %1(s8)
509           //    %3:_(s32) = G_ANYEXT %1(s8)
510           //    ... = ... %3(s32)
511           // rewrites to:
512           //    %2:_(s32) = G_SEXTLOAD ...
513           //    ... = ... %2(s32)
514           replaceRegWith(MRI, UseDstReg, ChosenDstReg);
515           Observer.erasingInstr(*UseMO->getParent());
516           UseMO->getParent()->eraseFromParent();
517         } else if (Preferred.Ty.getSizeInBits() < UseDstTy.getSizeInBits()) {
518           // If the preferred size is smaller, then keep the extend but extend
519           // from the result of the extending load. For example:
520           //    %1:_(s8) = G_LOAD ...
521           //    %2:_(s32) = G_SEXT %1(s8)
522           //    %3:_(s64) = G_ANYEXT %1(s8)
523           //    ... = ... %3(s64)
524           /// rewrites to:
525           //    %2:_(s32) = G_SEXTLOAD ...
526           //    %3:_(s64) = G_ANYEXT %2:_(s32)
527           //    ... = ... %3(s64)
528           replaceRegOpWith(MRI, UseSrcMO, ChosenDstReg);
529         } else {
530           // If the preferred size is large, then insert a truncate. For
531           // example:
532           //    %1:_(s8) = G_LOAD ...
533           //    %2:_(s64) = G_SEXT %1(s8)
534           //    %3:_(s32) = G_ZEXT %1(s8)
535           //    ... = ... %3(s32)
536           /// rewrites to:
537           //    %2:_(s64) = G_SEXTLOAD ...
538           //    %4:_(s8) = G_TRUNC %2:_(s32)
539           //    %3:_(s64) = G_ZEXT %2:_(s8)
540           //    ... = ... %3(s64)
541           InsertInsnsWithoutSideEffectsBeforeUse(Builder, MI, *UseMO,
542                                                  InsertTruncAt);
543         }
544         continue;
545       }
546       // The use is (one of) the uses of the preferred use we chose earlier.
547       // We're going to update the load to def this value later so just erase
548       // the old extend.
549       Observer.erasingInstr(*UseMO->getParent());
550       UseMO->getParent()->eraseFromParent();
551       continue;
552     }
553 
554     // The use isn't an extend. Truncate back to the type we originally loaded.
555     // This is free on many targets.
556     InsertInsnsWithoutSideEffectsBeforeUse(Builder, MI, *UseMO, InsertTruncAt);
557   }
558 
559   MI.getOperand(0).setReg(ChosenDstReg);
560   Observer.changedInstr(MI);
561 }
562 
563 bool CombinerHelper::isPredecessor(MachineInstr &DefMI, MachineInstr &UseMI) {
564   assert(DefMI.getParent() == UseMI.getParent());
565   if (&DefMI == &UseMI)
566     return false;
567 
568   // Loop through the basic block until we find one of the instructions.
569   MachineBasicBlock::const_iterator I = DefMI.getParent()->begin();
570   for (; &*I != &DefMI && &*I != &UseMI; ++I)
571     return &*I == &DefMI;
572 
573   llvm_unreachable("Block must contain instructions");
574 }
575 
576 bool CombinerHelper::dominates(MachineInstr &DefMI, MachineInstr &UseMI) {
577   if (MDT)
578     return MDT->dominates(&DefMI, &UseMI);
579   else if (DefMI.getParent() != UseMI.getParent())
580     return false;
581 
582   return isPredecessor(DefMI, UseMI);
583 }
584 
585 bool CombinerHelper::findPostIndexCandidate(MachineInstr &MI, Register &Addr,
586                                             Register &Base, Register &Offset) {
587   auto &MF = *MI.getParent()->getParent();
588   const auto &TLI = *MF.getSubtarget().getTargetLowering();
589 
590 #ifndef NDEBUG
591   unsigned Opcode = MI.getOpcode();
592   assert(Opcode == TargetOpcode::G_LOAD || Opcode == TargetOpcode::G_SEXTLOAD ||
593          Opcode == TargetOpcode::G_ZEXTLOAD || Opcode == TargetOpcode::G_STORE);
594 #endif
595 
596   Base = MI.getOperand(1).getReg();
597   MachineInstr *BaseDef = MRI.getUniqueVRegDef(Base);
598   if (BaseDef && BaseDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
599     return false;
600 
601   LLVM_DEBUG(dbgs() << "Searching for post-indexing opportunity for: " << MI);
602 
603   for (auto &Use : MRI.use_instructions(Base)) {
604     if (Use.getOpcode() != TargetOpcode::G_PTR_ADD)
605       continue;
606 
607     Offset = Use.getOperand(2).getReg();
608     if (!ForceLegalIndexing &&
609         !TLI.isIndexingLegal(MI, Base, Offset, /*IsPre*/ false, MRI)) {
610       LLVM_DEBUG(dbgs() << "    Ignoring candidate with illegal addrmode: "
611                         << Use);
612       continue;
613     }
614 
615     // Make sure the offset calculation is before the potentially indexed op.
616     // FIXME: we really care about dependency here. The offset calculation might
617     // be movable.
618     MachineInstr *OffsetDef = MRI.getUniqueVRegDef(Offset);
619     if (!OffsetDef || !dominates(*OffsetDef, MI)) {
620       LLVM_DEBUG(dbgs() << "    Ignoring candidate with offset after mem-op: "
621                         << Use);
622       continue;
623     }
624 
625     // FIXME: check whether all uses of Base are load/store with foldable
626     // addressing modes. If so, using the normal addr-modes is better than
627     // forming an indexed one.
628 
629     bool MemOpDominatesAddrUses = true;
630     for (auto &PtrAddUse : MRI.use_instructions(Use.getOperand(0).getReg())) {
631       if (!dominates(MI, PtrAddUse)) {
632         MemOpDominatesAddrUses = false;
633         break;
634       }
635     }
636 
637     if (!MemOpDominatesAddrUses) {
638       LLVM_DEBUG(
639           dbgs() << "    Ignoring candidate as memop does not dominate uses: "
640                  << Use);
641       continue;
642     }
643 
644     LLVM_DEBUG(dbgs() << "    Found match: " << Use);
645     Addr = Use.getOperand(0).getReg();
646     return true;
647   }
648 
649   return false;
650 }
651 
652 bool CombinerHelper::findPreIndexCandidate(MachineInstr &MI, Register &Addr,
653                                            Register &Base, Register &Offset) {
654   auto &MF = *MI.getParent()->getParent();
655   const auto &TLI = *MF.getSubtarget().getTargetLowering();
656 
657 #ifndef NDEBUG
658   unsigned Opcode = MI.getOpcode();
659   assert(Opcode == TargetOpcode::G_LOAD || Opcode == TargetOpcode::G_SEXTLOAD ||
660          Opcode == TargetOpcode::G_ZEXTLOAD || Opcode == TargetOpcode::G_STORE);
661 #endif
662 
663   Addr = MI.getOperand(1).getReg();
664   MachineInstr *AddrDef = getOpcodeDef(TargetOpcode::G_PTR_ADD, Addr, MRI);
665   if (!AddrDef || MRI.hasOneUse(Addr))
666     return false;
667 
668   Base = AddrDef->getOperand(1).getReg();
669   Offset = AddrDef->getOperand(2).getReg();
670 
671   LLVM_DEBUG(dbgs() << "Found potential pre-indexed load_store: " << MI);
672 
673   if (!ForceLegalIndexing &&
674       !TLI.isIndexingLegal(MI, Base, Offset, /*IsPre*/ true, MRI)) {
675     LLVM_DEBUG(dbgs() << "    Skipping, not legal for target");
676     return false;
677   }
678 
679   MachineInstr *BaseDef = getDefIgnoringCopies(Base, MRI);
680   if (BaseDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
681     LLVM_DEBUG(dbgs() << "    Skipping, frame index would need copy anyway.");
682     return false;
683   }
684 
685   if (MI.getOpcode() == TargetOpcode::G_STORE) {
686     // Would require a copy.
687     if (Base == MI.getOperand(0).getReg()) {
688       LLVM_DEBUG(dbgs() << "    Skipping, storing base so need copy anyway.");
689       return false;
690     }
691 
692     // We're expecting one use of Addr in MI, but it could also be the
693     // value stored, which isn't actually dominated by the instruction.
694     if (MI.getOperand(0).getReg() == Addr) {
695       LLVM_DEBUG(dbgs() << "    Skipping, does not dominate all addr uses");
696       return false;
697     }
698   }
699 
700   // FIXME: check whether all uses of the base pointer are constant PtrAdds.
701   // That might allow us to end base's liveness here by adjusting the constant.
702 
703   for (auto &UseMI : MRI.use_instructions(Addr)) {
704     if (!dominates(MI, UseMI)) {
705       LLVM_DEBUG(dbgs() << "    Skipping, does not dominate all addr uses.");
706       return false;
707     }
708   }
709 
710   return true;
711 }
712 
713 bool CombinerHelper::tryCombineIndexedLoadStore(MachineInstr &MI) {
714   IndexedLoadStoreMatchInfo MatchInfo;
715   if (matchCombineIndexedLoadStore(MI, MatchInfo)) {
716     applyCombineIndexedLoadStore(MI, MatchInfo);
717     return true;
718   }
719   return false;
720 }
721 
722 bool CombinerHelper::matchCombineIndexedLoadStore(MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo) {
723   unsigned Opcode = MI.getOpcode();
724   if (Opcode != TargetOpcode::G_LOAD && Opcode != TargetOpcode::G_SEXTLOAD &&
725       Opcode != TargetOpcode::G_ZEXTLOAD && Opcode != TargetOpcode::G_STORE)
726     return false;
727 
728   MatchInfo.IsPre = findPreIndexCandidate(MI, MatchInfo.Addr, MatchInfo.Base,
729                                           MatchInfo.Offset);
730   if (!MatchInfo.IsPre &&
731       !findPostIndexCandidate(MI, MatchInfo.Addr, MatchInfo.Base,
732                               MatchInfo.Offset))
733     return false;
734 
735   return true;
736 }
737 
738 void CombinerHelper::applyCombineIndexedLoadStore(
739     MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo) {
740   MachineInstr &AddrDef = *MRI.getUniqueVRegDef(MatchInfo.Addr);
741   MachineIRBuilder MIRBuilder(MI);
742   unsigned Opcode = MI.getOpcode();
743   bool IsStore = Opcode == TargetOpcode::G_STORE;
744   unsigned NewOpcode;
745   switch (Opcode) {
746   case TargetOpcode::G_LOAD:
747     NewOpcode = TargetOpcode::G_INDEXED_LOAD;
748     break;
749   case TargetOpcode::G_SEXTLOAD:
750     NewOpcode = TargetOpcode::G_INDEXED_SEXTLOAD;
751     break;
752   case TargetOpcode::G_ZEXTLOAD:
753     NewOpcode = TargetOpcode::G_INDEXED_ZEXTLOAD;
754     break;
755   case TargetOpcode::G_STORE:
756     NewOpcode = TargetOpcode::G_INDEXED_STORE;
757     break;
758   default:
759     llvm_unreachable("Unknown load/store opcode");
760   }
761 
762   auto MIB = MIRBuilder.buildInstr(NewOpcode);
763   if (IsStore) {
764     MIB.addDef(MatchInfo.Addr);
765     MIB.addUse(MI.getOperand(0).getReg());
766   } else {
767     MIB.addDef(MI.getOperand(0).getReg());
768     MIB.addDef(MatchInfo.Addr);
769   }
770 
771   MIB.addUse(MatchInfo.Base);
772   MIB.addUse(MatchInfo.Offset);
773   MIB.addImm(MatchInfo.IsPre);
774   MI.eraseFromParent();
775   AddrDef.eraseFromParent();
776 
777   LLVM_DEBUG(dbgs() << "    Combinined to indexed operation");
778 }
779 
780 bool CombinerHelper::matchElideBrByInvertingCond(MachineInstr &MI) {
781   if (MI.getOpcode() != TargetOpcode::G_BR)
782     return false;
783 
784   // Try to match the following:
785   // bb1:
786   //   %c(s32) = G_ICMP pred, %a, %b
787   //   %c1(s1) = G_TRUNC %c(s32)
788   //   G_BRCOND %c1, %bb2
789   //   G_BR %bb3
790   // bb2:
791   // ...
792   // bb3:
793 
794   // The above pattern does not have a fall through to the successor bb2, always
795   // resulting in a branch no matter which path is taken. Here we try to find
796   // and replace that pattern with conditional branch to bb3 and otherwise
797   // fallthrough to bb2.
798 
799   MachineBasicBlock *MBB = MI.getParent();
800   MachineBasicBlock::iterator BrIt(MI);
801   if (BrIt == MBB->begin())
802     return false;
803   assert(std::next(BrIt) == MBB->end() && "expected G_BR to be a terminator");
804 
805   MachineInstr *BrCond = &*std::prev(BrIt);
806   if (BrCond->getOpcode() != TargetOpcode::G_BRCOND)
807     return false;
808 
809   // Check that the next block is the conditional branch target.
810   if (!MBB->isLayoutSuccessor(BrCond->getOperand(1).getMBB()))
811     return false;
812 
813   MachineInstr *CmpMI = MRI.getVRegDef(BrCond->getOperand(0).getReg());
814   if (!CmpMI || CmpMI->getOpcode() != TargetOpcode::G_ICMP ||
815       !MRI.hasOneUse(CmpMI->getOperand(0).getReg()))
816     return false;
817   return true;
818 }
819 
820 bool CombinerHelper::tryElideBrByInvertingCond(MachineInstr &MI) {
821   if (!matchElideBrByInvertingCond(MI))
822     return false;
823   applyElideBrByInvertingCond(MI);
824   return true;
825 }
826 
827 void CombinerHelper::applyElideBrByInvertingCond(MachineInstr &MI) {
828   MachineBasicBlock *BrTarget = MI.getOperand(0).getMBB();
829   MachineBasicBlock::iterator BrIt(MI);
830   MachineInstr *BrCond = &*std::prev(BrIt);
831   MachineInstr *CmpMI = MRI.getVRegDef(BrCond->getOperand(0).getReg());
832 
833   CmpInst::Predicate InversePred = CmpInst::getInversePredicate(
834       (CmpInst::Predicate)CmpMI->getOperand(1).getPredicate());
835 
836   // Invert the G_ICMP condition.
837   Observer.changingInstr(*CmpMI);
838   CmpMI->getOperand(1).setPredicate(InversePred);
839   Observer.changedInstr(*CmpMI);
840 
841   // Change the conditional branch target.
842   Observer.changingInstr(*BrCond);
843   BrCond->getOperand(1).setMBB(BrTarget);
844   Observer.changedInstr(*BrCond);
845   MI.eraseFromParent();
846 }
847 
848 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
849   // On Darwin, -Os means optimize for size without hurting performance, so
850   // only really optimize for size when -Oz (MinSize) is used.
851   if (MF.getTarget().getTargetTriple().isOSDarwin())
852     return MF.getFunction().hasMinSize();
853   return MF.getFunction().hasOptSize();
854 }
855 
856 // Returns a list of types to use for memory op lowering in MemOps. A partial
857 // port of findOptimalMemOpLowering in TargetLowering.
858 static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
859                                           unsigned Limit, const MemOp &Op,
860                                           unsigned DstAS, unsigned SrcAS,
861                                           const AttributeList &FuncAttributes,
862                                           const TargetLowering &TLI) {
863   if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
864     return false;
865 
866   LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
867 
868   if (Ty == LLT()) {
869     // Use the largest scalar type whose alignment constraints are satisfied.
870     // We only need to check DstAlign here as SrcAlign is always greater or
871     // equal to DstAlign (or zero).
872     Ty = LLT::scalar(64);
873     if (Op.isFixedDstAlign())
874       while (Op.getDstAlign() < Ty.getSizeInBytes() &&
875              !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS,
876                                                  Op.getDstAlign().value()))
877         Ty = LLT::scalar(Ty.getSizeInBytes());
878     assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
879     // FIXME: check for the largest legal type we can load/store to.
880   }
881 
882   unsigned NumMemOps = 0;
883   uint64_t Size = Op.size();
884   while (Size) {
885     unsigned TySize = Ty.getSizeInBytes();
886     while (TySize > Size) {
887       // For now, only use non-vector load / store's for the left-over pieces.
888       LLT NewTy = Ty;
889       // FIXME: check for mem op safety and legality of the types. Not all of
890       // SDAGisms map cleanly to GISel concepts.
891       if (NewTy.isVector())
892         NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
893       NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits() - 1));
894       unsigned NewTySize = NewTy.getSizeInBytes();
895       assert(NewTySize > 0 && "Could not find appropriate type");
896 
897       // If the new LLT cannot cover all of the remaining bits, then consider
898       // issuing a (or a pair of) unaligned and overlapping load / store.
899       bool Fast;
900       // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
901       MVT VT = getMVTForLLT(Ty);
902       if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
903           TLI.allowsMisalignedMemoryAccesses(
904               VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign().value() : 0,
905               MachineMemOperand::MONone, &Fast) &&
906           Fast)
907         TySize = Size;
908       else {
909         Ty = NewTy;
910         TySize = NewTySize;
911       }
912     }
913 
914     if (++NumMemOps > Limit)
915       return false;
916 
917     MemOps.push_back(Ty);
918     Size -= TySize;
919   }
920 
921   return true;
922 }
923 
924 static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
925   if (Ty.isVector())
926     return VectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
927                            Ty.getNumElements());
928   return IntegerType::get(C, Ty.getSizeInBits());
929 }
930 
931 // Get a vectorized representation of the memset value operand, GISel edition.
932 static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
933   MachineRegisterInfo &MRI = *MIB.getMRI();
934   unsigned NumBits = Ty.getScalarSizeInBits();
935   auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI);
936   if (!Ty.isVector() && ValVRegAndVal) {
937     unsigned KnownVal = ValVRegAndVal->Value;
938     APInt Scalar = APInt(8, KnownVal);
939     APInt SplatVal = APInt::getSplat(NumBits, Scalar);
940     return MIB.buildConstant(Ty, SplatVal).getReg(0);
941   }
942   // FIXME: for vector types create a G_BUILD_VECTOR.
943   if (Ty.isVector())
944     return Register();
945 
946   // Extend the byte value to the larger type, and then multiply by a magic
947   // value 0x010101... in order to replicate it across every byte.
948   LLT ExtType = Ty.getScalarType();
949   auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
950   if (NumBits > 8) {
951     APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
952     auto MagicMI = MIB.buildConstant(ExtType, Magic);
953     Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
954   }
955 
956   assert(ExtType == Ty && "Vector memset value type not supported yet");
957   return Val;
958 }
959 
960 bool CombinerHelper::optimizeMemset(MachineInstr &MI, Register Dst,
961                                     Register Val, unsigned KnownLen,
962                                     Align Alignment, bool IsVolatile) {
963   auto &MF = *MI.getParent()->getParent();
964   const auto &TLI = *MF.getSubtarget().getTargetLowering();
965   auto &DL = MF.getDataLayout();
966   LLVMContext &C = MF.getFunction().getContext();
967 
968   assert(KnownLen != 0 && "Have a zero length memset length!");
969 
970   bool DstAlignCanChange = false;
971   MachineFrameInfo &MFI = MF.getFrameInfo();
972   bool OptSize = shouldLowerMemFuncForSize(MF);
973 
974   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
975   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
976     DstAlignCanChange = true;
977 
978   unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
979   std::vector<LLT> MemOps;
980 
981   const auto &DstMMO = **MI.memoperands_begin();
982   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
983 
984   auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI);
985   bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
986 
987   if (!findGISelOptimalMemOpLowering(MemOps, Limit,
988                                      MemOp::Set(KnownLen, DstAlignCanChange,
989                                                 Alignment,
990                                                 /*IsZeroMemset=*/IsZeroVal,
991                                                 /*IsVolatile=*/IsVolatile),
992                                      DstPtrInfo.getAddrSpace(), ~0u,
993                                      MF.getFunction().getAttributes(), TLI))
994     return false;
995 
996   if (DstAlignCanChange) {
997     // Get an estimate of the type from the LLT.
998     Type *IRTy = getTypeForLLT(MemOps[0], C);
999     Align NewAlign = DL.getABITypeAlign(IRTy);
1000     if (NewAlign > Alignment) {
1001       Alignment = NewAlign;
1002       unsigned FI = FIDef->getOperand(1).getIndex();
1003       // Give the stack frame object a larger alignment if needed.
1004       if (MFI.getObjectAlign(FI) < Alignment)
1005         MFI.setObjectAlignment(FI, Alignment);
1006     }
1007   }
1008 
1009   MachineIRBuilder MIB(MI);
1010   // Find the largest store and generate the bit pattern for it.
1011   LLT LargestTy = MemOps[0];
1012   for (unsigned i = 1; i < MemOps.size(); i++)
1013     if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
1014       LargestTy = MemOps[i];
1015 
1016   // The memset stored value is always defined as an s8, so in order to make it
1017   // work with larger store types we need to repeat the bit pattern across the
1018   // wider type.
1019   Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
1020 
1021   if (!MemSetValue)
1022     return false;
1023 
1024   // Generate the stores. For each store type in the list, we generate the
1025   // matching store of that type to the destination address.
1026   LLT PtrTy = MRI.getType(Dst);
1027   unsigned DstOff = 0;
1028   unsigned Size = KnownLen;
1029   for (unsigned I = 0; I < MemOps.size(); I++) {
1030     LLT Ty = MemOps[I];
1031     unsigned TySize = Ty.getSizeInBytes();
1032     if (TySize > Size) {
1033       // Issuing an unaligned load / store pair that overlaps with the previous
1034       // pair. Adjust the offset accordingly.
1035       assert(I == MemOps.size() - 1 && I != 0);
1036       DstOff -= TySize - Size;
1037     }
1038 
1039     // If this store is smaller than the largest store see whether we can get
1040     // the smaller value for free with a truncate.
1041     Register Value = MemSetValue;
1042     if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
1043       MVT VT = getMVTForLLT(Ty);
1044       MVT LargestVT = getMVTForLLT(LargestTy);
1045       if (!LargestTy.isVector() && !Ty.isVector() &&
1046           TLI.isTruncateFree(LargestVT, VT))
1047         Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
1048       else
1049         Value = getMemsetValue(Val, Ty, MIB);
1050       if (!Value)
1051         return false;
1052     }
1053 
1054     auto *StoreMMO =
1055         MF.getMachineMemOperand(&DstMMO, DstOff, Ty.getSizeInBytes());
1056 
1057     Register Ptr = Dst;
1058     if (DstOff != 0) {
1059       auto Offset =
1060           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
1061       Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
1062     }
1063 
1064     MIB.buildStore(Value, Ptr, *StoreMMO);
1065     DstOff += Ty.getSizeInBytes();
1066     Size -= TySize;
1067   }
1068 
1069   MI.eraseFromParent();
1070   return true;
1071 }
1072 
1073 bool CombinerHelper::optimizeMemcpy(MachineInstr &MI, Register Dst,
1074                                     Register Src, unsigned KnownLen,
1075                                     Align DstAlign, Align SrcAlign,
1076                                     bool IsVolatile) {
1077   auto &MF = *MI.getParent()->getParent();
1078   const auto &TLI = *MF.getSubtarget().getTargetLowering();
1079   auto &DL = MF.getDataLayout();
1080   LLVMContext &C = MF.getFunction().getContext();
1081 
1082   assert(KnownLen != 0 && "Have a zero length memcpy length!");
1083 
1084   bool DstAlignCanChange = false;
1085   MachineFrameInfo &MFI = MF.getFrameInfo();
1086   bool OptSize = shouldLowerMemFuncForSize(MF);
1087   Align Alignment = commonAlignment(DstAlign, SrcAlign);
1088 
1089   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
1090   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
1091     DstAlignCanChange = true;
1092 
1093   // FIXME: infer better src pointer alignment like SelectionDAG does here.
1094   // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
1095   // if the memcpy is in a tail call position.
1096 
1097   unsigned Limit = TLI.getMaxStoresPerMemcpy(OptSize);
1098   std::vector<LLT> MemOps;
1099 
1100   const auto &DstMMO = **MI.memoperands_begin();
1101   const auto &SrcMMO = **std::next(MI.memoperands_begin());
1102   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
1103   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
1104 
1105   if (!findGISelOptimalMemOpLowering(
1106           MemOps, Limit,
1107           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
1108                       IsVolatile),
1109           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
1110           MF.getFunction().getAttributes(), TLI))
1111     return false;
1112 
1113   if (DstAlignCanChange) {
1114     // Get an estimate of the type from the LLT.
1115     Type *IRTy = getTypeForLLT(MemOps[0], C);
1116     Align NewAlign = DL.getABITypeAlign(IRTy);
1117 
1118     // Don't promote to an alignment that would require dynamic stack
1119     // realignment.
1120     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1121     if (!TRI->needsStackRealignment(MF))
1122       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
1123         NewAlign = NewAlign / 2;
1124 
1125     if (NewAlign > Alignment) {
1126       Alignment = NewAlign;
1127       unsigned FI = FIDef->getOperand(1).getIndex();
1128       // Give the stack frame object a larger alignment if needed.
1129       if (MFI.getObjectAlign(FI) < Alignment)
1130         MFI.setObjectAlignment(FI, Alignment);
1131     }
1132   }
1133 
1134   LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
1135 
1136   MachineIRBuilder MIB(MI);
1137   // Now we need to emit a pair of load and stores for each of the types we've
1138   // collected. I.e. for each type, generate a load from the source pointer of
1139   // that type width, and then generate a corresponding store to the dest buffer
1140   // of that value loaded. This can result in a sequence of loads and stores
1141   // mixed types, depending on what the target specifies as good types to use.
1142   unsigned CurrOffset = 0;
1143   LLT PtrTy = MRI.getType(Src);
1144   unsigned Size = KnownLen;
1145   for (auto CopyTy : MemOps) {
1146     // Issuing an unaligned load / store pair  that overlaps with the previous
1147     // pair. Adjust the offset accordingly.
1148     if (CopyTy.getSizeInBytes() > Size)
1149       CurrOffset -= CopyTy.getSizeInBytes() - Size;
1150 
1151     // Construct MMOs for the accesses.
1152     auto *LoadMMO =
1153         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
1154     auto *StoreMMO =
1155         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
1156 
1157     // Create the load.
1158     Register LoadPtr = Src;
1159     Register Offset;
1160     if (CurrOffset != 0) {
1161       Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset)
1162                    .getReg(0);
1163       LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
1164     }
1165     auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
1166 
1167     // Create the store.
1168     Register StorePtr =
1169         CurrOffset == 0 ? Dst : MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
1170     MIB.buildStore(LdVal, StorePtr, *StoreMMO);
1171     CurrOffset += CopyTy.getSizeInBytes();
1172     Size -= CopyTy.getSizeInBytes();
1173   }
1174 
1175   MI.eraseFromParent();
1176   return true;
1177 }
1178 
1179 bool CombinerHelper::optimizeMemmove(MachineInstr &MI, Register Dst,
1180                                      Register Src, unsigned KnownLen,
1181                                      Align DstAlign, Align SrcAlign,
1182                                      bool IsVolatile) {
1183   auto &MF = *MI.getParent()->getParent();
1184   const auto &TLI = *MF.getSubtarget().getTargetLowering();
1185   auto &DL = MF.getDataLayout();
1186   LLVMContext &C = MF.getFunction().getContext();
1187 
1188   assert(KnownLen != 0 && "Have a zero length memmove length!");
1189 
1190   bool DstAlignCanChange = false;
1191   MachineFrameInfo &MFI = MF.getFrameInfo();
1192   bool OptSize = shouldLowerMemFuncForSize(MF);
1193   Align Alignment = commonAlignment(DstAlign, SrcAlign);
1194 
1195   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
1196   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
1197     DstAlignCanChange = true;
1198 
1199   unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
1200   std::vector<LLT> MemOps;
1201 
1202   const auto &DstMMO = **MI.memoperands_begin();
1203   const auto &SrcMMO = **std::next(MI.memoperands_begin());
1204   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
1205   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
1206 
1207   // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
1208   // to a bug in it's findOptimalMemOpLowering implementation. For now do the
1209   // same thing here.
1210   if (!findGISelOptimalMemOpLowering(
1211           MemOps, Limit,
1212           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
1213                       /*IsVolatile*/ true),
1214           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
1215           MF.getFunction().getAttributes(), TLI))
1216     return false;
1217 
1218   if (DstAlignCanChange) {
1219     // Get an estimate of the type from the LLT.
1220     Type *IRTy = getTypeForLLT(MemOps[0], C);
1221     Align NewAlign = DL.getABITypeAlign(IRTy);
1222 
1223     // Don't promote to an alignment that would require dynamic stack
1224     // realignment.
1225     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1226     if (!TRI->needsStackRealignment(MF))
1227       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
1228         NewAlign = NewAlign / 2;
1229 
1230     if (NewAlign > Alignment) {
1231       Alignment = NewAlign;
1232       unsigned FI = FIDef->getOperand(1).getIndex();
1233       // Give the stack frame object a larger alignment if needed.
1234       if (MFI.getObjectAlign(FI) < Alignment)
1235         MFI.setObjectAlignment(FI, Alignment);
1236     }
1237   }
1238 
1239   LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
1240 
1241   MachineIRBuilder MIB(MI);
1242   // Memmove requires that we perform the loads first before issuing the stores.
1243   // Apart from that, this loop is pretty much doing the same thing as the
1244   // memcpy codegen function.
1245   unsigned CurrOffset = 0;
1246   LLT PtrTy = MRI.getType(Src);
1247   SmallVector<Register, 16> LoadVals;
1248   for (auto CopyTy : MemOps) {
1249     // Construct MMO for the load.
1250     auto *LoadMMO =
1251         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
1252 
1253     // Create the load.
1254     Register LoadPtr = Src;
1255     if (CurrOffset != 0) {
1256       auto Offset =
1257           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
1258       LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
1259     }
1260     LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
1261     CurrOffset += CopyTy.getSizeInBytes();
1262   }
1263 
1264   CurrOffset = 0;
1265   for (unsigned I = 0; I < MemOps.size(); ++I) {
1266     LLT CopyTy = MemOps[I];
1267     // Now store the values loaded.
1268     auto *StoreMMO =
1269         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
1270 
1271     Register StorePtr = Dst;
1272     if (CurrOffset != 0) {
1273       auto Offset =
1274           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
1275       StorePtr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
1276     }
1277     MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
1278     CurrOffset += CopyTy.getSizeInBytes();
1279   }
1280   MI.eraseFromParent();
1281   return true;
1282 }
1283 
1284 bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
1285   // This combine is fairly complex so it's not written with a separate
1286   // matcher function.
1287   assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
1288   Intrinsic::ID ID = (Intrinsic::ID)MI.getIntrinsicID();
1289   assert((ID == Intrinsic::memcpy || ID == Intrinsic::memmove ||
1290           ID == Intrinsic::memset) &&
1291          "Expected a memcpy like intrinsic");
1292 
1293   auto MMOIt = MI.memoperands_begin();
1294   const MachineMemOperand *MemOp = *MMOIt;
1295   bool IsVolatile = MemOp->isVolatile();
1296   // Don't try to optimize volatile.
1297   if (IsVolatile)
1298     return false;
1299 
1300   Align DstAlign(MemOp->getBaseAlignment());
1301   Align SrcAlign;
1302   Register Dst = MI.getOperand(1).getReg();
1303   Register Src = MI.getOperand(2).getReg();
1304   Register Len = MI.getOperand(3).getReg();
1305 
1306   if (ID != Intrinsic::memset) {
1307     assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
1308     MemOp = *(++MMOIt);
1309     SrcAlign = Align(MemOp->getBaseAlignment());
1310   }
1311 
1312   // See if this is a constant length copy
1313   auto LenVRegAndVal = getConstantVRegValWithLookThrough(Len, MRI);
1314   if (!LenVRegAndVal)
1315     return false; // Leave it to the legalizer to lower it to a libcall.
1316   unsigned KnownLen = LenVRegAndVal->Value;
1317 
1318   if (KnownLen == 0) {
1319     MI.eraseFromParent();
1320     return true;
1321   }
1322 
1323   if (MaxLen && KnownLen > MaxLen)
1324     return false;
1325 
1326   if (ID == Intrinsic::memcpy)
1327     return optimizeMemcpy(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
1328   if (ID == Intrinsic::memmove)
1329     return optimizeMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
1330   if (ID == Intrinsic::memset)
1331     return optimizeMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
1332   return false;
1333 }
1334 
1335 bool CombinerHelper::matchPtrAddImmedChain(MachineInstr &MI,
1336                                            PtrAddChain &MatchInfo) {
1337   // We're trying to match the following pattern:
1338   //   %t1 = G_PTR_ADD %base, G_CONSTANT imm1
1339   //   %root = G_PTR_ADD %t1, G_CONSTANT imm2
1340   // -->
1341   //   %root = G_PTR_ADD %base, G_CONSTANT (imm1 + imm2)
1342 
1343   if (MI.getOpcode() != TargetOpcode::G_PTR_ADD)
1344     return false;
1345 
1346   Register Add2 = MI.getOperand(1).getReg();
1347   Register Imm1 = MI.getOperand(2).getReg();
1348   auto MaybeImmVal = getConstantVRegValWithLookThrough(Imm1, MRI);
1349   if (!MaybeImmVal)
1350     return false;
1351 
1352   MachineInstr *Add2Def = MRI.getUniqueVRegDef(Add2);
1353   if (!Add2Def || Add2Def->getOpcode() != TargetOpcode::G_PTR_ADD)
1354     return false;
1355 
1356   Register Base = Add2Def->getOperand(1).getReg();
1357   Register Imm2 = Add2Def->getOperand(2).getReg();
1358   auto MaybeImm2Val = getConstantVRegValWithLookThrough(Imm2, MRI);
1359   if (!MaybeImm2Val)
1360     return false;
1361 
1362   // Pass the combined immediate to the apply function.
1363   MatchInfo.Imm = MaybeImmVal->Value + MaybeImm2Val->Value;
1364   MatchInfo.Base = Base;
1365   return true;
1366 }
1367 
1368 bool CombinerHelper::applyPtrAddImmedChain(MachineInstr &MI,
1369                                            PtrAddChain &MatchInfo) {
1370   assert(MI.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
1371   MachineIRBuilder MIB(MI);
1372   LLT OffsetTy = MRI.getType(MI.getOperand(2).getReg());
1373   auto NewOffset = MIB.buildConstant(OffsetTy, MatchInfo.Imm);
1374   Observer.changingInstr(MI);
1375   MI.getOperand(1).setReg(MatchInfo.Base);
1376   MI.getOperand(2).setReg(NewOffset.getReg(0));
1377   Observer.changedInstr(MI);
1378   return true;
1379 }
1380 
1381 bool CombinerHelper::matchCombineMulToShl(MachineInstr &MI,
1382                                           unsigned &ShiftVal) {
1383   assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL");
1384   auto MaybeImmVal =
1385       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
1386   if (!MaybeImmVal || !isPowerOf2_64(MaybeImmVal->Value))
1387     return false;
1388   ShiftVal = Log2_64(MaybeImmVal->Value);
1389   return true;
1390 }
1391 
1392 bool CombinerHelper::applyCombineMulToShl(MachineInstr &MI,
1393                                           unsigned &ShiftVal) {
1394   assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL");
1395   MachineIRBuilder MIB(MI);
1396   LLT ShiftTy = MRI.getType(MI.getOperand(0).getReg());
1397   auto ShiftCst = MIB.buildConstant(ShiftTy, ShiftVal);
1398   Observer.changingInstr(MI);
1399   MI.setDesc(MIB.getTII().get(TargetOpcode::G_SHL));
1400   MI.getOperand(2).setReg(ShiftCst.getReg(0));
1401   Observer.changedInstr(MI);
1402   return true;
1403 }
1404 
1405 bool CombinerHelper::tryCombine(MachineInstr &MI) {
1406   if (tryCombineCopy(MI))
1407     return true;
1408   if (tryCombineExtendingLoads(MI))
1409     return true;
1410   if (tryCombineIndexedLoadStore(MI))
1411     return true;
1412   return false;
1413 }
1414