1 //===-- lib/CodeGen/GlobalISel/GICombinerHelper.cpp -----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
9 #include "llvm/CodeGen/GlobalISel/Combiner.h"
10 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
11 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
12 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
13 #include "llvm/CodeGen/GlobalISel/Utils.h"
14 #include "llvm/CodeGen/MachineDominators.h"
15 #include "llvm/CodeGen/MachineFrameInfo.h"
16 #include "llvm/CodeGen/MachineInstr.h"
17 #include "llvm/CodeGen/MachineRegisterInfo.h"
18 #include "llvm/CodeGen/TargetInstrInfo.h"
19 #include "llvm/CodeGen/TargetLowering.h"
20 #include "llvm/Target/TargetMachine.h"
21 
22 #define DEBUG_TYPE "gi-combiner"
23 
24 using namespace llvm;
25 
26 // Option to allow testing of the combiner while no targets know about indexed
27 // addressing.
28 static cl::opt<bool>
29     ForceLegalIndexing("force-legal-indexing", cl::Hidden, cl::init(false),
30                        cl::desc("Force all indexed operations to be "
31                                 "legal for the GlobalISel combiner"));
32 
33 
34 CombinerHelper::CombinerHelper(GISelChangeObserver &Observer,
35                                MachineIRBuilder &B, GISelKnownBits *KB,
36                                MachineDominatorTree *MDT)
37     : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer),
38       KB(KB), MDT(MDT) {
39   (void)this->KB;
40 }
41 
42 void CombinerHelper::replaceRegWith(MachineRegisterInfo &MRI, Register FromReg,
43                                     Register ToReg) const {
44   Observer.changingAllUsesOfReg(MRI, FromReg);
45 
46   if (MRI.constrainRegAttrs(ToReg, FromReg))
47     MRI.replaceRegWith(FromReg, ToReg);
48   else
49     Builder.buildCopy(ToReg, FromReg);
50 
51   Observer.finishedChangingAllUsesOfReg();
52 }
53 
54 void CombinerHelper::replaceRegOpWith(MachineRegisterInfo &MRI,
55                                       MachineOperand &FromRegOp,
56                                       Register ToReg) const {
57   assert(FromRegOp.getParent() && "Expected an operand in an MI");
58   Observer.changingInstr(*FromRegOp.getParent());
59 
60   FromRegOp.setReg(ToReg);
61 
62   Observer.changedInstr(*FromRegOp.getParent());
63 }
64 
65 bool CombinerHelper::tryCombineCopy(MachineInstr &MI) {
66   if (matchCombineCopy(MI)) {
67     applyCombineCopy(MI);
68     return true;
69   }
70   return false;
71 }
72 bool CombinerHelper::matchCombineCopy(MachineInstr &MI) {
73   if (MI.getOpcode() != TargetOpcode::COPY)
74     return false;
75   Register DstReg = MI.getOperand(0).getReg();
76   Register SrcReg = MI.getOperand(1).getReg();
77 
78   // Give up if either DstReg or SrcReg  is a physical register.
79   if (Register::isPhysicalRegister(DstReg) ||
80       Register::isPhysicalRegister(SrcReg))
81     return false;
82 
83   // Give up the types don't match.
84   LLT DstTy = MRI.getType(DstReg);
85   LLT SrcTy = MRI.getType(SrcReg);
86   // Give up if one has a valid LLT, but the other doesn't.
87   if (DstTy.isValid() != SrcTy.isValid())
88     return false;
89   // Give up if the types don't match.
90   if (DstTy.isValid() && SrcTy.isValid() && DstTy != SrcTy)
91     return false;
92 
93   // Get the register banks and classes.
94   const RegisterBank *DstBank = MRI.getRegBankOrNull(DstReg);
95   const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
96   const TargetRegisterClass *DstRC = MRI.getRegClassOrNull(DstReg);
97   const TargetRegisterClass *SrcRC = MRI.getRegClassOrNull(SrcReg);
98 
99   // Replace if the register constraints match.
100   if ((SrcRC == DstRC) && (SrcBank == DstBank))
101     return true;
102   // Replace if DstReg has no constraints.
103   if (!DstBank && !DstRC)
104     return true;
105 
106   return false;
107 }
108 void CombinerHelper::applyCombineCopy(MachineInstr &MI) {
109   Register DstReg = MI.getOperand(0).getReg();
110   Register SrcReg = MI.getOperand(1).getReg();
111   MI.eraseFromParent();
112   replaceRegWith(MRI, DstReg, SrcReg);
113 }
114 
115 bool CombinerHelper::tryCombineConcatVectors(MachineInstr &MI) {
116   bool IsUndef = false;
117   SmallVector<Register, 4> Ops;
118   if (matchCombineConcatVectors(MI, IsUndef, Ops)) {
119     applyCombineConcatVectors(MI, IsUndef, Ops);
120     return true;
121   }
122   return false;
123 }
124 
125 bool CombinerHelper::matchCombineConcatVectors(MachineInstr &MI, bool &IsUndef,
126                                                SmallVectorImpl<Register> &Ops) {
127   assert(MI.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
128          "Invalid instruction");
129   IsUndef = true;
130   MachineInstr *Undef = nullptr;
131 
132   // Walk over all the operands of concat vectors and check if they are
133   // build_vector themselves or undef.
134   // Then collect their operands in Ops.
135   for (const MachineOperand &MO : MI.uses()) {
136     Register Reg = MO.getReg();
137     MachineInstr *Def = MRI.getVRegDef(Reg);
138     assert(Def && "Operand not defined");
139     switch (Def->getOpcode()) {
140     case TargetOpcode::G_BUILD_VECTOR:
141       IsUndef = false;
142       // Remember the operands of the build_vector to fold
143       // them into the yet-to-build flattened concat vectors.
144       for (const MachineOperand &BuildVecMO : Def->uses())
145         Ops.push_back(BuildVecMO.getReg());
146       break;
147     case TargetOpcode::G_IMPLICIT_DEF: {
148       LLT OpType = MRI.getType(Reg);
149       // Keep one undef value for all the undef operands.
150       if (!Undef) {
151         Builder.setInsertPt(*MI.getParent(), MI);
152         Undef = Builder.buildUndef(OpType.getScalarType());
153       }
154       assert(MRI.getType(Undef->getOperand(0).getReg()) ==
155                  OpType.getScalarType() &&
156              "All undefs should have the same type");
157       // Break the undef vector in as many scalar elements as needed
158       // for the flattening.
159       for (unsigned EltIdx = 0, EltEnd = OpType.getNumElements();
160            EltIdx != EltEnd; ++EltIdx)
161         Ops.push_back(Undef->getOperand(0).getReg());
162       break;
163     }
164     default:
165       return false;
166     }
167   }
168   return true;
169 }
170 void CombinerHelper::applyCombineConcatVectors(
171     MachineInstr &MI, bool IsUndef, const ArrayRef<Register> Ops) {
172   // We determined that the concat_vectors can be flatten.
173   // Generate the flattened build_vector.
174   Register DstReg = MI.getOperand(0).getReg();
175   Builder.setInsertPt(*MI.getParent(), MI);
176   Register NewDstReg = MRI.cloneVirtualRegister(DstReg);
177 
178   // Note: IsUndef is sort of redundant. We could have determine it by
179   // checking that at all Ops are undef.  Alternatively, we could have
180   // generate a build_vector of undefs and rely on another combine to
181   // clean that up.  For now, given we already gather this information
182   // in tryCombineConcatVectors, just save compile time and issue the
183   // right thing.
184   if (IsUndef)
185     Builder.buildUndef(NewDstReg);
186   else
187     Builder.buildBuildVector(NewDstReg, Ops);
188   MI.eraseFromParent();
189   replaceRegWith(MRI, DstReg, NewDstReg);
190 }
191 
192 bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) {
193   SmallVector<Register, 4> Ops;
194   if (matchCombineShuffleVector(MI, Ops)) {
195     applyCombineShuffleVector(MI, Ops);
196     return true;
197   }
198   return false;
199 }
200 
201 bool CombinerHelper::matchCombineShuffleVector(MachineInstr &MI,
202                                                SmallVectorImpl<Register> &Ops) {
203   assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
204          "Invalid instruction kind");
205   LLT DstType = MRI.getType(MI.getOperand(0).getReg());
206   Register Src1 = MI.getOperand(1).getReg();
207   LLT SrcType = MRI.getType(Src1);
208   // As bizarre as it may look, shuffle vector can actually produce
209   // scalar! This is because at the IR level a <1 x ty> shuffle
210   // vector is perfectly valid.
211   unsigned DstNumElts = DstType.isVector() ? DstType.getNumElements() : 1;
212   unsigned SrcNumElts = SrcType.isVector() ? SrcType.getNumElements() : 1;
213 
214   // If the resulting vector is smaller than the size of the source
215   // vectors being concatenated, we won't be able to replace the
216   // shuffle vector into a concat_vectors.
217   //
218   // Note: We may still be able to produce a concat_vectors fed by
219   //       extract_vector_elt and so on. It is less clear that would
220   //       be better though, so don't bother for now.
221   //
222   // If the destination is a scalar, the size of the sources doesn't
223   // matter. we will lower the shuffle to a plain copy. This will
224   // work only if the source and destination have the same size. But
225   // that's covered by the next condition.
226   //
227   // TODO: If the size between the source and destination don't match
228   //       we could still emit an extract vector element in that case.
229   if (DstNumElts < 2 * SrcNumElts && DstNumElts != 1)
230     return false;
231 
232   // Check that the shuffle mask can be broken evenly between the
233   // different sources.
234   if (DstNumElts % SrcNumElts != 0)
235     return false;
236 
237   // Mask length is a multiple of the source vector length.
238   // Check if the shuffle is some kind of concatenation of the input
239   // vectors.
240   unsigned NumConcat = DstNumElts / SrcNumElts;
241   SmallVector<int, 8> ConcatSrcs(NumConcat, -1);
242   SmallVector<int, 8> Mask;
243   ShuffleVectorInst::getShuffleMask(MI.getOperand(3).getShuffleMask(), Mask);
244   for (unsigned i = 0; i != DstNumElts; ++i) {
245     int Idx = Mask[i];
246     // Undef value.
247     if (Idx < 0)
248       continue;
249     // Ensure the indices in each SrcType sized piece are sequential and that
250     // the same source is used for the whole piece.
251     if ((Idx % SrcNumElts != (i % SrcNumElts)) ||
252         (ConcatSrcs[i / SrcNumElts] >= 0 &&
253          ConcatSrcs[i / SrcNumElts] != (int)(Idx / SrcNumElts)))
254       return false;
255     // Remember which source this index came from.
256     ConcatSrcs[i / SrcNumElts] = Idx / SrcNumElts;
257   }
258 
259   // The shuffle is concatenating multiple vectors together.
260   // Collect the different operands for that.
261   Register UndefReg;
262   Register Src2 = MI.getOperand(2).getReg();
263   for (auto Src : ConcatSrcs) {
264     if (Src < 0) {
265       if (!UndefReg) {
266         Builder.setInsertPt(*MI.getParent(), MI);
267         UndefReg = Builder.buildUndef(SrcType).getReg(0);
268       }
269       Ops.push_back(UndefReg);
270     } else if (Src == 0)
271       Ops.push_back(Src1);
272     else
273       Ops.push_back(Src2);
274   }
275   return true;
276 }
277 
278 void CombinerHelper::applyCombineShuffleVector(MachineInstr &MI,
279                                                const ArrayRef<Register> Ops) {
280   Register DstReg = MI.getOperand(0).getReg();
281   Builder.setInsertPt(*MI.getParent(), MI);
282   Register NewDstReg = MRI.cloneVirtualRegister(DstReg);
283 
284   if (Ops.size() == 1)
285     Builder.buildCopy(NewDstReg, Ops[0]);
286   else
287     Builder.buildMerge(NewDstReg, Ops);
288 
289   MI.eraseFromParent();
290   replaceRegWith(MRI, DstReg, NewDstReg);
291 }
292 
293 namespace {
294 
295 /// Select a preference between two uses. CurrentUse is the current preference
296 /// while *ForCandidate is attributes of the candidate under consideration.
297 PreferredTuple ChoosePreferredUse(PreferredTuple &CurrentUse,
298                                   const LLT &TyForCandidate,
299                                   unsigned OpcodeForCandidate,
300                                   MachineInstr *MIForCandidate) {
301   if (!CurrentUse.Ty.isValid()) {
302     if (CurrentUse.ExtendOpcode == OpcodeForCandidate ||
303         CurrentUse.ExtendOpcode == TargetOpcode::G_ANYEXT)
304       return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
305     return CurrentUse;
306   }
307 
308   // We permit the extend to hoist through basic blocks but this is only
309   // sensible if the target has extending loads. If you end up lowering back
310   // into a load and extend during the legalizer then the end result is
311   // hoisting the extend up to the load.
312 
313   // Prefer defined extensions to undefined extensions as these are more
314   // likely to reduce the number of instructions.
315   if (OpcodeForCandidate == TargetOpcode::G_ANYEXT &&
316       CurrentUse.ExtendOpcode != TargetOpcode::G_ANYEXT)
317     return CurrentUse;
318   else if (CurrentUse.ExtendOpcode == TargetOpcode::G_ANYEXT &&
319            OpcodeForCandidate != TargetOpcode::G_ANYEXT)
320     return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
321 
322   // Prefer sign extensions to zero extensions as sign-extensions tend to be
323   // more expensive.
324   if (CurrentUse.Ty == TyForCandidate) {
325     if (CurrentUse.ExtendOpcode == TargetOpcode::G_SEXT &&
326         OpcodeForCandidate == TargetOpcode::G_ZEXT)
327       return CurrentUse;
328     else if (CurrentUse.ExtendOpcode == TargetOpcode::G_ZEXT &&
329              OpcodeForCandidate == TargetOpcode::G_SEXT)
330       return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
331   }
332 
333   // This is potentially target specific. We've chosen the largest type
334   // because G_TRUNC is usually free. One potential catch with this is that
335   // some targets have a reduced number of larger registers than smaller
336   // registers and this choice potentially increases the live-range for the
337   // larger value.
338   if (TyForCandidate.getSizeInBits() > CurrentUse.Ty.getSizeInBits()) {
339     return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
340   }
341   return CurrentUse;
342 }
343 
344 /// Find a suitable place to insert some instructions and insert them. This
345 /// function accounts for special cases like inserting before a PHI node.
346 /// The current strategy for inserting before PHI's is to duplicate the
347 /// instructions for each predecessor. However, while that's ok for G_TRUNC
348 /// on most targets since it generally requires no code, other targets/cases may
349 /// want to try harder to find a dominating block.
350 static void InsertInsnsWithoutSideEffectsBeforeUse(
351     MachineIRBuilder &Builder, MachineInstr &DefMI, MachineOperand &UseMO,
352     std::function<void(MachineBasicBlock *, MachineBasicBlock::iterator,
353                        MachineOperand &UseMO)>
354         Inserter) {
355   MachineInstr &UseMI = *UseMO.getParent();
356 
357   MachineBasicBlock *InsertBB = UseMI.getParent();
358 
359   // If the use is a PHI then we want the predecessor block instead.
360   if (UseMI.isPHI()) {
361     MachineOperand *PredBB = std::next(&UseMO);
362     InsertBB = PredBB->getMBB();
363   }
364 
365   // If the block is the same block as the def then we want to insert just after
366   // the def instead of at the start of the block.
367   if (InsertBB == DefMI.getParent()) {
368     MachineBasicBlock::iterator InsertPt = &DefMI;
369     Inserter(InsertBB, std::next(InsertPt), UseMO);
370     return;
371   }
372 
373   // Otherwise we want the start of the BB
374   Inserter(InsertBB, InsertBB->getFirstNonPHI(), UseMO);
375 }
376 } // end anonymous namespace
377 
378 bool CombinerHelper::tryCombineExtendingLoads(MachineInstr &MI) {
379   PreferredTuple Preferred;
380   if (matchCombineExtendingLoads(MI, Preferred)) {
381     applyCombineExtendingLoads(MI, Preferred);
382     return true;
383   }
384   return false;
385 }
386 
387 bool CombinerHelper::matchCombineExtendingLoads(MachineInstr &MI,
388                                                 PreferredTuple &Preferred) {
389   // We match the loads and follow the uses to the extend instead of matching
390   // the extends and following the def to the load. This is because the load
391   // must remain in the same position for correctness (unless we also add code
392   // to find a safe place to sink it) whereas the extend is freely movable.
393   // It also prevents us from duplicating the load for the volatile case or just
394   // for performance.
395 
396   if (MI.getOpcode() != TargetOpcode::G_LOAD &&
397       MI.getOpcode() != TargetOpcode::G_SEXTLOAD &&
398       MI.getOpcode() != TargetOpcode::G_ZEXTLOAD)
399     return false;
400 
401   auto &LoadValue = MI.getOperand(0);
402   assert(LoadValue.isReg() && "Result wasn't a register?");
403 
404   LLT LoadValueTy = MRI.getType(LoadValue.getReg());
405   if (!LoadValueTy.isScalar())
406     return false;
407 
408   // Most architectures are going to legalize <s8 loads into at least a 1 byte
409   // load, and the MMOs can only describe memory accesses in multiples of bytes.
410   // If we try to perform extload combining on those, we can end up with
411   // %a(s8) = extload %ptr (load 1 byte from %ptr)
412   // ... which is an illegal extload instruction.
413   if (LoadValueTy.getSizeInBits() < 8)
414     return false;
415 
416   // For non power-of-2 types, they will very likely be legalized into multiple
417   // loads. Don't bother trying to match them into extending loads.
418   if (!isPowerOf2_32(LoadValueTy.getSizeInBits()))
419     return false;
420 
421   // Find the preferred type aside from the any-extends (unless it's the only
422   // one) and non-extending ops. We'll emit an extending load to that type and
423   // and emit a variant of (extend (trunc X)) for the others according to the
424   // relative type sizes. At the same time, pick an extend to use based on the
425   // extend involved in the chosen type.
426   unsigned PreferredOpcode = MI.getOpcode() == TargetOpcode::G_LOAD
427                                  ? TargetOpcode::G_ANYEXT
428                                  : MI.getOpcode() == TargetOpcode::G_SEXTLOAD
429                                        ? TargetOpcode::G_SEXT
430                                        : TargetOpcode::G_ZEXT;
431   Preferred = {LLT(), PreferredOpcode, nullptr};
432   for (auto &UseMI : MRI.use_instructions(LoadValue.getReg())) {
433     if (UseMI.getOpcode() == TargetOpcode::G_SEXT ||
434         UseMI.getOpcode() == TargetOpcode::G_ZEXT ||
435         UseMI.getOpcode() == TargetOpcode::G_ANYEXT) {
436       Preferred = ChoosePreferredUse(Preferred,
437                                      MRI.getType(UseMI.getOperand(0).getReg()),
438                                      UseMI.getOpcode(), &UseMI);
439     }
440   }
441 
442   // There were no extends
443   if (!Preferred.MI)
444     return false;
445   // It should be impossible to chose an extend without selecting a different
446   // type since by definition the result of an extend is larger.
447   assert(Preferred.Ty != LoadValueTy && "Extending to same type?");
448 
449   LLVM_DEBUG(dbgs() << "Preferred use is: " << *Preferred.MI);
450   return true;
451 }
452 
453 void CombinerHelper::applyCombineExtendingLoads(MachineInstr &MI,
454                                                 PreferredTuple &Preferred) {
455   // Rewrite the load to the chosen extending load.
456   Register ChosenDstReg = Preferred.MI->getOperand(0).getReg();
457 
458   // Inserter to insert a truncate back to the original type at a given point
459   // with some basic CSE to limit truncate duplication to one per BB.
460   DenseMap<MachineBasicBlock *, MachineInstr *> EmittedInsns;
461   auto InsertTruncAt = [&](MachineBasicBlock *InsertIntoBB,
462                            MachineBasicBlock::iterator InsertBefore,
463                            MachineOperand &UseMO) {
464     MachineInstr *PreviouslyEmitted = EmittedInsns.lookup(InsertIntoBB);
465     if (PreviouslyEmitted) {
466       Observer.changingInstr(*UseMO.getParent());
467       UseMO.setReg(PreviouslyEmitted->getOperand(0).getReg());
468       Observer.changedInstr(*UseMO.getParent());
469       return;
470     }
471 
472     Builder.setInsertPt(*InsertIntoBB, InsertBefore);
473     Register NewDstReg = MRI.cloneVirtualRegister(MI.getOperand(0).getReg());
474     MachineInstr *NewMI = Builder.buildTrunc(NewDstReg, ChosenDstReg);
475     EmittedInsns[InsertIntoBB] = NewMI;
476     replaceRegOpWith(MRI, UseMO, NewDstReg);
477   };
478 
479   Observer.changingInstr(MI);
480   MI.setDesc(
481       Builder.getTII().get(Preferred.ExtendOpcode == TargetOpcode::G_SEXT
482                                ? TargetOpcode::G_SEXTLOAD
483                                : Preferred.ExtendOpcode == TargetOpcode::G_ZEXT
484                                      ? TargetOpcode::G_ZEXTLOAD
485                                      : TargetOpcode::G_LOAD));
486 
487   // Rewrite all the uses to fix up the types.
488   auto &LoadValue = MI.getOperand(0);
489   SmallVector<MachineOperand *, 4> Uses;
490   for (auto &UseMO : MRI.use_operands(LoadValue.getReg()))
491     Uses.push_back(&UseMO);
492 
493   for (auto *UseMO : Uses) {
494     MachineInstr *UseMI = UseMO->getParent();
495 
496     // If the extend is compatible with the preferred extend then we should fix
497     // up the type and extend so that it uses the preferred use.
498     if (UseMI->getOpcode() == Preferred.ExtendOpcode ||
499         UseMI->getOpcode() == TargetOpcode::G_ANYEXT) {
500       Register UseDstReg = UseMI->getOperand(0).getReg();
501       MachineOperand &UseSrcMO = UseMI->getOperand(1);
502       const LLT &UseDstTy = MRI.getType(UseDstReg);
503       if (UseDstReg != ChosenDstReg) {
504         if (Preferred.Ty == UseDstTy) {
505           // If the use has the same type as the preferred use, then merge
506           // the vregs and erase the extend. For example:
507           //    %1:_(s8) = G_LOAD ...
508           //    %2:_(s32) = G_SEXT %1(s8)
509           //    %3:_(s32) = G_ANYEXT %1(s8)
510           //    ... = ... %3(s32)
511           // rewrites to:
512           //    %2:_(s32) = G_SEXTLOAD ...
513           //    ... = ... %2(s32)
514           replaceRegWith(MRI, UseDstReg, ChosenDstReg);
515           Observer.erasingInstr(*UseMO->getParent());
516           UseMO->getParent()->eraseFromParent();
517         } else if (Preferred.Ty.getSizeInBits() < UseDstTy.getSizeInBits()) {
518           // If the preferred size is smaller, then keep the extend but extend
519           // from the result of the extending load. For example:
520           //    %1:_(s8) = G_LOAD ...
521           //    %2:_(s32) = G_SEXT %1(s8)
522           //    %3:_(s64) = G_ANYEXT %1(s8)
523           //    ... = ... %3(s64)
524           /// rewrites to:
525           //    %2:_(s32) = G_SEXTLOAD ...
526           //    %3:_(s64) = G_ANYEXT %2:_(s32)
527           //    ... = ... %3(s64)
528           replaceRegOpWith(MRI, UseSrcMO, ChosenDstReg);
529         } else {
530           // If the preferred size is large, then insert a truncate. For
531           // example:
532           //    %1:_(s8) = G_LOAD ...
533           //    %2:_(s64) = G_SEXT %1(s8)
534           //    %3:_(s32) = G_ZEXT %1(s8)
535           //    ... = ... %3(s32)
536           /// rewrites to:
537           //    %2:_(s64) = G_SEXTLOAD ...
538           //    %4:_(s8) = G_TRUNC %2:_(s32)
539           //    %3:_(s64) = G_ZEXT %2:_(s8)
540           //    ... = ... %3(s64)
541           InsertInsnsWithoutSideEffectsBeforeUse(Builder, MI, *UseMO,
542                                                  InsertTruncAt);
543         }
544         continue;
545       }
546       // The use is (one of) the uses of the preferred use we chose earlier.
547       // We're going to update the load to def this value later so just erase
548       // the old extend.
549       Observer.erasingInstr(*UseMO->getParent());
550       UseMO->getParent()->eraseFromParent();
551       continue;
552     }
553 
554     // The use isn't an extend. Truncate back to the type we originally loaded.
555     // This is free on many targets.
556     InsertInsnsWithoutSideEffectsBeforeUse(Builder, MI, *UseMO, InsertTruncAt);
557   }
558 
559   MI.getOperand(0).setReg(ChosenDstReg);
560   Observer.changedInstr(MI);
561 }
562 
563 bool CombinerHelper::isPredecessor(MachineInstr &DefMI, MachineInstr &UseMI) {
564   assert(DefMI.getParent() == UseMI.getParent());
565   if (&DefMI == &UseMI)
566     return false;
567 
568   // Loop through the basic block until we find one of the instructions.
569   MachineBasicBlock::const_iterator I = DefMI.getParent()->begin();
570   for (; &*I != &DefMI && &*I != &UseMI; ++I)
571     return &*I == &DefMI;
572 
573   llvm_unreachable("Block must contain instructions");
574 }
575 
576 bool CombinerHelper::dominates(MachineInstr &DefMI, MachineInstr &UseMI) {
577   if (MDT)
578     return MDT->dominates(&DefMI, &UseMI);
579   else if (DefMI.getParent() != UseMI.getParent())
580     return false;
581 
582   return isPredecessor(DefMI, UseMI);
583 }
584 
585 bool CombinerHelper::findPostIndexCandidate(MachineInstr &MI, Register &Addr,
586                                             Register &Base, Register &Offset) {
587   auto &MF = *MI.getParent()->getParent();
588   const auto &TLI = *MF.getSubtarget().getTargetLowering();
589 
590 #ifndef NDEBUG
591   unsigned Opcode = MI.getOpcode();
592   assert(Opcode == TargetOpcode::G_LOAD || Opcode == TargetOpcode::G_SEXTLOAD ||
593          Opcode == TargetOpcode::G_ZEXTLOAD || Opcode == TargetOpcode::G_STORE);
594 #endif
595 
596   Base = MI.getOperand(1).getReg();
597   MachineInstr *BaseDef = MRI.getUniqueVRegDef(Base);
598   if (BaseDef && BaseDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
599     return false;
600 
601   LLVM_DEBUG(dbgs() << "Searching for post-indexing opportunity for: " << MI);
602 
603   for (auto &Use : MRI.use_instructions(Base)) {
604     if (Use.getOpcode() != TargetOpcode::G_PTR_ADD)
605       continue;
606 
607     Offset = Use.getOperand(2).getReg();
608     if (!ForceLegalIndexing &&
609         !TLI.isIndexingLegal(MI, Base, Offset, /*IsPre*/ false, MRI)) {
610       LLVM_DEBUG(dbgs() << "    Ignoring candidate with illegal addrmode: "
611                         << Use);
612       continue;
613     }
614 
615     // Make sure the offset calculation is before the potentially indexed op.
616     // FIXME: we really care about dependency here. The offset calculation might
617     // be movable.
618     MachineInstr *OffsetDef = MRI.getUniqueVRegDef(Offset);
619     if (!OffsetDef || !dominates(*OffsetDef, MI)) {
620       LLVM_DEBUG(dbgs() << "    Ignoring candidate with offset after mem-op: "
621                         << Use);
622       continue;
623     }
624 
625     // FIXME: check whether all uses of Base are load/store with foldable
626     // addressing modes. If so, using the normal addr-modes is better than
627     // forming an indexed one.
628 
629     bool MemOpDominatesAddrUses = true;
630     for (auto &PtrAddUse : MRI.use_instructions(Use.getOperand(0).getReg())) {
631       if (!dominates(MI, PtrAddUse)) {
632         MemOpDominatesAddrUses = false;
633         break;
634       }
635     }
636 
637     if (!MemOpDominatesAddrUses) {
638       LLVM_DEBUG(
639           dbgs() << "    Ignoring candidate as memop does not dominate uses: "
640                  << Use);
641       continue;
642     }
643 
644     LLVM_DEBUG(dbgs() << "    Found match: " << Use);
645     Addr = Use.getOperand(0).getReg();
646     return true;
647   }
648 
649   return false;
650 }
651 
652 bool CombinerHelper::findPreIndexCandidate(MachineInstr &MI, Register &Addr,
653                                            Register &Base, Register &Offset) {
654   auto &MF = *MI.getParent()->getParent();
655   const auto &TLI = *MF.getSubtarget().getTargetLowering();
656 
657 #ifndef NDEBUG
658   unsigned Opcode = MI.getOpcode();
659   assert(Opcode == TargetOpcode::G_LOAD || Opcode == TargetOpcode::G_SEXTLOAD ||
660          Opcode == TargetOpcode::G_ZEXTLOAD || Opcode == TargetOpcode::G_STORE);
661 #endif
662 
663   Addr = MI.getOperand(1).getReg();
664   MachineInstr *AddrDef = getOpcodeDef(TargetOpcode::G_PTR_ADD, Addr, MRI);
665   if (!AddrDef || MRI.hasOneUse(Addr))
666     return false;
667 
668   Base = AddrDef->getOperand(1).getReg();
669   Offset = AddrDef->getOperand(2).getReg();
670 
671   LLVM_DEBUG(dbgs() << "Found potential pre-indexed load_store: " << MI);
672 
673   if (!ForceLegalIndexing &&
674       !TLI.isIndexingLegal(MI, Base, Offset, /*IsPre*/ true, MRI)) {
675     LLVM_DEBUG(dbgs() << "    Skipping, not legal for target");
676     return false;
677   }
678 
679   MachineInstr *BaseDef = getDefIgnoringCopies(Base, MRI);
680   if (BaseDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
681     LLVM_DEBUG(dbgs() << "    Skipping, frame index would need copy anyway.");
682     return false;
683   }
684 
685   if (MI.getOpcode() == TargetOpcode::G_STORE) {
686     // Would require a copy.
687     if (Base == MI.getOperand(0).getReg()) {
688       LLVM_DEBUG(dbgs() << "    Skipping, storing base so need copy anyway.");
689       return false;
690     }
691 
692     // We're expecting one use of Addr in MI, but it could also be the
693     // value stored, which isn't actually dominated by the instruction.
694     if (MI.getOperand(0).getReg() == Addr) {
695       LLVM_DEBUG(dbgs() << "    Skipping, does not dominate all addr uses");
696       return false;
697     }
698   }
699 
700   // FIXME: check whether all uses of the base pointer are constant PtrAdds.
701   // That might allow us to end base's liveness here by adjusting the constant.
702 
703   for (auto &UseMI : MRI.use_instructions(Addr)) {
704     if (!dominates(MI, UseMI)) {
705       LLVM_DEBUG(dbgs() << "    Skipping, does not dominate all addr uses.");
706       return false;
707     }
708   }
709 
710   return true;
711 }
712 
713 bool CombinerHelper::tryCombineIndexedLoadStore(MachineInstr &MI) {
714   unsigned Opcode = MI.getOpcode();
715   if (Opcode != TargetOpcode::G_LOAD && Opcode != TargetOpcode::G_SEXTLOAD &&
716       Opcode != TargetOpcode::G_ZEXTLOAD && Opcode != TargetOpcode::G_STORE)
717     return false;
718 
719   bool IsStore = Opcode == TargetOpcode::G_STORE;
720   Register Addr, Base, Offset;
721   bool IsPre = findPreIndexCandidate(MI, Addr, Base, Offset);
722   if (!IsPre && !findPostIndexCandidate(MI, Addr, Base, Offset))
723     return false;
724 
725 
726   unsigned NewOpcode;
727   switch (Opcode) {
728   case TargetOpcode::G_LOAD:
729     NewOpcode = TargetOpcode::G_INDEXED_LOAD;
730     break;
731   case TargetOpcode::G_SEXTLOAD:
732     NewOpcode = TargetOpcode::G_INDEXED_SEXTLOAD;
733     break;
734   case TargetOpcode::G_ZEXTLOAD:
735     NewOpcode = TargetOpcode::G_INDEXED_ZEXTLOAD;
736     break;
737   case TargetOpcode::G_STORE:
738     NewOpcode = TargetOpcode::G_INDEXED_STORE;
739     break;
740   default:
741     llvm_unreachable("Unknown load/store opcode");
742   }
743 
744   MachineInstr &AddrDef = *MRI.getUniqueVRegDef(Addr);
745   MachineIRBuilder MIRBuilder(MI);
746   auto MIB = MIRBuilder.buildInstr(NewOpcode);
747   if (IsStore) {
748     MIB.addDef(Addr);
749     MIB.addUse(MI.getOperand(0).getReg());
750   } else {
751     MIB.addDef(MI.getOperand(0).getReg());
752     MIB.addDef(Addr);
753   }
754 
755   MIB.addUse(Base);
756   MIB.addUse(Offset);
757   MIB.addImm(IsPre);
758   MI.eraseFromParent();
759   AddrDef.eraseFromParent();
760 
761   LLVM_DEBUG(dbgs() << "    Combinined to indexed operation");
762   return true;
763 }
764 
765 bool CombinerHelper::matchElideBrByInvertingCond(MachineInstr &MI) {
766   if (MI.getOpcode() != TargetOpcode::G_BR)
767     return false;
768 
769   // Try to match the following:
770   // bb1:
771   //   %c(s32) = G_ICMP pred, %a, %b
772   //   %c1(s1) = G_TRUNC %c(s32)
773   //   G_BRCOND %c1, %bb2
774   //   G_BR %bb3
775   // bb2:
776   // ...
777   // bb3:
778 
779   // The above pattern does not have a fall through to the successor bb2, always
780   // resulting in a branch no matter which path is taken. Here we try to find
781   // and replace that pattern with conditional branch to bb3 and otherwise
782   // fallthrough to bb2.
783 
784   MachineBasicBlock *MBB = MI.getParent();
785   MachineBasicBlock::iterator BrIt(MI);
786   if (BrIt == MBB->begin())
787     return false;
788   assert(std::next(BrIt) == MBB->end() && "expected G_BR to be a terminator");
789 
790   MachineInstr *BrCond = &*std::prev(BrIt);
791   if (BrCond->getOpcode() != TargetOpcode::G_BRCOND)
792     return false;
793 
794   // Check that the next block is the conditional branch target.
795   if (!MBB->isLayoutSuccessor(BrCond->getOperand(1).getMBB()))
796     return false;
797 
798   MachineInstr *CmpMI = MRI.getVRegDef(BrCond->getOperand(0).getReg());
799   if (!CmpMI || CmpMI->getOpcode() != TargetOpcode::G_ICMP ||
800       !MRI.hasOneUse(CmpMI->getOperand(0).getReg()))
801     return false;
802   return true;
803 }
804 
805 bool CombinerHelper::tryElideBrByInvertingCond(MachineInstr &MI) {
806   if (!matchElideBrByInvertingCond(MI))
807     return false;
808   applyElideBrByInvertingCond(MI);
809   return true;
810 }
811 
812 void CombinerHelper::applyElideBrByInvertingCond(MachineInstr &MI) {
813   MachineBasicBlock *BrTarget = MI.getOperand(0).getMBB();
814   MachineBasicBlock::iterator BrIt(MI);
815   MachineInstr *BrCond = &*std::prev(BrIt);
816   MachineInstr *CmpMI = MRI.getVRegDef(BrCond->getOperand(0).getReg());
817 
818   CmpInst::Predicate InversePred = CmpInst::getInversePredicate(
819       (CmpInst::Predicate)CmpMI->getOperand(1).getPredicate());
820 
821   // Invert the G_ICMP condition.
822   Observer.changingInstr(*CmpMI);
823   CmpMI->getOperand(1).setPredicate(InversePred);
824   Observer.changedInstr(*CmpMI);
825 
826   // Change the conditional branch target.
827   Observer.changingInstr(*BrCond);
828   BrCond->getOperand(1).setMBB(BrTarget);
829   Observer.changedInstr(*BrCond);
830   MI.eraseFromParent();
831 }
832 
833 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
834   // On Darwin, -Os means optimize for size without hurting performance, so
835   // only really optimize for size when -Oz (MinSize) is used.
836   if (MF.getTarget().getTargetTriple().isOSDarwin())
837     return MF.getFunction().hasMinSize();
838   return MF.getFunction().hasOptSize();
839 }
840 
841 // Returns a list of types to use for memory op lowering in MemOps. A partial
842 // port of findOptimalMemOpLowering in TargetLowering.
843 static bool findGISelOptimalMemOpLowering(
844     std::vector<LLT> &MemOps, unsigned Limit, uint64_t Size, unsigned DstAlign,
845     unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
846     bool AllowOverlap, unsigned DstAS, unsigned SrcAS,
847     const AttributeList &FuncAttributes, const TargetLowering &TLI) {
848   // If 'SrcAlign' is zero, that means the memory operation does not need to
849   // load the value, i.e. memset or memcpy from constant string. Otherwise,
850   // it's the inferred alignment of the source. 'DstAlign', on the other hand,
851   // is the specified alignment of the memory operation. If it is zero, that
852   // means it's possible to change the alignment of the destination.
853   // 'MemcpyStrSrc' indicates whether the memcpy source is constant so it does
854   // not need to be loaded.
855   if (SrcAlign != 0 && SrcAlign < DstAlign)
856     return false;
857 
858   LLT Ty = TLI.getOptimalMemOpLLT(Size, DstAlign, SrcAlign, IsMemset,
859                                   ZeroMemset, MemcpyStrSrc, FuncAttributes);
860 
861   if (Ty == LLT()) {
862     // Use the largest scalar type whose alignment constraints are satisfied.
863     // We only need to check DstAlign here as SrcAlign is always greater or
864     // equal to DstAlign (or zero).
865     Ty = LLT::scalar(64);
866     while (DstAlign && DstAlign < Ty.getSizeInBytes() &&
867            !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, DstAlign))
868       Ty = LLT::scalar(Ty.getSizeInBytes());
869     assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
870     // FIXME: check for the largest legal type we can load/store to.
871   }
872 
873   unsigned NumMemOps = 0;
874   while (Size != 0) {
875     unsigned TySize = Ty.getSizeInBytes();
876     while (TySize > Size) {
877       // For now, only use non-vector load / store's for the left-over pieces.
878       LLT NewTy = Ty;
879       // FIXME: check for mem op safety and legality of the types. Not all of
880       // SDAGisms map cleanly to GISel concepts.
881       if (NewTy.isVector())
882         NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
883       NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits() - 1));
884       unsigned NewTySize = NewTy.getSizeInBytes();
885       assert(NewTySize > 0 && "Could not find appropriate type");
886 
887       // If the new LLT cannot cover all of the remaining bits, then consider
888       // issuing a (or a pair of) unaligned and overlapping load / store.
889       bool Fast;
890       // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
891       MVT VT = getMVTForLLT(Ty);
892       if (NumMemOps && AllowOverlap && NewTySize < Size &&
893           TLI.allowsMisalignedMemoryAccesses(
894               VT, DstAS, DstAlign, MachineMemOperand::MONone, &Fast) &&
895           Fast)
896         TySize = Size;
897       else {
898         Ty = NewTy;
899         TySize = NewTySize;
900       }
901     }
902 
903     if (++NumMemOps > Limit)
904       return false;
905 
906     MemOps.push_back(Ty);
907     Size -= TySize;
908   }
909 
910   return true;
911 }
912 
913 static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
914   if (Ty.isVector())
915     return VectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
916                            Ty.getNumElements());
917   return IntegerType::get(C, Ty.getSizeInBits());
918 }
919 
920 // Get a vectorized representation of the memset value operand, GISel edition.
921 static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
922   MachineRegisterInfo &MRI = *MIB.getMRI();
923   unsigned NumBits = Ty.getScalarSizeInBits();
924   auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI);
925   if (!Ty.isVector() && ValVRegAndVal) {
926     unsigned KnownVal = ValVRegAndVal->Value;
927     APInt Scalar = APInt(8, KnownVal);
928     APInt SplatVal = APInt::getSplat(NumBits, Scalar);
929     return MIB.buildConstant(Ty, SplatVal).getReg(0);
930   }
931   // FIXME: for vector types create a G_BUILD_VECTOR.
932   if (Ty.isVector())
933     return Register();
934 
935   // Extend the byte value to the larger type, and then multiply by a magic
936   // value 0x010101... in order to replicate it across every byte.
937   LLT ExtType = Ty.getScalarType();
938   auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
939   if (NumBits > 8) {
940     APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
941     auto MagicMI = MIB.buildConstant(ExtType, Magic);
942     Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
943   }
944 
945   assert(ExtType == Ty && "Vector memset value type not supported yet");
946   return Val;
947 }
948 
949 bool CombinerHelper::optimizeMemset(MachineInstr &MI, Register Dst, Register Val,
950                                     unsigned KnownLen, unsigned Align,
951                                     bool IsVolatile) {
952   auto &MF = *MI.getParent()->getParent();
953   const auto &TLI = *MF.getSubtarget().getTargetLowering();
954   auto &DL = MF.getDataLayout();
955   LLVMContext &C = MF.getFunction().getContext();
956 
957   assert(KnownLen != 0 && "Have a zero length memset length!");
958 
959   bool DstAlignCanChange = false;
960   MachineFrameInfo &MFI = MF.getFrameInfo();
961   bool OptSize = shouldLowerMemFuncForSize(MF);
962 
963   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
964   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
965     DstAlignCanChange = true;
966 
967   unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
968   std::vector<LLT> MemOps;
969 
970   const auto &DstMMO = **MI.memoperands_begin();
971   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
972 
973   auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI);
974   bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
975 
976   if (!findGISelOptimalMemOpLowering(
977           MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Align), 0,
978           /*IsMemset=*/true,
979           /*ZeroMemset=*/IsZeroVal, /*MemcpyStrSrc=*/false,
980           /*AllowOverlap=*/!IsVolatile, DstPtrInfo.getAddrSpace(), ~0u,
981           MF.getFunction().getAttributes(), TLI))
982     return false;
983 
984   if (DstAlignCanChange) {
985     // Get an estimate of the type from the LLT.
986     Type *IRTy = getTypeForLLT(MemOps[0], C);
987     unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy);
988     if (NewAlign > Align) {
989       Align = NewAlign;
990       unsigned FI = FIDef->getOperand(1).getIndex();
991       // Give the stack frame object a larger alignment if needed.
992       if (MFI.getObjectAlignment(FI) < Align)
993         MFI.setObjectAlignment(FI, Align);
994     }
995   }
996 
997   MachineIRBuilder MIB(MI);
998   // Find the largest store and generate the bit pattern for it.
999   LLT LargestTy = MemOps[0];
1000   for (unsigned i = 1; i < MemOps.size(); i++)
1001     if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
1002       LargestTy = MemOps[i];
1003 
1004   // The memset stored value is always defined as an s8, so in order to make it
1005   // work with larger store types we need to repeat the bit pattern across the
1006   // wider type.
1007   Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
1008 
1009   if (!MemSetValue)
1010     return false;
1011 
1012   // Generate the stores. For each store type in the list, we generate the
1013   // matching store of that type to the destination address.
1014   LLT PtrTy = MRI.getType(Dst);
1015   unsigned DstOff = 0;
1016   unsigned Size = KnownLen;
1017   for (unsigned I = 0; I < MemOps.size(); I++) {
1018     LLT Ty = MemOps[I];
1019     unsigned TySize = Ty.getSizeInBytes();
1020     if (TySize > Size) {
1021       // Issuing an unaligned load / store pair that overlaps with the previous
1022       // pair. Adjust the offset accordingly.
1023       assert(I == MemOps.size() - 1 && I != 0);
1024       DstOff -= TySize - Size;
1025     }
1026 
1027     // If this store is smaller than the largest store see whether we can get
1028     // the smaller value for free with a truncate.
1029     Register Value = MemSetValue;
1030     if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
1031       MVT VT = getMVTForLLT(Ty);
1032       MVT LargestVT = getMVTForLLT(LargestTy);
1033       if (!LargestTy.isVector() && !Ty.isVector() &&
1034           TLI.isTruncateFree(LargestVT, VT))
1035         Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
1036       else
1037         Value = getMemsetValue(Val, Ty, MIB);
1038       if (!Value)
1039         return false;
1040     }
1041 
1042     auto *StoreMMO =
1043         MF.getMachineMemOperand(&DstMMO, DstOff, Ty.getSizeInBytes());
1044 
1045     Register Ptr = Dst;
1046     if (DstOff != 0) {
1047       auto Offset =
1048           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
1049       Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
1050     }
1051 
1052     MIB.buildStore(Value, Ptr, *StoreMMO);
1053     DstOff += Ty.getSizeInBytes();
1054     Size -= TySize;
1055   }
1056 
1057   MI.eraseFromParent();
1058   return true;
1059 }
1060 
1061 
1062 bool CombinerHelper::optimizeMemcpy(MachineInstr &MI, Register Dst,
1063                                     Register Src, unsigned KnownLen,
1064                                     unsigned DstAlign, unsigned SrcAlign,
1065                                     bool IsVolatile) {
1066   auto &MF = *MI.getParent()->getParent();
1067   const auto &TLI = *MF.getSubtarget().getTargetLowering();
1068   auto &DL = MF.getDataLayout();
1069   LLVMContext &C = MF.getFunction().getContext();
1070 
1071   assert(KnownLen != 0 && "Have a zero length memcpy length!");
1072 
1073   bool DstAlignCanChange = false;
1074   MachineFrameInfo &MFI = MF.getFrameInfo();
1075   bool OptSize = shouldLowerMemFuncForSize(MF);
1076   unsigned Alignment = MinAlign(DstAlign, SrcAlign);
1077 
1078   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
1079   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
1080     DstAlignCanChange = true;
1081 
1082   // FIXME: infer better src pointer alignment like SelectionDAG does here.
1083   // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
1084   // if the memcpy is in a tail call position.
1085 
1086   unsigned Limit = TLI.getMaxStoresPerMemcpy(OptSize);
1087   std::vector<LLT> MemOps;
1088 
1089   const auto &DstMMO = **MI.memoperands_begin();
1090   const auto &SrcMMO = **std::next(MI.memoperands_begin());
1091   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
1092   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
1093 
1094   if (!findGISelOptimalMemOpLowering(
1095           MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Alignment),
1096           SrcAlign,
1097           /*IsMemset=*/false,
1098           /*ZeroMemset=*/false, /*MemcpyStrSrc=*/false,
1099           /*AllowOverlap=*/!IsVolatile, DstPtrInfo.getAddrSpace(),
1100           SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes(), TLI))
1101     return false;
1102 
1103   if (DstAlignCanChange) {
1104     // Get an estimate of the type from the LLT.
1105     Type *IRTy = getTypeForLLT(MemOps[0], C);
1106     unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy);
1107 
1108     // Don't promote to an alignment that would require dynamic stack
1109     // realignment.
1110     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1111     if (!TRI->needsStackRealignment(MF))
1112       while (NewAlign > Alignment &&
1113              DL.exceedsNaturalStackAlignment(Align(NewAlign)))
1114         NewAlign /= 2;
1115 
1116     if (NewAlign > Alignment) {
1117       Alignment = NewAlign;
1118       unsigned FI = FIDef->getOperand(1).getIndex();
1119       // Give the stack frame object a larger alignment if needed.
1120       if (MFI.getObjectAlignment(FI) < Alignment)
1121         MFI.setObjectAlignment(FI, Alignment);
1122     }
1123   }
1124 
1125   LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
1126 
1127   MachineIRBuilder MIB(MI);
1128   // Now we need to emit a pair of load and stores for each of the types we've
1129   // collected. I.e. for each type, generate a load from the source pointer of
1130   // that type width, and then generate a corresponding store to the dest buffer
1131   // of that value loaded. This can result in a sequence of loads and stores
1132   // mixed types, depending on what the target specifies as good types to use.
1133   unsigned CurrOffset = 0;
1134   LLT PtrTy = MRI.getType(Src);
1135   unsigned Size = KnownLen;
1136   for (auto CopyTy : MemOps) {
1137     // Issuing an unaligned load / store pair  that overlaps with the previous
1138     // pair. Adjust the offset accordingly.
1139     if (CopyTy.getSizeInBytes() > Size)
1140       CurrOffset -= CopyTy.getSizeInBytes() - Size;
1141 
1142     // Construct MMOs for the accesses.
1143     auto *LoadMMO =
1144         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
1145     auto *StoreMMO =
1146         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
1147 
1148     // Create the load.
1149     Register LoadPtr = Src;
1150     Register Offset;
1151     if (CurrOffset != 0) {
1152       Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset)
1153                    .getReg(0);
1154       LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
1155     }
1156     auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
1157 
1158     // Create the store.
1159     Register StorePtr =
1160         CurrOffset == 0 ? Dst : MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
1161     MIB.buildStore(LdVal, StorePtr, *StoreMMO);
1162     CurrOffset += CopyTy.getSizeInBytes();
1163     Size -= CopyTy.getSizeInBytes();
1164   }
1165 
1166   MI.eraseFromParent();
1167   return true;
1168 }
1169 
1170 bool CombinerHelper::optimizeMemmove(MachineInstr &MI, Register Dst,
1171                                     Register Src, unsigned KnownLen,
1172                                     unsigned DstAlign, unsigned SrcAlign,
1173                                     bool IsVolatile) {
1174   auto &MF = *MI.getParent()->getParent();
1175   const auto &TLI = *MF.getSubtarget().getTargetLowering();
1176   auto &DL = MF.getDataLayout();
1177   LLVMContext &C = MF.getFunction().getContext();
1178 
1179   assert(KnownLen != 0 && "Have a zero length memmove length!");
1180 
1181   bool DstAlignCanChange = false;
1182   MachineFrameInfo &MFI = MF.getFrameInfo();
1183   bool OptSize = shouldLowerMemFuncForSize(MF);
1184   unsigned Alignment = MinAlign(DstAlign, SrcAlign);
1185 
1186   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
1187   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
1188     DstAlignCanChange = true;
1189 
1190   unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
1191   std::vector<LLT> MemOps;
1192 
1193   const auto &DstMMO = **MI.memoperands_begin();
1194   const auto &SrcMMO = **std::next(MI.memoperands_begin());
1195   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
1196   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
1197 
1198   // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
1199   // to a bug in it's findOptimalMemOpLowering implementation. For now do the
1200   // same thing here.
1201   if (!findGISelOptimalMemOpLowering(
1202           MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Alignment),
1203           SrcAlign,
1204           /*IsMemset=*/false,
1205           /*ZeroMemset=*/false, /*MemcpyStrSrc=*/false,
1206           /*AllowOverlap=*/false, DstPtrInfo.getAddrSpace(),
1207           SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes(), TLI))
1208     return false;
1209 
1210   if (DstAlignCanChange) {
1211     // Get an estimate of the type from the LLT.
1212     Type *IRTy = getTypeForLLT(MemOps[0], C);
1213     unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy);
1214 
1215     // Don't promote to an alignment that would require dynamic stack
1216     // realignment.
1217     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1218     if (!TRI->needsStackRealignment(MF))
1219       while (NewAlign > Alignment &&
1220              DL.exceedsNaturalStackAlignment(Align(NewAlign)))
1221         NewAlign /= 2;
1222 
1223     if (NewAlign > Alignment) {
1224       Alignment = NewAlign;
1225       unsigned FI = FIDef->getOperand(1).getIndex();
1226       // Give the stack frame object a larger alignment if needed.
1227       if (MFI.getObjectAlignment(FI) < Alignment)
1228         MFI.setObjectAlignment(FI, Alignment);
1229     }
1230   }
1231 
1232   LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
1233 
1234   MachineIRBuilder MIB(MI);
1235   // Memmove requires that we perform the loads first before issuing the stores.
1236   // Apart from that, this loop is pretty much doing the same thing as the
1237   // memcpy codegen function.
1238   unsigned CurrOffset = 0;
1239   LLT PtrTy = MRI.getType(Src);
1240   SmallVector<Register, 16> LoadVals;
1241   for (auto CopyTy : MemOps) {
1242     // Construct MMO for the load.
1243     auto *LoadMMO =
1244         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
1245 
1246     // Create the load.
1247     Register LoadPtr = Src;
1248     if (CurrOffset != 0) {
1249       auto Offset =
1250           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
1251       LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
1252     }
1253     LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
1254     CurrOffset += CopyTy.getSizeInBytes();
1255   }
1256 
1257   CurrOffset = 0;
1258   for (unsigned I = 0; I < MemOps.size(); ++I) {
1259     LLT CopyTy = MemOps[I];
1260     // Now store the values loaded.
1261     auto *StoreMMO =
1262         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
1263 
1264     Register StorePtr = Dst;
1265     if (CurrOffset != 0) {
1266       auto Offset =
1267           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
1268       StorePtr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
1269     }
1270     MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
1271     CurrOffset += CopyTy.getSizeInBytes();
1272   }
1273   MI.eraseFromParent();
1274   return true;
1275 }
1276 
1277 bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
1278   // This combine is fairly complex so it's not written with a separate
1279   // matcher function.
1280   assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
1281   Intrinsic::ID ID = (Intrinsic::ID)MI.getIntrinsicID();
1282   assert((ID == Intrinsic::memcpy || ID == Intrinsic::memmove ||
1283           ID == Intrinsic::memset) &&
1284          "Expected a memcpy like intrinsic");
1285 
1286   auto MMOIt = MI.memoperands_begin();
1287   const MachineMemOperand *MemOp = *MMOIt;
1288   bool IsVolatile = MemOp->isVolatile();
1289   // Don't try to optimize volatile.
1290   if (IsVolatile)
1291     return false;
1292 
1293   unsigned DstAlign = MemOp->getBaseAlignment();
1294   unsigned SrcAlign = 0;
1295   Register Dst = MI.getOperand(1).getReg();
1296   Register Src = MI.getOperand(2).getReg();
1297   Register Len = MI.getOperand(3).getReg();
1298 
1299   if (ID != Intrinsic::memset) {
1300     assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
1301     MemOp = *(++MMOIt);
1302     SrcAlign = MemOp->getBaseAlignment();
1303   }
1304 
1305   // See if this is a constant length copy
1306   auto LenVRegAndVal = getConstantVRegValWithLookThrough(Len, MRI);
1307   if (!LenVRegAndVal)
1308     return false; // Leave it to the legalizer to lower it to a libcall.
1309   unsigned KnownLen = LenVRegAndVal->Value;
1310 
1311   if (KnownLen == 0) {
1312     MI.eraseFromParent();
1313     return true;
1314   }
1315 
1316   if (MaxLen && KnownLen > MaxLen)
1317     return false;
1318 
1319   if (ID == Intrinsic::memcpy)
1320     return optimizeMemcpy(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
1321   if (ID == Intrinsic::memmove)
1322     return optimizeMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
1323   if (ID == Intrinsic::memset)
1324     return optimizeMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
1325   return false;
1326 }
1327 
1328 bool CombinerHelper::tryCombine(MachineInstr &MI) {
1329   if (tryCombineCopy(MI))
1330     return true;
1331   if (tryCombineExtendingLoads(MI))
1332     return true;
1333   if (tryCombineIndexedLoadStore(MI))
1334     return true;
1335   return false;
1336 }
1337