1 //===-- lib/CodeGen/GlobalISel/GICombinerHelper.cpp -----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
9 #include "llvm/CodeGen/GlobalISel/Combiner.h"
10 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
11 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
12 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
13 #include "llvm/CodeGen/GlobalISel/Utils.h"
14 #include "llvm/CodeGen/MachineDominators.h"
15 #include "llvm/CodeGen/MachineFrameInfo.h"
16 #include "llvm/CodeGen/MachineInstr.h"
17 #include "llvm/CodeGen/MachineRegisterInfo.h"
18 #include "llvm/CodeGen/TargetInstrInfo.h"
19 #include "llvm/CodeGen/TargetLowering.h"
20 #include "llvm/Target/TargetMachine.h"
21 
22 #define DEBUG_TYPE "gi-combiner"
23 
24 using namespace llvm;
25 
26 // Option to allow testing of the combiner while no targets know about indexed
27 // addressing.
28 static cl::opt<bool>
29     ForceLegalIndexing("force-legal-indexing", cl::Hidden, cl::init(false),
30                        cl::desc("Force all indexed operations to be "
31                                 "legal for the GlobalISel combiner"));
32 
33 
34 CombinerHelper::CombinerHelper(GISelChangeObserver &Observer,
35                                MachineIRBuilder &B, GISelKnownBits *KB,
36                                MachineDominatorTree *MDT)
37     : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer),
38       KB(KB), MDT(MDT) {
39   (void)this->KB;
40 }
41 
42 void CombinerHelper::replaceRegWith(MachineRegisterInfo &MRI, Register FromReg,
43                                     Register ToReg) const {
44   Observer.changingAllUsesOfReg(MRI, FromReg);
45 
46   if (MRI.constrainRegAttrs(ToReg, FromReg))
47     MRI.replaceRegWith(FromReg, ToReg);
48   else
49     Builder.buildCopy(ToReg, FromReg);
50 
51   Observer.finishedChangingAllUsesOfReg();
52 }
53 
54 void CombinerHelper::replaceRegOpWith(MachineRegisterInfo &MRI,
55                                       MachineOperand &FromRegOp,
56                                       Register ToReg) const {
57   assert(FromRegOp.getParent() && "Expected an operand in an MI");
58   Observer.changingInstr(*FromRegOp.getParent());
59 
60   FromRegOp.setReg(ToReg);
61 
62   Observer.changedInstr(*FromRegOp.getParent());
63 }
64 
65 bool CombinerHelper::tryCombineCopy(MachineInstr &MI) {
66   if (matchCombineCopy(MI)) {
67     applyCombineCopy(MI);
68     return true;
69   }
70   return false;
71 }
72 bool CombinerHelper::matchCombineCopy(MachineInstr &MI) {
73   if (MI.getOpcode() != TargetOpcode::COPY)
74     return false;
75   Register DstReg = MI.getOperand(0).getReg();
76   Register SrcReg = MI.getOperand(1).getReg();
77 
78   // Give up if either DstReg or SrcReg  is a physical register.
79   if (Register::isPhysicalRegister(DstReg) ||
80       Register::isPhysicalRegister(SrcReg))
81     return false;
82 
83   // Give up the types don't match.
84   LLT DstTy = MRI.getType(DstReg);
85   LLT SrcTy = MRI.getType(SrcReg);
86   // Give up if one has a valid LLT, but the other doesn't.
87   if (DstTy.isValid() != SrcTy.isValid())
88     return false;
89   // Give up if the types don't match.
90   if (DstTy.isValid() && SrcTy.isValid() && DstTy != SrcTy)
91     return false;
92 
93   // Get the register banks and classes.
94   const RegisterBank *DstBank = MRI.getRegBankOrNull(DstReg);
95   const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
96   const TargetRegisterClass *DstRC = MRI.getRegClassOrNull(DstReg);
97   const TargetRegisterClass *SrcRC = MRI.getRegClassOrNull(SrcReg);
98 
99   // Replace if the register constraints match.
100   if ((SrcRC == DstRC) && (SrcBank == DstBank))
101     return true;
102   // Replace if DstReg has no constraints.
103   if (!DstBank && !DstRC)
104     return true;
105 
106   return false;
107 }
108 void CombinerHelper::applyCombineCopy(MachineInstr &MI) {
109   Register DstReg = MI.getOperand(0).getReg();
110   Register SrcReg = MI.getOperand(1).getReg();
111   MI.eraseFromParent();
112   replaceRegWith(MRI, DstReg, SrcReg);
113 }
114 
115 bool CombinerHelper::tryCombineConcatVectors(MachineInstr &MI) {
116   bool IsUndef = false;
117   SmallVector<Register, 4> Ops;
118   if (matchCombineConcatVectors(MI, IsUndef, Ops)) {
119     applyCombineConcatVectors(MI, IsUndef, Ops);
120     return true;
121   }
122   return false;
123 }
124 
125 bool CombinerHelper::matchCombineConcatVectors(MachineInstr &MI, bool &IsUndef,
126                                                SmallVectorImpl<Register> &Ops) {
127   assert(MI.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
128          "Invalid instruction");
129   IsUndef = true;
130   MachineInstr *Undef = nullptr;
131 
132   // Walk over all the operands of concat vectors and check if they are
133   // build_vector themselves or undef.
134   // Then collect their operands in Ops.
135   for (const MachineOperand &MO : MI.uses()) {
136     Register Reg = MO.getReg();
137     MachineInstr *Def = MRI.getVRegDef(Reg);
138     assert(Def && "Operand not defined");
139     switch (Def->getOpcode()) {
140     case TargetOpcode::G_BUILD_VECTOR:
141       IsUndef = false;
142       // Remember the operands of the build_vector to fold
143       // them into the yet-to-build flattened concat vectors.
144       for (const MachineOperand &BuildVecMO : Def->uses())
145         Ops.push_back(BuildVecMO.getReg());
146       break;
147     case TargetOpcode::G_IMPLICIT_DEF: {
148       LLT OpType = MRI.getType(Reg);
149       // Keep one undef value for all the undef operands.
150       if (!Undef) {
151         Builder.setInsertPt(*MI.getParent(), MI);
152         Undef = Builder.buildUndef(OpType.getScalarType());
153       }
154       assert(MRI.getType(Undef->getOperand(0).getReg()) ==
155                  OpType.getScalarType() &&
156              "All undefs should have the same type");
157       // Break the undef vector in as many scalar elements as needed
158       // for the flattening.
159       for (unsigned EltIdx = 0, EltEnd = OpType.getNumElements();
160            EltIdx != EltEnd; ++EltIdx)
161         Ops.push_back(Undef->getOperand(0).getReg());
162       break;
163     }
164     default:
165       return false;
166     }
167   }
168   return true;
169 }
170 void CombinerHelper::applyCombineConcatVectors(
171     MachineInstr &MI, bool IsUndef, const ArrayRef<Register> Ops) {
172   // We determined that the concat_vectors can be flatten.
173   // Generate the flattened build_vector.
174   Register DstReg = MI.getOperand(0).getReg();
175   Builder.setInsertPt(*MI.getParent(), MI);
176   Register NewDstReg = MRI.cloneVirtualRegister(DstReg);
177 
178   // Note: IsUndef is sort of redundant. We could have determine it by
179   // checking that at all Ops are undef.  Alternatively, we could have
180   // generate a build_vector of undefs and rely on another combine to
181   // clean that up.  For now, given we already gather this information
182   // in tryCombineConcatVectors, just save compile time and issue the
183   // right thing.
184   if (IsUndef)
185     Builder.buildUndef(NewDstReg);
186   else
187     Builder.buildBuildVector(NewDstReg, Ops);
188   MI.eraseFromParent();
189   replaceRegWith(MRI, DstReg, NewDstReg);
190 }
191 
192 bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) {
193   SmallVector<Register, 4> Ops;
194   if (matchCombineShuffleVector(MI, Ops)) {
195     applyCombineShuffleVector(MI, Ops);
196     return true;
197   }
198   return false;
199 }
200 
201 bool CombinerHelper::matchCombineShuffleVector(MachineInstr &MI,
202                                                SmallVectorImpl<Register> &Ops) {
203   assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
204          "Invalid instruction kind");
205   LLT DstType = MRI.getType(MI.getOperand(0).getReg());
206   Register Src1 = MI.getOperand(1).getReg();
207   LLT SrcType = MRI.getType(Src1);
208   // As bizarre as it may look, shuffle vector can actually produce
209   // scalar! This is because at the IR level a <1 x ty> shuffle
210   // vector is perfectly valid.
211   unsigned DstNumElts = DstType.isVector() ? DstType.getNumElements() : 1;
212   unsigned SrcNumElts = SrcType.isVector() ? SrcType.getNumElements() : 1;
213 
214   // If the resulting vector is smaller than the size of the source
215   // vectors being concatenated, we won't be able to replace the
216   // shuffle vector into a concat_vectors.
217   //
218   // Note: We may still be able to produce a concat_vectors fed by
219   //       extract_vector_elt and so on. It is less clear that would
220   //       be better though, so don't bother for now.
221   //
222   // If the destination is a scalar, the size of the sources doesn't
223   // matter. we will lower the shuffle to a plain copy. This will
224   // work only if the source and destination have the same size. But
225   // that's covered by the next condition.
226   //
227   // TODO: If the size between the source and destination don't match
228   //       we could still emit an extract vector element in that case.
229   if (DstNumElts < 2 * SrcNumElts && DstNumElts != 1)
230     return false;
231 
232   // Check that the shuffle mask can be broken evenly between the
233   // different sources.
234   if (DstNumElts % SrcNumElts != 0)
235     return false;
236 
237   // Mask length is a multiple of the source vector length.
238   // Check if the shuffle is some kind of concatenation of the input
239   // vectors.
240   unsigned NumConcat = DstNumElts / SrcNumElts;
241   SmallVector<int, 8> ConcatSrcs(NumConcat, -1);
242   SmallVector<int, 8> Mask;
243   ShuffleVectorInst::getShuffleMask(MI.getOperand(3).getShuffleMask(), Mask);
244   for (unsigned i = 0; i != DstNumElts; ++i) {
245     int Idx = Mask[i];
246     // Undef value.
247     if (Idx < 0)
248       continue;
249     // Ensure the indices in each SrcType sized piece are sequential and that
250     // the same source is used for the whole piece.
251     if ((Idx % SrcNumElts != (i % SrcNumElts)) ||
252         (ConcatSrcs[i / SrcNumElts] >= 0 &&
253          ConcatSrcs[i / SrcNumElts] != (int)(Idx / SrcNumElts)))
254       return false;
255     // Remember which source this index came from.
256     ConcatSrcs[i / SrcNumElts] = Idx / SrcNumElts;
257   }
258 
259   // The shuffle is concatenating multiple vectors together.
260   // Collect the different operands for that.
261   Register UndefReg;
262   Register Src2 = MI.getOperand(2).getReg();
263   for (auto Src : ConcatSrcs) {
264     if (Src < 0) {
265       if (!UndefReg) {
266         Builder.setInsertPt(*MI.getParent(), MI);
267         UndefReg = Builder.buildUndef(SrcType).getReg(0);
268       }
269       Ops.push_back(UndefReg);
270     } else if (Src == 0)
271       Ops.push_back(Src1);
272     else
273       Ops.push_back(Src2);
274   }
275   return true;
276 }
277 
278 void CombinerHelper::applyCombineShuffleVector(MachineInstr &MI,
279                                                const ArrayRef<Register> Ops) {
280   Register DstReg = MI.getOperand(0).getReg();
281   Builder.setInsertPt(*MI.getParent(), MI);
282   Register NewDstReg = MRI.cloneVirtualRegister(DstReg);
283 
284   if (Ops.size() == 1)
285     Builder.buildCopy(NewDstReg, Ops[0]);
286   else
287     Builder.buildMerge(NewDstReg, Ops);
288 
289   MI.eraseFromParent();
290   replaceRegWith(MRI, DstReg, NewDstReg);
291 }
292 
293 namespace {
294 
295 /// Select a preference between two uses. CurrentUse is the current preference
296 /// while *ForCandidate is attributes of the candidate under consideration.
297 PreferredTuple ChoosePreferredUse(PreferredTuple &CurrentUse,
298                                   const LLT &TyForCandidate,
299                                   unsigned OpcodeForCandidate,
300                                   MachineInstr *MIForCandidate) {
301   if (!CurrentUse.Ty.isValid()) {
302     if (CurrentUse.ExtendOpcode == OpcodeForCandidate ||
303         CurrentUse.ExtendOpcode == TargetOpcode::G_ANYEXT)
304       return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
305     return CurrentUse;
306   }
307 
308   // We permit the extend to hoist through basic blocks but this is only
309   // sensible if the target has extending loads. If you end up lowering back
310   // into a load and extend during the legalizer then the end result is
311   // hoisting the extend up to the load.
312 
313   // Prefer defined extensions to undefined extensions as these are more
314   // likely to reduce the number of instructions.
315   if (OpcodeForCandidate == TargetOpcode::G_ANYEXT &&
316       CurrentUse.ExtendOpcode != TargetOpcode::G_ANYEXT)
317     return CurrentUse;
318   else if (CurrentUse.ExtendOpcode == TargetOpcode::G_ANYEXT &&
319            OpcodeForCandidate != TargetOpcode::G_ANYEXT)
320     return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
321 
322   // Prefer sign extensions to zero extensions as sign-extensions tend to be
323   // more expensive.
324   if (CurrentUse.Ty == TyForCandidate) {
325     if (CurrentUse.ExtendOpcode == TargetOpcode::G_SEXT &&
326         OpcodeForCandidate == TargetOpcode::G_ZEXT)
327       return CurrentUse;
328     else if (CurrentUse.ExtendOpcode == TargetOpcode::G_ZEXT &&
329              OpcodeForCandidate == TargetOpcode::G_SEXT)
330       return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
331   }
332 
333   // This is potentially target specific. We've chosen the largest type
334   // because G_TRUNC is usually free. One potential catch with this is that
335   // some targets have a reduced number of larger registers than smaller
336   // registers and this choice potentially increases the live-range for the
337   // larger value.
338   if (TyForCandidate.getSizeInBits() > CurrentUse.Ty.getSizeInBits()) {
339     return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
340   }
341   return CurrentUse;
342 }
343 
344 /// Find a suitable place to insert some instructions and insert them. This
345 /// function accounts for special cases like inserting before a PHI node.
346 /// The current strategy for inserting before PHI's is to duplicate the
347 /// instructions for each predecessor. However, while that's ok for G_TRUNC
348 /// on most targets since it generally requires no code, other targets/cases may
349 /// want to try harder to find a dominating block.
350 static void InsertInsnsWithoutSideEffectsBeforeUse(
351     MachineIRBuilder &Builder, MachineInstr &DefMI, MachineOperand &UseMO,
352     std::function<void(MachineBasicBlock *, MachineBasicBlock::iterator,
353                        MachineOperand &UseMO)>
354         Inserter) {
355   MachineInstr &UseMI = *UseMO.getParent();
356 
357   MachineBasicBlock *InsertBB = UseMI.getParent();
358 
359   // If the use is a PHI then we want the predecessor block instead.
360   if (UseMI.isPHI()) {
361     MachineOperand *PredBB = std::next(&UseMO);
362     InsertBB = PredBB->getMBB();
363   }
364 
365   // If the block is the same block as the def then we want to insert just after
366   // the def instead of at the start of the block.
367   if (InsertBB == DefMI.getParent()) {
368     MachineBasicBlock::iterator InsertPt = &DefMI;
369     Inserter(InsertBB, std::next(InsertPt), UseMO);
370     return;
371   }
372 
373   // Otherwise we want the start of the BB
374   Inserter(InsertBB, InsertBB->getFirstNonPHI(), UseMO);
375 }
376 } // end anonymous namespace
377 
378 bool CombinerHelper::tryCombineExtendingLoads(MachineInstr &MI) {
379   PreferredTuple Preferred;
380   if (matchCombineExtendingLoads(MI, Preferred)) {
381     applyCombineExtendingLoads(MI, Preferred);
382     return true;
383   }
384   return false;
385 }
386 
387 bool CombinerHelper::matchCombineExtendingLoads(MachineInstr &MI,
388                                                 PreferredTuple &Preferred) {
389   // We match the loads and follow the uses to the extend instead of matching
390   // the extends and following the def to the load. This is because the load
391   // must remain in the same position for correctness (unless we also add code
392   // to find a safe place to sink it) whereas the extend is freely movable.
393   // It also prevents us from duplicating the load for the volatile case or just
394   // for performance.
395 
396   if (MI.getOpcode() != TargetOpcode::G_LOAD &&
397       MI.getOpcode() != TargetOpcode::G_SEXTLOAD &&
398       MI.getOpcode() != TargetOpcode::G_ZEXTLOAD)
399     return false;
400 
401   auto &LoadValue = MI.getOperand(0);
402   assert(LoadValue.isReg() && "Result wasn't a register?");
403 
404   LLT LoadValueTy = MRI.getType(LoadValue.getReg());
405   if (!LoadValueTy.isScalar())
406     return false;
407 
408   // Most architectures are going to legalize <s8 loads into at least a 1 byte
409   // load, and the MMOs can only describe memory accesses in multiples of bytes.
410   // If we try to perform extload combining on those, we can end up with
411   // %a(s8) = extload %ptr (load 1 byte from %ptr)
412   // ... which is an illegal extload instruction.
413   if (LoadValueTy.getSizeInBits() < 8)
414     return false;
415 
416   // For non power-of-2 types, they will very likely be legalized into multiple
417   // loads. Don't bother trying to match them into extending loads.
418   if (!isPowerOf2_32(LoadValueTy.getSizeInBits()))
419     return false;
420 
421   // Find the preferred type aside from the any-extends (unless it's the only
422   // one) and non-extending ops. We'll emit an extending load to that type and
423   // and emit a variant of (extend (trunc X)) for the others according to the
424   // relative type sizes. At the same time, pick an extend to use based on the
425   // extend involved in the chosen type.
426   unsigned PreferredOpcode = MI.getOpcode() == TargetOpcode::G_LOAD
427                                  ? TargetOpcode::G_ANYEXT
428                                  : MI.getOpcode() == TargetOpcode::G_SEXTLOAD
429                                        ? TargetOpcode::G_SEXT
430                                        : TargetOpcode::G_ZEXT;
431   Preferred = {LLT(), PreferredOpcode, nullptr};
432   for (auto &UseMI : MRI.use_instructions(LoadValue.getReg())) {
433     if (UseMI.getOpcode() == TargetOpcode::G_SEXT ||
434         UseMI.getOpcode() == TargetOpcode::G_ZEXT ||
435         UseMI.getOpcode() == TargetOpcode::G_ANYEXT) {
436       Preferred = ChoosePreferredUse(Preferred,
437                                      MRI.getType(UseMI.getOperand(0).getReg()),
438                                      UseMI.getOpcode(), &UseMI);
439     }
440   }
441 
442   // There were no extends
443   if (!Preferred.MI)
444     return false;
445   // It should be impossible to chose an extend without selecting a different
446   // type since by definition the result of an extend is larger.
447   assert(Preferred.Ty != LoadValueTy && "Extending to same type?");
448 
449   LLVM_DEBUG(dbgs() << "Preferred use is: " << *Preferred.MI);
450   return true;
451 }
452 
453 void CombinerHelper::applyCombineExtendingLoads(MachineInstr &MI,
454                                                 PreferredTuple &Preferred) {
455   // Rewrite the load to the chosen extending load.
456   Register ChosenDstReg = Preferred.MI->getOperand(0).getReg();
457 
458   // Inserter to insert a truncate back to the original type at a given point
459   // with some basic CSE to limit truncate duplication to one per BB.
460   DenseMap<MachineBasicBlock *, MachineInstr *> EmittedInsns;
461   auto InsertTruncAt = [&](MachineBasicBlock *InsertIntoBB,
462                            MachineBasicBlock::iterator InsertBefore,
463                            MachineOperand &UseMO) {
464     MachineInstr *PreviouslyEmitted = EmittedInsns.lookup(InsertIntoBB);
465     if (PreviouslyEmitted) {
466       Observer.changingInstr(*UseMO.getParent());
467       UseMO.setReg(PreviouslyEmitted->getOperand(0).getReg());
468       Observer.changedInstr(*UseMO.getParent());
469       return;
470     }
471 
472     Builder.setInsertPt(*InsertIntoBB, InsertBefore);
473     Register NewDstReg = MRI.cloneVirtualRegister(MI.getOperand(0).getReg());
474     MachineInstr *NewMI = Builder.buildTrunc(NewDstReg, ChosenDstReg);
475     EmittedInsns[InsertIntoBB] = NewMI;
476     replaceRegOpWith(MRI, UseMO, NewDstReg);
477   };
478 
479   Observer.changingInstr(MI);
480   MI.setDesc(
481       Builder.getTII().get(Preferred.ExtendOpcode == TargetOpcode::G_SEXT
482                                ? TargetOpcode::G_SEXTLOAD
483                                : Preferred.ExtendOpcode == TargetOpcode::G_ZEXT
484                                      ? TargetOpcode::G_ZEXTLOAD
485                                      : TargetOpcode::G_LOAD));
486 
487   // Rewrite all the uses to fix up the types.
488   auto &LoadValue = MI.getOperand(0);
489   SmallVector<MachineOperand *, 4> Uses;
490   for (auto &UseMO : MRI.use_operands(LoadValue.getReg()))
491     Uses.push_back(&UseMO);
492 
493   for (auto *UseMO : Uses) {
494     MachineInstr *UseMI = UseMO->getParent();
495 
496     // If the extend is compatible with the preferred extend then we should fix
497     // up the type and extend so that it uses the preferred use.
498     if (UseMI->getOpcode() == Preferred.ExtendOpcode ||
499         UseMI->getOpcode() == TargetOpcode::G_ANYEXT) {
500       Register UseDstReg = UseMI->getOperand(0).getReg();
501       MachineOperand &UseSrcMO = UseMI->getOperand(1);
502       const LLT &UseDstTy = MRI.getType(UseDstReg);
503       if (UseDstReg != ChosenDstReg) {
504         if (Preferred.Ty == UseDstTy) {
505           // If the use has the same type as the preferred use, then merge
506           // the vregs and erase the extend. For example:
507           //    %1:_(s8) = G_LOAD ...
508           //    %2:_(s32) = G_SEXT %1(s8)
509           //    %3:_(s32) = G_ANYEXT %1(s8)
510           //    ... = ... %3(s32)
511           // rewrites to:
512           //    %2:_(s32) = G_SEXTLOAD ...
513           //    ... = ... %2(s32)
514           replaceRegWith(MRI, UseDstReg, ChosenDstReg);
515           Observer.erasingInstr(*UseMO->getParent());
516           UseMO->getParent()->eraseFromParent();
517         } else if (Preferred.Ty.getSizeInBits() < UseDstTy.getSizeInBits()) {
518           // If the preferred size is smaller, then keep the extend but extend
519           // from the result of the extending load. For example:
520           //    %1:_(s8) = G_LOAD ...
521           //    %2:_(s32) = G_SEXT %1(s8)
522           //    %3:_(s64) = G_ANYEXT %1(s8)
523           //    ... = ... %3(s64)
524           /// rewrites to:
525           //    %2:_(s32) = G_SEXTLOAD ...
526           //    %3:_(s64) = G_ANYEXT %2:_(s32)
527           //    ... = ... %3(s64)
528           replaceRegOpWith(MRI, UseSrcMO, ChosenDstReg);
529         } else {
530           // If the preferred size is large, then insert a truncate. For
531           // example:
532           //    %1:_(s8) = G_LOAD ...
533           //    %2:_(s64) = G_SEXT %1(s8)
534           //    %3:_(s32) = G_ZEXT %1(s8)
535           //    ... = ... %3(s32)
536           /// rewrites to:
537           //    %2:_(s64) = G_SEXTLOAD ...
538           //    %4:_(s8) = G_TRUNC %2:_(s32)
539           //    %3:_(s64) = G_ZEXT %2:_(s8)
540           //    ... = ... %3(s64)
541           InsertInsnsWithoutSideEffectsBeforeUse(Builder, MI, *UseMO,
542                                                  InsertTruncAt);
543         }
544         continue;
545       }
546       // The use is (one of) the uses of the preferred use we chose earlier.
547       // We're going to update the load to def this value later so just erase
548       // the old extend.
549       Observer.erasingInstr(*UseMO->getParent());
550       UseMO->getParent()->eraseFromParent();
551       continue;
552     }
553 
554     // The use isn't an extend. Truncate back to the type we originally loaded.
555     // This is free on many targets.
556     InsertInsnsWithoutSideEffectsBeforeUse(Builder, MI, *UseMO, InsertTruncAt);
557   }
558 
559   MI.getOperand(0).setReg(ChosenDstReg);
560   Observer.changedInstr(MI);
561 }
562 
563 bool CombinerHelper::isPredecessor(MachineInstr &DefMI, MachineInstr &UseMI) {
564   assert(DefMI.getParent() == UseMI.getParent());
565   if (&DefMI == &UseMI)
566     return false;
567 
568   // Loop through the basic block until we find one of the instructions.
569   MachineBasicBlock::const_iterator I = DefMI.getParent()->begin();
570   for (; &*I != &DefMI && &*I != &UseMI; ++I)
571     return &*I == &DefMI;
572 
573   llvm_unreachable("Block must contain instructions");
574 }
575 
576 bool CombinerHelper::dominates(MachineInstr &DefMI, MachineInstr &UseMI) {
577   if (MDT)
578     return MDT->dominates(&DefMI, &UseMI);
579   else if (DefMI.getParent() != UseMI.getParent())
580     return false;
581 
582   return isPredecessor(DefMI, UseMI);
583 }
584 
585 bool CombinerHelper::findPostIndexCandidate(MachineInstr &MI, Register &Addr,
586                                             Register &Base, Register &Offset) {
587   auto &MF = *MI.getParent()->getParent();
588   const auto &TLI = *MF.getSubtarget().getTargetLowering();
589 
590 #ifndef NDEBUG
591   unsigned Opcode = MI.getOpcode();
592   assert(Opcode == TargetOpcode::G_LOAD || Opcode == TargetOpcode::G_SEXTLOAD ||
593          Opcode == TargetOpcode::G_ZEXTLOAD || Opcode == TargetOpcode::G_STORE);
594 #endif
595 
596   Base = MI.getOperand(1).getReg();
597   MachineInstr *BaseDef = MRI.getUniqueVRegDef(Base);
598   if (BaseDef && BaseDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
599     return false;
600 
601   LLVM_DEBUG(dbgs() << "Searching for post-indexing opportunity for: " << MI);
602 
603   for (auto &Use : MRI.use_instructions(Base)) {
604     if (Use.getOpcode() != TargetOpcode::G_PTR_ADD)
605       continue;
606 
607     Offset = Use.getOperand(2).getReg();
608     if (!ForceLegalIndexing &&
609         !TLI.isIndexingLegal(MI, Base, Offset, /*IsPre*/ false, MRI)) {
610       LLVM_DEBUG(dbgs() << "    Ignoring candidate with illegal addrmode: "
611                         << Use);
612       continue;
613     }
614 
615     // Make sure the offset calculation is before the potentially indexed op.
616     // FIXME: we really care about dependency here. The offset calculation might
617     // be movable.
618     MachineInstr *OffsetDef = MRI.getUniqueVRegDef(Offset);
619     if (!OffsetDef || !dominates(*OffsetDef, MI)) {
620       LLVM_DEBUG(dbgs() << "    Ignoring candidate with offset after mem-op: "
621                         << Use);
622       continue;
623     }
624 
625     // FIXME: check whether all uses of Base are load/store with foldable
626     // addressing modes. If so, using the normal addr-modes is better than
627     // forming an indexed one.
628 
629     bool MemOpDominatesAddrUses = true;
630     for (auto &PtrAddUse : MRI.use_instructions(Use.getOperand(0).getReg())) {
631       if (!dominates(MI, PtrAddUse)) {
632         MemOpDominatesAddrUses = false;
633         break;
634       }
635     }
636 
637     if (!MemOpDominatesAddrUses) {
638       LLVM_DEBUG(
639           dbgs() << "    Ignoring candidate as memop does not dominate uses: "
640                  << Use);
641       continue;
642     }
643 
644     LLVM_DEBUG(dbgs() << "    Found match: " << Use);
645     Addr = Use.getOperand(0).getReg();
646     return true;
647   }
648 
649   return false;
650 }
651 
652 bool CombinerHelper::findPreIndexCandidate(MachineInstr &MI, Register &Addr,
653                                            Register &Base, Register &Offset) {
654   auto &MF = *MI.getParent()->getParent();
655   const auto &TLI = *MF.getSubtarget().getTargetLowering();
656 
657 #ifndef NDEBUG
658   unsigned Opcode = MI.getOpcode();
659   assert(Opcode == TargetOpcode::G_LOAD || Opcode == TargetOpcode::G_SEXTLOAD ||
660          Opcode == TargetOpcode::G_ZEXTLOAD || Opcode == TargetOpcode::G_STORE);
661 #endif
662 
663   Addr = MI.getOperand(1).getReg();
664   MachineInstr *AddrDef = getOpcodeDef(TargetOpcode::G_PTR_ADD, Addr, MRI);
665   if (!AddrDef || MRI.hasOneUse(Addr))
666     return false;
667 
668   Base = AddrDef->getOperand(1).getReg();
669   Offset = AddrDef->getOperand(2).getReg();
670 
671   LLVM_DEBUG(dbgs() << "Found potential pre-indexed load_store: " << MI);
672 
673   if (!ForceLegalIndexing &&
674       !TLI.isIndexingLegal(MI, Base, Offset, /*IsPre*/ true, MRI)) {
675     LLVM_DEBUG(dbgs() << "    Skipping, not legal for target");
676     return false;
677   }
678 
679   MachineInstr *BaseDef = getDefIgnoringCopies(Base, MRI);
680   if (BaseDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
681     LLVM_DEBUG(dbgs() << "    Skipping, frame index would need copy anyway.");
682     return false;
683   }
684 
685   if (MI.getOpcode() == TargetOpcode::G_STORE) {
686     // Would require a copy.
687     if (Base == MI.getOperand(0).getReg()) {
688       LLVM_DEBUG(dbgs() << "    Skipping, storing base so need copy anyway.");
689       return false;
690     }
691 
692     // We're expecting one use of Addr in MI, but it could also be the
693     // value stored, which isn't actually dominated by the instruction.
694     if (MI.getOperand(0).getReg() == Addr) {
695       LLVM_DEBUG(dbgs() << "    Skipping, does not dominate all addr uses");
696       return false;
697     }
698   }
699 
700   // FIXME: check whether all uses of the base pointer are constant PtrAdds.
701   // That might allow us to end base's liveness here by adjusting the constant.
702 
703   for (auto &UseMI : MRI.use_instructions(Addr)) {
704     if (!dominates(MI, UseMI)) {
705       LLVM_DEBUG(dbgs() << "    Skipping, does not dominate all addr uses.");
706       return false;
707     }
708   }
709 
710   return true;
711 }
712 
713 bool CombinerHelper::tryCombineIndexedLoadStore(MachineInstr &MI) {
714   IndexedLoadStoreMatchInfo MatchInfo;
715   if (matchCombineIndexedLoadStore(MI, MatchInfo)) {
716     applyCombineIndexedLoadStore(MI, MatchInfo);
717     return true;
718   }
719   return false;
720 }
721 
722 bool CombinerHelper::matchCombineIndexedLoadStore(MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo) {
723   unsigned Opcode = MI.getOpcode();
724   if (Opcode != TargetOpcode::G_LOAD && Opcode != TargetOpcode::G_SEXTLOAD &&
725       Opcode != TargetOpcode::G_ZEXTLOAD && Opcode != TargetOpcode::G_STORE)
726     return false;
727 
728   MatchInfo.IsPre = findPreIndexCandidate(MI, MatchInfo.Addr, MatchInfo.Base,
729                                           MatchInfo.Offset);
730   if (!MatchInfo.IsPre &&
731       !findPostIndexCandidate(MI, MatchInfo.Addr, MatchInfo.Base,
732                               MatchInfo.Offset))
733     return false;
734 
735   return true;
736 }
737 
738 void CombinerHelper::applyCombineIndexedLoadStore(
739     MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo) {
740   MachineInstr &AddrDef = *MRI.getUniqueVRegDef(MatchInfo.Addr);
741   MachineIRBuilder MIRBuilder(MI);
742   unsigned Opcode = MI.getOpcode();
743   bool IsStore = Opcode == TargetOpcode::G_STORE;
744   unsigned NewOpcode;
745   switch (Opcode) {
746   case TargetOpcode::G_LOAD:
747     NewOpcode = TargetOpcode::G_INDEXED_LOAD;
748     break;
749   case TargetOpcode::G_SEXTLOAD:
750     NewOpcode = TargetOpcode::G_INDEXED_SEXTLOAD;
751     break;
752   case TargetOpcode::G_ZEXTLOAD:
753     NewOpcode = TargetOpcode::G_INDEXED_ZEXTLOAD;
754     break;
755   case TargetOpcode::G_STORE:
756     NewOpcode = TargetOpcode::G_INDEXED_STORE;
757     break;
758   default:
759     llvm_unreachable("Unknown load/store opcode");
760   }
761 
762   auto MIB = MIRBuilder.buildInstr(NewOpcode);
763   if (IsStore) {
764     MIB.addDef(MatchInfo.Addr);
765     MIB.addUse(MI.getOperand(0).getReg());
766   } else {
767     MIB.addDef(MI.getOperand(0).getReg());
768     MIB.addDef(MatchInfo.Addr);
769   }
770 
771   MIB.addUse(MatchInfo.Base);
772   MIB.addUse(MatchInfo.Offset);
773   MIB.addImm(MatchInfo.IsPre);
774   MI.eraseFromParent();
775   AddrDef.eraseFromParent();
776 
777   LLVM_DEBUG(dbgs() << "    Combinined to indexed operation");
778 }
779 
780 bool CombinerHelper::matchElideBrByInvertingCond(MachineInstr &MI) {
781   if (MI.getOpcode() != TargetOpcode::G_BR)
782     return false;
783 
784   // Try to match the following:
785   // bb1:
786   //   %c(s32) = G_ICMP pred, %a, %b
787   //   %c1(s1) = G_TRUNC %c(s32)
788   //   G_BRCOND %c1, %bb2
789   //   G_BR %bb3
790   // bb2:
791   // ...
792   // bb3:
793 
794   // The above pattern does not have a fall through to the successor bb2, always
795   // resulting in a branch no matter which path is taken. Here we try to find
796   // and replace that pattern with conditional branch to bb3 and otherwise
797   // fallthrough to bb2.
798 
799   MachineBasicBlock *MBB = MI.getParent();
800   MachineBasicBlock::iterator BrIt(MI);
801   if (BrIt == MBB->begin())
802     return false;
803   assert(std::next(BrIt) == MBB->end() && "expected G_BR to be a terminator");
804 
805   MachineInstr *BrCond = &*std::prev(BrIt);
806   if (BrCond->getOpcode() != TargetOpcode::G_BRCOND)
807     return false;
808 
809   // Check that the next block is the conditional branch target.
810   if (!MBB->isLayoutSuccessor(BrCond->getOperand(1).getMBB()))
811     return false;
812 
813   MachineInstr *CmpMI = MRI.getVRegDef(BrCond->getOperand(0).getReg());
814   if (!CmpMI || CmpMI->getOpcode() != TargetOpcode::G_ICMP ||
815       !MRI.hasOneUse(CmpMI->getOperand(0).getReg()))
816     return false;
817   return true;
818 }
819 
820 bool CombinerHelper::tryElideBrByInvertingCond(MachineInstr &MI) {
821   if (!matchElideBrByInvertingCond(MI))
822     return false;
823   applyElideBrByInvertingCond(MI);
824   return true;
825 }
826 
827 void CombinerHelper::applyElideBrByInvertingCond(MachineInstr &MI) {
828   MachineBasicBlock *BrTarget = MI.getOperand(0).getMBB();
829   MachineBasicBlock::iterator BrIt(MI);
830   MachineInstr *BrCond = &*std::prev(BrIt);
831   MachineInstr *CmpMI = MRI.getVRegDef(BrCond->getOperand(0).getReg());
832 
833   CmpInst::Predicate InversePred = CmpInst::getInversePredicate(
834       (CmpInst::Predicate)CmpMI->getOperand(1).getPredicate());
835 
836   // Invert the G_ICMP condition.
837   Observer.changingInstr(*CmpMI);
838   CmpMI->getOperand(1).setPredicate(InversePred);
839   Observer.changedInstr(*CmpMI);
840 
841   // Change the conditional branch target.
842   Observer.changingInstr(*BrCond);
843   BrCond->getOperand(1).setMBB(BrTarget);
844   Observer.changedInstr(*BrCond);
845   MI.eraseFromParent();
846 }
847 
848 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
849   // On Darwin, -Os means optimize for size without hurting performance, so
850   // only really optimize for size when -Oz (MinSize) is used.
851   if (MF.getTarget().getTargetTriple().isOSDarwin())
852     return MF.getFunction().hasMinSize();
853   return MF.getFunction().hasOptSize();
854 }
855 
856 // Returns a list of types to use for memory op lowering in MemOps. A partial
857 // port of findOptimalMemOpLowering in TargetLowering.
858 static bool findGISelOptimalMemOpLowering(
859     std::vector<LLT> &MemOps, unsigned Limit, uint64_t Size, unsigned DstAlign,
860     unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
861     bool AllowOverlap, unsigned DstAS, unsigned SrcAS,
862     const AttributeList &FuncAttributes, const TargetLowering &TLI) {
863   // If 'SrcAlign' is zero, that means the memory operation does not need to
864   // load the value, i.e. memset or memcpy from constant string. Otherwise,
865   // it's the inferred alignment of the source. 'DstAlign', on the other hand,
866   // is the specified alignment of the memory operation. If it is zero, that
867   // means it's possible to change the alignment of the destination.
868   // 'MemcpyStrSrc' indicates whether the memcpy source is constant so it does
869   // not need to be loaded.
870   if (SrcAlign != 0 && SrcAlign < DstAlign)
871     return false;
872 
873   LLT Ty = TLI.getOptimalMemOpLLT(Size, DstAlign, SrcAlign, IsMemset,
874                                   ZeroMemset, MemcpyStrSrc, FuncAttributes);
875 
876   if (Ty == LLT()) {
877     // Use the largest scalar type whose alignment constraints are satisfied.
878     // We only need to check DstAlign here as SrcAlign is always greater or
879     // equal to DstAlign (or zero).
880     Ty = LLT::scalar(64);
881     while (DstAlign && DstAlign < Ty.getSizeInBytes() &&
882            !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, DstAlign))
883       Ty = LLT::scalar(Ty.getSizeInBytes());
884     assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
885     // FIXME: check for the largest legal type we can load/store to.
886   }
887 
888   unsigned NumMemOps = 0;
889   while (Size != 0) {
890     unsigned TySize = Ty.getSizeInBytes();
891     while (TySize > Size) {
892       // For now, only use non-vector load / store's for the left-over pieces.
893       LLT NewTy = Ty;
894       // FIXME: check for mem op safety and legality of the types. Not all of
895       // SDAGisms map cleanly to GISel concepts.
896       if (NewTy.isVector())
897         NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
898       NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits() - 1));
899       unsigned NewTySize = NewTy.getSizeInBytes();
900       assert(NewTySize > 0 && "Could not find appropriate type");
901 
902       // If the new LLT cannot cover all of the remaining bits, then consider
903       // issuing a (or a pair of) unaligned and overlapping load / store.
904       bool Fast;
905       // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
906       MVT VT = getMVTForLLT(Ty);
907       if (NumMemOps && AllowOverlap && NewTySize < Size &&
908           TLI.allowsMisalignedMemoryAccesses(
909               VT, DstAS, DstAlign, MachineMemOperand::MONone, &Fast) &&
910           Fast)
911         TySize = Size;
912       else {
913         Ty = NewTy;
914         TySize = NewTySize;
915       }
916     }
917 
918     if (++NumMemOps > Limit)
919       return false;
920 
921     MemOps.push_back(Ty);
922     Size -= TySize;
923   }
924 
925   return true;
926 }
927 
928 static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
929   if (Ty.isVector())
930     return VectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
931                            Ty.getNumElements());
932   return IntegerType::get(C, Ty.getSizeInBits());
933 }
934 
935 // Get a vectorized representation of the memset value operand, GISel edition.
936 static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
937   MachineRegisterInfo &MRI = *MIB.getMRI();
938   unsigned NumBits = Ty.getScalarSizeInBits();
939   auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI);
940   if (!Ty.isVector() && ValVRegAndVal) {
941     unsigned KnownVal = ValVRegAndVal->Value;
942     APInt Scalar = APInt(8, KnownVal);
943     APInt SplatVal = APInt::getSplat(NumBits, Scalar);
944     return MIB.buildConstant(Ty, SplatVal).getReg(0);
945   }
946   // FIXME: for vector types create a G_BUILD_VECTOR.
947   if (Ty.isVector())
948     return Register();
949 
950   // Extend the byte value to the larger type, and then multiply by a magic
951   // value 0x010101... in order to replicate it across every byte.
952   LLT ExtType = Ty.getScalarType();
953   auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
954   if (NumBits > 8) {
955     APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
956     auto MagicMI = MIB.buildConstant(ExtType, Magic);
957     Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
958   }
959 
960   assert(ExtType == Ty && "Vector memset value type not supported yet");
961   return Val;
962 }
963 
964 bool CombinerHelper::optimizeMemset(MachineInstr &MI, Register Dst, Register Val,
965                                     unsigned KnownLen, unsigned Align,
966                                     bool IsVolatile) {
967   auto &MF = *MI.getParent()->getParent();
968   const auto &TLI = *MF.getSubtarget().getTargetLowering();
969   auto &DL = MF.getDataLayout();
970   LLVMContext &C = MF.getFunction().getContext();
971 
972   assert(KnownLen != 0 && "Have a zero length memset length!");
973 
974   bool DstAlignCanChange = false;
975   MachineFrameInfo &MFI = MF.getFrameInfo();
976   bool OptSize = shouldLowerMemFuncForSize(MF);
977 
978   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
979   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
980     DstAlignCanChange = true;
981 
982   unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
983   std::vector<LLT> MemOps;
984 
985   const auto &DstMMO = **MI.memoperands_begin();
986   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
987 
988   auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI);
989   bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
990 
991   if (!findGISelOptimalMemOpLowering(
992           MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Align), 0,
993           /*IsMemset=*/true,
994           /*ZeroMemset=*/IsZeroVal, /*MemcpyStrSrc=*/false,
995           /*AllowOverlap=*/!IsVolatile, DstPtrInfo.getAddrSpace(), ~0u,
996           MF.getFunction().getAttributes(), TLI))
997     return false;
998 
999   if (DstAlignCanChange) {
1000     // Get an estimate of the type from the LLT.
1001     Type *IRTy = getTypeForLLT(MemOps[0], C);
1002     unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy);
1003     if (NewAlign > Align) {
1004       Align = NewAlign;
1005       unsigned FI = FIDef->getOperand(1).getIndex();
1006       // Give the stack frame object a larger alignment if needed.
1007       if (MFI.getObjectAlignment(FI) < Align)
1008         MFI.setObjectAlignment(FI, Align);
1009     }
1010   }
1011 
1012   MachineIRBuilder MIB(MI);
1013   // Find the largest store and generate the bit pattern for it.
1014   LLT LargestTy = MemOps[0];
1015   for (unsigned i = 1; i < MemOps.size(); i++)
1016     if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
1017       LargestTy = MemOps[i];
1018 
1019   // The memset stored value is always defined as an s8, so in order to make it
1020   // work with larger store types we need to repeat the bit pattern across the
1021   // wider type.
1022   Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
1023 
1024   if (!MemSetValue)
1025     return false;
1026 
1027   // Generate the stores. For each store type in the list, we generate the
1028   // matching store of that type to the destination address.
1029   LLT PtrTy = MRI.getType(Dst);
1030   unsigned DstOff = 0;
1031   unsigned Size = KnownLen;
1032   for (unsigned I = 0; I < MemOps.size(); I++) {
1033     LLT Ty = MemOps[I];
1034     unsigned TySize = Ty.getSizeInBytes();
1035     if (TySize > Size) {
1036       // Issuing an unaligned load / store pair that overlaps with the previous
1037       // pair. Adjust the offset accordingly.
1038       assert(I == MemOps.size() - 1 && I != 0);
1039       DstOff -= TySize - Size;
1040     }
1041 
1042     // If this store is smaller than the largest store see whether we can get
1043     // the smaller value for free with a truncate.
1044     Register Value = MemSetValue;
1045     if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
1046       MVT VT = getMVTForLLT(Ty);
1047       MVT LargestVT = getMVTForLLT(LargestTy);
1048       if (!LargestTy.isVector() && !Ty.isVector() &&
1049           TLI.isTruncateFree(LargestVT, VT))
1050         Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
1051       else
1052         Value = getMemsetValue(Val, Ty, MIB);
1053       if (!Value)
1054         return false;
1055     }
1056 
1057     auto *StoreMMO =
1058         MF.getMachineMemOperand(&DstMMO, DstOff, Ty.getSizeInBytes());
1059 
1060     Register Ptr = Dst;
1061     if (DstOff != 0) {
1062       auto Offset =
1063           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
1064       Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
1065     }
1066 
1067     MIB.buildStore(Value, Ptr, *StoreMMO);
1068     DstOff += Ty.getSizeInBytes();
1069     Size -= TySize;
1070   }
1071 
1072   MI.eraseFromParent();
1073   return true;
1074 }
1075 
1076 
1077 bool CombinerHelper::optimizeMemcpy(MachineInstr &MI, Register Dst,
1078                                     Register Src, unsigned KnownLen,
1079                                     unsigned DstAlign, unsigned SrcAlign,
1080                                     bool IsVolatile) {
1081   auto &MF = *MI.getParent()->getParent();
1082   const auto &TLI = *MF.getSubtarget().getTargetLowering();
1083   auto &DL = MF.getDataLayout();
1084   LLVMContext &C = MF.getFunction().getContext();
1085 
1086   assert(KnownLen != 0 && "Have a zero length memcpy length!");
1087 
1088   bool DstAlignCanChange = false;
1089   MachineFrameInfo &MFI = MF.getFrameInfo();
1090   bool OptSize = shouldLowerMemFuncForSize(MF);
1091   unsigned Alignment = MinAlign(DstAlign, SrcAlign);
1092 
1093   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
1094   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
1095     DstAlignCanChange = true;
1096 
1097   // FIXME: infer better src pointer alignment like SelectionDAG does here.
1098   // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
1099   // if the memcpy is in a tail call position.
1100 
1101   unsigned Limit = TLI.getMaxStoresPerMemcpy(OptSize);
1102   std::vector<LLT> MemOps;
1103 
1104   const auto &DstMMO = **MI.memoperands_begin();
1105   const auto &SrcMMO = **std::next(MI.memoperands_begin());
1106   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
1107   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
1108 
1109   if (!findGISelOptimalMemOpLowering(
1110           MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Alignment),
1111           SrcAlign,
1112           /*IsMemset=*/false,
1113           /*ZeroMemset=*/false, /*MemcpyStrSrc=*/false,
1114           /*AllowOverlap=*/!IsVolatile, DstPtrInfo.getAddrSpace(),
1115           SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes(), TLI))
1116     return false;
1117 
1118   if (DstAlignCanChange) {
1119     // Get an estimate of the type from the LLT.
1120     Type *IRTy = getTypeForLLT(MemOps[0], C);
1121     unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy);
1122 
1123     // Don't promote to an alignment that would require dynamic stack
1124     // realignment.
1125     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1126     if (!TRI->needsStackRealignment(MF))
1127       while (NewAlign > Alignment &&
1128              DL.exceedsNaturalStackAlignment(Align(NewAlign)))
1129         NewAlign /= 2;
1130 
1131     if (NewAlign > Alignment) {
1132       Alignment = NewAlign;
1133       unsigned FI = FIDef->getOperand(1).getIndex();
1134       // Give the stack frame object a larger alignment if needed.
1135       if (MFI.getObjectAlignment(FI) < Alignment)
1136         MFI.setObjectAlignment(FI, Alignment);
1137     }
1138   }
1139 
1140   LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
1141 
1142   MachineIRBuilder MIB(MI);
1143   // Now we need to emit a pair of load and stores for each of the types we've
1144   // collected. I.e. for each type, generate a load from the source pointer of
1145   // that type width, and then generate a corresponding store to the dest buffer
1146   // of that value loaded. This can result in a sequence of loads and stores
1147   // mixed types, depending on what the target specifies as good types to use.
1148   unsigned CurrOffset = 0;
1149   LLT PtrTy = MRI.getType(Src);
1150   unsigned Size = KnownLen;
1151   for (auto CopyTy : MemOps) {
1152     // Issuing an unaligned load / store pair  that overlaps with the previous
1153     // pair. Adjust the offset accordingly.
1154     if (CopyTy.getSizeInBytes() > Size)
1155       CurrOffset -= CopyTy.getSizeInBytes() - Size;
1156 
1157     // Construct MMOs for the accesses.
1158     auto *LoadMMO =
1159         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
1160     auto *StoreMMO =
1161         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
1162 
1163     // Create the load.
1164     Register LoadPtr = Src;
1165     Register Offset;
1166     if (CurrOffset != 0) {
1167       Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset)
1168                    .getReg(0);
1169       LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
1170     }
1171     auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
1172 
1173     // Create the store.
1174     Register StorePtr =
1175         CurrOffset == 0 ? Dst : MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
1176     MIB.buildStore(LdVal, StorePtr, *StoreMMO);
1177     CurrOffset += CopyTy.getSizeInBytes();
1178     Size -= CopyTy.getSizeInBytes();
1179   }
1180 
1181   MI.eraseFromParent();
1182   return true;
1183 }
1184 
1185 bool CombinerHelper::optimizeMemmove(MachineInstr &MI, Register Dst,
1186                                     Register Src, unsigned KnownLen,
1187                                     unsigned DstAlign, unsigned SrcAlign,
1188                                     bool IsVolatile) {
1189   auto &MF = *MI.getParent()->getParent();
1190   const auto &TLI = *MF.getSubtarget().getTargetLowering();
1191   auto &DL = MF.getDataLayout();
1192   LLVMContext &C = MF.getFunction().getContext();
1193 
1194   assert(KnownLen != 0 && "Have a zero length memmove length!");
1195 
1196   bool DstAlignCanChange = false;
1197   MachineFrameInfo &MFI = MF.getFrameInfo();
1198   bool OptSize = shouldLowerMemFuncForSize(MF);
1199   unsigned Alignment = MinAlign(DstAlign, SrcAlign);
1200 
1201   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
1202   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
1203     DstAlignCanChange = true;
1204 
1205   unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
1206   std::vector<LLT> MemOps;
1207 
1208   const auto &DstMMO = **MI.memoperands_begin();
1209   const auto &SrcMMO = **std::next(MI.memoperands_begin());
1210   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
1211   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
1212 
1213   // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
1214   // to a bug in it's findOptimalMemOpLowering implementation. For now do the
1215   // same thing here.
1216   if (!findGISelOptimalMemOpLowering(
1217           MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Alignment),
1218           SrcAlign,
1219           /*IsMemset=*/false,
1220           /*ZeroMemset=*/false, /*MemcpyStrSrc=*/false,
1221           /*AllowOverlap=*/false, DstPtrInfo.getAddrSpace(),
1222           SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes(), TLI))
1223     return false;
1224 
1225   if (DstAlignCanChange) {
1226     // Get an estimate of the type from the LLT.
1227     Type *IRTy = getTypeForLLT(MemOps[0], C);
1228     unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy);
1229 
1230     // Don't promote to an alignment that would require dynamic stack
1231     // realignment.
1232     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1233     if (!TRI->needsStackRealignment(MF))
1234       while (NewAlign > Alignment &&
1235              DL.exceedsNaturalStackAlignment(Align(NewAlign)))
1236         NewAlign /= 2;
1237 
1238     if (NewAlign > Alignment) {
1239       Alignment = NewAlign;
1240       unsigned FI = FIDef->getOperand(1).getIndex();
1241       // Give the stack frame object a larger alignment if needed.
1242       if (MFI.getObjectAlignment(FI) < Alignment)
1243         MFI.setObjectAlignment(FI, Alignment);
1244     }
1245   }
1246 
1247   LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
1248 
1249   MachineIRBuilder MIB(MI);
1250   // Memmove requires that we perform the loads first before issuing the stores.
1251   // Apart from that, this loop is pretty much doing the same thing as the
1252   // memcpy codegen function.
1253   unsigned CurrOffset = 0;
1254   LLT PtrTy = MRI.getType(Src);
1255   SmallVector<Register, 16> LoadVals;
1256   for (auto CopyTy : MemOps) {
1257     // Construct MMO for the load.
1258     auto *LoadMMO =
1259         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
1260 
1261     // Create the load.
1262     Register LoadPtr = Src;
1263     if (CurrOffset != 0) {
1264       auto Offset =
1265           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
1266       LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
1267     }
1268     LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
1269     CurrOffset += CopyTy.getSizeInBytes();
1270   }
1271 
1272   CurrOffset = 0;
1273   for (unsigned I = 0; I < MemOps.size(); ++I) {
1274     LLT CopyTy = MemOps[I];
1275     // Now store the values loaded.
1276     auto *StoreMMO =
1277         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
1278 
1279     Register StorePtr = Dst;
1280     if (CurrOffset != 0) {
1281       auto Offset =
1282           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
1283       StorePtr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
1284     }
1285     MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
1286     CurrOffset += CopyTy.getSizeInBytes();
1287   }
1288   MI.eraseFromParent();
1289   return true;
1290 }
1291 
1292 bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
1293   // This combine is fairly complex so it's not written with a separate
1294   // matcher function.
1295   assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
1296   Intrinsic::ID ID = (Intrinsic::ID)MI.getIntrinsicID();
1297   assert((ID == Intrinsic::memcpy || ID == Intrinsic::memmove ||
1298           ID == Intrinsic::memset) &&
1299          "Expected a memcpy like intrinsic");
1300 
1301   auto MMOIt = MI.memoperands_begin();
1302   const MachineMemOperand *MemOp = *MMOIt;
1303   bool IsVolatile = MemOp->isVolatile();
1304   // Don't try to optimize volatile.
1305   if (IsVolatile)
1306     return false;
1307 
1308   unsigned DstAlign = MemOp->getBaseAlignment();
1309   unsigned SrcAlign = 0;
1310   Register Dst = MI.getOperand(1).getReg();
1311   Register Src = MI.getOperand(2).getReg();
1312   Register Len = MI.getOperand(3).getReg();
1313 
1314   if (ID != Intrinsic::memset) {
1315     assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
1316     MemOp = *(++MMOIt);
1317     SrcAlign = MemOp->getBaseAlignment();
1318   }
1319 
1320   // See if this is a constant length copy
1321   auto LenVRegAndVal = getConstantVRegValWithLookThrough(Len, MRI);
1322   if (!LenVRegAndVal)
1323     return false; // Leave it to the legalizer to lower it to a libcall.
1324   unsigned KnownLen = LenVRegAndVal->Value;
1325 
1326   if (KnownLen == 0) {
1327     MI.eraseFromParent();
1328     return true;
1329   }
1330 
1331   if (MaxLen && KnownLen > MaxLen)
1332     return false;
1333 
1334   if (ID == Intrinsic::memcpy)
1335     return optimizeMemcpy(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
1336   if (ID == Intrinsic::memmove)
1337     return optimizeMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
1338   if (ID == Intrinsic::memset)
1339     return optimizeMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
1340   return false;
1341 }
1342 
1343 bool CombinerHelper::matchPtrAddImmedChain(MachineInstr &MI,
1344                                            PtrAddChain &MatchInfo) {
1345   // We're trying to match the following pattern:
1346   //   %t1 = G_PTR_ADD %base, G_CONSTANT imm1
1347   //   %root = G_PTR_ADD %t1, G_CONSTANT imm2
1348   // -->
1349   //   %root = G_PTR_ADD %base, G_CONSTANT (imm1 + imm2)
1350 
1351   if (MI.getOpcode() != TargetOpcode::G_PTR_ADD)
1352     return false;
1353 
1354   Register Add2 = MI.getOperand(1).getReg();
1355   Register Imm1 = MI.getOperand(2).getReg();
1356   auto MaybeImmVal = getConstantVRegValWithLookThrough(Imm1, MRI);
1357   if (!MaybeImmVal)
1358     return false;
1359 
1360   MachineInstr *Add2Def = MRI.getUniqueVRegDef(Add2);
1361   if (!Add2Def || Add2Def->getOpcode() != TargetOpcode::G_PTR_ADD)
1362     return false;
1363 
1364   Register Base = Add2Def->getOperand(1).getReg();
1365   Register Imm2 = Add2Def->getOperand(2).getReg();
1366   auto MaybeImm2Val = getConstantVRegValWithLookThrough(Imm2, MRI);
1367   if (!MaybeImm2Val)
1368     return false;
1369 
1370   // Pass the combined immediate to the apply function.
1371   MatchInfo.Imm = MaybeImmVal->Value + MaybeImm2Val->Value;
1372   MatchInfo.Base = Base;
1373   return true;
1374 }
1375 
1376 bool CombinerHelper::applyPtrAddImmedChain(MachineInstr &MI,
1377                                            PtrAddChain &MatchInfo) {
1378   assert(MI.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
1379   MachineIRBuilder MIB(MI);
1380   LLT OffsetTy = MRI.getType(MI.getOperand(2).getReg());
1381   auto NewOffset = MIB.buildConstant(OffsetTy, MatchInfo.Imm);
1382   Observer.changingInstr(MI);
1383   MI.getOperand(1).setReg(MatchInfo.Base);
1384   MI.getOperand(2).setReg(NewOffset.getReg(0));
1385   Observer.changedInstr(MI);
1386   return true;
1387 }
1388 
1389 bool CombinerHelper::tryCombine(MachineInstr &MI) {
1390   if (tryCombineCopy(MI))
1391     return true;
1392   if (tryCombineExtendingLoads(MI))
1393     return true;
1394   if (tryCombineIndexedLoadStore(MI))
1395     return true;
1396   return false;
1397 }
1398