1 //===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file This file contains a pass that performs load / store related peephole
11 /// optimizations. This pass should be run after register allocation.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "ARM.h"
16 #include "ARMBaseInstrInfo.h"
17 #include "ARMBaseRegisterInfo.h"
18 #include "ARMISelLowering.h"
19 #include "ARMMachineFunctionInfo.h"
20 #include "ARMSubtarget.h"
21 #include "MCTargetDesc/ARMAddressingModes.h"
22 #include "ThumbRegisterInfo.h"
23 #include "llvm/ADT/DenseMap.h"
24 #include "llvm/ADT/STLExtras.h"
25 #include "llvm/ADT/SmallPtrSet.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/SmallVector.h"
28 #include "llvm/ADT/Statistic.h"
29 #include "llvm/CodeGen/MachineBasicBlock.h"
30 #include "llvm/CodeGen/MachineFunctionPass.h"
31 #include "llvm/CodeGen/MachineInstr.h"
32 #include "llvm/CodeGen/MachineInstrBuilder.h"
33 #include "llvm/CodeGen/MachineRegisterInfo.h"
34 #include "llvm/CodeGen/RegisterClassInfo.h"
35 #include "llvm/CodeGen/SelectionDAGNodes.h"
36 #include "llvm/CodeGen/LivePhysRegs.h"
37 #include "llvm/IR/DataLayout.h"
38 #include "llvm/IR/DerivedTypes.h"
39 #include "llvm/IR/Function.h"
40 #include "llvm/Support/Allocator.h"
41 #include "llvm/Support/Debug.h"
42 #include "llvm/Support/ErrorHandling.h"
43 #include "llvm/Support/raw_ostream.h"
44 #include "llvm/Target/TargetInstrInfo.h"
45 #include "llvm/Target/TargetMachine.h"
46 #include "llvm/Target/TargetRegisterInfo.h"
47 using namespace llvm;
48 
49 #define DEBUG_TYPE "arm-ldst-opt"
50 
51 STATISTIC(NumLDMGened , "Number of ldm instructions generated");
52 STATISTIC(NumSTMGened , "Number of stm instructions generated");
53 STATISTIC(NumVLDMGened, "Number of vldm instructions generated");
54 STATISTIC(NumVSTMGened, "Number of vstm instructions generated");
55 STATISTIC(NumLdStMoved, "Number of load / store instructions moved");
56 STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation");
57 STATISTIC(NumSTRDFormed,"Number of strd created before allocation");
58 STATISTIC(NumLDRD2LDM,  "Number of ldrd instructions turned back into ldm");
59 STATISTIC(NumSTRD2STM,  "Number of strd instructions turned back into stm");
60 STATISTIC(NumLDRD2LDR,  "Number of ldrd instructions turned back into ldr's");
61 STATISTIC(NumSTRD2STR,  "Number of strd instructions turned back into str's");
62 
63 /// This switch disables formation of double/multi instructions that could
64 /// potentially lead to (new) alignment traps even with CCR.UNALIGN_TRP
65 /// disabled. This can be used to create libraries that are robust even when
66 /// users provoke undefined behaviour by supplying misaligned pointers.
67 /// \see mayCombineMisaligned()
68 static cl::opt<bool>
69 AssumeMisalignedLoadStores("arm-assume-misaligned-load-store", cl::Hidden,
70     cl::init(false), cl::desc("Be more conservative in ARM load/store opt"));
71 
72 namespace llvm {
73 void initializeARMLoadStoreOptPass(PassRegistry &);
74 }
75 
76 #define ARM_LOAD_STORE_OPT_NAME "ARM load / store optimization pass"
77 
78 namespace {
79   /// Post- register allocation pass the combine load / store instructions to
80   /// form ldm / stm instructions.
81   struct ARMLoadStoreOpt : public MachineFunctionPass {
82     static char ID;
83     ARMLoadStoreOpt() : MachineFunctionPass(ID) {
84       initializeARMLoadStoreOptPass(*PassRegistry::getPassRegistry());
85     }
86 
87     const MachineFunction *MF;
88     const TargetInstrInfo *TII;
89     const TargetRegisterInfo *TRI;
90     const ARMSubtarget *STI;
91     const TargetLowering *TL;
92     ARMFunctionInfo *AFI;
93     LivePhysRegs LiveRegs;
94     RegisterClassInfo RegClassInfo;
95     MachineBasicBlock::const_iterator LiveRegPos;
96     bool LiveRegsValid;
97     bool RegClassInfoValid;
98     bool isThumb1, isThumb2;
99 
100     bool runOnMachineFunction(MachineFunction &Fn) override;
101 
102     MachineFunctionProperties getRequiredProperties() const override {
103       return MachineFunctionProperties().set(
104           MachineFunctionProperties::Property::AllVRegsAllocated);
105     }
106 
107     const char *getPassName() const override {
108       return ARM_LOAD_STORE_OPT_NAME;
109     }
110 
111   private:
112     /// A set of load/store MachineInstrs with same base register sorted by
113     /// offset.
114     struct MemOpQueueEntry {
115       MachineInstr *MI;
116       int Offset;        ///< Load/Store offset.
117       unsigned Position; ///< Position as counted from end of basic block.
118       MemOpQueueEntry(MachineInstr *MI, int Offset, unsigned Position)
119         : MI(MI), Offset(Offset), Position(Position) {}
120     };
121     typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
122 
123     /// A set of MachineInstrs that fulfill (nearly all) conditions to get
124     /// merged into a LDM/STM.
125     struct MergeCandidate {
126       /// List of instructions ordered by load/store offset.
127       SmallVector<MachineInstr*, 4> Instrs;
128       /// Index in Instrs of the instruction being latest in the schedule.
129       unsigned LatestMIIdx;
130       /// Index in Instrs of the instruction being earliest in the schedule.
131       unsigned EarliestMIIdx;
132       /// Index into the basic block where the merged instruction will be
133       /// inserted. (See MemOpQueueEntry.Position)
134       unsigned InsertPos;
135       /// Whether the instructions can be merged into a ldm/stm instruction.
136       bool CanMergeToLSMulti;
137       /// Whether the instructions can be merged into a ldrd/strd instruction.
138       bool CanMergeToLSDouble;
139     };
140     SpecificBumpPtrAllocator<MergeCandidate> Allocator;
141     SmallVector<const MergeCandidate*,4> Candidates;
142     SmallVector<MachineInstr*,4> MergeBaseCandidates;
143 
144     void moveLiveRegsBefore(const MachineBasicBlock &MBB,
145                             MachineBasicBlock::const_iterator Before);
146     unsigned findFreeReg(const TargetRegisterClass &RegClass);
147     void UpdateBaseRegUses(MachineBasicBlock &MBB,
148                            MachineBasicBlock::iterator MBBI,
149                            DebugLoc DL, unsigned Base, unsigned WordOffset,
150                            ARMCC::CondCodes Pred, unsigned PredReg);
151     MachineInstr *CreateLoadStoreMulti(MachineBasicBlock &MBB,
152         MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
153         bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
154         DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs);
155     MachineInstr *CreateLoadStoreDouble(MachineBasicBlock &MBB,
156         MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
157         bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
158         DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) const;
159     void FormCandidates(const MemOpQueue &MemOps);
160     MachineInstr *MergeOpsUpdate(const MergeCandidate &Cand);
161     bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
162                              MachineBasicBlock::iterator &MBBI);
163     bool MergeBaseUpdateLoadStore(MachineInstr *MI);
164     bool MergeBaseUpdateLSMultiple(MachineInstr *MI);
165     bool MergeBaseUpdateLSDouble(MachineInstr &MI) const;
166     bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
167     bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
168     bool CombineMovBx(MachineBasicBlock &MBB);
169   };
170   char ARMLoadStoreOpt::ID = 0;
171 }
172 
173 INITIALIZE_PASS(ARMLoadStoreOpt, "arm-load-store-opt", ARM_LOAD_STORE_OPT_NAME, false, false)
174 
175 static bool definesCPSR(const MachineInstr *MI) {
176   for (const auto &MO : MI->operands()) {
177     if (!MO.isReg())
178       continue;
179     if (MO.isDef() && MO.getReg() == ARM::CPSR && !MO.isDead())
180       // If the instruction has live CPSR def, then it's not safe to fold it
181       // into load / store.
182       return true;
183   }
184 
185   return false;
186 }
187 
188 static int getMemoryOpOffset(const MachineInstr *MI) {
189   unsigned Opcode = MI->getOpcode();
190   bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD;
191   unsigned NumOperands = MI->getDesc().getNumOperands();
192   unsigned OffField = MI->getOperand(NumOperands-3).getImm();
193 
194   if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 ||
195       Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 ||
196       Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8 ||
197       Opcode == ARM::LDRi12   || Opcode == ARM::STRi12)
198     return OffField;
199 
200   // Thumb1 immediate offsets are scaled by 4
201   if (Opcode == ARM::tLDRi || Opcode == ARM::tSTRi ||
202       Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi)
203     return OffField * 4;
204 
205   int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField)
206     : ARM_AM::getAM5Offset(OffField) * 4;
207   ARM_AM::AddrOpc Op = isAM3 ? ARM_AM::getAM3Op(OffField)
208     : ARM_AM::getAM5Op(OffField);
209 
210   if (Op == ARM_AM::sub)
211     return -Offset;
212 
213   return Offset;
214 }
215 
216 static const MachineOperand &getLoadStoreBaseOp(const MachineInstr &MI) {
217   return MI.getOperand(1);
218 }
219 
220 static const MachineOperand &getLoadStoreRegOp(const MachineInstr &MI) {
221   return MI.getOperand(0);
222 }
223 
224 static int getLoadStoreMultipleOpcode(unsigned Opcode, ARM_AM::AMSubMode Mode) {
225   switch (Opcode) {
226   default: llvm_unreachable("Unhandled opcode!");
227   case ARM::LDRi12:
228     ++NumLDMGened;
229     switch (Mode) {
230     default: llvm_unreachable("Unhandled submode!");
231     case ARM_AM::ia: return ARM::LDMIA;
232     case ARM_AM::da: return ARM::LDMDA;
233     case ARM_AM::db: return ARM::LDMDB;
234     case ARM_AM::ib: return ARM::LDMIB;
235     }
236   case ARM::STRi12:
237     ++NumSTMGened;
238     switch (Mode) {
239     default: llvm_unreachable("Unhandled submode!");
240     case ARM_AM::ia: return ARM::STMIA;
241     case ARM_AM::da: return ARM::STMDA;
242     case ARM_AM::db: return ARM::STMDB;
243     case ARM_AM::ib: return ARM::STMIB;
244     }
245   case ARM::tLDRi:
246   case ARM::tLDRspi:
247     // tLDMIA is writeback-only - unless the base register is in the input
248     // reglist.
249     ++NumLDMGened;
250     switch (Mode) {
251     default: llvm_unreachable("Unhandled submode!");
252     case ARM_AM::ia: return ARM::tLDMIA;
253     }
254   case ARM::tSTRi:
255   case ARM::tSTRspi:
256     // There is no non-writeback tSTMIA either.
257     ++NumSTMGened;
258     switch (Mode) {
259     default: llvm_unreachable("Unhandled submode!");
260     case ARM_AM::ia: return ARM::tSTMIA_UPD;
261     }
262   case ARM::t2LDRi8:
263   case ARM::t2LDRi12:
264     ++NumLDMGened;
265     switch (Mode) {
266     default: llvm_unreachable("Unhandled submode!");
267     case ARM_AM::ia: return ARM::t2LDMIA;
268     case ARM_AM::db: return ARM::t2LDMDB;
269     }
270   case ARM::t2STRi8:
271   case ARM::t2STRi12:
272     ++NumSTMGened;
273     switch (Mode) {
274     default: llvm_unreachable("Unhandled submode!");
275     case ARM_AM::ia: return ARM::t2STMIA;
276     case ARM_AM::db: return ARM::t2STMDB;
277     }
278   case ARM::VLDRS:
279     ++NumVLDMGened;
280     switch (Mode) {
281     default: llvm_unreachable("Unhandled submode!");
282     case ARM_AM::ia: return ARM::VLDMSIA;
283     case ARM_AM::db: return 0; // Only VLDMSDB_UPD exists.
284     }
285   case ARM::VSTRS:
286     ++NumVSTMGened;
287     switch (Mode) {
288     default: llvm_unreachable("Unhandled submode!");
289     case ARM_AM::ia: return ARM::VSTMSIA;
290     case ARM_AM::db: return 0; // Only VSTMSDB_UPD exists.
291     }
292   case ARM::VLDRD:
293     ++NumVLDMGened;
294     switch (Mode) {
295     default: llvm_unreachable("Unhandled submode!");
296     case ARM_AM::ia: return ARM::VLDMDIA;
297     case ARM_AM::db: return 0; // Only VLDMDDB_UPD exists.
298     }
299   case ARM::VSTRD:
300     ++NumVSTMGened;
301     switch (Mode) {
302     default: llvm_unreachable("Unhandled submode!");
303     case ARM_AM::ia: return ARM::VSTMDIA;
304     case ARM_AM::db: return 0; // Only VSTMDDB_UPD exists.
305     }
306   }
307 }
308 
309 static ARM_AM::AMSubMode getLoadStoreMultipleSubMode(unsigned Opcode) {
310   switch (Opcode) {
311   default: llvm_unreachable("Unhandled opcode!");
312   case ARM::LDMIA_RET:
313   case ARM::LDMIA:
314   case ARM::LDMIA_UPD:
315   case ARM::STMIA:
316   case ARM::STMIA_UPD:
317   case ARM::tLDMIA:
318   case ARM::tLDMIA_UPD:
319   case ARM::tSTMIA_UPD:
320   case ARM::t2LDMIA_RET:
321   case ARM::t2LDMIA:
322   case ARM::t2LDMIA_UPD:
323   case ARM::t2STMIA:
324   case ARM::t2STMIA_UPD:
325   case ARM::VLDMSIA:
326   case ARM::VLDMSIA_UPD:
327   case ARM::VSTMSIA:
328   case ARM::VSTMSIA_UPD:
329   case ARM::VLDMDIA:
330   case ARM::VLDMDIA_UPD:
331   case ARM::VSTMDIA:
332   case ARM::VSTMDIA_UPD:
333     return ARM_AM::ia;
334 
335   case ARM::LDMDA:
336   case ARM::LDMDA_UPD:
337   case ARM::STMDA:
338   case ARM::STMDA_UPD:
339     return ARM_AM::da;
340 
341   case ARM::LDMDB:
342   case ARM::LDMDB_UPD:
343   case ARM::STMDB:
344   case ARM::STMDB_UPD:
345   case ARM::t2LDMDB:
346   case ARM::t2LDMDB_UPD:
347   case ARM::t2STMDB:
348   case ARM::t2STMDB_UPD:
349   case ARM::VLDMSDB_UPD:
350   case ARM::VSTMSDB_UPD:
351   case ARM::VLDMDDB_UPD:
352   case ARM::VSTMDDB_UPD:
353     return ARM_AM::db;
354 
355   case ARM::LDMIB:
356   case ARM::LDMIB_UPD:
357   case ARM::STMIB:
358   case ARM::STMIB_UPD:
359     return ARM_AM::ib;
360   }
361 }
362 
363 static bool isT1i32Load(unsigned Opc) {
364   return Opc == ARM::tLDRi || Opc == ARM::tLDRspi;
365 }
366 
367 static bool isT2i32Load(unsigned Opc) {
368   return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8;
369 }
370 
371 static bool isi32Load(unsigned Opc) {
372   return Opc == ARM::LDRi12 || isT1i32Load(Opc) || isT2i32Load(Opc) ;
373 }
374 
375 static bool isT1i32Store(unsigned Opc) {
376   return Opc == ARM::tSTRi || Opc == ARM::tSTRspi;
377 }
378 
379 static bool isT2i32Store(unsigned Opc) {
380   return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8;
381 }
382 
383 static bool isi32Store(unsigned Opc) {
384   return Opc == ARM::STRi12 || isT1i32Store(Opc) || isT2i32Store(Opc);
385 }
386 
387 static bool isLoadSingle(unsigned Opc) {
388   return isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD;
389 }
390 
391 static unsigned getImmScale(unsigned Opc) {
392   switch (Opc) {
393   default: llvm_unreachable("Unhandled opcode!");
394   case ARM::tLDRi:
395   case ARM::tSTRi:
396   case ARM::tLDRspi:
397   case ARM::tSTRspi:
398     return 1;
399   case ARM::tLDRHi:
400   case ARM::tSTRHi:
401     return 2;
402   case ARM::tLDRBi:
403   case ARM::tSTRBi:
404     return 4;
405   }
406 }
407 
408 static unsigned getLSMultipleTransferSize(const MachineInstr *MI) {
409   switch (MI->getOpcode()) {
410   default: return 0;
411   case ARM::LDRi12:
412   case ARM::STRi12:
413   case ARM::tLDRi:
414   case ARM::tSTRi:
415   case ARM::tLDRspi:
416   case ARM::tSTRspi:
417   case ARM::t2LDRi8:
418   case ARM::t2LDRi12:
419   case ARM::t2STRi8:
420   case ARM::t2STRi12:
421   case ARM::VLDRS:
422   case ARM::VSTRS:
423     return 4;
424   case ARM::VLDRD:
425   case ARM::VSTRD:
426     return 8;
427   case ARM::LDMIA:
428   case ARM::LDMDA:
429   case ARM::LDMDB:
430   case ARM::LDMIB:
431   case ARM::STMIA:
432   case ARM::STMDA:
433   case ARM::STMDB:
434   case ARM::STMIB:
435   case ARM::tLDMIA:
436   case ARM::tLDMIA_UPD:
437   case ARM::tSTMIA_UPD:
438   case ARM::t2LDMIA:
439   case ARM::t2LDMDB:
440   case ARM::t2STMIA:
441   case ARM::t2STMDB:
442   case ARM::VLDMSIA:
443   case ARM::VSTMSIA:
444     return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4;
445   case ARM::VLDMDIA:
446   case ARM::VSTMDIA:
447     return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8;
448   }
449 }
450 
451 /// Update future uses of the base register with the offset introduced
452 /// due to writeback. This function only works on Thumb1.
453 void
454 ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
455                                    MachineBasicBlock::iterator MBBI,
456                                    DebugLoc DL, unsigned Base,
457                                    unsigned WordOffset,
458                                    ARMCC::CondCodes Pred, unsigned PredReg) {
459   assert(isThumb1 && "Can only update base register uses for Thumb1!");
460   // Start updating any instructions with immediate offsets. Insert a SUB before
461   // the first non-updateable instruction (if any).
462   for (; MBBI != MBB.end(); ++MBBI) {
463     bool InsertSub = false;
464     unsigned Opc = MBBI->getOpcode();
465 
466     if (MBBI->readsRegister(Base)) {
467       int Offset;
468       bool IsLoad =
469         Opc == ARM::tLDRi || Opc == ARM::tLDRHi || Opc == ARM::tLDRBi;
470       bool IsStore =
471         Opc == ARM::tSTRi || Opc == ARM::tSTRHi || Opc == ARM::tSTRBi;
472 
473       if (IsLoad || IsStore) {
474         // Loads and stores with immediate offsets can be updated, but only if
475         // the new offset isn't negative.
476         // The MachineOperand containing the offset immediate is the last one
477         // before predicates.
478         MachineOperand &MO =
479           MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
480         // The offsets are scaled by 1, 2 or 4 depending on the Opcode.
481         Offset = MO.getImm() - WordOffset * getImmScale(Opc);
482 
483         // If storing the base register, it needs to be reset first.
484         unsigned InstrSrcReg = getLoadStoreRegOp(*MBBI).getReg();
485 
486         if (Offset >= 0 && !(IsStore && InstrSrcReg == Base))
487           MO.setImm(Offset);
488         else
489           InsertSub = true;
490 
491       } else if ((Opc == ARM::tSUBi8 || Opc == ARM::tADDi8) &&
492                  !definesCPSR(MBBI)) {
493         // SUBS/ADDS using this register, with a dead def of the CPSR.
494         // Merge it with the update; if the merged offset is too large,
495         // insert a new sub instead.
496         MachineOperand &MO =
497           MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
498         Offset = (Opc == ARM::tSUBi8) ?
499           MO.getImm() + WordOffset * 4 :
500           MO.getImm() - WordOffset * 4 ;
501         if (Offset >= 0 && TL->isLegalAddImmediate(Offset)) {
502           // FIXME: Swap ADDS<->SUBS if Offset < 0, erase instruction if
503           // Offset == 0.
504           MO.setImm(Offset);
505           // The base register has now been reset, so exit early.
506           return;
507         } else {
508           InsertSub = true;
509         }
510 
511       } else {
512         // Can't update the instruction.
513         InsertSub = true;
514       }
515 
516     } else if (definesCPSR(MBBI) || MBBI->isCall() || MBBI->isBranch()) {
517       // Since SUBS sets the condition flags, we can't place the base reset
518       // after an instruction that has a live CPSR def.
519       // The base register might also contain an argument for a function call.
520       InsertSub = true;
521     }
522 
523     if (InsertSub) {
524       // An instruction above couldn't be updated, so insert a sub.
525       AddDefaultT1CC(BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true)
526         .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg);
527       return;
528     }
529 
530     if (MBBI->killsRegister(Base) || MBBI->definesRegister(Base))
531       // Register got killed. Stop updating.
532       return;
533   }
534 
535   // End of block was reached.
536   if (MBB.succ_size() > 0) {
537     // FIXME: Because of a bug, live registers are sometimes missing from
538     // the successor blocks' live-in sets. This means we can't trust that
539     // information and *always* have to reset at the end of a block.
540     // See PR21029.
541     if (MBBI != MBB.end()) --MBBI;
542     AddDefaultT1CC(
543       BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true)
544       .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg);
545   }
546 }
547 
548 /// Return the first register of class \p RegClass that is not in \p Regs.
549 unsigned ARMLoadStoreOpt::findFreeReg(const TargetRegisterClass &RegClass) {
550   if (!RegClassInfoValid) {
551     RegClassInfo.runOnMachineFunction(*MF);
552     RegClassInfoValid = true;
553   }
554 
555   for (unsigned Reg : RegClassInfo.getOrder(&RegClass))
556     if (!LiveRegs.contains(Reg))
557       return Reg;
558   return 0;
559 }
560 
561 /// Compute live registers just before instruction \p Before (in normal schedule
562 /// direction). Computes backwards so multiple queries in the same block must
563 /// come in reverse order.
564 void ARMLoadStoreOpt::moveLiveRegsBefore(const MachineBasicBlock &MBB,
565     MachineBasicBlock::const_iterator Before) {
566   // Initialize if we never queried in this block.
567   if (!LiveRegsValid) {
568     LiveRegs.init(TRI);
569     LiveRegs.addLiveOuts(&MBB, true);
570     LiveRegPos = MBB.end();
571     LiveRegsValid = true;
572   }
573   // Move backward just before the "Before" position.
574   while (LiveRegPos != Before) {
575     --LiveRegPos;
576     LiveRegs.stepBackward(*LiveRegPos);
577   }
578 }
579 
580 static bool ContainsReg(const ArrayRef<std::pair<unsigned, bool>> &Regs,
581                         unsigned Reg) {
582   for (const std::pair<unsigned, bool> &R : Regs)
583     if (R.first == Reg)
584       return true;
585   return false;
586 }
587 
588 /// Create and insert a LDM or STM with Base as base register and registers in
589 /// Regs as the register operands that would be loaded / stored.  It returns
590 /// true if the transformation is done.
591 MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(MachineBasicBlock &MBB,
592     MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
593     bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
594     DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) {
595   unsigned NumRegs = Regs.size();
596   assert(NumRegs > 1);
597 
598   // For Thumb1 targets, it might be necessary to clobber the CPSR to merge.
599   // Compute liveness information for that register to make the decision.
600   bool SafeToClobberCPSR = !isThumb1 ||
601     (MBB.computeRegisterLiveness(TRI, ARM::CPSR, InsertBefore, 20) ==
602      MachineBasicBlock::LQR_Dead);
603 
604   bool Writeback = isThumb1; // Thumb1 LDM/STM have base reg writeback.
605 
606   // Exception: If the base register is in the input reglist, Thumb1 LDM is
607   // non-writeback.
608   // It's also not possible to merge an STR of the base register in Thumb1.
609   if (isThumb1 && isi32Load(Opcode) && ContainsReg(Regs, Base)) {
610     assert(Base != ARM::SP && "Thumb1 does not allow SP in register list");
611     if (Opcode == ARM::tLDRi) {
612       Writeback = false;
613     } else if (Opcode == ARM::tSTRi) {
614       return nullptr;
615     }
616   }
617 
618   ARM_AM::AMSubMode Mode = ARM_AM::ia;
619   // VFP and Thumb2 do not support IB or DA modes. Thumb1 only supports IA.
620   bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
621   bool haveIBAndDA = isNotVFP && !isThumb2 && !isThumb1;
622 
623   if (Offset == 4 && haveIBAndDA) {
624     Mode = ARM_AM::ib;
625   } else if (Offset == -4 * (int)NumRegs + 4 && haveIBAndDA) {
626     Mode = ARM_AM::da;
627   } else if (Offset == -4 * (int)NumRegs && isNotVFP && !isThumb1) {
628     // VLDM/VSTM do not support DB mode without also updating the base reg.
629     Mode = ARM_AM::db;
630   } else if (Offset != 0 || Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) {
631     // Check if this is a supported opcode before inserting instructions to
632     // calculate a new base register.
633     if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return nullptr;
634 
635     // If starting offset isn't zero, insert a MI to materialize a new base.
636     // But only do so if it is cost effective, i.e. merging more than two
637     // loads / stores.
638     if (NumRegs <= 2)
639       return nullptr;
640 
641     // On Thumb1, it's not worth materializing a new base register without
642     // clobbering the CPSR (i.e. not using ADDS/SUBS).
643     if (!SafeToClobberCPSR)
644       return nullptr;
645 
646     unsigned NewBase;
647     if (isi32Load(Opcode)) {
648       // If it is a load, then just use one of the destination registers
649       // as the new base. Will no longer be writeback in Thumb1.
650       NewBase = Regs[NumRegs-1].first;
651       Writeback = false;
652     } else {
653       // Find a free register that we can use as scratch register.
654       moveLiveRegsBefore(MBB, InsertBefore);
655       // The merged instruction does not exist yet but will use several Regs if
656       // it is a Store.
657       if (!isLoadSingle(Opcode))
658         for (const std::pair<unsigned, bool> &R : Regs)
659           LiveRegs.addReg(R.first);
660 
661       NewBase = findFreeReg(isThumb1 ? ARM::tGPRRegClass : ARM::GPRRegClass);
662       if (NewBase == 0)
663         return nullptr;
664     }
665 
666     int BaseOpc =
667       isThumb2 ? ARM::t2ADDri :
668       (isThumb1 && Base == ARM::SP) ? ARM::tADDrSPi :
669       (isThumb1 && Offset < 8) ? ARM::tADDi3 :
670       isThumb1 ? ARM::tADDi8  : ARM::ADDri;
671 
672     if (Offset < 0) {
673       Offset = - Offset;
674       BaseOpc =
675         isThumb2 ? ARM::t2SUBri :
676         (isThumb1 && Offset < 8 && Base != ARM::SP) ? ARM::tSUBi3 :
677         isThumb1 ? ARM::tSUBi8  : ARM::SUBri;
678     }
679 
680     if (!TL->isLegalAddImmediate(Offset))
681       // FIXME: Try add with register operand?
682       return nullptr; // Probably not worth it then.
683 
684     // We can only append a kill flag to the add/sub input if the value is not
685     // used in the register list of the stm as well.
686     bool KillOldBase = BaseKill &&
687       (!isi32Store(Opcode) || !ContainsReg(Regs, Base));
688 
689     if (isThumb1) {
690       // Thumb1: depending on immediate size, use either
691       //   ADDS NewBase, Base, #imm3
692       // or
693       //   MOV  NewBase, Base
694       //   ADDS NewBase, #imm8.
695       if (Base != NewBase &&
696           (BaseOpc == ARM::tADDi8 || BaseOpc == ARM::tSUBi8)) {
697         // Need to insert a MOV to the new base first.
698         if (isARMLowRegister(NewBase) && isARMLowRegister(Base) &&
699             !STI->hasV6Ops()) {
700           // thumbv4t doesn't have lo->lo copies, and we can't predicate tMOVSr
701           if (Pred != ARMCC::AL)
702             return nullptr;
703           BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVSr), NewBase)
704             .addReg(Base, getKillRegState(KillOldBase));
705         } else
706           BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVr), NewBase)
707             .addReg(Base, getKillRegState(KillOldBase))
708             .addImm(Pred).addReg(PredReg);
709 
710         // The following ADDS/SUBS becomes an update.
711         Base = NewBase;
712         KillOldBase = true;
713       }
714       if (BaseOpc == ARM::tADDrSPi) {
715         assert(Offset % 4 == 0 && "tADDrSPi offset is scaled by 4");
716         BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
717           .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset/4)
718           .addImm(Pred).addReg(PredReg);
719       } else
720         AddDefaultT1CC(
721           BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase), true)
722           .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset)
723           .addImm(Pred).addReg(PredReg);
724     } else {
725       BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
726         .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset)
727         .addImm(Pred).addReg(PredReg).addReg(0);
728     }
729     Base = NewBase;
730     BaseKill = true; // New base is always killed straight away.
731   }
732 
733   bool isDef = isLoadSingle(Opcode);
734 
735   // Get LS multiple opcode. Note that for Thumb1 this might be an opcode with
736   // base register writeback.
737   Opcode = getLoadStoreMultipleOpcode(Opcode, Mode);
738   if (!Opcode)
739     return nullptr;
740 
741   // Check if a Thumb1 LDM/STM merge is safe. This is the case if:
742   // - There is no writeback (LDM of base register),
743   // - the base register is killed by the merged instruction,
744   // - or it's safe to overwrite the condition flags, i.e. to insert a SUBS
745   //   to reset the base register.
746   // Otherwise, don't merge.
747   // It's safe to return here since the code to materialize a new base register
748   // above is also conditional on SafeToClobberCPSR.
749   if (isThumb1 && !SafeToClobberCPSR && Writeback && !BaseKill)
750     return nullptr;
751 
752   MachineInstrBuilder MIB;
753 
754   if (Writeback) {
755     assert(isThumb1 && "expected Writeback only inThumb1");
756     if (Opcode == ARM::tLDMIA) {
757       assert(!(ContainsReg(Regs, Base)) && "Thumb1 can't LDM ! with Base in Regs");
758       // Update tLDMIA with writeback if necessary.
759       Opcode = ARM::tLDMIA_UPD;
760     }
761 
762     MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode));
763 
764     // Thumb1: we might need to set base writeback when building the MI.
765     MIB.addReg(Base, getDefRegState(true))
766        .addReg(Base, getKillRegState(BaseKill));
767 
768     // The base isn't dead after a merged instruction with writeback.
769     // Insert a sub instruction after the newly formed instruction to reset.
770     if (!BaseKill)
771       UpdateBaseRegUses(MBB, InsertBefore, DL, Base, NumRegs, Pred, PredReg);
772 
773   } else {
774     // No writeback, simply build the MachineInstr.
775     MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode));
776     MIB.addReg(Base, getKillRegState(BaseKill));
777   }
778 
779   MIB.addImm(Pred).addReg(PredReg);
780 
781   for (const std::pair<unsigned, bool> &R : Regs)
782     MIB.addReg(R.first, getDefRegState(isDef) | getKillRegState(R.second));
783 
784   return MIB.getInstr();
785 }
786 
787 MachineInstr *ARMLoadStoreOpt::CreateLoadStoreDouble(MachineBasicBlock &MBB,
788     MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
789     bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
790     DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) const {
791   bool IsLoad = isi32Load(Opcode);
792   assert((IsLoad || isi32Store(Opcode)) && "Must have integer load or store");
793   unsigned LoadStoreOpcode = IsLoad ? ARM::t2LDRDi8 : ARM::t2STRDi8;
794 
795   assert(Regs.size() == 2);
796   MachineInstrBuilder MIB = BuildMI(MBB, InsertBefore, DL,
797                                     TII->get(LoadStoreOpcode));
798   if (IsLoad) {
799     MIB.addReg(Regs[0].first, RegState::Define)
800        .addReg(Regs[1].first, RegState::Define);
801   } else {
802     MIB.addReg(Regs[0].first, getKillRegState(Regs[0].second))
803        .addReg(Regs[1].first, getKillRegState(Regs[1].second));
804   }
805   MIB.addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
806   return MIB.getInstr();
807 }
808 
809 /// Call MergeOps and update MemOps and merges accordingly on success.
810 MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
811   const MachineInstr *First = Cand.Instrs.front();
812   unsigned Opcode = First->getOpcode();
813   bool IsLoad = isLoadSingle(Opcode);
814   SmallVector<std::pair<unsigned, bool>, 8> Regs;
815   SmallVector<unsigned, 4> ImpDefs;
816   DenseSet<unsigned> KilledRegs;
817   DenseSet<unsigned> UsedRegs;
818   // Determine list of registers and list of implicit super-register defs.
819   for (const MachineInstr *MI : Cand.Instrs) {
820     const MachineOperand &MO = getLoadStoreRegOp(*MI);
821     unsigned Reg = MO.getReg();
822     bool IsKill = MO.isKill();
823     if (IsKill)
824       KilledRegs.insert(Reg);
825     Regs.push_back(std::make_pair(Reg, IsKill));
826     UsedRegs.insert(Reg);
827 
828     if (IsLoad) {
829       // Collect any implicit defs of super-registers, after merging we can't
830       // be sure anymore that we properly preserved these live ranges and must
831       // removed these implicit operands.
832       for (const MachineOperand &MO : MI->implicit_operands()) {
833         if (!MO.isReg() || !MO.isDef() || MO.isDead())
834           continue;
835         assert(MO.isImplicit());
836         unsigned DefReg = MO.getReg();
837 
838         if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) != ImpDefs.end())
839           continue;
840         // We can ignore cases where the super-reg is read and written.
841         if (MI->readsRegister(DefReg))
842           continue;
843         ImpDefs.push_back(DefReg);
844       }
845     }
846   }
847 
848   // Attempt the merge.
849   typedef MachineBasicBlock::iterator iterator;
850   MachineInstr *LatestMI = Cand.Instrs[Cand.LatestMIIdx];
851   iterator InsertBefore = std::next(iterator(LatestMI));
852   MachineBasicBlock &MBB = *LatestMI->getParent();
853   unsigned Offset = getMemoryOpOffset(First);
854   unsigned Base = getLoadStoreBaseOp(*First).getReg();
855   bool BaseKill = LatestMI->killsRegister(Base);
856   unsigned PredReg = 0;
857   ARMCC::CondCodes Pred = getInstrPredicate(*First, PredReg);
858   DebugLoc DL = First->getDebugLoc();
859   MachineInstr *Merged = nullptr;
860   if (Cand.CanMergeToLSDouble)
861     Merged = CreateLoadStoreDouble(MBB, InsertBefore, Offset, Base, BaseKill,
862                                    Opcode, Pred, PredReg, DL, Regs);
863   if (!Merged && Cand.CanMergeToLSMulti)
864     Merged = CreateLoadStoreMulti(MBB, InsertBefore, Offset, Base, BaseKill,
865                                   Opcode, Pred, PredReg, DL, Regs);
866   if (!Merged)
867     return nullptr;
868 
869   // Determine earliest instruction that will get removed. We then keep an
870   // iterator just above it so the following erases don't invalidated it.
871   iterator EarliestI(Cand.Instrs[Cand.EarliestMIIdx]);
872   bool EarliestAtBegin = false;
873   if (EarliestI == MBB.begin()) {
874     EarliestAtBegin = true;
875   } else {
876     EarliestI = std::prev(EarliestI);
877   }
878 
879   // Remove instructions which have been merged.
880   for (MachineInstr *MI : Cand.Instrs)
881     MBB.erase(MI);
882 
883   // Determine range between the earliest removed instruction and the new one.
884   if (EarliestAtBegin)
885     EarliestI = MBB.begin();
886   else
887     EarliestI = std::next(EarliestI);
888   auto FixupRange = make_range(EarliestI, iterator(Merged));
889 
890   if (isLoadSingle(Opcode)) {
891     // If the previous loads defined a super-reg, then we have to mark earlier
892     // operands undef; Replicate the super-reg def on the merged instruction.
893     for (MachineInstr &MI : FixupRange) {
894       for (unsigned &ImpDefReg : ImpDefs) {
895         for (MachineOperand &MO : MI.implicit_operands()) {
896           if (!MO.isReg() || MO.getReg() != ImpDefReg)
897             continue;
898           if (MO.readsReg())
899             MO.setIsUndef();
900           else if (MO.isDef())
901             ImpDefReg = 0;
902         }
903       }
904     }
905 
906     MachineInstrBuilder MIB(*Merged->getParent()->getParent(), Merged);
907     for (unsigned ImpDef : ImpDefs)
908       MIB.addReg(ImpDef, RegState::ImplicitDefine);
909   } else {
910     // Remove kill flags: We are possibly storing the values later now.
911     assert(isi32Store(Opcode) || Opcode == ARM::VSTRS || Opcode == ARM::VSTRD);
912     for (MachineInstr &MI : FixupRange) {
913       for (MachineOperand &MO : MI.uses()) {
914         if (!MO.isReg() || !MO.isKill())
915           continue;
916         if (UsedRegs.count(MO.getReg()))
917           MO.setIsKill(false);
918       }
919     }
920     assert(ImpDefs.empty());
921   }
922 
923   return Merged;
924 }
925 
926 static bool isValidLSDoubleOffset(int Offset) {
927   unsigned Value = abs(Offset);
928   // t2LDRDi8/t2STRDi8 supports an 8 bit immediate which is internally
929   // multiplied by 4.
930   return (Value % 4) == 0 && Value < 1024;
931 }
932 
933 /// Return true for loads/stores that can be combined to a double/multi
934 /// operation without increasing the requirements for alignment.
935 static bool mayCombineMisaligned(const TargetSubtargetInfo &STI,
936                                  const MachineInstr &MI) {
937   // vldr/vstr trap on misaligned pointers anyway, forming vldm makes no
938   // difference.
939   unsigned Opcode = MI.getOpcode();
940   if (!isi32Load(Opcode) && !isi32Store(Opcode))
941     return true;
942 
943   // Stack pointer alignment is out of the programmers control so we can trust
944   // SP-relative loads/stores.
945   if (getLoadStoreBaseOp(MI).getReg() == ARM::SP &&
946       STI.getFrameLowering()->getTransientStackAlignment() >= 4)
947     return true;
948   return false;
949 }
950 
951 /// Find candidates for load/store multiple merge in list of MemOpQueueEntries.
952 void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
953   const MachineInstr *FirstMI = MemOps[0].MI;
954   unsigned Opcode = FirstMI->getOpcode();
955   bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
956   unsigned Size = getLSMultipleTransferSize(FirstMI);
957 
958   unsigned SIndex = 0;
959   unsigned EIndex = MemOps.size();
960   do {
961     // Look at the first instruction.
962     const MachineInstr *MI = MemOps[SIndex].MI;
963     int Offset = MemOps[SIndex].Offset;
964     const MachineOperand &PMO = getLoadStoreRegOp(*MI);
965     unsigned PReg = PMO.getReg();
966     unsigned PRegNum = PMO.isUndef() ? UINT_MAX : TRI->getEncodingValue(PReg);
967     unsigned Latest = SIndex;
968     unsigned Earliest = SIndex;
969     unsigned Count = 1;
970     bool CanMergeToLSDouble =
971       STI->isThumb2() && isNotVFP && isValidLSDoubleOffset(Offset);
972     // ARM errata 602117: LDRD with base in list may result in incorrect base
973     // register when interrupted or faulted.
974     if (STI->isCortexM3() && isi32Load(Opcode) &&
975         PReg == getLoadStoreBaseOp(*MI).getReg())
976       CanMergeToLSDouble = false;
977 
978     bool CanMergeToLSMulti = true;
979     // On swift vldm/vstm starting with an odd register number as that needs
980     // more uops than single vldrs.
981     if (STI->isSwift() && !isNotVFP && (PRegNum % 2) == 1)
982       CanMergeToLSMulti = false;
983 
984     // LDRD/STRD do not allow SP/PC. LDM/STM do not support it or have it
985     // deprecated; LDM to PC is fine but cannot happen here.
986     if (PReg == ARM::SP || PReg == ARM::PC)
987       CanMergeToLSMulti = CanMergeToLSDouble = false;
988 
989     // Should we be conservative?
990     if (AssumeMisalignedLoadStores && !mayCombineMisaligned(*STI, *MI))
991       CanMergeToLSMulti = CanMergeToLSDouble = false;
992 
993     // Merge following instructions where possible.
994     for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) {
995       int NewOffset = MemOps[I].Offset;
996       if (NewOffset != Offset + (int)Size)
997         break;
998       const MachineOperand &MO = getLoadStoreRegOp(*MemOps[I].MI);
999       unsigned Reg = MO.getReg();
1000       if (Reg == ARM::SP || Reg == ARM::PC)
1001         break;
1002 
1003       // See if the current load/store may be part of a multi load/store.
1004       unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg);
1005       bool PartOfLSMulti = CanMergeToLSMulti;
1006       if (PartOfLSMulti) {
1007         // Register numbers must be in ascending order.
1008         if (RegNum <= PRegNum)
1009           PartOfLSMulti = false;
1010         // For VFP / NEON load/store multiples, the registers must be
1011         // consecutive and within the limit on the number of registers per
1012         // instruction.
1013         else if (!isNotVFP && RegNum != PRegNum+1)
1014           PartOfLSMulti = false;
1015       }
1016       // See if the current load/store may be part of a double load/store.
1017       bool PartOfLSDouble = CanMergeToLSDouble && Count <= 1;
1018 
1019       if (!PartOfLSMulti && !PartOfLSDouble)
1020         break;
1021       CanMergeToLSMulti &= PartOfLSMulti;
1022       CanMergeToLSDouble &= PartOfLSDouble;
1023       // Track MemOp with latest and earliest position (Positions are
1024       // counted in reverse).
1025       unsigned Position = MemOps[I].Position;
1026       if (Position < MemOps[Latest].Position)
1027         Latest = I;
1028       else if (Position > MemOps[Earliest].Position)
1029         Earliest = I;
1030       // Prepare for next MemOp.
1031       Offset += Size;
1032       PRegNum = RegNum;
1033     }
1034 
1035     // Form a candidate from the Ops collected so far.
1036     MergeCandidate *Candidate = new(Allocator.Allocate()) MergeCandidate;
1037     for (unsigned C = SIndex, CE = SIndex + Count; C < CE; ++C)
1038       Candidate->Instrs.push_back(MemOps[C].MI);
1039     Candidate->LatestMIIdx = Latest - SIndex;
1040     Candidate->EarliestMIIdx = Earliest - SIndex;
1041     Candidate->InsertPos = MemOps[Latest].Position;
1042     if (Count == 1)
1043       CanMergeToLSMulti = CanMergeToLSDouble = false;
1044     Candidate->CanMergeToLSMulti = CanMergeToLSMulti;
1045     Candidate->CanMergeToLSDouble = CanMergeToLSDouble;
1046     Candidates.push_back(Candidate);
1047     // Continue after the chain.
1048     SIndex += Count;
1049   } while (SIndex < EIndex);
1050 }
1051 
1052 static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
1053                                             ARM_AM::AMSubMode Mode) {
1054   switch (Opc) {
1055   default: llvm_unreachable("Unhandled opcode!");
1056   case ARM::LDMIA:
1057   case ARM::LDMDA:
1058   case ARM::LDMDB:
1059   case ARM::LDMIB:
1060     switch (Mode) {
1061     default: llvm_unreachable("Unhandled submode!");
1062     case ARM_AM::ia: return ARM::LDMIA_UPD;
1063     case ARM_AM::ib: return ARM::LDMIB_UPD;
1064     case ARM_AM::da: return ARM::LDMDA_UPD;
1065     case ARM_AM::db: return ARM::LDMDB_UPD;
1066     }
1067   case ARM::STMIA:
1068   case ARM::STMDA:
1069   case ARM::STMDB:
1070   case ARM::STMIB:
1071     switch (Mode) {
1072     default: llvm_unreachable("Unhandled submode!");
1073     case ARM_AM::ia: return ARM::STMIA_UPD;
1074     case ARM_AM::ib: return ARM::STMIB_UPD;
1075     case ARM_AM::da: return ARM::STMDA_UPD;
1076     case ARM_AM::db: return ARM::STMDB_UPD;
1077     }
1078   case ARM::t2LDMIA:
1079   case ARM::t2LDMDB:
1080     switch (Mode) {
1081     default: llvm_unreachable("Unhandled submode!");
1082     case ARM_AM::ia: return ARM::t2LDMIA_UPD;
1083     case ARM_AM::db: return ARM::t2LDMDB_UPD;
1084     }
1085   case ARM::t2STMIA:
1086   case ARM::t2STMDB:
1087     switch (Mode) {
1088     default: llvm_unreachable("Unhandled submode!");
1089     case ARM_AM::ia: return ARM::t2STMIA_UPD;
1090     case ARM_AM::db: return ARM::t2STMDB_UPD;
1091     }
1092   case ARM::VLDMSIA:
1093     switch (Mode) {
1094     default: llvm_unreachable("Unhandled submode!");
1095     case ARM_AM::ia: return ARM::VLDMSIA_UPD;
1096     case ARM_AM::db: return ARM::VLDMSDB_UPD;
1097     }
1098   case ARM::VLDMDIA:
1099     switch (Mode) {
1100     default: llvm_unreachable("Unhandled submode!");
1101     case ARM_AM::ia: return ARM::VLDMDIA_UPD;
1102     case ARM_AM::db: return ARM::VLDMDDB_UPD;
1103     }
1104   case ARM::VSTMSIA:
1105     switch (Mode) {
1106     default: llvm_unreachable("Unhandled submode!");
1107     case ARM_AM::ia: return ARM::VSTMSIA_UPD;
1108     case ARM_AM::db: return ARM::VSTMSDB_UPD;
1109     }
1110   case ARM::VSTMDIA:
1111     switch (Mode) {
1112     default: llvm_unreachable("Unhandled submode!");
1113     case ARM_AM::ia: return ARM::VSTMDIA_UPD;
1114     case ARM_AM::db: return ARM::VSTMDDB_UPD;
1115     }
1116   }
1117 }
1118 
1119 /// Check if the given instruction increments or decrements a register and
1120 /// return the amount it is incremented/decremented. Returns 0 if the CPSR flags
1121 /// generated by the instruction are possibly read as well.
1122 static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg,
1123                                   ARMCC::CondCodes Pred, unsigned PredReg) {
1124   bool CheckCPSRDef;
1125   int Scale;
1126   switch (MI.getOpcode()) {
1127   case ARM::tADDi8:  Scale =  4; CheckCPSRDef = true; break;
1128   case ARM::tSUBi8:  Scale = -4; CheckCPSRDef = true; break;
1129   case ARM::t2SUBri:
1130   case ARM::SUBri:   Scale = -1; CheckCPSRDef = true; break;
1131   case ARM::t2ADDri:
1132   case ARM::ADDri:   Scale =  1; CheckCPSRDef = true; break;
1133   case ARM::tADDspi: Scale =  4; CheckCPSRDef = false; break;
1134   case ARM::tSUBspi: Scale = -4; CheckCPSRDef = false; break;
1135   default: return 0;
1136   }
1137 
1138   unsigned MIPredReg;
1139   if (MI.getOperand(0).getReg() != Reg ||
1140       MI.getOperand(1).getReg() != Reg ||
1141       getInstrPredicate(MI, MIPredReg) != Pred ||
1142       MIPredReg != PredReg)
1143     return 0;
1144 
1145   if (CheckCPSRDef && definesCPSR(&MI))
1146     return 0;
1147   return MI.getOperand(2).getImm() * Scale;
1148 }
1149 
1150 /// Searches for an increment or decrement of \p Reg before \p MBBI.
1151 static MachineBasicBlock::iterator
1152 findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg,
1153                  ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
1154   Offset = 0;
1155   MachineBasicBlock &MBB = *MBBI->getParent();
1156   MachineBasicBlock::iterator BeginMBBI = MBB.begin();
1157   MachineBasicBlock::iterator EndMBBI = MBB.end();
1158   if (MBBI == BeginMBBI)
1159     return EndMBBI;
1160 
1161   // Skip debug values.
1162   MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
1163   while (PrevMBBI->isDebugValue() && PrevMBBI != BeginMBBI)
1164     --PrevMBBI;
1165 
1166   Offset = isIncrementOrDecrement(*PrevMBBI, Reg, Pred, PredReg);
1167   return Offset == 0 ? EndMBBI : PrevMBBI;
1168 }
1169 
1170 /// Searches for a increment or decrement of \p Reg after \p MBBI.
1171 static MachineBasicBlock::iterator
1172 findIncDecAfter(MachineBasicBlock::iterator MBBI, unsigned Reg,
1173                 ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
1174   Offset = 0;
1175   MachineBasicBlock &MBB = *MBBI->getParent();
1176   MachineBasicBlock::iterator EndMBBI = MBB.end();
1177   MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
1178   // Skip debug values.
1179   while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
1180     ++NextMBBI;
1181   if (NextMBBI == EndMBBI)
1182     return EndMBBI;
1183 
1184   Offset = isIncrementOrDecrement(*NextMBBI, Reg, Pred, PredReg);
1185   return Offset == 0 ? EndMBBI : NextMBBI;
1186 }
1187 
1188 /// Fold proceeding/trailing inc/dec of base register into the
1189 /// LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible:
1190 ///
1191 /// stmia rn, <ra, rb, rc>
1192 /// rn := rn + 4 * 3;
1193 /// =>
1194 /// stmia rn!, <ra, rb, rc>
1195 ///
1196 /// rn := rn - 4 * 3;
1197 /// ldmia rn, <ra, rb, rc>
1198 /// =>
1199 /// ldmdb rn!, <ra, rb, rc>
1200 bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
1201   // Thumb1 is already using updating loads/stores.
1202   if (isThumb1) return false;
1203 
1204   const MachineOperand &BaseOP = MI->getOperand(0);
1205   unsigned Base = BaseOP.getReg();
1206   bool BaseKill = BaseOP.isKill();
1207   unsigned PredReg = 0;
1208   ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
1209   unsigned Opcode = MI->getOpcode();
1210   DebugLoc DL = MI->getDebugLoc();
1211 
1212   // Can't use an updating ld/st if the base register is also a dest
1213   // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
1214   for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i)
1215     if (MI->getOperand(i).getReg() == Base)
1216       return false;
1217 
1218   int Bytes = getLSMultipleTransferSize(MI);
1219   MachineBasicBlock &MBB = *MI->getParent();
1220   MachineBasicBlock::iterator MBBI(MI);
1221   int Offset;
1222   MachineBasicBlock::iterator MergeInstr
1223     = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
1224   ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode);
1225   if (Mode == ARM_AM::ia && Offset == -Bytes) {
1226     Mode = ARM_AM::db;
1227   } else if (Mode == ARM_AM::ib && Offset == -Bytes) {
1228     Mode = ARM_AM::da;
1229   } else {
1230     MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
1231     if (((Mode != ARM_AM::ia && Mode != ARM_AM::ib) || Offset != Bytes) &&
1232         ((Mode != ARM_AM::da && Mode != ARM_AM::db) || Offset != -Bytes))
1233       return false;
1234   }
1235   MBB.erase(MergeInstr);
1236 
1237   unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode);
1238   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
1239     .addReg(Base, getDefRegState(true)) // WB base register
1240     .addReg(Base, getKillRegState(BaseKill))
1241     .addImm(Pred).addReg(PredReg);
1242 
1243   // Transfer the rest of operands.
1244   for (unsigned OpNum = 3, e = MI->getNumOperands(); OpNum != e; ++OpNum)
1245     MIB.addOperand(MI->getOperand(OpNum));
1246 
1247   // Transfer memoperands.
1248   MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
1249 
1250   MBB.erase(MBBI);
1251   return true;
1252 }
1253 
1254 static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc,
1255                                              ARM_AM::AddrOpc Mode) {
1256   switch (Opc) {
1257   case ARM::LDRi12:
1258     return ARM::LDR_PRE_IMM;
1259   case ARM::STRi12:
1260     return ARM::STR_PRE_IMM;
1261   case ARM::VLDRS:
1262     return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
1263   case ARM::VLDRD:
1264     return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD;
1265   case ARM::VSTRS:
1266     return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD;
1267   case ARM::VSTRD:
1268     return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD;
1269   case ARM::t2LDRi8:
1270   case ARM::t2LDRi12:
1271     return ARM::t2LDR_PRE;
1272   case ARM::t2STRi8:
1273   case ARM::t2STRi12:
1274     return ARM::t2STR_PRE;
1275   default: llvm_unreachable("Unhandled opcode!");
1276   }
1277 }
1278 
1279 static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc,
1280                                               ARM_AM::AddrOpc Mode) {
1281   switch (Opc) {
1282   case ARM::LDRi12:
1283     return ARM::LDR_POST_IMM;
1284   case ARM::STRi12:
1285     return ARM::STR_POST_IMM;
1286   case ARM::VLDRS:
1287     return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
1288   case ARM::VLDRD:
1289     return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD;
1290   case ARM::VSTRS:
1291     return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD;
1292   case ARM::VSTRD:
1293     return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD;
1294   case ARM::t2LDRi8:
1295   case ARM::t2LDRi12:
1296     return ARM::t2LDR_POST;
1297   case ARM::t2STRi8:
1298   case ARM::t2STRi12:
1299     return ARM::t2STR_POST;
1300   default: llvm_unreachable("Unhandled opcode!");
1301   }
1302 }
1303 
1304 /// Fold proceeding/trailing inc/dec of base register into the
1305 /// LDR/STR/FLD{D|S}/FST{D|S} op when possible:
1306 bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
1307   // Thumb1 doesn't have updating LDR/STR.
1308   // FIXME: Use LDM/STM with single register instead.
1309   if (isThumb1) return false;
1310 
1311   unsigned Base = getLoadStoreBaseOp(*MI).getReg();
1312   bool BaseKill = getLoadStoreBaseOp(*MI).isKill();
1313   unsigned Opcode = MI->getOpcode();
1314   DebugLoc DL = MI->getDebugLoc();
1315   bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS ||
1316                 Opcode == ARM::VSTRD || Opcode == ARM::VSTRS);
1317   bool isAM2 = (Opcode == ARM::LDRi12 || Opcode == ARM::STRi12);
1318   if (isi32Load(Opcode) || isi32Store(Opcode))
1319     if (MI->getOperand(2).getImm() != 0)
1320       return false;
1321   if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
1322     return false;
1323 
1324   // Can't do the merge if the destination register is the same as the would-be
1325   // writeback register.
1326   if (MI->getOperand(0).getReg() == Base)
1327     return false;
1328 
1329   unsigned PredReg = 0;
1330   ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
1331   int Bytes = getLSMultipleTransferSize(MI);
1332   MachineBasicBlock &MBB = *MI->getParent();
1333   MachineBasicBlock::iterator MBBI(MI);
1334   int Offset;
1335   MachineBasicBlock::iterator MergeInstr
1336     = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
1337   unsigned NewOpc;
1338   if (!isAM5 && Offset == Bytes) {
1339     NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
1340   } else if (Offset == -Bytes) {
1341     NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
1342   } else {
1343     MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
1344     if (Offset == Bytes) {
1345       NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
1346     } else if (!isAM5 && Offset == -Bytes) {
1347       NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
1348     } else
1349       return false;
1350   }
1351   MBB.erase(MergeInstr);
1352 
1353   ARM_AM::AddrOpc AddSub = Offset < 0 ? ARM_AM::sub : ARM_AM::add;
1354 
1355   bool isLd = isLoadSingle(Opcode);
1356   if (isAM5) {
1357     // VLDM[SD]_UPD, VSTM[SD]_UPD
1358     // (There are no base-updating versions of VLDR/VSTR instructions, but the
1359     // updating load/store-multiple instructions can be used with only one
1360     // register.)
1361     MachineOperand &MO = MI->getOperand(0);
1362     BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
1363       .addReg(Base, getDefRegState(true)) // WB base register
1364       .addReg(Base, getKillRegState(isLd ? BaseKill : false))
1365       .addImm(Pred).addReg(PredReg)
1366       .addReg(MO.getReg(), (isLd ? getDefRegState(true) :
1367                             getKillRegState(MO.isKill())));
1368   } else if (isLd) {
1369     if (isAM2) {
1370       // LDR_PRE, LDR_POST
1371       if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) {
1372         BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
1373           .addReg(Base, RegState::Define)
1374           .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
1375       } else {
1376         int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
1377         BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
1378           .addReg(Base, RegState::Define)
1379           .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
1380       }
1381     } else {
1382       // t2LDR_PRE, t2LDR_POST
1383       BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
1384         .addReg(Base, RegState::Define)
1385         .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
1386     }
1387   } else {
1388     MachineOperand &MO = MI->getOperand(0);
1389     // FIXME: post-indexed stores use am2offset_imm, which still encodes
1390     // the vestigal zero-reg offset register. When that's fixed, this clause
1391     // can be removed entirely.
1392     if (isAM2 && NewOpc == ARM::STR_POST_IMM) {
1393       int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
1394       // STR_PRE, STR_POST
1395       BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
1396         .addReg(MO.getReg(), getKillRegState(MO.isKill()))
1397         .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
1398     } else {
1399       // t2STR_PRE, t2STR_POST
1400       BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
1401         .addReg(MO.getReg(), getKillRegState(MO.isKill()))
1402         .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
1403     }
1404   }
1405   MBB.erase(MBBI);
1406 
1407   return true;
1408 }
1409 
1410 bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
1411   unsigned Opcode = MI.getOpcode();
1412   assert((Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) &&
1413          "Must have t2STRDi8 or t2LDRDi8");
1414   if (MI.getOperand(3).getImm() != 0)
1415     return false;
1416 
1417   // Behaviour for writeback is undefined if base register is the same as one
1418   // of the others.
1419   const MachineOperand &BaseOp = MI.getOperand(2);
1420   unsigned Base = BaseOp.getReg();
1421   const MachineOperand &Reg0Op = MI.getOperand(0);
1422   const MachineOperand &Reg1Op = MI.getOperand(1);
1423   if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base)
1424     return false;
1425 
1426   unsigned PredReg;
1427   ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
1428   MachineBasicBlock::iterator MBBI(MI);
1429   MachineBasicBlock &MBB = *MI.getParent();
1430   int Offset;
1431   MachineBasicBlock::iterator MergeInstr = findIncDecBefore(MBBI, Base, Pred,
1432                                                             PredReg, Offset);
1433   unsigned NewOpc;
1434   if (Offset == 8 || Offset == -8) {
1435     NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_PRE : ARM::t2STRD_PRE;
1436   } else {
1437     MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
1438     if (Offset == 8 || Offset == -8) {
1439       NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_POST : ARM::t2STRD_POST;
1440     } else
1441       return false;
1442   }
1443   MBB.erase(MergeInstr);
1444 
1445   DebugLoc DL = MI.getDebugLoc();
1446   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
1447   if (NewOpc == ARM::t2LDRD_PRE || NewOpc == ARM::t2LDRD_POST) {
1448     MIB.addOperand(Reg0Op).addOperand(Reg1Op)
1449        .addReg(BaseOp.getReg(), RegState::Define);
1450   } else {
1451     assert(NewOpc == ARM::t2STRD_PRE || NewOpc == ARM::t2STRD_POST);
1452     MIB.addReg(BaseOp.getReg(), RegState::Define)
1453        .addOperand(Reg0Op).addOperand(Reg1Op);
1454   }
1455   MIB.addReg(BaseOp.getReg(), RegState::Kill)
1456      .addImm(Offset).addImm(Pred).addReg(PredReg);
1457   assert(TII->get(Opcode).getNumOperands() == 6 &&
1458          TII->get(NewOpc).getNumOperands() == 7 &&
1459          "Unexpected number of operands in Opcode specification.");
1460 
1461   // Transfer implicit operands.
1462   for (const MachineOperand &MO : MI.implicit_operands())
1463     MIB.addOperand(MO);
1464   MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
1465 
1466   MBB.erase(MBBI);
1467   return true;
1468 }
1469 
1470 /// Returns true if instruction is a memory operation that this pass is capable
1471 /// of operating on.
1472 static bool isMemoryOp(const MachineInstr &MI) {
1473   unsigned Opcode = MI.getOpcode();
1474   switch (Opcode) {
1475   case ARM::VLDRS:
1476   case ARM::VSTRS:
1477   case ARM::VLDRD:
1478   case ARM::VSTRD:
1479   case ARM::LDRi12:
1480   case ARM::STRi12:
1481   case ARM::tLDRi:
1482   case ARM::tSTRi:
1483   case ARM::tLDRspi:
1484   case ARM::tSTRspi:
1485   case ARM::t2LDRi8:
1486   case ARM::t2LDRi12:
1487   case ARM::t2STRi8:
1488   case ARM::t2STRi12:
1489     break;
1490   default:
1491     return false;
1492   }
1493   if (!MI.getOperand(1).isReg())
1494     return false;
1495 
1496   // When no memory operands are present, conservatively assume unaligned,
1497   // volatile, unfoldable.
1498   if (!MI.hasOneMemOperand())
1499     return false;
1500 
1501   const MachineMemOperand &MMO = **MI.memoperands_begin();
1502 
1503   // Don't touch volatile memory accesses - we may be changing their order.
1504   if (MMO.isVolatile())
1505     return false;
1506 
1507   // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is
1508   // not.
1509   if (MMO.getAlignment() < 4)
1510     return false;
1511 
1512   // str <undef> could probably be eliminated entirely, but for now we just want
1513   // to avoid making a mess of it.
1514   // FIXME: Use str <undef> as a wildcard to enable better stm folding.
1515   if (MI.getOperand(0).isReg() && MI.getOperand(0).isUndef())
1516     return false;
1517 
1518   // Likewise don't mess with references to undefined addresses.
1519   if (MI.getOperand(1).isUndef())
1520     return false;
1521 
1522   return true;
1523 }
1524 
1525 static void InsertLDR_STR(MachineBasicBlock &MBB,
1526                           MachineBasicBlock::iterator &MBBI,
1527                           int Offset, bool isDef,
1528                           DebugLoc DL, unsigned NewOpc,
1529                           unsigned Reg, bool RegDeadKill, bool RegUndef,
1530                           unsigned BaseReg, bool BaseKill, bool BaseUndef,
1531                           bool OffKill, bool OffUndef,
1532                           ARMCC::CondCodes Pred, unsigned PredReg,
1533                           const TargetInstrInfo *TII, bool isT2) {
1534   if (isDef) {
1535     MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
1536                                       TII->get(NewOpc))
1537       .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill))
1538       .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
1539     MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1540   } else {
1541     MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
1542                                       TII->get(NewOpc))
1543       .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef))
1544       .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
1545     MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1546   }
1547 }
1548 
1549 bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
1550                                           MachineBasicBlock::iterator &MBBI) {
1551   MachineInstr *MI = &*MBBI;
1552   unsigned Opcode = MI->getOpcode();
1553   if (Opcode != ARM::LDRD && Opcode != ARM::STRD && Opcode != ARM::t2LDRDi8)
1554     return false;
1555 
1556   const MachineOperand &BaseOp = MI->getOperand(2);
1557   unsigned BaseReg = BaseOp.getReg();
1558   unsigned EvenReg = MI->getOperand(0).getReg();
1559   unsigned OddReg  = MI->getOperand(1).getReg();
1560   unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false);
1561   unsigned OddRegNum  = TRI->getDwarfRegNum(OddReg, false);
1562 
1563   // ARM errata 602117: LDRD with base in list may result in incorrect base
1564   // register when interrupted or faulted.
1565   bool Errata602117 = EvenReg == BaseReg &&
1566     (Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8) && STI->isCortexM3();
1567   // ARM LDRD/STRD needs consecutive registers.
1568   bool NonConsecutiveRegs = (Opcode == ARM::LDRD || Opcode == ARM::STRD) &&
1569     (EvenRegNum % 2 != 0 || EvenRegNum + 1 != OddRegNum);
1570 
1571   if (!Errata602117 && !NonConsecutiveRegs)
1572     return false;
1573 
1574   bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8;
1575   bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8;
1576   bool EvenDeadKill = isLd ?
1577     MI->getOperand(0).isDead() : MI->getOperand(0).isKill();
1578   bool EvenUndef = MI->getOperand(0).isUndef();
1579   bool OddDeadKill  = isLd ?
1580     MI->getOperand(1).isDead() : MI->getOperand(1).isKill();
1581   bool OddUndef = MI->getOperand(1).isUndef();
1582   bool BaseKill = BaseOp.isKill();
1583   bool BaseUndef = BaseOp.isUndef();
1584   bool OffKill = isT2 ? false : MI->getOperand(3).isKill();
1585   bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef();
1586   int OffImm = getMemoryOpOffset(MI);
1587   unsigned PredReg = 0;
1588   ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
1589 
1590   if (OddRegNum > EvenRegNum && OffImm == 0) {
1591     // Ascending register numbers and no offset. It's safe to change it to a
1592     // ldm or stm.
1593     unsigned NewOpc = (isLd)
1594       ? (isT2 ? ARM::t2LDMIA : ARM::LDMIA)
1595       : (isT2 ? ARM::t2STMIA : ARM::STMIA);
1596     if (isLd) {
1597       BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
1598         .addReg(BaseReg, getKillRegState(BaseKill))
1599         .addImm(Pred).addReg(PredReg)
1600         .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill))
1601         .addReg(OddReg,  getDefRegState(isLd) | getDeadRegState(OddDeadKill));
1602       ++NumLDRD2LDM;
1603     } else {
1604       BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
1605         .addReg(BaseReg, getKillRegState(BaseKill))
1606         .addImm(Pred).addReg(PredReg)
1607         .addReg(EvenReg,
1608                 getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef))
1609         .addReg(OddReg,
1610                 getKillRegState(OddDeadKill)  | getUndefRegState(OddUndef));
1611       ++NumSTRD2STM;
1612     }
1613   } else {
1614     // Split into two instructions.
1615     unsigned NewOpc = (isLd)
1616       ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12)
1617       : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12);
1618     // Be extra careful for thumb2. t2LDRi8 can't reference a zero offset,
1619     // so adjust and use t2LDRi12 here for that.
1620     unsigned NewOpc2 = (isLd)
1621       ? (isT2 ? (OffImm+4 < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12)
1622       : (isT2 ? (OffImm+4 < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12);
1623     DebugLoc dl = MBBI->getDebugLoc();
1624     // If this is a load and base register is killed, it may have been
1625     // re-defed by the load, make sure the first load does not clobber it.
1626     if (isLd &&
1627         (BaseKill || OffKill) &&
1628         (TRI->regsOverlap(EvenReg, BaseReg))) {
1629       assert(!TRI->regsOverlap(OddReg, BaseReg));
1630       InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2,
1631                     OddReg, OddDeadKill, false,
1632                     BaseReg, false, BaseUndef, false, OffUndef,
1633                     Pred, PredReg, TII, isT2);
1634       InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
1635                     EvenReg, EvenDeadKill, false,
1636                     BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
1637                     Pred, PredReg, TII, isT2);
1638     } else {
1639       if (OddReg == EvenReg && EvenDeadKill) {
1640         // If the two source operands are the same, the kill marker is
1641         // probably on the first one. e.g.
1642         // t2STRDi8 %R5<kill>, %R5, %R9<kill>, 0, 14, %reg0
1643         EvenDeadKill = false;
1644         OddDeadKill = true;
1645       }
1646       // Never kill the base register in the first instruction.
1647       if (EvenReg == BaseReg)
1648         EvenDeadKill = false;
1649       InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
1650                     EvenReg, EvenDeadKill, EvenUndef,
1651                     BaseReg, false, BaseUndef, false, OffUndef,
1652                     Pred, PredReg, TII, isT2);
1653       InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2,
1654                     OddReg, OddDeadKill, OddUndef,
1655                     BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
1656                     Pred, PredReg, TII, isT2);
1657     }
1658     if (isLd)
1659       ++NumLDRD2LDR;
1660     else
1661       ++NumSTRD2STR;
1662   }
1663 
1664   MBBI = MBB.erase(MBBI);
1665   return true;
1666 }
1667 
1668 /// An optimization pass to turn multiple LDR / STR ops of the same base and
1669 /// incrementing offset into LDM / STM ops.
1670 bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
1671   MemOpQueue MemOps;
1672   unsigned CurrBase = 0;
1673   unsigned CurrOpc = ~0u;
1674   ARMCC::CondCodes CurrPred = ARMCC::AL;
1675   unsigned Position = 0;
1676   assert(Candidates.size() == 0);
1677   assert(MergeBaseCandidates.size() == 0);
1678   LiveRegsValid = false;
1679 
1680   for (MachineBasicBlock::iterator I = MBB.end(), MBBI; I != MBB.begin();
1681        I = MBBI) {
1682     // The instruction in front of the iterator is the one we look at.
1683     MBBI = std::prev(I);
1684     if (FixInvalidRegPairOp(MBB, MBBI))
1685       continue;
1686     ++Position;
1687 
1688     if (isMemoryOp(*MBBI)) {
1689       unsigned Opcode = MBBI->getOpcode();
1690       const MachineOperand &MO = MBBI->getOperand(0);
1691       unsigned Reg = MO.getReg();
1692       unsigned Base = getLoadStoreBaseOp(*MBBI).getReg();
1693       unsigned PredReg = 0;
1694       ARMCC::CondCodes Pred = getInstrPredicate(*MBBI, PredReg);
1695       int Offset = getMemoryOpOffset(MBBI);
1696       if (CurrBase == 0) {
1697         // Start of a new chain.
1698         CurrBase = Base;
1699         CurrOpc  = Opcode;
1700         CurrPred = Pred;
1701         MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position));
1702         continue;
1703       }
1704       // Note: No need to match PredReg in the next if.
1705       if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) {
1706         // Watch out for:
1707         //   r4 := ldr [r0, #8]
1708         //   r4 := ldr [r0, #4]
1709         // or
1710         //   r0 := ldr [r0]
1711         // If a load overrides the base register or a register loaded by
1712         // another load in our chain, we cannot take this instruction.
1713         bool Overlap = false;
1714         if (isLoadSingle(Opcode)) {
1715           Overlap = (Base == Reg);
1716           if (!Overlap) {
1717             for (const MemOpQueueEntry &E : MemOps) {
1718               if (TRI->regsOverlap(Reg, E.MI->getOperand(0).getReg())) {
1719                 Overlap = true;
1720                 break;
1721               }
1722             }
1723           }
1724         }
1725 
1726         if (!Overlap) {
1727           // Check offset and sort memory operation into the current chain.
1728           if (Offset > MemOps.back().Offset) {
1729             MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position));
1730             continue;
1731           } else {
1732             MemOpQueue::iterator MI, ME;
1733             for (MI = MemOps.begin(), ME = MemOps.end(); MI != ME; ++MI) {
1734               if (Offset < MI->Offset) {
1735                 // Found a place to insert.
1736                 break;
1737               }
1738               if (Offset == MI->Offset) {
1739                 // Collision, abort.
1740                 MI = ME;
1741                 break;
1742               }
1743             }
1744             if (MI != MemOps.end()) {
1745               MemOps.insert(MI, MemOpQueueEntry(MBBI, Offset, Position));
1746               continue;
1747             }
1748           }
1749         }
1750       }
1751 
1752       // Don't advance the iterator; The op will start a new chain next.
1753       MBBI = I;
1754       --Position;
1755       // Fallthrough to look into existing chain.
1756     } else if (MBBI->isDebugValue()) {
1757       continue;
1758     } else if (MBBI->getOpcode() == ARM::t2LDRDi8 ||
1759                MBBI->getOpcode() == ARM::t2STRDi8) {
1760       // ARMPreAllocLoadStoreOpt has already formed some LDRD/STRD instructions
1761       // remember them because we may still be able to merge add/sub into them.
1762       MergeBaseCandidates.push_back(MBBI);
1763     }
1764 
1765 
1766     // If we are here then the chain is broken; Extract candidates for a merge.
1767     if (MemOps.size() > 0) {
1768       FormCandidates(MemOps);
1769       // Reset for the next chain.
1770       CurrBase = 0;
1771       CurrOpc = ~0u;
1772       CurrPred = ARMCC::AL;
1773       MemOps.clear();
1774     }
1775   }
1776   if (MemOps.size() > 0)
1777     FormCandidates(MemOps);
1778 
1779   // Sort candidates so they get processed from end to begin of the basic
1780   // block later; This is necessary for liveness calculation.
1781   auto LessThan = [](const MergeCandidate* M0, const MergeCandidate *M1) {
1782     return M0->InsertPos < M1->InsertPos;
1783   };
1784   std::sort(Candidates.begin(), Candidates.end(), LessThan);
1785 
1786   // Go through list of candidates and merge.
1787   bool Changed = false;
1788   for (const MergeCandidate *Candidate : Candidates) {
1789     if (Candidate->CanMergeToLSMulti || Candidate->CanMergeToLSDouble) {
1790       MachineInstr *Merged = MergeOpsUpdate(*Candidate);
1791       // Merge preceding/trailing base inc/dec into the merged op.
1792       if (Merged) {
1793         Changed = true;
1794         unsigned Opcode = Merged->getOpcode();
1795         if (Opcode == ARM::t2STRDi8 || Opcode == ARM::t2LDRDi8)
1796           MergeBaseUpdateLSDouble(*Merged);
1797         else
1798           MergeBaseUpdateLSMultiple(Merged);
1799       } else {
1800         for (MachineInstr *MI : Candidate->Instrs) {
1801           if (MergeBaseUpdateLoadStore(MI))
1802             Changed = true;
1803         }
1804       }
1805     } else {
1806       assert(Candidate->Instrs.size() == 1);
1807       if (MergeBaseUpdateLoadStore(Candidate->Instrs.front()))
1808         Changed = true;
1809     }
1810   }
1811   Candidates.clear();
1812   // Try to fold add/sub into the LDRD/STRD formed by ARMPreAllocLoadStoreOpt.
1813   for (MachineInstr *MI : MergeBaseCandidates)
1814     MergeBaseUpdateLSDouble(*MI);
1815   MergeBaseCandidates.clear();
1816 
1817   return Changed;
1818 }
1819 
1820 /// If this is a exit BB, try merging the return ops ("bx lr" and "mov pc, lr")
1821 /// into the preceding stack restore so it directly restore the value of LR
1822 /// into pc.
1823 ///   ldmfd sp!, {..., lr}
1824 ///   bx lr
1825 /// or
1826 ///   ldmfd sp!, {..., lr}
1827 ///   mov pc, lr
1828 /// =>
1829 ///   ldmfd sp!, {..., pc}
1830 bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
1831   // Thumb1 LDM doesn't allow high registers.
1832   if (isThumb1) return false;
1833   if (MBB.empty()) return false;
1834 
1835   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
1836   if (MBBI != MBB.begin() &&
1837       (MBBI->getOpcode() == ARM::BX_RET ||
1838        MBBI->getOpcode() == ARM::tBX_RET ||
1839        MBBI->getOpcode() == ARM::MOVPCLR)) {
1840     MachineBasicBlock::iterator PrevI = std::prev(MBBI);
1841     // Ignore any DBG_VALUE instructions.
1842     while (PrevI->isDebugValue() && PrevI != MBB.begin())
1843       --PrevI;
1844     MachineInstr *PrevMI = PrevI;
1845     unsigned Opcode = PrevMI->getOpcode();
1846     if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::LDMDA_UPD ||
1847         Opcode == ARM::LDMDB_UPD || Opcode == ARM::LDMIB_UPD ||
1848         Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD) {
1849       MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1);
1850       if (MO.getReg() != ARM::LR)
1851         return false;
1852       unsigned NewOpc = (isThumb2 ? ARM::t2LDMIA_RET : ARM::LDMIA_RET);
1853       assert(((isThumb2 && Opcode == ARM::t2LDMIA_UPD) ||
1854               Opcode == ARM::LDMIA_UPD) && "Unsupported multiple load-return!");
1855       PrevMI->setDesc(TII->get(NewOpc));
1856       MO.setReg(ARM::PC);
1857       PrevMI->copyImplicitOps(*MBB.getParent(), *MBBI);
1858       MBB.erase(MBBI);
1859       return true;
1860     }
1861   }
1862   return false;
1863 }
1864 
1865 bool ARMLoadStoreOpt::CombineMovBx(MachineBasicBlock &MBB) {
1866   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
1867   if (MBBI == MBB.begin() || MBBI == MBB.end() ||
1868       MBBI->getOpcode() != ARM::tBX_RET)
1869     return false;
1870 
1871   MachineBasicBlock::iterator Prev = MBBI;
1872   --Prev;
1873   if (Prev->getOpcode() != ARM::tMOVr || !Prev->definesRegister(ARM::LR))
1874     return false;
1875 
1876   for (auto Use : Prev->uses())
1877     if (Use.isKill()) {
1878       AddDefaultPred(BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(ARM::tBX))
1879                          .addReg(Use.getReg(), RegState::Kill))
1880           .copyImplicitOps(*MBBI);
1881       MBB.erase(MBBI);
1882       MBB.erase(Prev);
1883       return true;
1884     }
1885 
1886   llvm_unreachable("tMOVr doesn't kill a reg before tBX_RET?");
1887 }
1888 
1889 bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1890   MF = &Fn;
1891   STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
1892   TL = STI->getTargetLowering();
1893   AFI = Fn.getInfo<ARMFunctionInfo>();
1894   TII = STI->getInstrInfo();
1895   TRI = STI->getRegisterInfo();
1896 
1897   RegClassInfoValid = false;
1898   isThumb2 = AFI->isThumb2Function();
1899   isThumb1 = AFI->isThumbFunction() && !isThumb2;
1900 
1901   bool Modified = false;
1902   for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
1903        ++MFI) {
1904     MachineBasicBlock &MBB = *MFI;
1905     Modified |= LoadStoreMultipleOpti(MBB);
1906     if (STI->hasV5TOps())
1907       Modified |= MergeReturnIntoLDM(MBB);
1908     if (isThumb1)
1909       Modified |= CombineMovBx(MBB);
1910   }
1911 
1912   Allocator.DestroyAll();
1913   return Modified;
1914 }
1915 
1916 namespace llvm {
1917 void initializeARMPreAllocLoadStoreOptPass(PassRegistry &);
1918 }
1919 
1920 #define ARM_PREALLOC_LOAD_STORE_OPT_NAME                                       \
1921   "ARM pre- register allocation load / store optimization pass"
1922 
1923 namespace {
1924   /// Pre- register allocation pass that move load / stores from consecutive
1925   /// locations close to make it more likely they will be combined later.
1926   struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{
1927     static char ID;
1928     ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {
1929       initializeARMPreAllocLoadStoreOptPass(*PassRegistry::getPassRegistry());
1930     }
1931 
1932     const DataLayout *TD;
1933     const TargetInstrInfo *TII;
1934     const TargetRegisterInfo *TRI;
1935     const ARMSubtarget *STI;
1936     MachineRegisterInfo *MRI;
1937     MachineFunction *MF;
1938 
1939     bool runOnMachineFunction(MachineFunction &Fn) override;
1940 
1941     const char *getPassName() const override {
1942       return ARM_PREALLOC_LOAD_STORE_OPT_NAME;
1943     }
1944 
1945   private:
1946     bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl,
1947                           unsigned &NewOpc, unsigned &EvenReg,
1948                           unsigned &OddReg, unsigned &BaseReg,
1949                           int &Offset,
1950                           unsigned &PredReg, ARMCC::CondCodes &Pred,
1951                           bool &isT2);
1952     bool RescheduleOps(MachineBasicBlock *MBB,
1953                        SmallVectorImpl<MachineInstr *> &Ops,
1954                        unsigned Base, bool isLd,
1955                        DenseMap<MachineInstr*, unsigned> &MI2LocMap);
1956     bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB);
1957   };
1958   char ARMPreAllocLoadStoreOpt::ID = 0;
1959 }
1960 
1961 INITIALIZE_PASS(ARMPreAllocLoadStoreOpt, "arm-prera-load-store-opt",
1962                 ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
1963 
1964 bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1965   if (AssumeMisalignedLoadStores)
1966     return false;
1967 
1968   TD = &Fn.getDataLayout();
1969   STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
1970   TII = STI->getInstrInfo();
1971   TRI = STI->getRegisterInfo();
1972   MRI = &Fn.getRegInfo();
1973   MF  = &Fn;
1974 
1975   bool Modified = false;
1976   for (MachineBasicBlock &MFI : Fn)
1977     Modified |= RescheduleLoadStoreInstrs(&MFI);
1978 
1979   return Modified;
1980 }
1981 
1982 static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
1983                                       MachineBasicBlock::iterator I,
1984                                       MachineBasicBlock::iterator E,
1985                                       SmallPtrSetImpl<MachineInstr*> &MemOps,
1986                                       SmallSet<unsigned, 4> &MemRegs,
1987                                       const TargetRegisterInfo *TRI) {
1988   // Are there stores / loads / calls between them?
1989   // FIXME: This is overly conservative. We should make use of alias information
1990   // some day.
1991   SmallSet<unsigned, 4> AddedRegPressure;
1992   while (++I != E) {
1993     if (I->isDebugValue() || MemOps.count(&*I))
1994       continue;
1995     if (I->isCall() || I->isTerminator() || I->hasUnmodeledSideEffects())
1996       return false;
1997     if (isLd && I->mayStore())
1998       return false;
1999     if (!isLd) {
2000       if (I->mayLoad())
2001         return false;
2002       // It's not safe to move the first 'str' down.
2003       // str r1, [r0]
2004       // strh r5, [r0]
2005       // str r4, [r0, #+4]
2006       if (I->mayStore())
2007         return false;
2008     }
2009     for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) {
2010       MachineOperand &MO = I->getOperand(j);
2011       if (!MO.isReg())
2012         continue;
2013       unsigned Reg = MO.getReg();
2014       if (MO.isDef() && TRI->regsOverlap(Reg, Base))
2015         return false;
2016       if (Reg != Base && !MemRegs.count(Reg))
2017         AddedRegPressure.insert(Reg);
2018     }
2019   }
2020 
2021   // Estimate register pressure increase due to the transformation.
2022   if (MemRegs.size() <= 4)
2023     // Ok if we are moving small number of instructions.
2024     return true;
2025   return AddedRegPressure.size() <= MemRegs.size() * 2;
2026 }
2027 
2028 bool
2029 ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
2030                                           DebugLoc &dl, unsigned &NewOpc,
2031                                           unsigned &FirstReg,
2032                                           unsigned &SecondReg,
2033                                           unsigned &BaseReg, int &Offset,
2034                                           unsigned &PredReg,
2035                                           ARMCC::CondCodes &Pred,
2036                                           bool &isT2) {
2037   // Make sure we're allowed to generate LDRD/STRD.
2038   if (!STI->hasV5TEOps())
2039     return false;
2040 
2041   // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD
2042   unsigned Scale = 1;
2043   unsigned Opcode = Op0->getOpcode();
2044   if (Opcode == ARM::LDRi12) {
2045     NewOpc = ARM::LDRD;
2046   } else if (Opcode == ARM::STRi12) {
2047     NewOpc = ARM::STRD;
2048   } else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) {
2049     NewOpc = ARM::t2LDRDi8;
2050     Scale = 4;
2051     isT2 = true;
2052   } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) {
2053     NewOpc = ARM::t2STRDi8;
2054     Scale = 4;
2055     isT2 = true;
2056   } else {
2057     return false;
2058   }
2059 
2060   // Make sure the base address satisfies i64 ld / st alignment requirement.
2061   // At the moment, we ignore the memoryoperand's value.
2062   // If we want to use AliasAnalysis, we should check it accordingly.
2063   if (!Op0->hasOneMemOperand() ||
2064       (*Op0->memoperands_begin())->isVolatile())
2065     return false;
2066 
2067   unsigned Align = (*Op0->memoperands_begin())->getAlignment();
2068   const Function *Func = MF->getFunction();
2069   unsigned ReqAlign = STI->hasV6Ops()
2070     ? TD->getABITypeAlignment(Type::getInt64Ty(Func->getContext()))
2071     : 8;  // Pre-v6 need 8-byte align
2072   if (Align < ReqAlign)
2073     return false;
2074 
2075   // Then make sure the immediate offset fits.
2076   int OffImm = getMemoryOpOffset(Op0);
2077   if (isT2) {
2078     int Limit = (1 << 8) * Scale;
2079     if (OffImm >= Limit || (OffImm <= -Limit) || (OffImm & (Scale-1)))
2080       return false;
2081     Offset = OffImm;
2082   } else {
2083     ARM_AM::AddrOpc AddSub = ARM_AM::add;
2084     if (OffImm < 0) {
2085       AddSub = ARM_AM::sub;
2086       OffImm = - OffImm;
2087     }
2088     int Limit = (1 << 8) * Scale;
2089     if (OffImm >= Limit || (OffImm & (Scale-1)))
2090       return false;
2091     Offset = ARM_AM::getAM3Opc(AddSub, OffImm);
2092   }
2093   FirstReg = Op0->getOperand(0).getReg();
2094   SecondReg = Op1->getOperand(0).getReg();
2095   if (FirstReg == SecondReg)
2096     return false;
2097   BaseReg = Op0->getOperand(1).getReg();
2098   Pred = getInstrPredicate(*Op0, PredReg);
2099   dl = Op0->getDebugLoc();
2100   return true;
2101 }
2102 
2103 bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
2104                                  SmallVectorImpl<MachineInstr *> &Ops,
2105                                  unsigned Base, bool isLd,
2106                                  DenseMap<MachineInstr*, unsigned> &MI2LocMap) {
2107   bool RetVal = false;
2108 
2109   // Sort by offset (in reverse order).
2110   std::sort(Ops.begin(), Ops.end(),
2111             [](const MachineInstr *LHS, const MachineInstr *RHS) {
2112     int LOffset = getMemoryOpOffset(LHS);
2113     int ROffset = getMemoryOpOffset(RHS);
2114     assert(LHS == RHS || LOffset != ROffset);
2115     return LOffset > ROffset;
2116   });
2117 
2118   // The loads / stores of the same base are in order. Scan them from first to
2119   // last and check for the following:
2120   // 1. Any def of base.
2121   // 2. Any gaps.
2122   while (Ops.size() > 1) {
2123     unsigned FirstLoc = ~0U;
2124     unsigned LastLoc = 0;
2125     MachineInstr *FirstOp = nullptr;
2126     MachineInstr *LastOp = nullptr;
2127     int LastOffset = 0;
2128     unsigned LastOpcode = 0;
2129     unsigned LastBytes = 0;
2130     unsigned NumMove = 0;
2131     for (int i = Ops.size() - 1; i >= 0; --i) {
2132       MachineInstr *Op = Ops[i];
2133       unsigned Loc = MI2LocMap[Op];
2134       if (Loc <= FirstLoc) {
2135         FirstLoc = Loc;
2136         FirstOp = Op;
2137       }
2138       if (Loc >= LastLoc) {
2139         LastLoc = Loc;
2140         LastOp = Op;
2141       }
2142 
2143       unsigned LSMOpcode
2144         = getLoadStoreMultipleOpcode(Op->getOpcode(), ARM_AM::ia);
2145       if (LastOpcode && LSMOpcode != LastOpcode)
2146         break;
2147 
2148       int Offset = getMemoryOpOffset(Op);
2149       unsigned Bytes = getLSMultipleTransferSize(Op);
2150       if (LastBytes) {
2151         if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes))
2152           break;
2153       }
2154       LastOffset = Offset;
2155       LastBytes = Bytes;
2156       LastOpcode = LSMOpcode;
2157       if (++NumMove == 8) // FIXME: Tune this limit.
2158         break;
2159     }
2160 
2161     if (NumMove <= 1)
2162       Ops.pop_back();
2163     else {
2164       SmallPtrSet<MachineInstr*, 4> MemOps;
2165       SmallSet<unsigned, 4> MemRegs;
2166       for (int i = NumMove-1; i >= 0; --i) {
2167         MemOps.insert(Ops[i]);
2168         MemRegs.insert(Ops[i]->getOperand(0).getReg());
2169       }
2170 
2171       // Be conservative, if the instructions are too far apart, don't
2172       // move them. We want to limit the increase of register pressure.
2173       bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this.
2174       if (DoMove)
2175         DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp,
2176                                            MemOps, MemRegs, TRI);
2177       if (!DoMove) {
2178         for (unsigned i = 0; i != NumMove; ++i)
2179           Ops.pop_back();
2180       } else {
2181         // This is the new location for the loads / stores.
2182         MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp;
2183         while (InsertPos != MBB->end()
2184                && (MemOps.count(InsertPos) || InsertPos->isDebugValue()))
2185           ++InsertPos;
2186 
2187         // If we are moving a pair of loads / stores, see if it makes sense
2188         // to try to allocate a pair of registers that can form register pairs.
2189         MachineInstr *Op0 = Ops.back();
2190         MachineInstr *Op1 = Ops[Ops.size()-2];
2191         unsigned FirstReg = 0, SecondReg = 0;
2192         unsigned BaseReg = 0, PredReg = 0;
2193         ARMCC::CondCodes Pred = ARMCC::AL;
2194         bool isT2 = false;
2195         unsigned NewOpc = 0;
2196         int Offset = 0;
2197         DebugLoc dl;
2198         if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc,
2199                                              FirstReg, SecondReg, BaseReg,
2200                                              Offset, PredReg, Pred, isT2)) {
2201           Ops.pop_back();
2202           Ops.pop_back();
2203 
2204           const MCInstrDesc &MCID = TII->get(NewOpc);
2205           const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF);
2206           MRI->constrainRegClass(FirstReg, TRC);
2207           MRI->constrainRegClass(SecondReg, TRC);
2208 
2209           // Form the pair instruction.
2210           if (isLd) {
2211             MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID)
2212               .addReg(FirstReg, RegState::Define)
2213               .addReg(SecondReg, RegState::Define)
2214               .addReg(BaseReg);
2215             // FIXME: We're converting from LDRi12 to an insn that still
2216             // uses addrmode2, so we need an explicit offset reg. It should
2217             // always by reg0 since we're transforming LDRi12s.
2218             if (!isT2)
2219               MIB.addReg(0);
2220             MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
2221             MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1));
2222             DEBUG(dbgs() << "Formed " << *MIB << "\n");
2223             ++NumLDRDFormed;
2224           } else {
2225             MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID)
2226               .addReg(FirstReg)
2227               .addReg(SecondReg)
2228               .addReg(BaseReg);
2229             // FIXME: We're converting from LDRi12 to an insn that still
2230             // uses addrmode2, so we need an explicit offset reg. It should
2231             // always by reg0 since we're transforming STRi12s.
2232             if (!isT2)
2233               MIB.addReg(0);
2234             MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
2235             MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1));
2236             DEBUG(dbgs() << "Formed " << *MIB << "\n");
2237             ++NumSTRDFormed;
2238           }
2239           MBB->erase(Op0);
2240           MBB->erase(Op1);
2241 
2242           if (!isT2) {
2243             // Add register allocation hints to form register pairs.
2244             MRI->setRegAllocationHint(FirstReg, ARMRI::RegPairEven, SecondReg);
2245             MRI->setRegAllocationHint(SecondReg,  ARMRI::RegPairOdd, FirstReg);
2246           }
2247         } else {
2248           for (unsigned i = 0; i != NumMove; ++i) {
2249             MachineInstr *Op = Ops.back();
2250             Ops.pop_back();
2251             MBB->splice(InsertPos, MBB, Op);
2252           }
2253         }
2254 
2255         NumLdStMoved += NumMove;
2256         RetVal = true;
2257       }
2258     }
2259   }
2260 
2261   return RetVal;
2262 }
2263 
2264 bool
2265 ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
2266   bool RetVal = false;
2267 
2268   DenseMap<MachineInstr*, unsigned> MI2LocMap;
2269   DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap;
2270   DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap;
2271   SmallVector<unsigned, 4> LdBases;
2272   SmallVector<unsigned, 4> StBases;
2273 
2274   unsigned Loc = 0;
2275   MachineBasicBlock::iterator MBBI = MBB->begin();
2276   MachineBasicBlock::iterator E = MBB->end();
2277   while (MBBI != E) {
2278     for (; MBBI != E; ++MBBI) {
2279       MachineInstr *MI = MBBI;
2280       if (MI->isCall() || MI->isTerminator()) {
2281         // Stop at barriers.
2282         ++MBBI;
2283         break;
2284       }
2285 
2286       if (!MI->isDebugValue())
2287         MI2LocMap[MI] = ++Loc;
2288 
2289       if (!isMemoryOp(*MI))
2290         continue;
2291       unsigned PredReg = 0;
2292       if (getInstrPredicate(*MI, PredReg) != ARMCC::AL)
2293         continue;
2294 
2295       int Opc = MI->getOpcode();
2296       bool isLd = isLoadSingle(Opc);
2297       unsigned Base = MI->getOperand(1).getReg();
2298       int Offset = getMemoryOpOffset(MI);
2299 
2300       bool StopHere = false;
2301       if (isLd) {
2302         DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
2303           Base2LdsMap.find(Base);
2304         if (BI != Base2LdsMap.end()) {
2305           for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
2306             if (Offset == getMemoryOpOffset(BI->second[i])) {
2307               StopHere = true;
2308               break;
2309             }
2310           }
2311           if (!StopHere)
2312             BI->second.push_back(MI);
2313         } else {
2314           Base2LdsMap[Base].push_back(MI);
2315           LdBases.push_back(Base);
2316         }
2317       } else {
2318         DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
2319           Base2StsMap.find(Base);
2320         if (BI != Base2StsMap.end()) {
2321           for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
2322             if (Offset == getMemoryOpOffset(BI->second[i])) {
2323               StopHere = true;
2324               break;
2325             }
2326           }
2327           if (!StopHere)
2328             BI->second.push_back(MI);
2329         } else {
2330           Base2StsMap[Base].push_back(MI);
2331           StBases.push_back(Base);
2332         }
2333       }
2334 
2335       if (StopHere) {
2336         // Found a duplicate (a base+offset combination that's seen earlier).
2337         // Backtrack.
2338         --Loc;
2339         break;
2340       }
2341     }
2342 
2343     // Re-schedule loads.
2344     for (unsigned i = 0, e = LdBases.size(); i != e; ++i) {
2345       unsigned Base = LdBases[i];
2346       SmallVectorImpl<MachineInstr *> &Lds = Base2LdsMap[Base];
2347       if (Lds.size() > 1)
2348         RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap);
2349     }
2350 
2351     // Re-schedule stores.
2352     for (unsigned i = 0, e = StBases.size(); i != e; ++i) {
2353       unsigned Base = StBases[i];
2354       SmallVectorImpl<MachineInstr *> &Sts = Base2StsMap[Base];
2355       if (Sts.size() > 1)
2356         RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap);
2357     }
2358 
2359     if (MBBI != E) {
2360       Base2LdsMap.clear();
2361       Base2StsMap.clear();
2362       LdBases.clear();
2363       StBases.clear();
2364     }
2365   }
2366 
2367   return RetVal;
2368 }
2369 
2370 
2371 /// Returns an instance of the load / store optimization pass.
2372 FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
2373   if (PreAlloc)
2374     return new ARMPreAllocLoadStoreOpt();
2375   return new ARMLoadStoreOpt();
2376 }
2377