1 //===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file This file contains a pass that performs load / store related peephole
11 /// optimizations. This pass should be run after register allocation.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "ARM.h"
16 #include "ARMBaseInstrInfo.h"
17 #include "ARMBaseRegisterInfo.h"
18 #include "ARMISelLowering.h"
19 #include "ARMMachineFunctionInfo.h"
20 #include "ARMSubtarget.h"
21 #include "MCTargetDesc/ARMAddressingModes.h"
22 #include "ThumbRegisterInfo.h"
23 #include "llvm/ADT/DenseMap.h"
24 #include "llvm/ADT/STLExtras.h"
25 #include "llvm/ADT/SmallPtrSet.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/SmallVector.h"
28 #include "llvm/ADT/Statistic.h"
29 #include "llvm/CodeGen/MachineBasicBlock.h"
30 #include "llvm/CodeGen/MachineFunctionPass.h"
31 #include "llvm/CodeGen/MachineInstr.h"
32 #include "llvm/CodeGen/MachineInstrBuilder.h"
33 #include "llvm/CodeGen/MachineRegisterInfo.h"
34 #include "llvm/CodeGen/RegisterClassInfo.h"
35 #include "llvm/CodeGen/SelectionDAGNodes.h"
36 #include "llvm/CodeGen/LivePhysRegs.h"
37 #include "llvm/IR/DataLayout.h"
38 #include "llvm/IR/DerivedTypes.h"
39 #include "llvm/IR/Function.h"
40 #include "llvm/Support/Allocator.h"
41 #include "llvm/Support/Debug.h"
42 #include "llvm/Support/ErrorHandling.h"
43 #include "llvm/Support/raw_ostream.h"
44 #include "llvm/Target/TargetInstrInfo.h"
45 #include "llvm/Target/TargetMachine.h"
46 #include "llvm/Target/TargetRegisterInfo.h"
47 using namespace llvm;
48 
49 #define DEBUG_TYPE "arm-ldst-opt"
50 
51 STATISTIC(NumLDMGened , "Number of ldm instructions generated");
52 STATISTIC(NumSTMGened , "Number of stm instructions generated");
53 STATISTIC(NumVLDMGened, "Number of vldm instructions generated");
54 STATISTIC(NumVSTMGened, "Number of vstm instructions generated");
55 STATISTIC(NumLdStMoved, "Number of load / store instructions moved");
56 STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation");
57 STATISTIC(NumSTRDFormed,"Number of strd created before allocation");
58 STATISTIC(NumLDRD2LDM,  "Number of ldrd instructions turned back into ldm");
59 STATISTIC(NumSTRD2STM,  "Number of strd instructions turned back into stm");
60 STATISTIC(NumLDRD2LDR,  "Number of ldrd instructions turned back into ldr's");
61 STATISTIC(NumSTRD2STR,  "Number of strd instructions turned back into str's");
62 
63 /// This switch disables formation of double/multi instructions that could
64 /// potentially lead to (new) alignment traps even with CCR.UNALIGN_TRP
65 /// disabled. This can be used to create libraries that are robust even when
66 /// users provoke undefined behaviour by supplying misaligned pointers.
67 /// \see mayCombineMisaligned()
68 static cl::opt<bool>
69 AssumeMisalignedLoadStores("arm-assume-misaligned-load-store", cl::Hidden,
70     cl::init(false), cl::desc("Be more conservative in ARM load/store opt"));
71 
72 namespace llvm {
73 void initializeARMLoadStoreOptPass(PassRegistry &);
74 }
75 
76 #define ARM_LOAD_STORE_OPT_NAME "ARM load / store optimization pass"
77 
78 namespace {
79   /// Post- register allocation pass the combine load / store instructions to
80   /// form ldm / stm instructions.
81   struct ARMLoadStoreOpt : public MachineFunctionPass {
82     static char ID;
83     ARMLoadStoreOpt() : MachineFunctionPass(ID) {
84       initializeARMLoadStoreOptPass(*PassRegistry::getPassRegistry());
85     }
86 
87     const MachineFunction *MF;
88     const TargetInstrInfo *TII;
89     const TargetRegisterInfo *TRI;
90     const ARMSubtarget *STI;
91     const TargetLowering *TL;
92     ARMFunctionInfo *AFI;
93     LivePhysRegs LiveRegs;
94     RegisterClassInfo RegClassInfo;
95     MachineBasicBlock::const_iterator LiveRegPos;
96     bool LiveRegsValid;
97     bool RegClassInfoValid;
98     bool isThumb1, isThumb2;
99 
100     bool runOnMachineFunction(MachineFunction &Fn) override;
101 
102     MachineFunctionProperties getRequiredProperties() const override {
103       return MachineFunctionProperties().set(
104           MachineFunctionProperties::Property::AllVRegsAllocated);
105     }
106 
107     const char *getPassName() const override {
108       return ARM_LOAD_STORE_OPT_NAME;
109     }
110 
111   private:
112     /// A set of load/store MachineInstrs with same base register sorted by
113     /// offset.
114     struct MemOpQueueEntry {
115       MachineInstr *MI;
116       int Offset;        ///< Load/Store offset.
117       unsigned Position; ///< Position as counted from end of basic block.
118       MemOpQueueEntry(MachineInstr *MI, int Offset, unsigned Position)
119         : MI(MI), Offset(Offset), Position(Position) {}
120     };
121     typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
122 
123     /// A set of MachineInstrs that fulfill (nearly all) conditions to get
124     /// merged into a LDM/STM.
125     struct MergeCandidate {
126       /// List of instructions ordered by load/store offset.
127       SmallVector<MachineInstr*, 4> Instrs;
128       /// Index in Instrs of the instruction being latest in the schedule.
129       unsigned LatestMIIdx;
130       /// Index in Instrs of the instruction being earliest in the schedule.
131       unsigned EarliestMIIdx;
132       /// Index into the basic block where the merged instruction will be
133       /// inserted. (See MemOpQueueEntry.Position)
134       unsigned InsertPos;
135       /// Whether the instructions can be merged into a ldm/stm instruction.
136       bool CanMergeToLSMulti;
137       /// Whether the instructions can be merged into a ldrd/strd instruction.
138       bool CanMergeToLSDouble;
139     };
140     SpecificBumpPtrAllocator<MergeCandidate> Allocator;
141     SmallVector<const MergeCandidate*,4> Candidates;
142     SmallVector<MachineInstr*,4> MergeBaseCandidates;
143 
144     void moveLiveRegsBefore(const MachineBasicBlock &MBB,
145                             MachineBasicBlock::const_iterator Before);
146     unsigned findFreeReg(const TargetRegisterClass &RegClass);
147     void UpdateBaseRegUses(MachineBasicBlock &MBB,
148                            MachineBasicBlock::iterator MBBI,
149                            DebugLoc DL, unsigned Base, unsigned WordOffset,
150                            ARMCC::CondCodes Pred, unsigned PredReg);
151     MachineInstr *CreateLoadStoreMulti(MachineBasicBlock &MBB,
152         MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
153         bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
154         DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs);
155     MachineInstr *CreateLoadStoreDouble(MachineBasicBlock &MBB,
156         MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
157         bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
158         DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) const;
159     void FormCandidates(const MemOpQueue &MemOps);
160     MachineInstr *MergeOpsUpdate(const MergeCandidate &Cand);
161     bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
162                              MachineBasicBlock::iterator &MBBI);
163     bool MergeBaseUpdateLoadStore(MachineInstr *MI);
164     bool MergeBaseUpdateLSMultiple(MachineInstr *MI);
165     bool MergeBaseUpdateLSDouble(MachineInstr &MI) const;
166     bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
167     bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
168     bool CombineMovBx(MachineBasicBlock &MBB);
169   };
170   char ARMLoadStoreOpt::ID = 0;
171 }
172 
173 INITIALIZE_PASS(ARMLoadStoreOpt, "arm-load-store-opt", ARM_LOAD_STORE_OPT_NAME, false, false)
174 
175 static bool definesCPSR(const MachineInstr *MI) {
176   for (const auto &MO : MI->operands()) {
177     if (!MO.isReg())
178       continue;
179     if (MO.isDef() && MO.getReg() == ARM::CPSR && !MO.isDead())
180       // If the instruction has live CPSR def, then it's not safe to fold it
181       // into load / store.
182       return true;
183   }
184 
185   return false;
186 }
187 
188 static int getMemoryOpOffset(const MachineInstr *MI) {
189   unsigned Opcode = MI->getOpcode();
190   bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD;
191   unsigned NumOperands = MI->getDesc().getNumOperands();
192   unsigned OffField = MI->getOperand(NumOperands-3).getImm();
193 
194   if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 ||
195       Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 ||
196       Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8 ||
197       Opcode == ARM::LDRi12   || Opcode == ARM::STRi12)
198     return OffField;
199 
200   // Thumb1 immediate offsets are scaled by 4
201   if (Opcode == ARM::tLDRi || Opcode == ARM::tSTRi ||
202       Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi)
203     return OffField * 4;
204 
205   int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField)
206     : ARM_AM::getAM5Offset(OffField) * 4;
207   ARM_AM::AddrOpc Op = isAM3 ? ARM_AM::getAM3Op(OffField)
208     : ARM_AM::getAM5Op(OffField);
209 
210   if (Op == ARM_AM::sub)
211     return -Offset;
212 
213   return Offset;
214 }
215 
216 static const MachineOperand &getLoadStoreBaseOp(const MachineInstr &MI) {
217   return MI.getOperand(1);
218 }
219 
220 static const MachineOperand &getLoadStoreRegOp(const MachineInstr &MI) {
221   return MI.getOperand(0);
222 }
223 
224 static int getLoadStoreMultipleOpcode(unsigned Opcode, ARM_AM::AMSubMode Mode) {
225   switch (Opcode) {
226   default: llvm_unreachable("Unhandled opcode!");
227   case ARM::LDRi12:
228     ++NumLDMGened;
229     switch (Mode) {
230     default: llvm_unreachable("Unhandled submode!");
231     case ARM_AM::ia: return ARM::LDMIA;
232     case ARM_AM::da: return ARM::LDMDA;
233     case ARM_AM::db: return ARM::LDMDB;
234     case ARM_AM::ib: return ARM::LDMIB;
235     }
236   case ARM::STRi12:
237     ++NumSTMGened;
238     switch (Mode) {
239     default: llvm_unreachable("Unhandled submode!");
240     case ARM_AM::ia: return ARM::STMIA;
241     case ARM_AM::da: return ARM::STMDA;
242     case ARM_AM::db: return ARM::STMDB;
243     case ARM_AM::ib: return ARM::STMIB;
244     }
245   case ARM::tLDRi:
246   case ARM::tLDRspi:
247     // tLDMIA is writeback-only - unless the base register is in the input
248     // reglist.
249     ++NumLDMGened;
250     switch (Mode) {
251     default: llvm_unreachable("Unhandled submode!");
252     case ARM_AM::ia: return ARM::tLDMIA;
253     }
254   case ARM::tSTRi:
255   case ARM::tSTRspi:
256     // There is no non-writeback tSTMIA either.
257     ++NumSTMGened;
258     switch (Mode) {
259     default: llvm_unreachable("Unhandled submode!");
260     case ARM_AM::ia: return ARM::tSTMIA_UPD;
261     }
262   case ARM::t2LDRi8:
263   case ARM::t2LDRi12:
264     ++NumLDMGened;
265     switch (Mode) {
266     default: llvm_unreachable("Unhandled submode!");
267     case ARM_AM::ia: return ARM::t2LDMIA;
268     case ARM_AM::db: return ARM::t2LDMDB;
269     }
270   case ARM::t2STRi8:
271   case ARM::t2STRi12:
272     ++NumSTMGened;
273     switch (Mode) {
274     default: llvm_unreachable("Unhandled submode!");
275     case ARM_AM::ia: return ARM::t2STMIA;
276     case ARM_AM::db: return ARM::t2STMDB;
277     }
278   case ARM::VLDRS:
279     ++NumVLDMGened;
280     switch (Mode) {
281     default: llvm_unreachable("Unhandled submode!");
282     case ARM_AM::ia: return ARM::VLDMSIA;
283     case ARM_AM::db: return 0; // Only VLDMSDB_UPD exists.
284     }
285   case ARM::VSTRS:
286     ++NumVSTMGened;
287     switch (Mode) {
288     default: llvm_unreachable("Unhandled submode!");
289     case ARM_AM::ia: return ARM::VSTMSIA;
290     case ARM_AM::db: return 0; // Only VSTMSDB_UPD exists.
291     }
292   case ARM::VLDRD:
293     ++NumVLDMGened;
294     switch (Mode) {
295     default: llvm_unreachable("Unhandled submode!");
296     case ARM_AM::ia: return ARM::VLDMDIA;
297     case ARM_AM::db: return 0; // Only VLDMDDB_UPD exists.
298     }
299   case ARM::VSTRD:
300     ++NumVSTMGened;
301     switch (Mode) {
302     default: llvm_unreachable("Unhandled submode!");
303     case ARM_AM::ia: return ARM::VSTMDIA;
304     case ARM_AM::db: return 0; // Only VSTMDDB_UPD exists.
305     }
306   }
307 }
308 
309 static ARM_AM::AMSubMode getLoadStoreMultipleSubMode(unsigned Opcode) {
310   switch (Opcode) {
311   default: llvm_unreachable("Unhandled opcode!");
312   case ARM::LDMIA_RET:
313   case ARM::LDMIA:
314   case ARM::LDMIA_UPD:
315   case ARM::STMIA:
316   case ARM::STMIA_UPD:
317   case ARM::tLDMIA:
318   case ARM::tLDMIA_UPD:
319   case ARM::tSTMIA_UPD:
320   case ARM::t2LDMIA_RET:
321   case ARM::t2LDMIA:
322   case ARM::t2LDMIA_UPD:
323   case ARM::t2STMIA:
324   case ARM::t2STMIA_UPD:
325   case ARM::VLDMSIA:
326   case ARM::VLDMSIA_UPD:
327   case ARM::VSTMSIA:
328   case ARM::VSTMSIA_UPD:
329   case ARM::VLDMDIA:
330   case ARM::VLDMDIA_UPD:
331   case ARM::VSTMDIA:
332   case ARM::VSTMDIA_UPD:
333     return ARM_AM::ia;
334 
335   case ARM::LDMDA:
336   case ARM::LDMDA_UPD:
337   case ARM::STMDA:
338   case ARM::STMDA_UPD:
339     return ARM_AM::da;
340 
341   case ARM::LDMDB:
342   case ARM::LDMDB_UPD:
343   case ARM::STMDB:
344   case ARM::STMDB_UPD:
345   case ARM::t2LDMDB:
346   case ARM::t2LDMDB_UPD:
347   case ARM::t2STMDB:
348   case ARM::t2STMDB_UPD:
349   case ARM::VLDMSDB_UPD:
350   case ARM::VSTMSDB_UPD:
351   case ARM::VLDMDDB_UPD:
352   case ARM::VSTMDDB_UPD:
353     return ARM_AM::db;
354 
355   case ARM::LDMIB:
356   case ARM::LDMIB_UPD:
357   case ARM::STMIB:
358   case ARM::STMIB_UPD:
359     return ARM_AM::ib;
360   }
361 }
362 
363 static bool isT1i32Load(unsigned Opc) {
364   return Opc == ARM::tLDRi || Opc == ARM::tLDRspi;
365 }
366 
367 static bool isT2i32Load(unsigned Opc) {
368   return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8;
369 }
370 
371 static bool isi32Load(unsigned Opc) {
372   return Opc == ARM::LDRi12 || isT1i32Load(Opc) || isT2i32Load(Opc) ;
373 }
374 
375 static bool isT1i32Store(unsigned Opc) {
376   return Opc == ARM::tSTRi || Opc == ARM::tSTRspi;
377 }
378 
379 static bool isT2i32Store(unsigned Opc) {
380   return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8;
381 }
382 
383 static bool isi32Store(unsigned Opc) {
384   return Opc == ARM::STRi12 || isT1i32Store(Opc) || isT2i32Store(Opc);
385 }
386 
387 static bool isLoadSingle(unsigned Opc) {
388   return isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD;
389 }
390 
391 static unsigned getImmScale(unsigned Opc) {
392   switch (Opc) {
393   default: llvm_unreachable("Unhandled opcode!");
394   case ARM::tLDRi:
395   case ARM::tSTRi:
396   case ARM::tLDRspi:
397   case ARM::tSTRspi:
398     return 1;
399   case ARM::tLDRHi:
400   case ARM::tSTRHi:
401     return 2;
402   case ARM::tLDRBi:
403   case ARM::tSTRBi:
404     return 4;
405   }
406 }
407 
408 static unsigned getLSMultipleTransferSize(const MachineInstr *MI) {
409   switch (MI->getOpcode()) {
410   default: return 0;
411   case ARM::LDRi12:
412   case ARM::STRi12:
413   case ARM::tLDRi:
414   case ARM::tSTRi:
415   case ARM::tLDRspi:
416   case ARM::tSTRspi:
417   case ARM::t2LDRi8:
418   case ARM::t2LDRi12:
419   case ARM::t2STRi8:
420   case ARM::t2STRi12:
421   case ARM::VLDRS:
422   case ARM::VSTRS:
423     return 4;
424   case ARM::VLDRD:
425   case ARM::VSTRD:
426     return 8;
427   case ARM::LDMIA:
428   case ARM::LDMDA:
429   case ARM::LDMDB:
430   case ARM::LDMIB:
431   case ARM::STMIA:
432   case ARM::STMDA:
433   case ARM::STMDB:
434   case ARM::STMIB:
435   case ARM::tLDMIA:
436   case ARM::tLDMIA_UPD:
437   case ARM::tSTMIA_UPD:
438   case ARM::t2LDMIA:
439   case ARM::t2LDMDB:
440   case ARM::t2STMIA:
441   case ARM::t2STMDB:
442   case ARM::VLDMSIA:
443   case ARM::VSTMSIA:
444     return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4;
445   case ARM::VLDMDIA:
446   case ARM::VSTMDIA:
447     return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8;
448   }
449 }
450 
451 /// Update future uses of the base register with the offset introduced
452 /// due to writeback. This function only works on Thumb1.
453 void
454 ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
455                                    MachineBasicBlock::iterator MBBI,
456                                    DebugLoc DL, unsigned Base,
457                                    unsigned WordOffset,
458                                    ARMCC::CondCodes Pred, unsigned PredReg) {
459   assert(isThumb1 && "Can only update base register uses for Thumb1!");
460   // Start updating any instructions with immediate offsets. Insert a SUB before
461   // the first non-updateable instruction (if any).
462   for (; MBBI != MBB.end(); ++MBBI) {
463     bool InsertSub = false;
464     unsigned Opc = MBBI->getOpcode();
465 
466     if (MBBI->readsRegister(Base)) {
467       int Offset;
468       bool IsLoad =
469         Opc == ARM::tLDRi || Opc == ARM::tLDRHi || Opc == ARM::tLDRBi;
470       bool IsStore =
471         Opc == ARM::tSTRi || Opc == ARM::tSTRHi || Opc == ARM::tSTRBi;
472 
473       if (IsLoad || IsStore) {
474         // Loads and stores with immediate offsets can be updated, but only if
475         // the new offset isn't negative.
476         // The MachineOperand containing the offset immediate is the last one
477         // before predicates.
478         MachineOperand &MO =
479           MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
480         // The offsets are scaled by 1, 2 or 4 depending on the Opcode.
481         Offset = MO.getImm() - WordOffset * getImmScale(Opc);
482 
483         // If storing the base register, it needs to be reset first.
484         unsigned InstrSrcReg = getLoadStoreRegOp(*MBBI).getReg();
485 
486         if (Offset >= 0 && !(IsStore && InstrSrcReg == Base))
487           MO.setImm(Offset);
488         else
489           InsertSub = true;
490 
491       } else if ((Opc == ARM::tSUBi8 || Opc == ARM::tADDi8) &&
492                  !definesCPSR(MBBI)) {
493         // SUBS/ADDS using this register, with a dead def of the CPSR.
494         // Merge it with the update; if the merged offset is too large,
495         // insert a new sub instead.
496         MachineOperand &MO =
497           MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
498         Offset = (Opc == ARM::tSUBi8) ?
499           MO.getImm() + WordOffset * 4 :
500           MO.getImm() - WordOffset * 4 ;
501         if (Offset >= 0 && TL->isLegalAddImmediate(Offset)) {
502           // FIXME: Swap ADDS<->SUBS if Offset < 0, erase instruction if
503           // Offset == 0.
504           MO.setImm(Offset);
505           // The base register has now been reset, so exit early.
506           return;
507         } else {
508           InsertSub = true;
509         }
510 
511       } else {
512         // Can't update the instruction.
513         InsertSub = true;
514       }
515 
516     } else if (definesCPSR(MBBI) || MBBI->isCall() || MBBI->isBranch()) {
517       // Since SUBS sets the condition flags, we can't place the base reset
518       // after an instruction that has a live CPSR def.
519       // The base register might also contain an argument for a function call.
520       InsertSub = true;
521     }
522 
523     if (InsertSub) {
524       // An instruction above couldn't be updated, so insert a sub.
525       AddDefaultT1CC(BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true)
526         .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg);
527       return;
528     }
529 
530     if (MBBI->killsRegister(Base) || MBBI->definesRegister(Base))
531       // Register got killed. Stop updating.
532       return;
533   }
534 
535   // End of block was reached.
536   if (MBB.succ_size() > 0) {
537     // FIXME: Because of a bug, live registers are sometimes missing from
538     // the successor blocks' live-in sets. This means we can't trust that
539     // information and *always* have to reset at the end of a block.
540     // See PR21029.
541     if (MBBI != MBB.end()) --MBBI;
542     AddDefaultT1CC(
543       BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true)
544       .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg);
545   }
546 }
547 
548 /// Return the first register of class \p RegClass that is not in \p Regs.
549 unsigned ARMLoadStoreOpt::findFreeReg(const TargetRegisterClass &RegClass) {
550   if (!RegClassInfoValid) {
551     RegClassInfo.runOnMachineFunction(*MF);
552     RegClassInfoValid = true;
553   }
554 
555   for (unsigned Reg : RegClassInfo.getOrder(&RegClass))
556     if (!LiveRegs.contains(Reg))
557       return Reg;
558   return 0;
559 }
560 
561 /// Compute live registers just before instruction \p Before (in normal schedule
562 /// direction). Computes backwards so multiple queries in the same block must
563 /// come in reverse order.
564 void ARMLoadStoreOpt::moveLiveRegsBefore(const MachineBasicBlock &MBB,
565     MachineBasicBlock::const_iterator Before) {
566   // Initialize if we never queried in this block.
567   if (!LiveRegsValid) {
568     LiveRegs.init(TRI);
569     LiveRegs.addLiveOuts(MBB);
570     LiveRegPos = MBB.end();
571     LiveRegsValid = true;
572   }
573   // Move backward just before the "Before" position.
574   while (LiveRegPos != Before) {
575     --LiveRegPos;
576     LiveRegs.stepBackward(*LiveRegPos);
577   }
578 }
579 
580 static bool ContainsReg(const ArrayRef<std::pair<unsigned, bool>> &Regs,
581                         unsigned Reg) {
582   for (const std::pair<unsigned, bool> &R : Regs)
583     if (R.first == Reg)
584       return true;
585   return false;
586 }
587 
588 /// Create and insert a LDM or STM with Base as base register and registers in
589 /// Regs as the register operands that would be loaded / stored.  It returns
590 /// true if the transformation is done.
591 MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(MachineBasicBlock &MBB,
592     MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
593     bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
594     DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) {
595   unsigned NumRegs = Regs.size();
596   assert(NumRegs > 1);
597 
598   // For Thumb1 targets, it might be necessary to clobber the CPSR to merge.
599   // Compute liveness information for that register to make the decision.
600   bool SafeToClobberCPSR = !isThumb1 ||
601     (MBB.computeRegisterLiveness(TRI, ARM::CPSR, InsertBefore, 20) ==
602      MachineBasicBlock::LQR_Dead);
603 
604   bool Writeback = isThumb1; // Thumb1 LDM/STM have base reg writeback.
605 
606   // Exception: If the base register is in the input reglist, Thumb1 LDM is
607   // non-writeback.
608   // It's also not possible to merge an STR of the base register in Thumb1.
609   if (isThumb1 && isi32Load(Opcode) && ContainsReg(Regs, Base)) {
610     assert(Base != ARM::SP && "Thumb1 does not allow SP in register list");
611     if (Opcode == ARM::tLDRi) {
612       Writeback = false;
613     } else if (Opcode == ARM::tSTRi) {
614       return nullptr;
615     }
616   }
617 
618   ARM_AM::AMSubMode Mode = ARM_AM::ia;
619   // VFP and Thumb2 do not support IB or DA modes. Thumb1 only supports IA.
620   bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
621   bool haveIBAndDA = isNotVFP && !isThumb2 && !isThumb1;
622 
623   if (Offset == 4 && haveIBAndDA) {
624     Mode = ARM_AM::ib;
625   } else if (Offset == -4 * (int)NumRegs + 4 && haveIBAndDA) {
626     Mode = ARM_AM::da;
627   } else if (Offset == -4 * (int)NumRegs && isNotVFP && !isThumb1) {
628     // VLDM/VSTM do not support DB mode without also updating the base reg.
629     Mode = ARM_AM::db;
630   } else if (Offset != 0 || Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) {
631     // Check if this is a supported opcode before inserting instructions to
632     // calculate a new base register.
633     if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return nullptr;
634 
635     // If starting offset isn't zero, insert a MI to materialize a new base.
636     // But only do so if it is cost effective, i.e. merging more than two
637     // loads / stores.
638     if (NumRegs <= 2)
639       return nullptr;
640 
641     // On Thumb1, it's not worth materializing a new base register without
642     // clobbering the CPSR (i.e. not using ADDS/SUBS).
643     if (!SafeToClobberCPSR)
644       return nullptr;
645 
646     unsigned NewBase;
647     if (isi32Load(Opcode)) {
648       // If it is a load, then just use one of the destination registers
649       // as the new base. Will no longer be writeback in Thumb1.
650       NewBase = Regs[NumRegs-1].first;
651       Writeback = false;
652     } else {
653       // Find a free register that we can use as scratch register.
654       moveLiveRegsBefore(MBB, InsertBefore);
655       // The merged instruction does not exist yet but will use several Regs if
656       // it is a Store.
657       if (!isLoadSingle(Opcode))
658         for (const std::pair<unsigned, bool> &R : Regs)
659           LiveRegs.addReg(R.first);
660 
661       NewBase = findFreeReg(isThumb1 ? ARM::tGPRRegClass : ARM::GPRRegClass);
662       if (NewBase == 0)
663         return nullptr;
664     }
665 
666     int BaseOpc =
667       isThumb2 ? ARM::t2ADDri :
668       (isThumb1 && Base == ARM::SP) ? ARM::tADDrSPi :
669       (isThumb1 && Offset < 8) ? ARM::tADDi3 :
670       isThumb1 ? ARM::tADDi8  : ARM::ADDri;
671 
672     if (Offset < 0) {
673       Offset = - Offset;
674       BaseOpc =
675         isThumb2 ? ARM::t2SUBri :
676         (isThumb1 && Offset < 8 && Base != ARM::SP) ? ARM::tSUBi3 :
677         isThumb1 ? ARM::tSUBi8  : ARM::SUBri;
678     }
679 
680     if (!TL->isLegalAddImmediate(Offset))
681       // FIXME: Try add with register operand?
682       return nullptr; // Probably not worth it then.
683 
684     // We can only append a kill flag to the add/sub input if the value is not
685     // used in the register list of the stm as well.
686     bool KillOldBase = BaseKill &&
687       (!isi32Store(Opcode) || !ContainsReg(Regs, Base));
688 
689     if (isThumb1) {
690       // Thumb1: depending on immediate size, use either
691       //   ADDS NewBase, Base, #imm3
692       // or
693       //   MOV  NewBase, Base
694       //   ADDS NewBase, #imm8.
695       if (Base != NewBase &&
696           (BaseOpc == ARM::tADDi8 || BaseOpc == ARM::tSUBi8)) {
697         // Need to insert a MOV to the new base first.
698         if (isARMLowRegister(NewBase) && isARMLowRegister(Base) &&
699             !STI->hasV6Ops()) {
700           // thumbv4t doesn't have lo->lo copies, and we can't predicate tMOVSr
701           if (Pred != ARMCC::AL)
702             return nullptr;
703           BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVSr), NewBase)
704             .addReg(Base, getKillRegState(KillOldBase));
705         } else
706           BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVr), NewBase)
707             .addReg(Base, getKillRegState(KillOldBase))
708             .addImm(Pred).addReg(PredReg);
709 
710         // The following ADDS/SUBS becomes an update.
711         Base = NewBase;
712         KillOldBase = true;
713       }
714       if (BaseOpc == ARM::tADDrSPi) {
715         assert(Offset % 4 == 0 && "tADDrSPi offset is scaled by 4");
716         BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
717           .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset/4)
718           .addImm(Pred).addReg(PredReg);
719       } else
720         AddDefaultT1CC(
721           BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase), true)
722           .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset)
723           .addImm(Pred).addReg(PredReg);
724     } else {
725       BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
726         .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset)
727         .addImm(Pred).addReg(PredReg).addReg(0);
728     }
729     Base = NewBase;
730     BaseKill = true; // New base is always killed straight away.
731   }
732 
733   bool isDef = isLoadSingle(Opcode);
734 
735   // Get LS multiple opcode. Note that for Thumb1 this might be an opcode with
736   // base register writeback.
737   Opcode = getLoadStoreMultipleOpcode(Opcode, Mode);
738   if (!Opcode)
739     return nullptr;
740 
741   // Check if a Thumb1 LDM/STM merge is safe. This is the case if:
742   // - There is no writeback (LDM of base register),
743   // - the base register is killed by the merged instruction,
744   // - or it's safe to overwrite the condition flags, i.e. to insert a SUBS
745   //   to reset the base register.
746   // Otherwise, don't merge.
747   // It's safe to return here since the code to materialize a new base register
748   // above is also conditional on SafeToClobberCPSR.
749   if (isThumb1 && !SafeToClobberCPSR && Writeback && !BaseKill)
750     return nullptr;
751 
752   MachineInstrBuilder MIB;
753 
754   if (Writeback) {
755     assert(isThumb1 && "expected Writeback only inThumb1");
756     if (Opcode == ARM::tLDMIA) {
757       assert(!(ContainsReg(Regs, Base)) && "Thumb1 can't LDM ! with Base in Regs");
758       // Update tLDMIA with writeback if necessary.
759       Opcode = ARM::tLDMIA_UPD;
760     }
761 
762     MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode));
763 
764     // Thumb1: we might need to set base writeback when building the MI.
765     MIB.addReg(Base, getDefRegState(true))
766        .addReg(Base, getKillRegState(BaseKill));
767 
768     // The base isn't dead after a merged instruction with writeback.
769     // Insert a sub instruction after the newly formed instruction to reset.
770     if (!BaseKill)
771       UpdateBaseRegUses(MBB, InsertBefore, DL, Base, NumRegs, Pred, PredReg);
772 
773   } else {
774     // No writeback, simply build the MachineInstr.
775     MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode));
776     MIB.addReg(Base, getKillRegState(BaseKill));
777   }
778 
779   MIB.addImm(Pred).addReg(PredReg);
780 
781   for (const std::pair<unsigned, bool> &R : Regs)
782     MIB.addReg(R.first, getDefRegState(isDef) | getKillRegState(R.second));
783 
784   return MIB.getInstr();
785 }
786 
787 MachineInstr *ARMLoadStoreOpt::CreateLoadStoreDouble(MachineBasicBlock &MBB,
788     MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
789     bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
790     DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) const {
791   bool IsLoad = isi32Load(Opcode);
792   assert((IsLoad || isi32Store(Opcode)) && "Must have integer load or store");
793   unsigned LoadStoreOpcode = IsLoad ? ARM::t2LDRDi8 : ARM::t2STRDi8;
794 
795   assert(Regs.size() == 2);
796   MachineInstrBuilder MIB = BuildMI(MBB, InsertBefore, DL,
797                                     TII->get(LoadStoreOpcode));
798   if (IsLoad) {
799     MIB.addReg(Regs[0].first, RegState::Define)
800        .addReg(Regs[1].first, RegState::Define);
801   } else {
802     MIB.addReg(Regs[0].first, getKillRegState(Regs[0].second))
803        .addReg(Regs[1].first, getKillRegState(Regs[1].second));
804   }
805   MIB.addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
806   return MIB.getInstr();
807 }
808 
809 /// Call MergeOps and update MemOps and merges accordingly on success.
810 MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
811   const MachineInstr *First = Cand.Instrs.front();
812   unsigned Opcode = First->getOpcode();
813   bool IsLoad = isLoadSingle(Opcode);
814   SmallVector<std::pair<unsigned, bool>, 8> Regs;
815   SmallVector<unsigned, 4> ImpDefs;
816   DenseSet<unsigned> KilledRegs;
817   DenseSet<unsigned> UsedRegs;
818   // Determine list of registers and list of implicit super-register defs.
819   for (const MachineInstr *MI : Cand.Instrs) {
820     const MachineOperand &MO = getLoadStoreRegOp(*MI);
821     unsigned Reg = MO.getReg();
822     bool IsKill = MO.isKill();
823     if (IsKill)
824       KilledRegs.insert(Reg);
825     Regs.push_back(std::make_pair(Reg, IsKill));
826     UsedRegs.insert(Reg);
827 
828     if (IsLoad) {
829       // Collect any implicit defs of super-registers, after merging we can't
830       // be sure anymore that we properly preserved these live ranges and must
831       // removed these implicit operands.
832       for (const MachineOperand &MO : MI->implicit_operands()) {
833         if (!MO.isReg() || !MO.isDef() || MO.isDead())
834           continue;
835         assert(MO.isImplicit());
836         unsigned DefReg = MO.getReg();
837 
838         if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) != ImpDefs.end())
839           continue;
840         // We can ignore cases where the super-reg is read and written.
841         if (MI->readsRegister(DefReg))
842           continue;
843         ImpDefs.push_back(DefReg);
844       }
845     }
846   }
847 
848   // Attempt the merge.
849   typedef MachineBasicBlock::iterator iterator;
850   MachineInstr *LatestMI = Cand.Instrs[Cand.LatestMIIdx];
851   iterator InsertBefore = std::next(iterator(LatestMI));
852   MachineBasicBlock &MBB = *LatestMI->getParent();
853   unsigned Offset = getMemoryOpOffset(First);
854   unsigned Base = getLoadStoreBaseOp(*First).getReg();
855   bool BaseKill = LatestMI->killsRegister(Base);
856   unsigned PredReg = 0;
857   ARMCC::CondCodes Pred = getInstrPredicate(*First, PredReg);
858   DebugLoc DL = First->getDebugLoc();
859   MachineInstr *Merged = nullptr;
860   if (Cand.CanMergeToLSDouble)
861     Merged = CreateLoadStoreDouble(MBB, InsertBefore, Offset, Base, BaseKill,
862                                    Opcode, Pred, PredReg, DL, Regs);
863   if (!Merged && Cand.CanMergeToLSMulti)
864     Merged = CreateLoadStoreMulti(MBB, InsertBefore, Offset, Base, BaseKill,
865                                   Opcode, Pred, PredReg, DL, Regs);
866   if (!Merged)
867     return nullptr;
868 
869   // Determine earliest instruction that will get removed. We then keep an
870   // iterator just above it so the following erases don't invalidated it.
871   iterator EarliestI(Cand.Instrs[Cand.EarliestMIIdx]);
872   bool EarliestAtBegin = false;
873   if (EarliestI == MBB.begin()) {
874     EarliestAtBegin = true;
875   } else {
876     EarliestI = std::prev(EarliestI);
877   }
878 
879   // Remove instructions which have been merged.
880   for (MachineInstr *MI : Cand.Instrs)
881     MBB.erase(MI);
882 
883   // Determine range between the earliest removed instruction and the new one.
884   if (EarliestAtBegin)
885     EarliestI = MBB.begin();
886   else
887     EarliestI = std::next(EarliestI);
888   auto FixupRange = make_range(EarliestI, iterator(Merged));
889 
890   if (isLoadSingle(Opcode)) {
891     // If the previous loads defined a super-reg, then we have to mark earlier
892     // operands undef; Replicate the super-reg def on the merged instruction.
893     for (MachineInstr &MI : FixupRange) {
894       for (unsigned &ImpDefReg : ImpDefs) {
895         for (MachineOperand &MO : MI.implicit_operands()) {
896           if (!MO.isReg() || MO.getReg() != ImpDefReg)
897             continue;
898           if (MO.readsReg())
899             MO.setIsUndef();
900           else if (MO.isDef())
901             ImpDefReg = 0;
902         }
903       }
904     }
905 
906     MachineInstrBuilder MIB(*Merged->getParent()->getParent(), Merged);
907     for (unsigned ImpDef : ImpDefs)
908       MIB.addReg(ImpDef, RegState::ImplicitDefine);
909   } else {
910     // Remove kill flags: We are possibly storing the values later now.
911     assert(isi32Store(Opcode) || Opcode == ARM::VSTRS || Opcode == ARM::VSTRD);
912     for (MachineInstr &MI : FixupRange) {
913       for (MachineOperand &MO : MI.uses()) {
914         if (!MO.isReg() || !MO.isKill())
915           continue;
916         if (UsedRegs.count(MO.getReg()))
917           MO.setIsKill(false);
918       }
919     }
920     assert(ImpDefs.empty());
921   }
922 
923   return Merged;
924 }
925 
926 static bool isValidLSDoubleOffset(int Offset) {
927   unsigned Value = abs(Offset);
928   // t2LDRDi8/t2STRDi8 supports an 8 bit immediate which is internally
929   // multiplied by 4.
930   return (Value % 4) == 0 && Value < 1024;
931 }
932 
933 /// Return true for loads/stores that can be combined to a double/multi
934 /// operation without increasing the requirements for alignment.
935 static bool mayCombineMisaligned(const TargetSubtargetInfo &STI,
936                                  const MachineInstr &MI) {
937   // vldr/vstr trap on misaligned pointers anyway, forming vldm makes no
938   // difference.
939   unsigned Opcode = MI.getOpcode();
940   if (!isi32Load(Opcode) && !isi32Store(Opcode))
941     return true;
942 
943   // Stack pointer alignment is out of the programmers control so we can trust
944   // SP-relative loads/stores.
945   if (getLoadStoreBaseOp(MI).getReg() == ARM::SP &&
946       STI.getFrameLowering()->getTransientStackAlignment() >= 4)
947     return true;
948   return false;
949 }
950 
951 /// Find candidates for load/store multiple merge in list of MemOpQueueEntries.
952 void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
953   const MachineInstr *FirstMI = MemOps[0].MI;
954   unsigned Opcode = FirstMI->getOpcode();
955   bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
956   unsigned Size = getLSMultipleTransferSize(FirstMI);
957 
958   unsigned SIndex = 0;
959   unsigned EIndex = MemOps.size();
960   do {
961     // Look at the first instruction.
962     const MachineInstr *MI = MemOps[SIndex].MI;
963     int Offset = MemOps[SIndex].Offset;
964     const MachineOperand &PMO = getLoadStoreRegOp(*MI);
965     unsigned PReg = PMO.getReg();
966     unsigned PRegNum = PMO.isUndef() ? UINT_MAX : TRI->getEncodingValue(PReg);
967     unsigned Latest = SIndex;
968     unsigned Earliest = SIndex;
969     unsigned Count = 1;
970     bool CanMergeToLSDouble =
971       STI->isThumb2() && isNotVFP && isValidLSDoubleOffset(Offset);
972     // ARM errata 602117: LDRD with base in list may result in incorrect base
973     // register when interrupted or faulted.
974     if (STI->isCortexM3() && isi32Load(Opcode) &&
975         PReg == getLoadStoreBaseOp(*MI).getReg())
976       CanMergeToLSDouble = false;
977 
978     bool CanMergeToLSMulti = true;
979     // On swift vldm/vstm starting with an odd register number as that needs
980     // more uops than single vldrs.
981     if (STI->isSwift() && !isNotVFP && (PRegNum % 2) == 1)
982       CanMergeToLSMulti = false;
983 
984     // LDRD/STRD do not allow SP/PC. LDM/STM do not support it or have it
985     // deprecated; LDM to PC is fine but cannot happen here.
986     if (PReg == ARM::SP || PReg == ARM::PC)
987       CanMergeToLSMulti = CanMergeToLSDouble = false;
988 
989     // Should we be conservative?
990     if (AssumeMisalignedLoadStores && !mayCombineMisaligned(*STI, *MI))
991       CanMergeToLSMulti = CanMergeToLSDouble = false;
992 
993     // Merge following instructions where possible.
994     for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) {
995       int NewOffset = MemOps[I].Offset;
996       if (NewOffset != Offset + (int)Size)
997         break;
998       const MachineOperand &MO = getLoadStoreRegOp(*MemOps[I].MI);
999       unsigned Reg = MO.getReg();
1000       if (Reg == ARM::SP || Reg == ARM::PC)
1001         break;
1002 
1003       // See if the current load/store may be part of a multi load/store.
1004       unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg);
1005       bool PartOfLSMulti = CanMergeToLSMulti;
1006       if (PartOfLSMulti) {
1007         // Register numbers must be in ascending order.
1008         if (RegNum <= PRegNum)
1009           PartOfLSMulti = false;
1010         // For VFP / NEON load/store multiples, the registers must be
1011         // consecutive and within the limit on the number of registers per
1012         // instruction.
1013         else if (!isNotVFP && RegNum != PRegNum+1)
1014           PartOfLSMulti = false;
1015       }
1016       // See if the current load/store may be part of a double load/store.
1017       bool PartOfLSDouble = CanMergeToLSDouble && Count <= 1;
1018 
1019       if (!PartOfLSMulti && !PartOfLSDouble)
1020         break;
1021       CanMergeToLSMulti &= PartOfLSMulti;
1022       CanMergeToLSDouble &= PartOfLSDouble;
1023       // Track MemOp with latest and earliest position (Positions are
1024       // counted in reverse).
1025       unsigned Position = MemOps[I].Position;
1026       if (Position < MemOps[Latest].Position)
1027         Latest = I;
1028       else if (Position > MemOps[Earliest].Position)
1029         Earliest = I;
1030       // Prepare for next MemOp.
1031       Offset += Size;
1032       PRegNum = RegNum;
1033     }
1034 
1035     // Form a candidate from the Ops collected so far.
1036     MergeCandidate *Candidate = new(Allocator.Allocate()) MergeCandidate;
1037     for (unsigned C = SIndex, CE = SIndex + Count; C < CE; ++C)
1038       Candidate->Instrs.push_back(MemOps[C].MI);
1039     Candidate->LatestMIIdx = Latest - SIndex;
1040     Candidate->EarliestMIIdx = Earliest - SIndex;
1041     Candidate->InsertPos = MemOps[Latest].Position;
1042     if (Count == 1)
1043       CanMergeToLSMulti = CanMergeToLSDouble = false;
1044     Candidate->CanMergeToLSMulti = CanMergeToLSMulti;
1045     Candidate->CanMergeToLSDouble = CanMergeToLSDouble;
1046     Candidates.push_back(Candidate);
1047     // Continue after the chain.
1048     SIndex += Count;
1049   } while (SIndex < EIndex);
1050 }
1051 
1052 static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
1053                                             ARM_AM::AMSubMode Mode) {
1054   switch (Opc) {
1055   default: llvm_unreachable("Unhandled opcode!");
1056   case ARM::LDMIA:
1057   case ARM::LDMDA:
1058   case ARM::LDMDB:
1059   case ARM::LDMIB:
1060     switch (Mode) {
1061     default: llvm_unreachable("Unhandled submode!");
1062     case ARM_AM::ia: return ARM::LDMIA_UPD;
1063     case ARM_AM::ib: return ARM::LDMIB_UPD;
1064     case ARM_AM::da: return ARM::LDMDA_UPD;
1065     case ARM_AM::db: return ARM::LDMDB_UPD;
1066     }
1067   case ARM::STMIA:
1068   case ARM::STMDA:
1069   case ARM::STMDB:
1070   case ARM::STMIB:
1071     switch (Mode) {
1072     default: llvm_unreachable("Unhandled submode!");
1073     case ARM_AM::ia: return ARM::STMIA_UPD;
1074     case ARM_AM::ib: return ARM::STMIB_UPD;
1075     case ARM_AM::da: return ARM::STMDA_UPD;
1076     case ARM_AM::db: return ARM::STMDB_UPD;
1077     }
1078   case ARM::t2LDMIA:
1079   case ARM::t2LDMDB:
1080     switch (Mode) {
1081     default: llvm_unreachable("Unhandled submode!");
1082     case ARM_AM::ia: return ARM::t2LDMIA_UPD;
1083     case ARM_AM::db: return ARM::t2LDMDB_UPD;
1084     }
1085   case ARM::t2STMIA:
1086   case ARM::t2STMDB:
1087     switch (Mode) {
1088     default: llvm_unreachable("Unhandled submode!");
1089     case ARM_AM::ia: return ARM::t2STMIA_UPD;
1090     case ARM_AM::db: return ARM::t2STMDB_UPD;
1091     }
1092   case ARM::VLDMSIA:
1093     switch (Mode) {
1094     default: llvm_unreachable("Unhandled submode!");
1095     case ARM_AM::ia: return ARM::VLDMSIA_UPD;
1096     case ARM_AM::db: return ARM::VLDMSDB_UPD;
1097     }
1098   case ARM::VLDMDIA:
1099     switch (Mode) {
1100     default: llvm_unreachable("Unhandled submode!");
1101     case ARM_AM::ia: return ARM::VLDMDIA_UPD;
1102     case ARM_AM::db: return ARM::VLDMDDB_UPD;
1103     }
1104   case ARM::VSTMSIA:
1105     switch (Mode) {
1106     default: llvm_unreachable("Unhandled submode!");
1107     case ARM_AM::ia: return ARM::VSTMSIA_UPD;
1108     case ARM_AM::db: return ARM::VSTMSDB_UPD;
1109     }
1110   case ARM::VSTMDIA:
1111     switch (Mode) {
1112     default: llvm_unreachable("Unhandled submode!");
1113     case ARM_AM::ia: return ARM::VSTMDIA_UPD;
1114     case ARM_AM::db: return ARM::VSTMDDB_UPD;
1115     }
1116   }
1117 }
1118 
1119 /// Check if the given instruction increments or decrements a register and
1120 /// return the amount it is incremented/decremented. Returns 0 if the CPSR flags
1121 /// generated by the instruction are possibly read as well.
1122 static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg,
1123                                   ARMCC::CondCodes Pred, unsigned PredReg) {
1124   bool CheckCPSRDef;
1125   int Scale;
1126   switch (MI.getOpcode()) {
1127   case ARM::tADDi8:  Scale =  4; CheckCPSRDef = true; break;
1128   case ARM::tSUBi8:  Scale = -4; CheckCPSRDef = true; break;
1129   case ARM::t2SUBri:
1130   case ARM::SUBri:   Scale = -1; CheckCPSRDef = true; break;
1131   case ARM::t2ADDri:
1132   case ARM::ADDri:   Scale =  1; CheckCPSRDef = true; break;
1133   case ARM::tADDspi: Scale =  4; CheckCPSRDef = false; break;
1134   case ARM::tSUBspi: Scale = -4; CheckCPSRDef = false; break;
1135   default: return 0;
1136   }
1137 
1138   unsigned MIPredReg;
1139   if (MI.getOperand(0).getReg() != Reg ||
1140       MI.getOperand(1).getReg() != Reg ||
1141       getInstrPredicate(MI, MIPredReg) != Pred ||
1142       MIPredReg != PredReg)
1143     return 0;
1144 
1145   if (CheckCPSRDef && definesCPSR(&MI))
1146     return 0;
1147   return MI.getOperand(2).getImm() * Scale;
1148 }
1149 
1150 /// Searches for an increment or decrement of \p Reg before \p MBBI.
1151 static MachineBasicBlock::iterator
1152 findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg,
1153                  ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
1154   Offset = 0;
1155   MachineBasicBlock &MBB = *MBBI->getParent();
1156   MachineBasicBlock::iterator BeginMBBI = MBB.begin();
1157   MachineBasicBlock::iterator EndMBBI = MBB.end();
1158   if (MBBI == BeginMBBI)
1159     return EndMBBI;
1160 
1161   // Skip debug values.
1162   MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
1163   while (PrevMBBI->isDebugValue() && PrevMBBI != BeginMBBI)
1164     --PrevMBBI;
1165 
1166   Offset = isIncrementOrDecrement(*PrevMBBI, Reg, Pred, PredReg);
1167   return Offset == 0 ? EndMBBI : PrevMBBI;
1168 }
1169 
1170 /// Searches for a increment or decrement of \p Reg after \p MBBI.
1171 static MachineBasicBlock::iterator
1172 findIncDecAfter(MachineBasicBlock::iterator MBBI, unsigned Reg,
1173                 ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
1174   Offset = 0;
1175   MachineBasicBlock &MBB = *MBBI->getParent();
1176   MachineBasicBlock::iterator EndMBBI = MBB.end();
1177   MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
1178   // Skip debug values.
1179   while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
1180     ++NextMBBI;
1181   if (NextMBBI == EndMBBI)
1182     return EndMBBI;
1183 
1184   Offset = isIncrementOrDecrement(*NextMBBI, Reg, Pred, PredReg);
1185   return Offset == 0 ? EndMBBI : NextMBBI;
1186 }
1187 
1188 /// Fold proceeding/trailing inc/dec of base register into the
1189 /// LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible:
1190 ///
1191 /// stmia rn, <ra, rb, rc>
1192 /// rn := rn + 4 * 3;
1193 /// =>
1194 /// stmia rn!, <ra, rb, rc>
1195 ///
1196 /// rn := rn - 4 * 3;
1197 /// ldmia rn, <ra, rb, rc>
1198 /// =>
1199 /// ldmdb rn!, <ra, rb, rc>
1200 bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
1201   // Thumb1 is already using updating loads/stores.
1202   if (isThumb1) return false;
1203 
1204   const MachineOperand &BaseOP = MI->getOperand(0);
1205   unsigned Base = BaseOP.getReg();
1206   bool BaseKill = BaseOP.isKill();
1207   unsigned PredReg = 0;
1208   ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
1209   unsigned Opcode = MI->getOpcode();
1210   DebugLoc DL = MI->getDebugLoc();
1211 
1212   // Can't use an updating ld/st if the base register is also a dest
1213   // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
1214   for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i)
1215     if (MI->getOperand(i).getReg() == Base)
1216       return false;
1217 
1218   int Bytes = getLSMultipleTransferSize(MI);
1219   MachineBasicBlock &MBB = *MI->getParent();
1220   MachineBasicBlock::iterator MBBI(MI);
1221   int Offset;
1222   MachineBasicBlock::iterator MergeInstr
1223     = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
1224   ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode);
1225   if (Mode == ARM_AM::ia && Offset == -Bytes) {
1226     Mode = ARM_AM::db;
1227   } else if (Mode == ARM_AM::ib && Offset == -Bytes) {
1228     Mode = ARM_AM::da;
1229   } else {
1230     MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
1231     if (((Mode != ARM_AM::ia && Mode != ARM_AM::ib) || Offset != Bytes) &&
1232         ((Mode != ARM_AM::da && Mode != ARM_AM::db) || Offset != -Bytes)) {
1233 
1234       // We couldn't find an inc/dec to merge. But if the base is dead, we
1235       // can still change to a writeback form as that will save us 2 bytes
1236       // of code size. It can create WAW hazards though, so only do it if
1237       // we're minimizing code size.
1238       if (!MBB.getParent()->getFunction()->optForMinSize() || !BaseKill)
1239         return false;
1240 
1241       bool HighRegsUsed = false;
1242       for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i)
1243         if (MI->getOperand(i).getReg() >= ARM::R8) {
1244           HighRegsUsed = true;
1245           break;
1246         }
1247 
1248       if (!HighRegsUsed)
1249         MergeInstr = MBB.end();
1250       else
1251         return false;
1252     }
1253   }
1254   if (MergeInstr != MBB.end())
1255     MBB.erase(MergeInstr);
1256 
1257   unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode);
1258   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
1259     .addReg(Base, getDefRegState(true)) // WB base register
1260     .addReg(Base, getKillRegState(BaseKill))
1261     .addImm(Pred).addReg(PredReg);
1262 
1263   // Transfer the rest of operands.
1264   for (unsigned OpNum = 3, e = MI->getNumOperands(); OpNum != e; ++OpNum)
1265     MIB.addOperand(MI->getOperand(OpNum));
1266 
1267   // Transfer memoperands.
1268   MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
1269 
1270   MBB.erase(MBBI);
1271   return true;
1272 }
1273 
1274 static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc,
1275                                              ARM_AM::AddrOpc Mode) {
1276   switch (Opc) {
1277   case ARM::LDRi12:
1278     return ARM::LDR_PRE_IMM;
1279   case ARM::STRi12:
1280     return ARM::STR_PRE_IMM;
1281   case ARM::VLDRS:
1282     return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
1283   case ARM::VLDRD:
1284     return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD;
1285   case ARM::VSTRS:
1286     return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD;
1287   case ARM::VSTRD:
1288     return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD;
1289   case ARM::t2LDRi8:
1290   case ARM::t2LDRi12:
1291     return ARM::t2LDR_PRE;
1292   case ARM::t2STRi8:
1293   case ARM::t2STRi12:
1294     return ARM::t2STR_PRE;
1295   default: llvm_unreachable("Unhandled opcode!");
1296   }
1297 }
1298 
1299 static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc,
1300                                               ARM_AM::AddrOpc Mode) {
1301   switch (Opc) {
1302   case ARM::LDRi12:
1303     return ARM::LDR_POST_IMM;
1304   case ARM::STRi12:
1305     return ARM::STR_POST_IMM;
1306   case ARM::VLDRS:
1307     return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
1308   case ARM::VLDRD:
1309     return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD;
1310   case ARM::VSTRS:
1311     return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD;
1312   case ARM::VSTRD:
1313     return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD;
1314   case ARM::t2LDRi8:
1315   case ARM::t2LDRi12:
1316     return ARM::t2LDR_POST;
1317   case ARM::t2STRi8:
1318   case ARM::t2STRi12:
1319     return ARM::t2STR_POST;
1320   default: llvm_unreachable("Unhandled opcode!");
1321   }
1322 }
1323 
1324 /// Fold proceeding/trailing inc/dec of base register into the
1325 /// LDR/STR/FLD{D|S}/FST{D|S} op when possible:
1326 bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
1327   // Thumb1 doesn't have updating LDR/STR.
1328   // FIXME: Use LDM/STM with single register instead.
1329   if (isThumb1) return false;
1330 
1331   unsigned Base = getLoadStoreBaseOp(*MI).getReg();
1332   bool BaseKill = getLoadStoreBaseOp(*MI).isKill();
1333   unsigned Opcode = MI->getOpcode();
1334   DebugLoc DL = MI->getDebugLoc();
1335   bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS ||
1336                 Opcode == ARM::VSTRD || Opcode == ARM::VSTRS);
1337   bool isAM2 = (Opcode == ARM::LDRi12 || Opcode == ARM::STRi12);
1338   if (isi32Load(Opcode) || isi32Store(Opcode))
1339     if (MI->getOperand(2).getImm() != 0)
1340       return false;
1341   if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
1342     return false;
1343 
1344   // Can't do the merge if the destination register is the same as the would-be
1345   // writeback register.
1346   if (MI->getOperand(0).getReg() == Base)
1347     return false;
1348 
1349   unsigned PredReg = 0;
1350   ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
1351   int Bytes = getLSMultipleTransferSize(MI);
1352   MachineBasicBlock &MBB = *MI->getParent();
1353   MachineBasicBlock::iterator MBBI(MI);
1354   int Offset;
1355   MachineBasicBlock::iterator MergeInstr
1356     = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
1357   unsigned NewOpc;
1358   if (!isAM5 && Offset == Bytes) {
1359     NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
1360   } else if (Offset == -Bytes) {
1361     NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
1362   } else {
1363     MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
1364     if (Offset == Bytes) {
1365       NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
1366     } else if (!isAM5 && Offset == -Bytes) {
1367       NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
1368     } else
1369       return false;
1370   }
1371   MBB.erase(MergeInstr);
1372 
1373   ARM_AM::AddrOpc AddSub = Offset < 0 ? ARM_AM::sub : ARM_AM::add;
1374 
1375   bool isLd = isLoadSingle(Opcode);
1376   if (isAM5) {
1377     // VLDM[SD]_UPD, VSTM[SD]_UPD
1378     // (There are no base-updating versions of VLDR/VSTR instructions, but the
1379     // updating load/store-multiple instructions can be used with only one
1380     // register.)
1381     MachineOperand &MO = MI->getOperand(0);
1382     BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
1383       .addReg(Base, getDefRegState(true)) // WB base register
1384       .addReg(Base, getKillRegState(isLd ? BaseKill : false))
1385       .addImm(Pred).addReg(PredReg)
1386       .addReg(MO.getReg(), (isLd ? getDefRegState(true) :
1387                             getKillRegState(MO.isKill())));
1388   } else if (isLd) {
1389     if (isAM2) {
1390       // LDR_PRE, LDR_POST
1391       if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) {
1392         BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
1393           .addReg(Base, RegState::Define)
1394           .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
1395       } else {
1396         int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
1397         BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
1398           .addReg(Base, RegState::Define)
1399           .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
1400       }
1401     } else {
1402       // t2LDR_PRE, t2LDR_POST
1403       BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
1404         .addReg(Base, RegState::Define)
1405         .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
1406     }
1407   } else {
1408     MachineOperand &MO = MI->getOperand(0);
1409     // FIXME: post-indexed stores use am2offset_imm, which still encodes
1410     // the vestigal zero-reg offset register. When that's fixed, this clause
1411     // can be removed entirely.
1412     if (isAM2 && NewOpc == ARM::STR_POST_IMM) {
1413       int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
1414       // STR_PRE, STR_POST
1415       BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
1416         .addReg(MO.getReg(), getKillRegState(MO.isKill()))
1417         .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
1418     } else {
1419       // t2STR_PRE, t2STR_POST
1420       BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
1421         .addReg(MO.getReg(), getKillRegState(MO.isKill()))
1422         .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
1423     }
1424   }
1425   MBB.erase(MBBI);
1426 
1427   return true;
1428 }
1429 
1430 bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
1431   unsigned Opcode = MI.getOpcode();
1432   assert((Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) &&
1433          "Must have t2STRDi8 or t2LDRDi8");
1434   if (MI.getOperand(3).getImm() != 0)
1435     return false;
1436 
1437   // Behaviour for writeback is undefined if base register is the same as one
1438   // of the others.
1439   const MachineOperand &BaseOp = MI.getOperand(2);
1440   unsigned Base = BaseOp.getReg();
1441   const MachineOperand &Reg0Op = MI.getOperand(0);
1442   const MachineOperand &Reg1Op = MI.getOperand(1);
1443   if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base)
1444     return false;
1445 
1446   unsigned PredReg;
1447   ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
1448   MachineBasicBlock::iterator MBBI(MI);
1449   MachineBasicBlock &MBB = *MI.getParent();
1450   int Offset;
1451   MachineBasicBlock::iterator MergeInstr = findIncDecBefore(MBBI, Base, Pred,
1452                                                             PredReg, Offset);
1453   unsigned NewOpc;
1454   if (Offset == 8 || Offset == -8) {
1455     NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_PRE : ARM::t2STRD_PRE;
1456   } else {
1457     MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
1458     if (Offset == 8 || Offset == -8) {
1459       NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_POST : ARM::t2STRD_POST;
1460     } else
1461       return false;
1462   }
1463   MBB.erase(MergeInstr);
1464 
1465   DebugLoc DL = MI.getDebugLoc();
1466   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
1467   if (NewOpc == ARM::t2LDRD_PRE || NewOpc == ARM::t2LDRD_POST) {
1468     MIB.addOperand(Reg0Op).addOperand(Reg1Op)
1469        .addReg(BaseOp.getReg(), RegState::Define);
1470   } else {
1471     assert(NewOpc == ARM::t2STRD_PRE || NewOpc == ARM::t2STRD_POST);
1472     MIB.addReg(BaseOp.getReg(), RegState::Define)
1473        .addOperand(Reg0Op).addOperand(Reg1Op);
1474   }
1475   MIB.addReg(BaseOp.getReg(), RegState::Kill)
1476      .addImm(Offset).addImm(Pred).addReg(PredReg);
1477   assert(TII->get(Opcode).getNumOperands() == 6 &&
1478          TII->get(NewOpc).getNumOperands() == 7 &&
1479          "Unexpected number of operands in Opcode specification.");
1480 
1481   // Transfer implicit operands.
1482   for (const MachineOperand &MO : MI.implicit_operands())
1483     MIB.addOperand(MO);
1484   MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
1485 
1486   MBB.erase(MBBI);
1487   return true;
1488 }
1489 
1490 /// Returns true if instruction is a memory operation that this pass is capable
1491 /// of operating on.
1492 static bool isMemoryOp(const MachineInstr &MI) {
1493   unsigned Opcode = MI.getOpcode();
1494   switch (Opcode) {
1495   case ARM::VLDRS:
1496   case ARM::VSTRS:
1497   case ARM::VLDRD:
1498   case ARM::VSTRD:
1499   case ARM::LDRi12:
1500   case ARM::STRi12:
1501   case ARM::tLDRi:
1502   case ARM::tSTRi:
1503   case ARM::tLDRspi:
1504   case ARM::tSTRspi:
1505   case ARM::t2LDRi8:
1506   case ARM::t2LDRi12:
1507   case ARM::t2STRi8:
1508   case ARM::t2STRi12:
1509     break;
1510   default:
1511     return false;
1512   }
1513   if (!MI.getOperand(1).isReg())
1514     return false;
1515 
1516   // When no memory operands are present, conservatively assume unaligned,
1517   // volatile, unfoldable.
1518   if (!MI.hasOneMemOperand())
1519     return false;
1520 
1521   const MachineMemOperand &MMO = **MI.memoperands_begin();
1522 
1523   // Don't touch volatile memory accesses - we may be changing their order.
1524   if (MMO.isVolatile())
1525     return false;
1526 
1527   // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is
1528   // not.
1529   if (MMO.getAlignment() < 4)
1530     return false;
1531 
1532   // str <undef> could probably be eliminated entirely, but for now we just want
1533   // to avoid making a mess of it.
1534   // FIXME: Use str <undef> as a wildcard to enable better stm folding.
1535   if (MI.getOperand(0).isReg() && MI.getOperand(0).isUndef())
1536     return false;
1537 
1538   // Likewise don't mess with references to undefined addresses.
1539   if (MI.getOperand(1).isUndef())
1540     return false;
1541 
1542   return true;
1543 }
1544 
1545 static void InsertLDR_STR(MachineBasicBlock &MBB,
1546                           MachineBasicBlock::iterator &MBBI,
1547                           int Offset, bool isDef,
1548                           DebugLoc DL, unsigned NewOpc,
1549                           unsigned Reg, bool RegDeadKill, bool RegUndef,
1550                           unsigned BaseReg, bool BaseKill, bool BaseUndef,
1551                           bool OffKill, bool OffUndef,
1552                           ARMCC::CondCodes Pred, unsigned PredReg,
1553                           const TargetInstrInfo *TII, bool isT2) {
1554   if (isDef) {
1555     MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
1556                                       TII->get(NewOpc))
1557       .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill))
1558       .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
1559     MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1560   } else {
1561     MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
1562                                       TII->get(NewOpc))
1563       .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef))
1564       .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
1565     MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1566   }
1567 }
1568 
1569 bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
1570                                           MachineBasicBlock::iterator &MBBI) {
1571   MachineInstr *MI = &*MBBI;
1572   unsigned Opcode = MI->getOpcode();
1573   if (Opcode != ARM::LDRD && Opcode != ARM::STRD && Opcode != ARM::t2LDRDi8)
1574     return false;
1575 
1576   const MachineOperand &BaseOp = MI->getOperand(2);
1577   unsigned BaseReg = BaseOp.getReg();
1578   unsigned EvenReg = MI->getOperand(0).getReg();
1579   unsigned OddReg  = MI->getOperand(1).getReg();
1580   unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false);
1581   unsigned OddRegNum  = TRI->getDwarfRegNum(OddReg, false);
1582 
1583   // ARM errata 602117: LDRD with base in list may result in incorrect base
1584   // register when interrupted or faulted.
1585   bool Errata602117 = EvenReg == BaseReg &&
1586     (Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8) && STI->isCortexM3();
1587   // ARM LDRD/STRD needs consecutive registers.
1588   bool NonConsecutiveRegs = (Opcode == ARM::LDRD || Opcode == ARM::STRD) &&
1589     (EvenRegNum % 2 != 0 || EvenRegNum + 1 != OddRegNum);
1590 
1591   if (!Errata602117 && !NonConsecutiveRegs)
1592     return false;
1593 
1594   bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8;
1595   bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8;
1596   bool EvenDeadKill = isLd ?
1597     MI->getOperand(0).isDead() : MI->getOperand(0).isKill();
1598   bool EvenUndef = MI->getOperand(0).isUndef();
1599   bool OddDeadKill  = isLd ?
1600     MI->getOperand(1).isDead() : MI->getOperand(1).isKill();
1601   bool OddUndef = MI->getOperand(1).isUndef();
1602   bool BaseKill = BaseOp.isKill();
1603   bool BaseUndef = BaseOp.isUndef();
1604   bool OffKill = isT2 ? false : MI->getOperand(3).isKill();
1605   bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef();
1606   int OffImm = getMemoryOpOffset(MI);
1607   unsigned PredReg = 0;
1608   ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
1609 
1610   if (OddRegNum > EvenRegNum && OffImm == 0) {
1611     // Ascending register numbers and no offset. It's safe to change it to a
1612     // ldm or stm.
1613     unsigned NewOpc = (isLd)
1614       ? (isT2 ? ARM::t2LDMIA : ARM::LDMIA)
1615       : (isT2 ? ARM::t2STMIA : ARM::STMIA);
1616     if (isLd) {
1617       BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
1618         .addReg(BaseReg, getKillRegState(BaseKill))
1619         .addImm(Pred).addReg(PredReg)
1620         .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill))
1621         .addReg(OddReg,  getDefRegState(isLd) | getDeadRegState(OddDeadKill));
1622       ++NumLDRD2LDM;
1623     } else {
1624       BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
1625         .addReg(BaseReg, getKillRegState(BaseKill))
1626         .addImm(Pred).addReg(PredReg)
1627         .addReg(EvenReg,
1628                 getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef))
1629         .addReg(OddReg,
1630                 getKillRegState(OddDeadKill)  | getUndefRegState(OddUndef));
1631       ++NumSTRD2STM;
1632     }
1633   } else {
1634     // Split into two instructions.
1635     unsigned NewOpc = (isLd)
1636       ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12)
1637       : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12);
1638     // Be extra careful for thumb2. t2LDRi8 can't reference a zero offset,
1639     // so adjust and use t2LDRi12 here for that.
1640     unsigned NewOpc2 = (isLd)
1641       ? (isT2 ? (OffImm+4 < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12)
1642       : (isT2 ? (OffImm+4 < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12);
1643     DebugLoc dl = MBBI->getDebugLoc();
1644     // If this is a load and base register is killed, it may have been
1645     // re-defed by the load, make sure the first load does not clobber it.
1646     if (isLd &&
1647         (BaseKill || OffKill) &&
1648         (TRI->regsOverlap(EvenReg, BaseReg))) {
1649       assert(!TRI->regsOverlap(OddReg, BaseReg));
1650       InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2,
1651                     OddReg, OddDeadKill, false,
1652                     BaseReg, false, BaseUndef, false, OffUndef,
1653                     Pred, PredReg, TII, isT2);
1654       InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
1655                     EvenReg, EvenDeadKill, false,
1656                     BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
1657                     Pred, PredReg, TII, isT2);
1658     } else {
1659       if (OddReg == EvenReg && EvenDeadKill) {
1660         // If the two source operands are the same, the kill marker is
1661         // probably on the first one. e.g.
1662         // t2STRDi8 %R5<kill>, %R5, %R9<kill>, 0, 14, %reg0
1663         EvenDeadKill = false;
1664         OddDeadKill = true;
1665       }
1666       // Never kill the base register in the first instruction.
1667       if (EvenReg == BaseReg)
1668         EvenDeadKill = false;
1669       InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
1670                     EvenReg, EvenDeadKill, EvenUndef,
1671                     BaseReg, false, BaseUndef, false, OffUndef,
1672                     Pred, PredReg, TII, isT2);
1673       InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2,
1674                     OddReg, OddDeadKill, OddUndef,
1675                     BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
1676                     Pred, PredReg, TII, isT2);
1677     }
1678     if (isLd)
1679       ++NumLDRD2LDR;
1680     else
1681       ++NumSTRD2STR;
1682   }
1683 
1684   MBBI = MBB.erase(MBBI);
1685   return true;
1686 }
1687 
1688 /// An optimization pass to turn multiple LDR / STR ops of the same base and
1689 /// incrementing offset into LDM / STM ops.
1690 bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
1691   MemOpQueue MemOps;
1692   unsigned CurrBase = 0;
1693   unsigned CurrOpc = ~0u;
1694   ARMCC::CondCodes CurrPred = ARMCC::AL;
1695   unsigned Position = 0;
1696   assert(Candidates.size() == 0);
1697   assert(MergeBaseCandidates.size() == 0);
1698   LiveRegsValid = false;
1699 
1700   for (MachineBasicBlock::iterator I = MBB.end(), MBBI; I != MBB.begin();
1701        I = MBBI) {
1702     // The instruction in front of the iterator is the one we look at.
1703     MBBI = std::prev(I);
1704     if (FixInvalidRegPairOp(MBB, MBBI))
1705       continue;
1706     ++Position;
1707 
1708     if (isMemoryOp(*MBBI)) {
1709       unsigned Opcode = MBBI->getOpcode();
1710       const MachineOperand &MO = MBBI->getOperand(0);
1711       unsigned Reg = MO.getReg();
1712       unsigned Base = getLoadStoreBaseOp(*MBBI).getReg();
1713       unsigned PredReg = 0;
1714       ARMCC::CondCodes Pred = getInstrPredicate(*MBBI, PredReg);
1715       int Offset = getMemoryOpOffset(MBBI);
1716       if (CurrBase == 0) {
1717         // Start of a new chain.
1718         CurrBase = Base;
1719         CurrOpc  = Opcode;
1720         CurrPred = Pred;
1721         MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position));
1722         continue;
1723       }
1724       // Note: No need to match PredReg in the next if.
1725       if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) {
1726         // Watch out for:
1727         //   r4 := ldr [r0, #8]
1728         //   r4 := ldr [r0, #4]
1729         // or
1730         //   r0 := ldr [r0]
1731         // If a load overrides the base register or a register loaded by
1732         // another load in our chain, we cannot take this instruction.
1733         bool Overlap = false;
1734         if (isLoadSingle(Opcode)) {
1735           Overlap = (Base == Reg);
1736           if (!Overlap) {
1737             for (const MemOpQueueEntry &E : MemOps) {
1738               if (TRI->regsOverlap(Reg, E.MI->getOperand(0).getReg())) {
1739                 Overlap = true;
1740                 break;
1741               }
1742             }
1743           }
1744         }
1745 
1746         if (!Overlap) {
1747           // Check offset and sort memory operation into the current chain.
1748           if (Offset > MemOps.back().Offset) {
1749             MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position));
1750             continue;
1751           } else {
1752             MemOpQueue::iterator MI, ME;
1753             for (MI = MemOps.begin(), ME = MemOps.end(); MI != ME; ++MI) {
1754               if (Offset < MI->Offset) {
1755                 // Found a place to insert.
1756                 break;
1757               }
1758               if (Offset == MI->Offset) {
1759                 // Collision, abort.
1760                 MI = ME;
1761                 break;
1762               }
1763             }
1764             if (MI != MemOps.end()) {
1765               MemOps.insert(MI, MemOpQueueEntry(MBBI, Offset, Position));
1766               continue;
1767             }
1768           }
1769         }
1770       }
1771 
1772       // Don't advance the iterator; The op will start a new chain next.
1773       MBBI = I;
1774       --Position;
1775       // Fallthrough to look into existing chain.
1776     } else if (MBBI->isDebugValue()) {
1777       continue;
1778     } else if (MBBI->getOpcode() == ARM::t2LDRDi8 ||
1779                MBBI->getOpcode() == ARM::t2STRDi8) {
1780       // ARMPreAllocLoadStoreOpt has already formed some LDRD/STRD instructions
1781       // remember them because we may still be able to merge add/sub into them.
1782       MergeBaseCandidates.push_back(MBBI);
1783     }
1784 
1785 
1786     // If we are here then the chain is broken; Extract candidates for a merge.
1787     if (MemOps.size() > 0) {
1788       FormCandidates(MemOps);
1789       // Reset for the next chain.
1790       CurrBase = 0;
1791       CurrOpc = ~0u;
1792       CurrPred = ARMCC::AL;
1793       MemOps.clear();
1794     }
1795   }
1796   if (MemOps.size() > 0)
1797     FormCandidates(MemOps);
1798 
1799   // Sort candidates so they get processed from end to begin of the basic
1800   // block later; This is necessary for liveness calculation.
1801   auto LessThan = [](const MergeCandidate* M0, const MergeCandidate *M1) {
1802     return M0->InsertPos < M1->InsertPos;
1803   };
1804   std::sort(Candidates.begin(), Candidates.end(), LessThan);
1805 
1806   // Go through list of candidates and merge.
1807   bool Changed = false;
1808   for (const MergeCandidate *Candidate : Candidates) {
1809     if (Candidate->CanMergeToLSMulti || Candidate->CanMergeToLSDouble) {
1810       MachineInstr *Merged = MergeOpsUpdate(*Candidate);
1811       // Merge preceding/trailing base inc/dec into the merged op.
1812       if (Merged) {
1813         Changed = true;
1814         unsigned Opcode = Merged->getOpcode();
1815         if (Opcode == ARM::t2STRDi8 || Opcode == ARM::t2LDRDi8)
1816           MergeBaseUpdateLSDouble(*Merged);
1817         else
1818           MergeBaseUpdateLSMultiple(Merged);
1819       } else {
1820         for (MachineInstr *MI : Candidate->Instrs) {
1821           if (MergeBaseUpdateLoadStore(MI))
1822             Changed = true;
1823         }
1824       }
1825     } else {
1826       assert(Candidate->Instrs.size() == 1);
1827       if (MergeBaseUpdateLoadStore(Candidate->Instrs.front()))
1828         Changed = true;
1829     }
1830   }
1831   Candidates.clear();
1832   // Try to fold add/sub into the LDRD/STRD formed by ARMPreAllocLoadStoreOpt.
1833   for (MachineInstr *MI : MergeBaseCandidates)
1834     MergeBaseUpdateLSDouble(*MI);
1835   MergeBaseCandidates.clear();
1836 
1837   return Changed;
1838 }
1839 
1840 /// If this is a exit BB, try merging the return ops ("bx lr" and "mov pc, lr")
1841 /// into the preceding stack restore so it directly restore the value of LR
1842 /// into pc.
1843 ///   ldmfd sp!, {..., lr}
1844 ///   bx lr
1845 /// or
1846 ///   ldmfd sp!, {..., lr}
1847 ///   mov pc, lr
1848 /// =>
1849 ///   ldmfd sp!, {..., pc}
1850 bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
1851   // Thumb1 LDM doesn't allow high registers.
1852   if (isThumb1) return false;
1853   if (MBB.empty()) return false;
1854 
1855   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
1856   if (MBBI != MBB.begin() &&
1857       (MBBI->getOpcode() == ARM::BX_RET ||
1858        MBBI->getOpcode() == ARM::tBX_RET ||
1859        MBBI->getOpcode() == ARM::MOVPCLR)) {
1860     MachineBasicBlock::iterator PrevI = std::prev(MBBI);
1861     // Ignore any DBG_VALUE instructions.
1862     while (PrevI->isDebugValue() && PrevI != MBB.begin())
1863       --PrevI;
1864     MachineInstr *PrevMI = PrevI;
1865     unsigned Opcode = PrevMI->getOpcode();
1866     if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::LDMDA_UPD ||
1867         Opcode == ARM::LDMDB_UPD || Opcode == ARM::LDMIB_UPD ||
1868         Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD) {
1869       MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1);
1870       if (MO.getReg() != ARM::LR)
1871         return false;
1872       unsigned NewOpc = (isThumb2 ? ARM::t2LDMIA_RET : ARM::LDMIA_RET);
1873       assert(((isThumb2 && Opcode == ARM::t2LDMIA_UPD) ||
1874               Opcode == ARM::LDMIA_UPD) && "Unsupported multiple load-return!");
1875       PrevMI->setDesc(TII->get(NewOpc));
1876       MO.setReg(ARM::PC);
1877       PrevMI->copyImplicitOps(*MBB.getParent(), *MBBI);
1878       MBB.erase(MBBI);
1879       return true;
1880     }
1881   }
1882   return false;
1883 }
1884 
1885 bool ARMLoadStoreOpt::CombineMovBx(MachineBasicBlock &MBB) {
1886   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
1887   if (MBBI == MBB.begin() || MBBI == MBB.end() ||
1888       MBBI->getOpcode() != ARM::tBX_RET)
1889     return false;
1890 
1891   MachineBasicBlock::iterator Prev = MBBI;
1892   --Prev;
1893   if (Prev->getOpcode() != ARM::tMOVr || !Prev->definesRegister(ARM::LR))
1894     return false;
1895 
1896   for (auto Use : Prev->uses())
1897     if (Use.isKill()) {
1898       AddDefaultPred(BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(ARM::tBX))
1899                          .addReg(Use.getReg(), RegState::Kill))
1900           .copyImplicitOps(*MBBI);
1901       MBB.erase(MBBI);
1902       MBB.erase(Prev);
1903       return true;
1904     }
1905 
1906   llvm_unreachable("tMOVr doesn't kill a reg before tBX_RET?");
1907 }
1908 
1909 bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1910   if (skipFunction(*Fn.getFunction()))
1911     return false;
1912 
1913   MF = &Fn;
1914   STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
1915   TL = STI->getTargetLowering();
1916   AFI = Fn.getInfo<ARMFunctionInfo>();
1917   TII = STI->getInstrInfo();
1918   TRI = STI->getRegisterInfo();
1919 
1920   RegClassInfoValid = false;
1921   isThumb2 = AFI->isThumb2Function();
1922   isThumb1 = AFI->isThumbFunction() && !isThumb2;
1923 
1924   bool Modified = false;
1925   for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
1926        ++MFI) {
1927     MachineBasicBlock &MBB = *MFI;
1928     Modified |= LoadStoreMultipleOpti(MBB);
1929     if (STI->hasV5TOps())
1930       Modified |= MergeReturnIntoLDM(MBB);
1931     if (isThumb1)
1932       Modified |= CombineMovBx(MBB);
1933   }
1934 
1935   Allocator.DestroyAll();
1936   return Modified;
1937 }
1938 
1939 namespace llvm {
1940 void initializeARMPreAllocLoadStoreOptPass(PassRegistry &);
1941 }
1942 
1943 #define ARM_PREALLOC_LOAD_STORE_OPT_NAME                                       \
1944   "ARM pre- register allocation load / store optimization pass"
1945 
1946 namespace {
1947   /// Pre- register allocation pass that move load / stores from consecutive
1948   /// locations close to make it more likely they will be combined later.
1949   struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{
1950     static char ID;
1951     ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {
1952       initializeARMPreAllocLoadStoreOptPass(*PassRegistry::getPassRegistry());
1953     }
1954 
1955     const DataLayout *TD;
1956     const TargetInstrInfo *TII;
1957     const TargetRegisterInfo *TRI;
1958     const ARMSubtarget *STI;
1959     MachineRegisterInfo *MRI;
1960     MachineFunction *MF;
1961 
1962     bool runOnMachineFunction(MachineFunction &Fn) override;
1963 
1964     const char *getPassName() const override {
1965       return ARM_PREALLOC_LOAD_STORE_OPT_NAME;
1966     }
1967 
1968   private:
1969     bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl,
1970                           unsigned &NewOpc, unsigned &EvenReg,
1971                           unsigned &OddReg, unsigned &BaseReg,
1972                           int &Offset,
1973                           unsigned &PredReg, ARMCC::CondCodes &Pred,
1974                           bool &isT2);
1975     bool RescheduleOps(MachineBasicBlock *MBB,
1976                        SmallVectorImpl<MachineInstr *> &Ops,
1977                        unsigned Base, bool isLd,
1978                        DenseMap<MachineInstr*, unsigned> &MI2LocMap);
1979     bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB);
1980   };
1981   char ARMPreAllocLoadStoreOpt::ID = 0;
1982 }
1983 
1984 INITIALIZE_PASS(ARMPreAllocLoadStoreOpt, "arm-prera-load-store-opt",
1985                 ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
1986 
1987 bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1988   if (AssumeMisalignedLoadStores || skipFunction(*Fn.getFunction()))
1989     return false;
1990 
1991   TD = &Fn.getDataLayout();
1992   STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
1993   TII = STI->getInstrInfo();
1994   TRI = STI->getRegisterInfo();
1995   MRI = &Fn.getRegInfo();
1996   MF  = &Fn;
1997 
1998   bool Modified = false;
1999   for (MachineBasicBlock &MFI : Fn)
2000     Modified |= RescheduleLoadStoreInstrs(&MFI);
2001 
2002   return Modified;
2003 }
2004 
2005 static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
2006                                       MachineBasicBlock::iterator I,
2007                                       MachineBasicBlock::iterator E,
2008                                       SmallPtrSetImpl<MachineInstr*> &MemOps,
2009                                       SmallSet<unsigned, 4> &MemRegs,
2010                                       const TargetRegisterInfo *TRI) {
2011   // Are there stores / loads / calls between them?
2012   // FIXME: This is overly conservative. We should make use of alias information
2013   // some day.
2014   SmallSet<unsigned, 4> AddedRegPressure;
2015   while (++I != E) {
2016     if (I->isDebugValue() || MemOps.count(&*I))
2017       continue;
2018     if (I->isCall() || I->isTerminator() || I->hasUnmodeledSideEffects())
2019       return false;
2020     if (isLd && I->mayStore())
2021       return false;
2022     if (!isLd) {
2023       if (I->mayLoad())
2024         return false;
2025       // It's not safe to move the first 'str' down.
2026       // str r1, [r0]
2027       // strh r5, [r0]
2028       // str r4, [r0, #+4]
2029       if (I->mayStore())
2030         return false;
2031     }
2032     for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) {
2033       MachineOperand &MO = I->getOperand(j);
2034       if (!MO.isReg())
2035         continue;
2036       unsigned Reg = MO.getReg();
2037       if (MO.isDef() && TRI->regsOverlap(Reg, Base))
2038         return false;
2039       if (Reg != Base && !MemRegs.count(Reg))
2040         AddedRegPressure.insert(Reg);
2041     }
2042   }
2043 
2044   // Estimate register pressure increase due to the transformation.
2045   if (MemRegs.size() <= 4)
2046     // Ok if we are moving small number of instructions.
2047     return true;
2048   return AddedRegPressure.size() <= MemRegs.size() * 2;
2049 }
2050 
2051 bool
2052 ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
2053                                           DebugLoc &dl, unsigned &NewOpc,
2054                                           unsigned &FirstReg,
2055                                           unsigned &SecondReg,
2056                                           unsigned &BaseReg, int &Offset,
2057                                           unsigned &PredReg,
2058                                           ARMCC::CondCodes &Pred,
2059                                           bool &isT2) {
2060   // Make sure we're allowed to generate LDRD/STRD.
2061   if (!STI->hasV5TEOps())
2062     return false;
2063 
2064   // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD
2065   unsigned Scale = 1;
2066   unsigned Opcode = Op0->getOpcode();
2067   if (Opcode == ARM::LDRi12) {
2068     NewOpc = ARM::LDRD;
2069   } else if (Opcode == ARM::STRi12) {
2070     NewOpc = ARM::STRD;
2071   } else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) {
2072     NewOpc = ARM::t2LDRDi8;
2073     Scale = 4;
2074     isT2 = true;
2075   } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) {
2076     NewOpc = ARM::t2STRDi8;
2077     Scale = 4;
2078     isT2 = true;
2079   } else {
2080     return false;
2081   }
2082 
2083   // Make sure the base address satisfies i64 ld / st alignment requirement.
2084   // At the moment, we ignore the memoryoperand's value.
2085   // If we want to use AliasAnalysis, we should check it accordingly.
2086   if (!Op0->hasOneMemOperand() ||
2087       (*Op0->memoperands_begin())->isVolatile())
2088     return false;
2089 
2090   unsigned Align = (*Op0->memoperands_begin())->getAlignment();
2091   const Function *Func = MF->getFunction();
2092   unsigned ReqAlign = STI->hasV6Ops()
2093     ? TD->getABITypeAlignment(Type::getInt64Ty(Func->getContext()))
2094     : 8;  // Pre-v6 need 8-byte align
2095   if (Align < ReqAlign)
2096     return false;
2097 
2098   // Then make sure the immediate offset fits.
2099   int OffImm = getMemoryOpOffset(Op0);
2100   if (isT2) {
2101     int Limit = (1 << 8) * Scale;
2102     if (OffImm >= Limit || (OffImm <= -Limit) || (OffImm & (Scale-1)))
2103       return false;
2104     Offset = OffImm;
2105   } else {
2106     ARM_AM::AddrOpc AddSub = ARM_AM::add;
2107     if (OffImm < 0) {
2108       AddSub = ARM_AM::sub;
2109       OffImm = - OffImm;
2110     }
2111     int Limit = (1 << 8) * Scale;
2112     if (OffImm >= Limit || (OffImm & (Scale-1)))
2113       return false;
2114     Offset = ARM_AM::getAM3Opc(AddSub, OffImm);
2115   }
2116   FirstReg = Op0->getOperand(0).getReg();
2117   SecondReg = Op1->getOperand(0).getReg();
2118   if (FirstReg == SecondReg)
2119     return false;
2120   BaseReg = Op0->getOperand(1).getReg();
2121   Pred = getInstrPredicate(*Op0, PredReg);
2122   dl = Op0->getDebugLoc();
2123   return true;
2124 }
2125 
2126 bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
2127                                  SmallVectorImpl<MachineInstr *> &Ops,
2128                                  unsigned Base, bool isLd,
2129                                  DenseMap<MachineInstr*, unsigned> &MI2LocMap) {
2130   bool RetVal = false;
2131 
2132   // Sort by offset (in reverse order).
2133   std::sort(Ops.begin(), Ops.end(),
2134             [](const MachineInstr *LHS, const MachineInstr *RHS) {
2135     int LOffset = getMemoryOpOffset(LHS);
2136     int ROffset = getMemoryOpOffset(RHS);
2137     assert(LHS == RHS || LOffset != ROffset);
2138     return LOffset > ROffset;
2139   });
2140 
2141   // The loads / stores of the same base are in order. Scan them from first to
2142   // last and check for the following:
2143   // 1. Any def of base.
2144   // 2. Any gaps.
2145   while (Ops.size() > 1) {
2146     unsigned FirstLoc = ~0U;
2147     unsigned LastLoc = 0;
2148     MachineInstr *FirstOp = nullptr;
2149     MachineInstr *LastOp = nullptr;
2150     int LastOffset = 0;
2151     unsigned LastOpcode = 0;
2152     unsigned LastBytes = 0;
2153     unsigned NumMove = 0;
2154     for (int i = Ops.size() - 1; i >= 0; --i) {
2155       MachineInstr *Op = Ops[i];
2156       unsigned Loc = MI2LocMap[Op];
2157       if (Loc <= FirstLoc) {
2158         FirstLoc = Loc;
2159         FirstOp = Op;
2160       }
2161       if (Loc >= LastLoc) {
2162         LastLoc = Loc;
2163         LastOp = Op;
2164       }
2165 
2166       unsigned LSMOpcode
2167         = getLoadStoreMultipleOpcode(Op->getOpcode(), ARM_AM::ia);
2168       if (LastOpcode && LSMOpcode != LastOpcode)
2169         break;
2170 
2171       int Offset = getMemoryOpOffset(Op);
2172       unsigned Bytes = getLSMultipleTransferSize(Op);
2173       if (LastBytes) {
2174         if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes))
2175           break;
2176       }
2177       LastOffset = Offset;
2178       LastBytes = Bytes;
2179       LastOpcode = LSMOpcode;
2180       if (++NumMove == 8) // FIXME: Tune this limit.
2181         break;
2182     }
2183 
2184     if (NumMove <= 1)
2185       Ops.pop_back();
2186     else {
2187       SmallPtrSet<MachineInstr*, 4> MemOps;
2188       SmallSet<unsigned, 4> MemRegs;
2189       for (int i = NumMove-1; i >= 0; --i) {
2190         MemOps.insert(Ops[i]);
2191         MemRegs.insert(Ops[i]->getOperand(0).getReg());
2192       }
2193 
2194       // Be conservative, if the instructions are too far apart, don't
2195       // move them. We want to limit the increase of register pressure.
2196       bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this.
2197       if (DoMove)
2198         DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp,
2199                                            MemOps, MemRegs, TRI);
2200       if (!DoMove) {
2201         for (unsigned i = 0; i != NumMove; ++i)
2202           Ops.pop_back();
2203       } else {
2204         // This is the new location for the loads / stores.
2205         MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp;
2206         while (InsertPos != MBB->end()
2207                && (MemOps.count(InsertPos) || InsertPos->isDebugValue()))
2208           ++InsertPos;
2209 
2210         // If we are moving a pair of loads / stores, see if it makes sense
2211         // to try to allocate a pair of registers that can form register pairs.
2212         MachineInstr *Op0 = Ops.back();
2213         MachineInstr *Op1 = Ops[Ops.size()-2];
2214         unsigned FirstReg = 0, SecondReg = 0;
2215         unsigned BaseReg = 0, PredReg = 0;
2216         ARMCC::CondCodes Pred = ARMCC::AL;
2217         bool isT2 = false;
2218         unsigned NewOpc = 0;
2219         int Offset = 0;
2220         DebugLoc dl;
2221         if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc,
2222                                              FirstReg, SecondReg, BaseReg,
2223                                              Offset, PredReg, Pred, isT2)) {
2224           Ops.pop_back();
2225           Ops.pop_back();
2226 
2227           const MCInstrDesc &MCID = TII->get(NewOpc);
2228           const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF);
2229           MRI->constrainRegClass(FirstReg, TRC);
2230           MRI->constrainRegClass(SecondReg, TRC);
2231 
2232           // Form the pair instruction.
2233           if (isLd) {
2234             MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID)
2235               .addReg(FirstReg, RegState::Define)
2236               .addReg(SecondReg, RegState::Define)
2237               .addReg(BaseReg);
2238             // FIXME: We're converting from LDRi12 to an insn that still
2239             // uses addrmode2, so we need an explicit offset reg. It should
2240             // always by reg0 since we're transforming LDRi12s.
2241             if (!isT2)
2242               MIB.addReg(0);
2243             MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
2244             MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1));
2245             DEBUG(dbgs() << "Formed " << *MIB << "\n");
2246             ++NumLDRDFormed;
2247           } else {
2248             MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID)
2249               .addReg(FirstReg)
2250               .addReg(SecondReg)
2251               .addReg(BaseReg);
2252             // FIXME: We're converting from LDRi12 to an insn that still
2253             // uses addrmode2, so we need an explicit offset reg. It should
2254             // always by reg0 since we're transforming STRi12s.
2255             if (!isT2)
2256               MIB.addReg(0);
2257             MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
2258             MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1));
2259             DEBUG(dbgs() << "Formed " << *MIB << "\n");
2260             ++NumSTRDFormed;
2261           }
2262           MBB->erase(Op0);
2263           MBB->erase(Op1);
2264 
2265           if (!isT2) {
2266             // Add register allocation hints to form register pairs.
2267             MRI->setRegAllocationHint(FirstReg, ARMRI::RegPairEven, SecondReg);
2268             MRI->setRegAllocationHint(SecondReg,  ARMRI::RegPairOdd, FirstReg);
2269           }
2270         } else {
2271           for (unsigned i = 0; i != NumMove; ++i) {
2272             MachineInstr *Op = Ops.back();
2273             Ops.pop_back();
2274             MBB->splice(InsertPos, MBB, Op);
2275           }
2276         }
2277 
2278         NumLdStMoved += NumMove;
2279         RetVal = true;
2280       }
2281     }
2282   }
2283 
2284   return RetVal;
2285 }
2286 
2287 bool
2288 ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
2289   bool RetVal = false;
2290 
2291   DenseMap<MachineInstr*, unsigned> MI2LocMap;
2292   DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap;
2293   DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap;
2294   SmallVector<unsigned, 4> LdBases;
2295   SmallVector<unsigned, 4> StBases;
2296 
2297   unsigned Loc = 0;
2298   MachineBasicBlock::iterator MBBI = MBB->begin();
2299   MachineBasicBlock::iterator E = MBB->end();
2300   while (MBBI != E) {
2301     for (; MBBI != E; ++MBBI) {
2302       MachineInstr *MI = MBBI;
2303       if (MI->isCall() || MI->isTerminator()) {
2304         // Stop at barriers.
2305         ++MBBI;
2306         break;
2307       }
2308 
2309       if (!MI->isDebugValue())
2310         MI2LocMap[MI] = ++Loc;
2311 
2312       if (!isMemoryOp(*MI))
2313         continue;
2314       unsigned PredReg = 0;
2315       if (getInstrPredicate(*MI, PredReg) != ARMCC::AL)
2316         continue;
2317 
2318       int Opc = MI->getOpcode();
2319       bool isLd = isLoadSingle(Opc);
2320       unsigned Base = MI->getOperand(1).getReg();
2321       int Offset = getMemoryOpOffset(MI);
2322 
2323       bool StopHere = false;
2324       if (isLd) {
2325         DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
2326           Base2LdsMap.find(Base);
2327         if (BI != Base2LdsMap.end()) {
2328           for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
2329             if (Offset == getMemoryOpOffset(BI->second[i])) {
2330               StopHere = true;
2331               break;
2332             }
2333           }
2334           if (!StopHere)
2335             BI->second.push_back(MI);
2336         } else {
2337           Base2LdsMap[Base].push_back(MI);
2338           LdBases.push_back(Base);
2339         }
2340       } else {
2341         DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
2342           Base2StsMap.find(Base);
2343         if (BI != Base2StsMap.end()) {
2344           for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
2345             if (Offset == getMemoryOpOffset(BI->second[i])) {
2346               StopHere = true;
2347               break;
2348             }
2349           }
2350           if (!StopHere)
2351             BI->second.push_back(MI);
2352         } else {
2353           Base2StsMap[Base].push_back(MI);
2354           StBases.push_back(Base);
2355         }
2356       }
2357 
2358       if (StopHere) {
2359         // Found a duplicate (a base+offset combination that's seen earlier).
2360         // Backtrack.
2361         --Loc;
2362         break;
2363       }
2364     }
2365 
2366     // Re-schedule loads.
2367     for (unsigned i = 0, e = LdBases.size(); i != e; ++i) {
2368       unsigned Base = LdBases[i];
2369       SmallVectorImpl<MachineInstr *> &Lds = Base2LdsMap[Base];
2370       if (Lds.size() > 1)
2371         RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap);
2372     }
2373 
2374     // Re-schedule stores.
2375     for (unsigned i = 0, e = StBases.size(); i != e; ++i) {
2376       unsigned Base = StBases[i];
2377       SmallVectorImpl<MachineInstr *> &Sts = Base2StsMap[Base];
2378       if (Sts.size() > 1)
2379         RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap);
2380     }
2381 
2382     if (MBBI != E) {
2383       Base2LdsMap.clear();
2384       Base2StsMap.clear();
2385       LdBases.clear();
2386       StBases.clear();
2387     }
2388   }
2389 
2390   return RetVal;
2391 }
2392 
2393 
2394 /// Returns an instance of the load / store optimization pass.
2395 FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
2396   if (PreAlloc)
2397     return new ARMPreAllocLoadStoreOpt();
2398   return new ARMLoadStoreOpt();
2399 }
2400