1 //===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ----*- C++ -*-=//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file contains a pass that performs load / store related peephole
11 // optimizations. This pass should be run after register allocation.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #define DEBUG_TYPE "arm-ldst-opt"
16 #include "ARM.h"
17 #include "ARMAddressingModes.h"
18 #include "ARMBaseInstrInfo.h"
19 #include "ARMMachineFunctionInfo.h"
20 #include "ARMRegisterInfo.h"
21 #include "llvm/DerivedTypes.h"
22 #include "llvm/Function.h"
23 #include "llvm/CodeGen/MachineBasicBlock.h"
24 #include "llvm/CodeGen/MachineFunctionPass.h"
25 #include "llvm/CodeGen/MachineInstr.h"
26 #include "llvm/CodeGen/MachineInstrBuilder.h"
27 #include "llvm/CodeGen/MachineRegisterInfo.h"
28 #include "llvm/CodeGen/RegisterScavenging.h"
29 #include "llvm/Target/TargetData.h"
30 #include "llvm/Target/TargetInstrInfo.h"
31 #include "llvm/Target/TargetMachine.h"
32 #include "llvm/Target/TargetRegisterInfo.h"
33 #include "llvm/Support/ErrorHandling.h"
34 #include "llvm/ADT/DenseMap.h"
35 #include "llvm/ADT/STLExtras.h"
36 #include "llvm/ADT/SmallPtrSet.h"
37 #include "llvm/ADT/SmallSet.h"
38 #include "llvm/ADT/SmallVector.h"
39 #include "llvm/ADT/Statistic.h"
40 using namespace llvm;
41 
42 STATISTIC(NumLDMGened , "Number of ldm instructions generated");
43 STATISTIC(NumSTMGened , "Number of stm instructions generated");
44 STATISTIC(NumVLDMGened, "Number of vldm instructions generated");
45 STATISTIC(NumVSTMGened, "Number of vstm instructions generated");
46 STATISTIC(NumLdStMoved, "Number of load / store instructions moved");
47 STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation");
48 STATISTIC(NumSTRDFormed,"Number of strd created before allocation");
49 STATISTIC(NumLDRD2LDM,  "Number of ldrd instructions turned back into ldm");
50 STATISTIC(NumSTRD2STM,  "Number of strd instructions turned back into stm");
51 STATISTIC(NumLDRD2LDR,  "Number of ldrd instructions turned back into ldr's");
52 STATISTIC(NumSTRD2STR,  "Number of strd instructions turned back into str's");
53 
54 /// ARMAllocLoadStoreOpt - Post- register allocation pass the combine
55 /// load / store instructions to form ldm / stm instructions.
56 
57 namespace {
58   struct ARMLoadStoreOpt : public MachineFunctionPass {
59     static char ID;
60     ARMLoadStoreOpt() : MachineFunctionPass(&ID) {}
61 
62     const TargetInstrInfo *TII;
63     const TargetRegisterInfo *TRI;
64     ARMFunctionInfo *AFI;
65     RegScavenger *RS;
66     bool isThumb2;
67 
68     virtual bool runOnMachineFunction(MachineFunction &Fn);
69 
70     virtual const char *getPassName() const {
71       return "ARM load / store optimization pass";
72     }
73 
74   private:
75     struct MemOpQueueEntry {
76       int Offset;
77       unsigned Position;
78       MachineBasicBlock::iterator MBBI;
79       bool Merged;
80       MemOpQueueEntry(int o, int p, MachineBasicBlock::iterator i)
81         : Offset(o), Position(p), MBBI(i), Merged(false) {}
82     };
83     typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
84     typedef MemOpQueue::iterator MemOpQueueIter;
85 
86     bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
87                   int Offset, unsigned Base, bool BaseKill, int Opcode,
88                   ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch,
89                   DebugLoc dl, SmallVector<std::pair<unsigned, bool>, 8> &Regs);
90     void MergeOpsUpdate(MachineBasicBlock &MBB,
91                         MemOpQueue &MemOps,
92                         unsigned memOpsBegin,
93                         unsigned memOpsEnd,
94                         unsigned insertAfter,
95                         int Offset,
96                         unsigned Base,
97                         bool BaseKill,
98                         int Opcode,
99                         ARMCC::CondCodes Pred,
100                         unsigned PredReg,
101                         unsigned Scratch,
102                         DebugLoc dl,
103                         SmallVector<MachineBasicBlock::iterator, 4> &Merges);
104     void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base,
105                       int Opcode, unsigned Size,
106                       ARMCC::CondCodes Pred, unsigned PredReg,
107                       unsigned Scratch, MemOpQueue &MemOps,
108                       SmallVector<MachineBasicBlock::iterator, 4> &Merges);
109 
110     void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps);
111     bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
112                              MachineBasicBlock::iterator &MBBI);
113     bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
114                                   MachineBasicBlock::iterator MBBI,
115                                   const TargetInstrInfo *TII,
116                                   bool &Advance,
117                                   MachineBasicBlock::iterator &I);
118     bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
119                                    MachineBasicBlock::iterator MBBI,
120                                    bool &Advance,
121                                    MachineBasicBlock::iterator &I);
122     bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
123     bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
124   };
125   char ARMLoadStoreOpt::ID = 0;
126 }
127 
128 static int getLoadStoreMultipleOpcode(int Opcode) {
129   switch (Opcode) {
130   case ARM::LDR:
131     NumLDMGened++;
132     return ARM::LDM;
133   case ARM::STR:
134     NumSTMGened++;
135     return ARM::STM;
136   case ARM::t2LDRi8:
137   case ARM::t2LDRi12:
138     NumLDMGened++;
139     return ARM::t2LDM;
140   case ARM::t2STRi8:
141   case ARM::t2STRi12:
142     NumSTMGened++;
143     return ARM::t2STM;
144   case ARM::VLDRS:
145     NumVLDMGened++;
146     return ARM::VLDMS;
147   case ARM::VSTRS:
148     NumVSTMGened++;
149     return ARM::VSTMS;
150   case ARM::VLDRD:
151     NumVLDMGened++;
152     return ARM::VLDMD;
153   case ARM::VSTRD:
154     NumVSTMGened++;
155     return ARM::VSTMD;
156   default: llvm_unreachable("Unhandled opcode!");
157   }
158   return 0;
159 }
160 
161 static bool isT2i32Load(unsigned Opc) {
162   return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8;
163 }
164 
165 static bool isi32Load(unsigned Opc) {
166   return Opc == ARM::LDR || isT2i32Load(Opc);
167 }
168 
169 static bool isT2i32Store(unsigned Opc) {
170   return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8;
171 }
172 
173 static bool isi32Store(unsigned Opc) {
174   return Opc == ARM::STR || isT2i32Store(Opc);
175 }
176 
177 /// MergeOps - Create and insert a LDM or STM with Base as base register and
178 /// registers in Regs as the register operands that would be loaded / stored.
179 /// It returns true if the transformation is done.
180 bool
181 ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
182                           MachineBasicBlock::iterator MBBI,
183                           int Offset, unsigned Base, bool BaseKill,
184                           int Opcode, ARMCC::CondCodes Pred,
185                           unsigned PredReg, unsigned Scratch, DebugLoc dl,
186                           SmallVector<std::pair<unsigned, bool>, 8> &Regs) {
187   // Only a single register to load / store. Don't bother.
188   unsigned NumRegs = Regs.size();
189   if (NumRegs <= 1)
190     return false;
191 
192   ARM_AM::AMSubMode Mode = ARM_AM::ia;
193   bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode);
194   if (isAM4 && Offset == 4) {
195     if (isThumb2)
196       // Thumb2 does not support ldmib / stmib.
197       return false;
198     Mode = ARM_AM::ib;
199   } else if (isAM4 && Offset == -4 * (int)NumRegs + 4) {
200     if (isThumb2)
201       // Thumb2 does not support ldmda / stmda.
202       return false;
203     Mode = ARM_AM::da;
204   } else if (isAM4 && Offset == -4 * (int)NumRegs) {
205     Mode = ARM_AM::db;
206   } else if (Offset != 0) {
207     // If starting offset isn't zero, insert a MI to materialize a new base.
208     // But only do so if it is cost effective, i.e. merging more than two
209     // loads / stores.
210     if (NumRegs <= 2)
211       return false;
212 
213     unsigned NewBase;
214     if (isi32Load(Opcode))
215       // If it is a load, then just use one of the destination register to
216       // use as the new base.
217       NewBase = Regs[NumRegs-1].first;
218     else {
219       // Use the scratch register to use as a new base.
220       NewBase = Scratch;
221       if (NewBase == 0)
222         return false;
223     }
224     int BaseOpc = !isThumb2
225       ? ARM::ADDri
226       : ((Base == ARM::SP) ? ARM::t2ADDrSPi : ARM::t2ADDri);
227     if (Offset < 0) {
228       BaseOpc = !isThumb2
229         ? ARM::SUBri
230         : ((Base == ARM::SP) ? ARM::t2SUBrSPi : ARM::t2SUBri);
231       Offset = - Offset;
232     }
233     int ImmedOffset = isThumb2
234       ? ARM_AM::getT2SOImmVal(Offset) : ARM_AM::getSOImmVal(Offset);
235     if (ImmedOffset == -1)
236       // FIXME: Try t2ADDri12 or t2SUBri12?
237       return false;  // Probably not worth it then.
238 
239     BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)
240       .addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
241       .addImm(Pred).addReg(PredReg).addReg(0);
242     Base = NewBase;
243     BaseKill = true;  // New base is always killed right its use.
244   }
245 
246   bool isDPR = Opcode == ARM::VLDRD || Opcode == ARM::VSTRD;
247   bool isDef = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD;
248   Opcode = getLoadStoreMultipleOpcode(Opcode);
249   MachineInstrBuilder MIB = (isAM4)
250     ? BuildMI(MBB, MBBI, dl, TII->get(Opcode))
251         .addReg(Base, getKillRegState(BaseKill))
252         .addImm(ARM_AM::getAM4ModeImm(Mode)).addImm(Pred).addReg(PredReg)
253     : BuildMI(MBB, MBBI, dl, TII->get(Opcode))
254         .addReg(Base, getKillRegState(BaseKill))
255         .addImm(ARM_AM::getAM5Opc(Mode, false, isDPR ? NumRegs<<1 : NumRegs))
256         .addImm(Pred).addReg(PredReg);
257   MIB.addReg(0); // Add optional writeback (0 for now).
258   for (unsigned i = 0; i != NumRegs; ++i)
259     MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef)
260                      | getKillRegState(Regs[i].second));
261 
262   return true;
263 }
264 
265 // MergeOpsUpdate - call MergeOps and update MemOps and merges accordingly on
266 // success.
267 void ARMLoadStoreOpt::
268 MergeOpsUpdate(MachineBasicBlock &MBB,
269                MemOpQueue &memOps,
270                unsigned memOpsBegin,
271                unsigned memOpsEnd,
272                unsigned insertAfter,
273                int Offset,
274                unsigned Base,
275                bool BaseKill,
276                int Opcode,
277                ARMCC::CondCodes Pred,
278                unsigned PredReg,
279                unsigned Scratch,
280                DebugLoc dl,
281                SmallVector<MachineBasicBlock::iterator, 4> &Merges) {
282   // First calculate which of the registers should be killed by the merged
283   // instruction.
284   SmallVector<std::pair<unsigned, bool>, 8> Regs;
285   const unsigned insertPos = memOps[insertAfter].Position;
286   for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
287     const MachineOperand &MO = memOps[i].MBBI->getOperand(0);
288     unsigned Reg = MO.getReg();
289     bool isKill = MO.isKill();
290 
291     // If we are inserting the merged operation after an unmerged operation that
292     // uses the same register, make sure to transfer any kill flag.
293     for (unsigned j = memOpsEnd, e = memOps.size(); !isKill && j != e; ++j)
294       if (memOps[j].Position<insertPos) {
295         const MachineOperand &MOJ = memOps[j].MBBI->getOperand(0);
296         if (MOJ.getReg() == Reg && MOJ.isKill())
297           isKill = true;
298       }
299 
300     Regs.push_back(std::make_pair(Reg, isKill));
301   }
302 
303   // Try to do the merge.
304   MachineBasicBlock::iterator Loc = memOps[insertAfter].MBBI;
305   Loc++;
306   if (!MergeOps(MBB, Loc, Offset, Base, BaseKill, Opcode,
307                 Pred, PredReg, Scratch, dl, Regs))
308     return;
309 
310   // Merge succeeded, update records.
311   Merges.push_back(prior(Loc));
312   for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
313     // Remove kill flags from any unmerged memops that come before insertPos.
314     if (Regs[i-memOpsBegin].second)
315       for (unsigned j = memOpsEnd, e = memOps.size(); j != e; ++j)
316         if (memOps[j].Position<insertPos) {
317           MachineOperand &MOJ = memOps[j].MBBI->getOperand(0);
318           if (MOJ.getReg() == Regs[i-memOpsBegin].first && MOJ.isKill())
319             MOJ.setIsKill(false);
320         }
321     MBB.erase(memOps[i].MBBI);
322     memOps[i].Merged = true;
323   }
324 }
325 
326 /// MergeLDR_STR - Merge a number of load / store instructions into one or more
327 /// load / store multiple instructions.
328 void
329 ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
330                           unsigned Base, int Opcode, unsigned Size,
331                           ARMCC::CondCodes Pred, unsigned PredReg,
332                           unsigned Scratch, MemOpQueue &MemOps,
333                           SmallVector<MachineBasicBlock::iterator, 4> &Merges) {
334   bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode);
335   int Offset = MemOps[SIndex].Offset;
336   int SOffset = Offset;
337   unsigned insertAfter = SIndex;
338   MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI;
339   DebugLoc dl = Loc->getDebugLoc();
340   const MachineOperand &PMO = Loc->getOperand(0);
341   unsigned PReg = PMO.getReg();
342   unsigned PRegNum = PMO.isUndef() ? UINT_MAX
343     : ARMRegisterInfo::getRegisterNumbering(PReg);
344 
345   for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) {
346     int NewOffset = MemOps[i].Offset;
347     const MachineOperand &MO = MemOps[i].MBBI->getOperand(0);
348     unsigned Reg = MO.getReg();
349     unsigned RegNum = MO.isUndef() ? UINT_MAX
350       : ARMRegisterInfo::getRegisterNumbering(Reg);
351     // AM4 - register numbers in ascending order.
352     // AM5 - consecutive register numbers in ascending order.
353     if (NewOffset == Offset + (int)Size &&
354         ((isAM4 && RegNum > PRegNum) || RegNum == PRegNum+1)) {
355       Offset += Size;
356       PRegNum = RegNum;
357     } else {
358       // Can't merge this in. Try merge the earlier ones first.
359       MergeOpsUpdate(MBB, MemOps, SIndex, i, insertAfter, SOffset,
360                      Base, false, Opcode, Pred, PredReg, Scratch, dl, Merges);
361       MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch,
362                    MemOps, Merges);
363       return;
364     }
365 
366     if (MemOps[i].Position > MemOps[insertAfter].Position)
367       insertAfter = i;
368   }
369 
370   bool BaseKill = Loc->findRegisterUseOperandIdx(Base, true) != -1;
371   MergeOpsUpdate(MBB, MemOps, SIndex, MemOps.size(), insertAfter, SOffset,
372                  Base, BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges);
373   return;
374 }
375 
376 static inline bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
377                                        unsigned Bytes, unsigned Limit,
378                                        ARMCC::CondCodes Pred, unsigned PredReg){
379   unsigned MyPredReg = 0;
380   if (!MI)
381     return false;
382   if (MI->getOpcode() != ARM::t2SUBri &&
383       MI->getOpcode() != ARM::t2SUBrSPi &&
384       MI->getOpcode() != ARM::t2SUBrSPi12 &&
385       MI->getOpcode() != ARM::tSUBspi &&
386       MI->getOpcode() != ARM::SUBri)
387     return false;
388 
389   // Make sure the offset fits in 8 bits.
390   if (Bytes <= 0 || (Limit && Bytes >= Limit))
391     return false;
392 
393   unsigned Scale = (MI->getOpcode() == ARM::tSUBspi) ? 4 : 1; // FIXME
394   return (MI->getOperand(0).getReg() == Base &&
395           MI->getOperand(1).getReg() == Base &&
396           (MI->getOperand(2).getImm()*Scale) == Bytes &&
397           llvm::getInstrPredicate(MI, MyPredReg) == Pred &&
398           MyPredReg == PredReg);
399 }
400 
401 static inline bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
402                                        unsigned Bytes, unsigned Limit,
403                                        ARMCC::CondCodes Pred, unsigned PredReg){
404   unsigned MyPredReg = 0;
405   if (!MI)
406     return false;
407   if (MI->getOpcode() != ARM::t2ADDri &&
408       MI->getOpcode() != ARM::t2ADDrSPi &&
409       MI->getOpcode() != ARM::t2ADDrSPi12 &&
410       MI->getOpcode() != ARM::tADDspi &&
411       MI->getOpcode() != ARM::ADDri)
412     return false;
413 
414   if (Bytes <= 0 || (Limit && Bytes >= Limit))
415     // Make sure the offset fits in 8 bits.
416     return false;
417 
418   unsigned Scale = (MI->getOpcode() == ARM::tADDspi) ? 4 : 1; // FIXME
419   return (MI->getOperand(0).getReg() == Base &&
420           MI->getOperand(1).getReg() == Base &&
421           (MI->getOperand(2).getImm()*Scale) == Bytes &&
422           llvm::getInstrPredicate(MI, MyPredReg) == Pred &&
423           MyPredReg == PredReg);
424 }
425 
426 static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
427   switch (MI->getOpcode()) {
428   default: return 0;
429   case ARM::LDR:
430   case ARM::STR:
431   case ARM::t2LDRi8:
432   case ARM::t2LDRi12:
433   case ARM::t2STRi8:
434   case ARM::t2STRi12:
435   case ARM::VLDRS:
436   case ARM::VSTRS:
437     return 4;
438   case ARM::VLDRD:
439   case ARM::VSTRD:
440     return 8;
441   case ARM::LDM:
442   case ARM::STM:
443   case ARM::t2LDM:
444   case ARM::t2STM:
445     return (MI->getNumOperands() - 5) * 4;
446   case ARM::VLDMS:
447   case ARM::VSTMS:
448   case ARM::VLDMD:
449   case ARM::VSTMD:
450     return ARM_AM::getAM5Offset(MI->getOperand(1).getImm()) * 4;
451   }
452 }
453 
454 /// MergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base
455 /// register into the LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible:
456 ///
457 /// stmia rn, <ra, rb, rc>
458 /// rn := rn + 4 * 3;
459 /// =>
460 /// stmia rn!, <ra, rb, rc>
461 ///
462 /// rn := rn - 4 * 3;
463 /// ldmia rn, <ra, rb, rc>
464 /// =>
465 /// ldmdb rn!, <ra, rb, rc>
466 bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
467                                                MachineBasicBlock::iterator MBBI,
468                                                bool &Advance,
469                                                MachineBasicBlock::iterator &I) {
470   MachineInstr *MI = MBBI;
471   unsigned Base = MI->getOperand(0).getReg();
472   unsigned Bytes = getLSMultipleTransferSize(MI);
473   unsigned PredReg = 0;
474   ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
475   int Opcode = MI->getOpcode();
476   bool isAM4 = Opcode == ARM::LDM || Opcode == ARM::t2LDM ||
477     Opcode == ARM::STM || Opcode == ARM::t2STM;
478 
479   if (isAM4) {
480     if (ARM_AM::getAM4WBFlag(MI->getOperand(1).getImm()))
481       return false;
482 
483     // Can't use the updating AM4 sub-mode if the base register is also a dest
484     // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
485     for (unsigned i = 3, e = MI->getNumOperands(); i != e; ++i) {
486       if (MI->getOperand(i).getReg() == Base)
487         return false;
488     }
489 
490     ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(1).getImm());
491     if (MBBI != MBB.begin()) {
492       MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
493       if (Mode == ARM_AM::ia &&
494           isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
495         MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::db, true));
496         MI->getOperand(4).setReg(Base);
497         MI->getOperand(4).setIsDef();
498         MBB.erase(PrevMBBI);
499         return true;
500       } else if (Mode == ARM_AM::ib &&
501                  isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
502         MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::da, true));
503         MI->getOperand(4).setReg(Base);  // WB to base
504         MI->getOperand(4).setIsDef();
505         MBB.erase(PrevMBBI);
506         return true;
507       }
508     }
509 
510     if (MBBI != MBB.end()) {
511       MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI);
512       if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
513           isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
514         MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true));
515         MI->getOperand(4).setReg(Base);  // WB to base
516         MI->getOperand(4).setIsDef();
517         if (NextMBBI == I) {
518           Advance = true;
519           ++I;
520         }
521         MBB.erase(NextMBBI);
522         return true;
523       } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) &&
524                  isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
525         MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true));
526         MI->getOperand(4).setReg(Base);  // WB to base
527         MI->getOperand(4).setIsDef();
528         if (NextMBBI == I) {
529           Advance = true;
530           ++I;
531         }
532         MBB.erase(NextMBBI);
533         return true;
534       }
535     }
536   } else {
537     // VLDM{D|S}, VSTM{D|S} addressing mode 5 ops.
538     if (ARM_AM::getAM5WBFlag(MI->getOperand(1).getImm()))
539       return false;
540 
541     ARM_AM::AMSubMode Mode = ARM_AM::getAM5SubMode(MI->getOperand(1).getImm());
542     unsigned Offset = ARM_AM::getAM5Offset(MI->getOperand(1).getImm());
543     if (MBBI != MBB.begin()) {
544       MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
545       if (Mode == ARM_AM::ia &&
546           isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
547         MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::db, true, Offset));
548         MI->getOperand(4).setReg(Base);  // WB to base
549         MI->getOperand(4).setIsDef();
550         MBB.erase(PrevMBBI);
551         return true;
552       }
553     }
554 
555     if (MBBI != MBB.end()) {
556       MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI);
557       if (Mode == ARM_AM::ia &&
558           isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
559         MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::ia, true, Offset));
560         MI->getOperand(4).setReg(Base);  // WB to base
561         MI->getOperand(4).setIsDef();
562         if (NextMBBI == I) {
563           Advance = true;
564           ++I;
565         }
566         MBB.erase(NextMBBI);
567       }
568       return true;
569     }
570   }
571 
572   return false;
573 }
574 
575 static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc) {
576   switch (Opc) {
577   case ARM::LDR: return ARM::LDR_PRE;
578   case ARM::STR: return ARM::STR_PRE;
579   case ARM::VLDRS: return ARM::VLDMS;
580   case ARM::VLDRD: return ARM::VLDMD;
581   case ARM::VSTRS: return ARM::VSTMS;
582   case ARM::VSTRD: return ARM::VSTMD;
583   case ARM::t2LDRi8:
584   case ARM::t2LDRi12:
585     return ARM::t2LDR_PRE;
586   case ARM::t2STRi8:
587   case ARM::t2STRi12:
588     return ARM::t2STR_PRE;
589   default: llvm_unreachable("Unhandled opcode!");
590   }
591   return 0;
592 }
593 
594 static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc) {
595   switch (Opc) {
596   case ARM::LDR: return ARM::LDR_POST;
597   case ARM::STR: return ARM::STR_POST;
598   case ARM::VLDRS: return ARM::VLDMS;
599   case ARM::VLDRD: return ARM::VLDMD;
600   case ARM::VSTRS: return ARM::VSTMS;
601   case ARM::VSTRD: return ARM::VSTMD;
602   case ARM::t2LDRi8:
603   case ARM::t2LDRi12:
604     return ARM::t2LDR_POST;
605   case ARM::t2STRi8:
606   case ARM::t2STRi12:
607     return ARM::t2STR_POST;
608   default: llvm_unreachable("Unhandled opcode!");
609   }
610   return 0;
611 }
612 
613 /// MergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base
614 /// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible:
615 bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
616                                                MachineBasicBlock::iterator MBBI,
617                                                const TargetInstrInfo *TII,
618                                                bool &Advance,
619                                                MachineBasicBlock::iterator &I) {
620   MachineInstr *MI = MBBI;
621   unsigned Base = MI->getOperand(1).getReg();
622   bool BaseKill = MI->getOperand(1).isKill();
623   unsigned Bytes = getLSMultipleTransferSize(MI);
624   int Opcode = MI->getOpcode();
625   DebugLoc dl = MI->getDebugLoc();
626   bool isAM5 = Opcode == ARM::VLDRD || Opcode == ARM::VLDRS ||
627     Opcode == ARM::VSTRD || Opcode == ARM::VSTRS;
628   bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR;
629   if (isAM2 && ARM_AM::getAM2Offset(MI->getOperand(3).getImm()) != 0)
630     return false;
631   else if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
632     return false;
633   else if (isT2i32Load(Opcode) || isT2i32Store(Opcode))
634     if (MI->getOperand(2).getImm() != 0)
635       return false;
636 
637   bool isLd = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD;
638   // Can't do the merge if the destination register is the same as the would-be
639   // writeback register.
640   if (isLd && MI->getOperand(0).getReg() == Base)
641     return false;
642 
643   unsigned PredReg = 0;
644   ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
645   bool DoMerge = false;
646   ARM_AM::AddrOpc AddSub = ARM_AM::add;
647   unsigned NewOpc = 0;
648   // AM2 - 12 bits, thumb2 - 8 bits.
649   unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100);
650   if (MBBI != MBB.begin()) {
651     MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
652     if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) {
653       DoMerge = true;
654       AddSub = ARM_AM::sub;
655       NewOpc = getPreIndexedLoadStoreOpcode(Opcode);
656     } else if (!isAM5 &&
657                isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) {
658       DoMerge = true;
659       NewOpc = getPreIndexedLoadStoreOpcode(Opcode);
660     }
661     if (DoMerge)
662       MBB.erase(PrevMBBI);
663   }
664 
665   if (!DoMerge && MBBI != MBB.end()) {
666     MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI);
667     if (!isAM5 &&
668         isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) {
669       DoMerge = true;
670       AddSub = ARM_AM::sub;
671       NewOpc = getPostIndexedLoadStoreOpcode(Opcode);
672     } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) {
673       DoMerge = true;
674       NewOpc = getPostIndexedLoadStoreOpcode(Opcode);
675     }
676     if (DoMerge) {
677       if (NextMBBI == I) {
678         Advance = true;
679         ++I;
680       }
681       MBB.erase(NextMBBI);
682     }
683   }
684 
685   if (!DoMerge)
686     return false;
687 
688   bool isDPR = NewOpc == ARM::VLDMD || NewOpc == ARM::VSTMD;
689   unsigned Offset = 0;
690   if (isAM5)
691     Offset = ARM_AM::getAM5Opc((AddSub == ARM_AM::sub)
692                                ? ARM_AM::db
693                                : ARM_AM::ia, true, (isDPR ? 2 : 1));
694   else if (isAM2)
695     Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
696   else
697     Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
698   if (isLd) {
699     if (isAM5)
700       // VLDMS, VLDMD
701       BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
702         .addReg(Base, getKillRegState(BaseKill))
703         .addImm(Offset).addImm(Pred).addReg(PredReg)
704         .addReg(Base, getDefRegState(true)) // WB base register
705         .addReg(MI->getOperand(0).getReg(), RegState::Define);
706     else if (isAM2)
707       // LDR_PRE, LDR_POST,
708       BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
709         .addReg(Base, RegState::Define)
710         .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
711     else
712       // t2LDR_PRE, t2LDR_POST
713       BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
714         .addReg(Base, RegState::Define)
715         .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
716   } else {
717     MachineOperand &MO = MI->getOperand(0);
718     if (isAM5)
719       // VSTMS, VSTMD
720       BuildMI(MBB, MBBI, dl, TII->get(NewOpc)).addReg(Base).addImm(Offset)
721         .addImm(Pred).addReg(PredReg)
722         .addReg(Base, getDefRegState(true)) // WB base register
723         .addReg(MO.getReg(), getKillRegState(MO.isKill()));
724     else if (isAM2)
725       // STR_PRE, STR_POST
726       BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
727         .addReg(MO.getReg(), getKillRegState(MO.isKill()))
728         .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
729     else
730       // t2STR_PRE, t2STR_POST
731       BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
732         .addReg(MO.getReg(), getKillRegState(MO.isKill()))
733         .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
734   }
735   MBB.erase(MBBI);
736 
737   return true;
738 }
739 
740 /// isMemoryOp - Returns true if instruction is a memory operations (that this
741 /// pass is capable of operating on).
742 static bool isMemoryOp(const MachineInstr *MI) {
743   int Opcode = MI->getOpcode();
744   switch (Opcode) {
745   default: break;
746   case ARM::LDR:
747   case ARM::STR:
748     return MI->getOperand(1).isReg() && MI->getOperand(2).getReg() == 0;
749   case ARM::VLDRS:
750   case ARM::VSTRS:
751     return MI->getOperand(1).isReg();
752   case ARM::VLDRD:
753   case ARM::VSTRD:
754     return MI->getOperand(1).isReg();
755   case ARM::t2LDRi8:
756   case ARM::t2LDRi12:
757   case ARM::t2STRi8:
758   case ARM::t2STRi12:
759     return MI->getOperand(1).isReg();
760   }
761   return false;
762 }
763 
764 /// AdvanceRS - Advance register scavenger to just before the earliest memory
765 /// op that is being merged.
766 void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) {
767   MachineBasicBlock::iterator Loc = MemOps[0].MBBI;
768   unsigned Position = MemOps[0].Position;
769   for (unsigned i = 1, e = MemOps.size(); i != e; ++i) {
770     if (MemOps[i].Position < Position) {
771       Position = MemOps[i].Position;
772       Loc = MemOps[i].MBBI;
773     }
774   }
775 
776   if (Loc != MBB.begin())
777     RS->forward(prior(Loc));
778 }
779 
780 static int getMemoryOpOffset(const MachineInstr *MI) {
781   int Opcode = MI->getOpcode();
782   bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR;
783   bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD;
784   unsigned NumOperands = MI->getDesc().getNumOperands();
785   unsigned OffField = MI->getOperand(NumOperands-3).getImm();
786 
787   if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 ||
788       Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 ||
789       Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8)
790     return OffField;
791 
792   int Offset = isAM2
793     ? ARM_AM::getAM2Offset(OffField)
794     : (isAM3 ? ARM_AM::getAM3Offset(OffField)
795              : ARM_AM::getAM5Offset(OffField) * 4);
796   if (isAM2) {
797     if (ARM_AM::getAM2Op(OffField) == ARM_AM::sub)
798       Offset = -Offset;
799   } else if (isAM3) {
800     if (ARM_AM::getAM3Op(OffField) == ARM_AM::sub)
801       Offset = -Offset;
802   } else {
803     if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub)
804       Offset = -Offset;
805   }
806   return Offset;
807 }
808 
809 static void InsertLDR_STR(MachineBasicBlock &MBB,
810                           MachineBasicBlock::iterator &MBBI,
811                           int OffImm, bool isDef,
812                           DebugLoc dl, unsigned NewOpc,
813                           unsigned Reg, bool RegDeadKill, bool RegUndef,
814                           unsigned BaseReg, bool BaseKill, bool BaseUndef,
815                           unsigned OffReg, bool OffKill, bool OffUndef,
816                           ARMCC::CondCodes Pred, unsigned PredReg,
817                           const TargetInstrInfo *TII, bool isT2) {
818   int Offset = OffImm;
819   if (!isT2) {
820     if (OffImm < 0)
821       Offset = ARM_AM::getAM2Opc(ARM_AM::sub, -OffImm, ARM_AM::no_shift);
822     else
823       Offset = ARM_AM::getAM2Opc(ARM_AM::add, OffImm, ARM_AM::no_shift);
824   }
825   if (isDef) {
826     MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
827                                       TII->get(NewOpc))
828       .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill))
829       .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
830     if (!isT2)
831       MIB.addReg(OffReg,  getKillRegState(OffKill)|getUndefRegState(OffUndef));
832     MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
833   } else {
834     MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
835                                       TII->get(NewOpc))
836       .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef))
837       .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
838     if (!isT2)
839       MIB.addReg(OffReg,  getKillRegState(OffKill)|getUndefRegState(OffUndef));
840     MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
841   }
842 }
843 
844 bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
845                                           MachineBasicBlock::iterator &MBBI) {
846   MachineInstr *MI = &*MBBI;
847   unsigned Opcode = MI->getOpcode();
848   if (Opcode == ARM::LDRD || Opcode == ARM::STRD ||
849       Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) {
850     unsigned EvenReg = MI->getOperand(0).getReg();
851     unsigned OddReg  = MI->getOperand(1).getReg();
852     unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false);
853     unsigned OddRegNum  = TRI->getDwarfRegNum(OddReg, false);
854     if ((EvenRegNum & 1) == 0 && (EvenRegNum + 1) == OddRegNum)
855       return false;
856 
857     bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8;
858     bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8;
859     bool EvenDeadKill = isLd ?
860       MI->getOperand(0).isDead() : MI->getOperand(0).isKill();
861     bool EvenUndef = MI->getOperand(0).isUndef();
862     bool OddDeadKill  = isLd ?
863       MI->getOperand(1).isDead() : MI->getOperand(1).isKill();
864     bool OddUndef = MI->getOperand(1).isUndef();
865     const MachineOperand &BaseOp = MI->getOperand(2);
866     unsigned BaseReg = BaseOp.getReg();
867     bool BaseKill = BaseOp.isKill();
868     bool BaseUndef = BaseOp.isUndef();
869     unsigned OffReg = isT2 ? 0 : MI->getOperand(3).getReg();
870     bool OffKill = isT2 ? false : MI->getOperand(3).isKill();
871     bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef();
872     int OffImm = getMemoryOpOffset(MI);
873     unsigned PredReg = 0;
874     ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
875 
876     if (OddRegNum > EvenRegNum && OffReg == 0 && OffImm == 0) {
877       // Ascending register numbers and no offset. It's safe to change it to a
878       // ldm or stm.
879       unsigned NewOpc = (isLd)
880         ? (isT2 ? ARM::t2LDM : ARM::LDM)
881         : (isT2 ? ARM::t2STM : ARM::STM);
882       if (isLd) {
883         BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
884           .addReg(BaseReg, getKillRegState(BaseKill))
885           .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))
886           .addImm(Pred).addReg(PredReg)
887           .addReg(0)
888           .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill))
889           .addReg(OddReg,  getDefRegState(isLd) | getDeadRegState(OddDeadKill));
890         ++NumLDRD2LDM;
891       } else {
892         BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
893           .addReg(BaseReg, getKillRegState(BaseKill))
894           .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))
895           .addImm(Pred).addReg(PredReg)
896           .addReg(0)
897           .addReg(EvenReg,
898                   getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef))
899           .addReg(OddReg,
900                   getKillRegState(OddDeadKill)  | getUndefRegState(OddUndef));
901         ++NumSTRD2STM;
902       }
903     } else {
904       // Split into two instructions.
905       assert((!isT2 || !OffReg) &&
906              "Thumb2 ldrd / strd does not encode offset register!");
907       unsigned NewOpc = (isLd)
908         ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDR)
909         : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STR);
910       DebugLoc dl = MBBI->getDebugLoc();
911       // If this is a load and base register is killed, it may have been
912       // re-defed by the load, make sure the first load does not clobber it.
913       if (isLd &&
914           (BaseKill || OffKill) &&
915           (TRI->regsOverlap(EvenReg, BaseReg) ||
916            (OffReg && TRI->regsOverlap(EvenReg, OffReg)))) {
917         assert(!TRI->regsOverlap(OddReg, BaseReg) &&
918                (!OffReg || !TRI->regsOverlap(OddReg, OffReg)));
919         InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc,
920                       OddReg, OddDeadKill, false,
921                       BaseReg, false, BaseUndef, OffReg, false, OffUndef,
922                       Pred, PredReg, TII, isT2);
923         InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
924                       EvenReg, EvenDeadKill, false,
925                       BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef,
926                       Pred, PredReg, TII, isT2);
927       } else {
928         if (OddReg == EvenReg && EvenDeadKill) {
929           // If the two source operands are the same, the kill marker is probably
930           // on the first one. e.g.
931           // t2STRDi8 %R5<kill>, %R5, %R9<kill>, 0, 14, %reg0
932           EvenDeadKill = false;
933           OddDeadKill = true;
934         }
935         InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
936                       EvenReg, EvenDeadKill, EvenUndef,
937                       BaseReg, false, BaseUndef, OffReg, false, OffUndef,
938                       Pred, PredReg, TII, isT2);
939         InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc,
940                       OddReg, OddDeadKill, OddUndef,
941                       BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef,
942                       Pred, PredReg, TII, isT2);
943       }
944       if (isLd)
945         ++NumLDRD2LDR;
946       else
947         ++NumSTRD2STR;
948     }
949 
950     MBBI = prior(MBBI);
951     MBB.erase(MI);
952   }
953   return false;
954 }
955 
956 /// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR
957 /// ops of the same base and incrementing offset into LDM / STM ops.
958 bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
959   unsigned NumMerges = 0;
960   unsigned NumMemOps = 0;
961   MemOpQueue MemOps;
962   unsigned CurrBase = 0;
963   int CurrOpc = -1;
964   unsigned CurrSize = 0;
965   ARMCC::CondCodes CurrPred = ARMCC::AL;
966   unsigned CurrPredReg = 0;
967   unsigned Position = 0;
968   SmallVector<MachineBasicBlock::iterator,4> Merges;
969 
970   RS->enterBasicBlock(&MBB);
971   MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
972   while (MBBI != E) {
973     if (FixInvalidRegPairOp(MBB, MBBI))
974       continue;
975 
976     bool Advance  = false;
977     bool TryMerge = false;
978     bool Clobber  = false;
979 
980     bool isMemOp = isMemoryOp(MBBI);
981     if (isMemOp) {
982       int Opcode = MBBI->getOpcode();
983       unsigned Size = getLSMultipleTransferSize(MBBI);
984       unsigned Base = MBBI->getOperand(1).getReg();
985       unsigned PredReg = 0;
986       ARMCC::CondCodes Pred = llvm::getInstrPredicate(MBBI, PredReg);
987       int Offset = getMemoryOpOffset(MBBI);
988       // Watch out for:
989       // r4 := ldr [r5]
990       // r5 := ldr [r5, #4]
991       // r6 := ldr [r5, #8]
992       //
993       // The second ldr has effectively broken the chain even though it
994       // looks like the later ldr(s) use the same base register. Try to
995       // merge the ldr's so far, including this one. But don't try to
996       // combine the following ldr(s).
997       Clobber = (isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg());
998       if (CurrBase == 0 && !Clobber) {
999         // Start of a new chain.
1000         CurrBase = Base;
1001         CurrOpc  = Opcode;
1002         CurrSize = Size;
1003         CurrPred = Pred;
1004         CurrPredReg = PredReg;
1005         MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI));
1006         NumMemOps++;
1007         Advance = true;
1008       } else {
1009         if (Clobber) {
1010           TryMerge = true;
1011           Advance = true;
1012         }
1013 
1014         if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) {
1015           // No need to match PredReg.
1016           // Continue adding to the queue.
1017           if (Offset > MemOps.back().Offset) {
1018             MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI));
1019             NumMemOps++;
1020             Advance = true;
1021           } else {
1022             for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end();
1023                  I != E; ++I) {
1024               if (Offset < I->Offset) {
1025                 MemOps.insert(I, MemOpQueueEntry(Offset, Position, MBBI));
1026                 NumMemOps++;
1027                 Advance = true;
1028                 break;
1029               } else if (Offset == I->Offset) {
1030                 // Collision! This can't be merged!
1031                 break;
1032               }
1033             }
1034           }
1035         }
1036       }
1037     }
1038 
1039     if (Advance) {
1040       ++Position;
1041       ++MBBI;
1042       if (MBBI == E)
1043         // Reach the end of the block, try merging the memory instructions.
1044         TryMerge = true;
1045     } else
1046       TryMerge = true;
1047 
1048     if (TryMerge) {
1049       if (NumMemOps > 1) {
1050         // Try to find a free register to use as a new base in case it's needed.
1051         // First advance to the instruction just before the start of the chain.
1052         AdvanceRS(MBB, MemOps);
1053         // Find a scratch register.
1054         unsigned Scratch = RS->FindUnusedReg(ARM::GPRRegisterClass);
1055         // Process the load / store instructions.
1056         RS->forward(prior(MBBI));
1057 
1058         // Merge ops.
1059         Merges.clear();
1060         MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize,
1061                      CurrPred, CurrPredReg, Scratch, MemOps, Merges);
1062 
1063         // Try folding preceeding/trailing base inc/dec into the generated
1064         // LDM/STM ops.
1065         for (unsigned i = 0, e = Merges.size(); i < e; ++i)
1066           if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI))
1067             ++NumMerges;
1068         NumMerges += Merges.size();
1069 
1070         // Try folding preceeding/trailing base inc/dec into those load/store
1071         // that were not merged to form LDM/STM ops.
1072         for (unsigned i = 0; i != NumMemOps; ++i)
1073           if (!MemOps[i].Merged)
1074             if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI))
1075               ++NumMerges;
1076 
1077         // RS may be pointing to an instruction that's deleted.
1078         RS->skipTo(prior(MBBI));
1079       } else if (NumMemOps == 1) {
1080         // Try folding preceeding/trailing base inc/dec into the single
1081         // load/store.
1082         if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) {
1083           ++NumMerges;
1084           RS->forward(prior(MBBI));
1085         }
1086       }
1087 
1088       CurrBase = 0;
1089       CurrOpc = -1;
1090       CurrSize = 0;
1091       CurrPred = ARMCC::AL;
1092       CurrPredReg = 0;
1093       if (NumMemOps) {
1094         MemOps.clear();
1095         NumMemOps = 0;
1096       }
1097 
1098       // If iterator hasn't been advanced and this is not a memory op, skip it.
1099       // It can't start a new chain anyway.
1100       if (!Advance && !isMemOp && MBBI != E) {
1101         ++Position;
1102         ++MBBI;
1103       }
1104     }
1105   }
1106   return NumMerges > 0;
1107 }
1108 
1109 namespace {
1110   struct OffsetCompare {
1111     bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const {
1112       int LOffset = getMemoryOpOffset(LHS);
1113       int ROffset = getMemoryOpOffset(RHS);
1114       assert(LHS == RHS || LOffset != ROffset);
1115       return LOffset > ROffset;
1116     }
1117   };
1118 }
1119 
1120 /// MergeReturnIntoLDM - If this is a exit BB, try merging the return op
1121 /// (bx lr) into the preceeding stack restore so it directly restore the value
1122 /// of LR into pc.
1123 ///   ldmfd sp!, {r7, lr}
1124 ///   bx lr
1125 /// =>
1126 ///   ldmfd sp!, {r7, pc}
1127 bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
1128   if (MBB.empty()) return false;
1129 
1130   MachineBasicBlock::iterator MBBI = prior(MBB.end());
1131   if (MBBI != MBB.begin() &&
1132       (MBBI->getOpcode() == ARM::BX_RET || MBBI->getOpcode() == ARM::tBX_RET)) {
1133     MachineInstr *PrevMI = prior(MBBI);
1134     if (PrevMI->getOpcode() == ARM::LDM || PrevMI->getOpcode() == ARM::t2LDM) {
1135       MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1);
1136       if (MO.getReg() != ARM::LR)
1137         return false;
1138       unsigned NewOpc = isThumb2 ? ARM::t2LDM_RET : ARM::LDM_RET;
1139       PrevMI->setDesc(TII->get(NewOpc));
1140       MO.setReg(ARM::PC);
1141       MBB.erase(MBBI);
1142       return true;
1143     }
1144   }
1145   return false;
1146 }
1147 
1148 bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1149   const TargetMachine &TM = Fn.getTarget();
1150   AFI = Fn.getInfo<ARMFunctionInfo>();
1151   TII = TM.getInstrInfo();
1152   TRI = TM.getRegisterInfo();
1153   RS = new RegScavenger();
1154   isThumb2 = AFI->isThumb2Function();
1155 
1156   bool Modified = false;
1157   for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
1158        ++MFI) {
1159     MachineBasicBlock &MBB = *MFI;
1160     Modified |= LoadStoreMultipleOpti(MBB);
1161     Modified |= MergeReturnIntoLDM(MBB);
1162   }
1163 
1164   delete RS;
1165   return Modified;
1166 }
1167 
1168 
1169 /// ARMPreAllocLoadStoreOpt - Pre- register allocation pass that move
1170 /// load / stores from consecutive locations close to make it more
1171 /// likely they will be combined later.
1172 
1173 namespace {
1174   struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{
1175     static char ID;
1176     ARMPreAllocLoadStoreOpt() : MachineFunctionPass(&ID) {}
1177 
1178     const TargetData *TD;
1179     const TargetInstrInfo *TII;
1180     const TargetRegisterInfo *TRI;
1181     const ARMSubtarget *STI;
1182     MachineRegisterInfo *MRI;
1183     MachineFunction *MF;
1184 
1185     virtual bool runOnMachineFunction(MachineFunction &Fn);
1186 
1187     virtual const char *getPassName() const {
1188       return "ARM pre- register allocation load / store optimization pass";
1189     }
1190 
1191   private:
1192     bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl,
1193                           unsigned &NewOpc, unsigned &EvenReg,
1194                           unsigned &OddReg, unsigned &BaseReg,
1195                           unsigned &OffReg, int &Offset,
1196                           unsigned &PredReg, ARMCC::CondCodes &Pred,
1197                           bool &isT2);
1198     bool RescheduleOps(MachineBasicBlock *MBB,
1199                        SmallVector<MachineInstr*, 4> &Ops,
1200                        unsigned Base, bool isLd,
1201                        DenseMap<MachineInstr*, unsigned> &MI2LocMap);
1202     bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB);
1203   };
1204   char ARMPreAllocLoadStoreOpt::ID = 0;
1205 }
1206 
1207 bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1208   TD  = Fn.getTarget().getTargetData();
1209   TII = Fn.getTarget().getInstrInfo();
1210   TRI = Fn.getTarget().getRegisterInfo();
1211   STI = &Fn.getTarget().getSubtarget<ARMSubtarget>();
1212   MRI = &Fn.getRegInfo();
1213   MF  = &Fn;
1214 
1215   bool Modified = false;
1216   for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
1217        ++MFI)
1218     Modified |= RescheduleLoadStoreInstrs(MFI);
1219 
1220   return Modified;
1221 }
1222 
1223 static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
1224                                       MachineBasicBlock::iterator I,
1225                                       MachineBasicBlock::iterator E,
1226                                       SmallPtrSet<MachineInstr*, 4> &MemOps,
1227                                       SmallSet<unsigned, 4> &MemRegs,
1228                                       const TargetRegisterInfo *TRI) {
1229   // Are there stores / loads / calls between them?
1230   // FIXME: This is overly conservative. We should make use of alias information
1231   // some day.
1232   SmallSet<unsigned, 4> AddedRegPressure;
1233   while (++I != E) {
1234     if (MemOps.count(&*I))
1235       continue;
1236     const TargetInstrDesc &TID = I->getDesc();
1237     if (TID.isCall() || TID.isTerminator() || TID.hasUnmodeledSideEffects())
1238       return false;
1239     if (isLd && TID.mayStore())
1240       return false;
1241     if (!isLd) {
1242       if (TID.mayLoad())
1243         return false;
1244       // It's not safe to move the first 'str' down.
1245       // str r1, [r0]
1246       // strh r5, [r0]
1247       // str r4, [r0, #+4]
1248       if (TID.mayStore())
1249         return false;
1250     }
1251     for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) {
1252       MachineOperand &MO = I->getOperand(j);
1253       if (!MO.isReg())
1254         continue;
1255       unsigned Reg = MO.getReg();
1256       if (MO.isDef() && TRI->regsOverlap(Reg, Base))
1257         return false;
1258       if (Reg != Base && !MemRegs.count(Reg))
1259         AddedRegPressure.insert(Reg);
1260     }
1261   }
1262 
1263   // Estimate register pressure increase due to the transformation.
1264   if (MemRegs.size() <= 4)
1265     // Ok if we are moving small number of instructions.
1266     return true;
1267   return AddedRegPressure.size() <= MemRegs.size() * 2;
1268 }
1269 
1270 bool
1271 ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
1272                                           DebugLoc &dl,
1273                                           unsigned &NewOpc, unsigned &EvenReg,
1274                                           unsigned &OddReg, unsigned &BaseReg,
1275                                           unsigned &OffReg, int &Offset,
1276                                           unsigned &PredReg,
1277                                           ARMCC::CondCodes &Pred,
1278                                           bool &isT2) {
1279   // Make sure we're allowed to generate LDRD/STRD.
1280   if (!STI->hasV5TEOps())
1281     return false;
1282 
1283   // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD
1284   unsigned Scale = 1;
1285   unsigned Opcode = Op0->getOpcode();
1286   if (Opcode == ARM::LDR)
1287     NewOpc = ARM::LDRD;
1288   else if (Opcode == ARM::STR)
1289     NewOpc = ARM::STRD;
1290   else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) {
1291     NewOpc = ARM::t2LDRDi8;
1292     Scale = 4;
1293     isT2 = true;
1294   } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) {
1295     NewOpc = ARM::t2STRDi8;
1296     Scale = 4;
1297     isT2 = true;
1298   } else
1299     return false;
1300 
1301   // Make sure the offset registers match.
1302   if (!isT2 &&
1303       (Op0->getOperand(2).getReg() != Op1->getOperand(2).getReg()))
1304       return false;
1305 
1306   // Must sure the base address satisfies i64 ld / st alignment requirement.
1307   if (!Op0->hasOneMemOperand() ||
1308       !(*Op0->memoperands_begin())->getValue() ||
1309       (*Op0->memoperands_begin())->isVolatile())
1310     return false;
1311 
1312   unsigned Align = (*Op0->memoperands_begin())->getAlignment();
1313   Function *Func = MF->getFunction();
1314   unsigned ReqAlign = STI->hasV6Ops()
1315     ? TD->getPrefTypeAlignment(Type::getInt64Ty(Func->getContext()))
1316     : 8;  // Pre-v6 need 8-byte align
1317   if (Align < ReqAlign)
1318     return false;
1319 
1320   // Then make sure the immediate offset fits.
1321   int OffImm = getMemoryOpOffset(Op0);
1322   if (isT2) {
1323     if (OffImm < 0) {
1324       if (OffImm < -255)
1325         // Can't fall back to t2LDRi8 / t2STRi8.
1326         return false;
1327     } else {
1328       int Limit = (1 << 8) * Scale;
1329       if (OffImm >= Limit || (OffImm & (Scale-1)))
1330         return false;
1331     }
1332     Offset = OffImm;
1333   } else {
1334     ARM_AM::AddrOpc AddSub = ARM_AM::add;
1335     if (OffImm < 0) {
1336       AddSub = ARM_AM::sub;
1337       OffImm = - OffImm;
1338     }
1339     int Limit = (1 << 8) * Scale;
1340     if (OffImm >= Limit || (OffImm & (Scale-1)))
1341       return false;
1342     Offset = ARM_AM::getAM3Opc(AddSub, OffImm);
1343   }
1344   EvenReg = Op0->getOperand(0).getReg();
1345   OddReg  = Op1->getOperand(0).getReg();
1346   if (EvenReg == OddReg)
1347     return false;
1348   BaseReg = Op0->getOperand(1).getReg();
1349   if (!isT2)
1350     OffReg = Op0->getOperand(2).getReg();
1351   Pred = llvm::getInstrPredicate(Op0, PredReg);
1352   dl = Op0->getDebugLoc();
1353   return true;
1354 }
1355 
1356 bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
1357                                  SmallVector<MachineInstr*, 4> &Ops,
1358                                  unsigned Base, bool isLd,
1359                                  DenseMap<MachineInstr*, unsigned> &MI2LocMap) {
1360   bool RetVal = false;
1361 
1362   // Sort by offset (in reverse order).
1363   std::sort(Ops.begin(), Ops.end(), OffsetCompare());
1364 
1365   // The loads / stores of the same base are in order. Scan them from first to
1366   // last and check for the followins:
1367   // 1. Any def of base.
1368   // 2. Any gaps.
1369   while (Ops.size() > 1) {
1370     unsigned FirstLoc = ~0U;
1371     unsigned LastLoc = 0;
1372     MachineInstr *FirstOp = 0;
1373     MachineInstr *LastOp = 0;
1374     int LastOffset = 0;
1375     unsigned LastOpcode = 0;
1376     unsigned LastBytes = 0;
1377     unsigned NumMove = 0;
1378     for (int i = Ops.size() - 1; i >= 0; --i) {
1379       MachineInstr *Op = Ops[i];
1380       unsigned Loc = MI2LocMap[Op];
1381       if (Loc <= FirstLoc) {
1382         FirstLoc = Loc;
1383         FirstOp = Op;
1384       }
1385       if (Loc >= LastLoc) {
1386         LastLoc = Loc;
1387         LastOp = Op;
1388       }
1389 
1390       unsigned Opcode = Op->getOpcode();
1391       if (LastOpcode && Opcode != LastOpcode)
1392         break;
1393 
1394       int Offset = getMemoryOpOffset(Op);
1395       unsigned Bytes = getLSMultipleTransferSize(Op);
1396       if (LastBytes) {
1397         if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes))
1398           break;
1399       }
1400       LastOffset = Offset;
1401       LastBytes = Bytes;
1402       LastOpcode = Opcode;
1403       if (++NumMove == 8) // FIXME: Tune this limit.
1404         break;
1405     }
1406 
1407     if (NumMove <= 1)
1408       Ops.pop_back();
1409     else {
1410       SmallPtrSet<MachineInstr*, 4> MemOps;
1411       SmallSet<unsigned, 4> MemRegs;
1412       for (int i = NumMove-1; i >= 0; --i) {
1413         MemOps.insert(Ops[i]);
1414         MemRegs.insert(Ops[i]->getOperand(0).getReg());
1415       }
1416 
1417       // Be conservative, if the instructions are too far apart, don't
1418       // move them. We want to limit the increase of register pressure.
1419       bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this.
1420       if (DoMove)
1421         DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp,
1422                                            MemOps, MemRegs, TRI);
1423       if (!DoMove) {
1424         for (unsigned i = 0; i != NumMove; ++i)
1425           Ops.pop_back();
1426       } else {
1427         // This is the new location for the loads / stores.
1428         MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp;
1429         while (InsertPos != MBB->end() && MemOps.count(InsertPos))
1430           ++InsertPos;
1431 
1432         // If we are moving a pair of loads / stores, see if it makes sense
1433         // to try to allocate a pair of registers that can form register pairs.
1434         MachineInstr *Op0 = Ops.back();
1435         MachineInstr *Op1 = Ops[Ops.size()-2];
1436         unsigned EvenReg = 0, OddReg = 0;
1437         unsigned BaseReg = 0, OffReg = 0, PredReg = 0;
1438         ARMCC::CondCodes Pred = ARMCC::AL;
1439         bool isT2 = false;
1440         unsigned NewOpc = 0;
1441         int Offset = 0;
1442         DebugLoc dl;
1443         if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc,
1444                                              EvenReg, OddReg, BaseReg, OffReg,
1445                                              Offset, PredReg, Pred, isT2)) {
1446           Ops.pop_back();
1447           Ops.pop_back();
1448 
1449           // Form the pair instruction.
1450           if (isLd) {
1451             MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos,
1452                                               dl, TII->get(NewOpc))
1453               .addReg(EvenReg, RegState::Define)
1454               .addReg(OddReg, RegState::Define)
1455               .addReg(BaseReg);
1456             if (!isT2)
1457               MIB.addReg(OffReg);
1458             MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1459             ++NumLDRDFormed;
1460           } else {
1461             MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos,
1462                                               dl, TII->get(NewOpc))
1463               .addReg(EvenReg)
1464               .addReg(OddReg)
1465               .addReg(BaseReg);
1466             if (!isT2)
1467               MIB.addReg(OffReg);
1468             MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1469             ++NumSTRDFormed;
1470           }
1471           MBB->erase(Op0);
1472           MBB->erase(Op1);
1473 
1474           // Add register allocation hints to form register pairs.
1475           MRI->setRegAllocationHint(EvenReg, ARMRI::RegPairEven, OddReg);
1476           MRI->setRegAllocationHint(OddReg,  ARMRI::RegPairOdd, EvenReg);
1477         } else {
1478           for (unsigned i = 0; i != NumMove; ++i) {
1479             MachineInstr *Op = Ops.back();
1480             Ops.pop_back();
1481             MBB->splice(InsertPos, MBB, Op);
1482           }
1483         }
1484 
1485         NumLdStMoved += NumMove;
1486         RetVal = true;
1487       }
1488     }
1489   }
1490 
1491   return RetVal;
1492 }
1493 
1494 bool
1495 ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
1496   bool RetVal = false;
1497 
1498   DenseMap<MachineInstr*, unsigned> MI2LocMap;
1499   DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap;
1500   DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap;
1501   SmallVector<unsigned, 4> LdBases;
1502   SmallVector<unsigned, 4> StBases;
1503 
1504   unsigned Loc = 0;
1505   MachineBasicBlock::iterator MBBI = MBB->begin();
1506   MachineBasicBlock::iterator E = MBB->end();
1507   while (MBBI != E) {
1508     for (; MBBI != E; ++MBBI) {
1509       MachineInstr *MI = MBBI;
1510       const TargetInstrDesc &TID = MI->getDesc();
1511       if (TID.isCall() || TID.isTerminator()) {
1512         // Stop at barriers.
1513         ++MBBI;
1514         break;
1515       }
1516 
1517       MI2LocMap[MI] = Loc++;
1518       if (!isMemoryOp(MI))
1519         continue;
1520       unsigned PredReg = 0;
1521       if (llvm::getInstrPredicate(MI, PredReg) != ARMCC::AL)
1522         continue;
1523 
1524       int Opc = MI->getOpcode();
1525       bool isLd = isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD;
1526       unsigned Base = MI->getOperand(1).getReg();
1527       int Offset = getMemoryOpOffset(MI);
1528 
1529       bool StopHere = false;
1530       if (isLd) {
1531         DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
1532           Base2LdsMap.find(Base);
1533         if (BI != Base2LdsMap.end()) {
1534           for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
1535             if (Offset == getMemoryOpOffset(BI->second[i])) {
1536               StopHere = true;
1537               break;
1538             }
1539           }
1540           if (!StopHere)
1541             BI->second.push_back(MI);
1542         } else {
1543           SmallVector<MachineInstr*, 4> MIs;
1544           MIs.push_back(MI);
1545           Base2LdsMap[Base] = MIs;
1546           LdBases.push_back(Base);
1547         }
1548       } else {
1549         DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
1550           Base2StsMap.find(Base);
1551         if (BI != Base2StsMap.end()) {
1552           for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
1553             if (Offset == getMemoryOpOffset(BI->second[i])) {
1554               StopHere = true;
1555               break;
1556             }
1557           }
1558           if (!StopHere)
1559             BI->second.push_back(MI);
1560         } else {
1561           SmallVector<MachineInstr*, 4> MIs;
1562           MIs.push_back(MI);
1563           Base2StsMap[Base] = MIs;
1564           StBases.push_back(Base);
1565         }
1566       }
1567 
1568       if (StopHere) {
1569         // Found a duplicate (a base+offset combination that's seen earlier).
1570         // Backtrack.
1571         --Loc;
1572         break;
1573       }
1574     }
1575 
1576     // Re-schedule loads.
1577     for (unsigned i = 0, e = LdBases.size(); i != e; ++i) {
1578       unsigned Base = LdBases[i];
1579       SmallVector<MachineInstr*, 4> &Lds = Base2LdsMap[Base];
1580       if (Lds.size() > 1)
1581         RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap);
1582     }
1583 
1584     // Re-schedule stores.
1585     for (unsigned i = 0, e = StBases.size(); i != e; ++i) {
1586       unsigned Base = StBases[i];
1587       SmallVector<MachineInstr*, 4> &Sts = Base2StsMap[Base];
1588       if (Sts.size() > 1)
1589         RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap);
1590     }
1591 
1592     if (MBBI != E) {
1593       Base2LdsMap.clear();
1594       Base2StsMap.clear();
1595       LdBases.clear();
1596       StBases.clear();
1597     }
1598   }
1599 
1600   return RetVal;
1601 }
1602 
1603 
1604 /// createARMLoadStoreOptimizationPass - returns an instance of the load / store
1605 /// optimization pass.
1606 FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
1607   if (PreAlloc)
1608     return new ARMPreAllocLoadStoreOpt();
1609   return new ARMLoadStoreOpt();
1610 }
1611