1 //=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file contains a pass that performs load / store related peephole
11 // optimizations. This pass should be run after register allocation.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AArch64InstrInfo.h"
16 #include "AArch64Subtarget.h"
17 #include "MCTargetDesc/AArch64AddressingModes.h"
18 #include "llvm/ADT/BitVector.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/CodeGen/MachineInstr.h"
24 #include "llvm/CodeGen/MachineInstrBuilder.h"
25 #include "llvm/Support/CommandLine.h"
26 #include "llvm/Support/Debug.h"
27 #include "llvm/Support/ErrorHandling.h"
28 #include "llvm/Support/raw_ostream.h"
29 #include "llvm/Target/TargetInstrInfo.h"
30 #include "llvm/Target/TargetMachine.h"
31 #include "llvm/Target/TargetRegisterInfo.h"
32 using namespace llvm;
33 
34 #define DEBUG_TYPE "aarch64-ldst-opt"
35 
36 /// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine
37 /// load / store instructions to form ldp / stp instructions.
38 
39 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
40 STATISTIC(NumPostFolded, "Number of post-index updates folded");
41 STATISTIC(NumPreFolded, "Number of pre-index updates folded");
42 STATISTIC(NumUnscaledPairCreated,
43           "Number of load/store from unscaled generated");
44 STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted");
45 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
46 STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
47 
48 static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit",
49                                    cl::init(20), cl::Hidden);
50 
51 namespace llvm {
52 void initializeAArch64LoadStoreOptPass(PassRegistry &);
53 }
54 
55 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
56 
57 namespace {
58 
59 typedef struct LdStPairFlags {
60   // If a matching instruction is found, MergeForward is set to true if the
61   // merge is to remove the first instruction and replace the second with
62   // a pair-wise insn, and false if the reverse is true.
63   bool MergeForward;
64 
65   // SExtIdx gives the index of the result of the load pair that must be
66   // extended. The value of SExtIdx assumes that the paired load produces the
67   // value in this order: (I, returned iterator), i.e., -1 means no value has
68   // to be extended, 0 means I, and 1 means the returned iterator.
69   int SExtIdx;
70 
71   LdStPairFlags() : MergeForward(false), SExtIdx(-1) {}
72 
73   void setMergeForward(bool V = true) { MergeForward = V; }
74   bool getMergeForward() const { return MergeForward; }
75 
76   void setSExtIdx(int V) { SExtIdx = V; }
77   int getSExtIdx() const { return SExtIdx; }
78 
79 } LdStPairFlags;
80 
81 struct AArch64LoadStoreOpt : public MachineFunctionPass {
82   static char ID;
83   AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
84     initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
85   }
86 
87   const AArch64InstrInfo *TII;
88   const TargetRegisterInfo *TRI;
89   const AArch64Subtarget *Subtarget;
90 
91   // Track which registers have been modified and used.
92   BitVector ModifiedRegs, UsedRegs;
93 
94   // Scan the instructions looking for a load/store that can be combined
95   // with the current instruction into a load/store pair.
96   // Return the matching instruction if one is found, else MBB->end().
97   MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
98                                                LdStPairFlags &Flags,
99                                                unsigned Limit);
100 
101   // Scan the instructions looking for a store that writes to the address from
102   // which the current load instruction reads. Return true if one is found.
103   bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
104                          MachineBasicBlock::iterator &StoreI);
105 
106   // Merge the two instructions indicated into a single pair-wise instruction.
107   // If MergeForward is true, erase the first instruction and fold its
108   // operation into the second. If false, the reverse. Return the instruction
109   // following the first instruction (which may change during processing).
110   MachineBasicBlock::iterator
111   mergePairedInsns(MachineBasicBlock::iterator I,
112                    MachineBasicBlock::iterator Paired,
113                    const LdStPairFlags &Flags);
114 
115   // Promote the load that reads directly from the address stored to.
116   MachineBasicBlock::iterator
117   promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
118                        MachineBasicBlock::iterator StoreI);
119 
120   // Scan the instruction list to find a base register update that can
121   // be combined with the current instruction (a load or store) using
122   // pre or post indexed addressing with writeback. Scan forwards.
123   MachineBasicBlock::iterator
124   findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
125                                 int UnscaledOffset);
126 
127   // Scan the instruction list to find a base register update that can
128   // be combined with the current instruction (a load or store) using
129   // pre or post indexed addressing with writeback. Scan backwards.
130   MachineBasicBlock::iterator
131   findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I);
132 
133   // Find an instruction that updates the base register of the ld/st
134   // instruction.
135   bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI,
136                             unsigned BaseReg, int Offset);
137 
138   // Merge a pre- or post-index base register update into a ld/st instruction.
139   MachineBasicBlock::iterator
140   mergeUpdateInsn(MachineBasicBlock::iterator I,
141                   MachineBasicBlock::iterator Update, bool IsPreIdx);
142 
143   // Find and merge foldable ldr/str instructions.
144   bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI);
145 
146   // Find and promote load instructions which read directly from store.
147   bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
148 
149   // Check if converting two narrow loads into a single wider load with
150   // bitfield extracts could be enabled.
151   bool enableNarrowLdMerge(MachineFunction &Fn);
152 
153   bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt);
154 
155   bool runOnMachineFunction(MachineFunction &Fn) override;
156 
157   const char *getPassName() const override {
158     return AARCH64_LOAD_STORE_OPT_NAME;
159   }
160 };
161 char AArch64LoadStoreOpt::ID = 0;
162 } // namespace
163 
164 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
165                 AARCH64_LOAD_STORE_OPT_NAME, false, false)
166 
167 static bool isUnscaledLdSt(unsigned Opc) {
168   switch (Opc) {
169   default:
170     return false;
171   case AArch64::STURSi:
172   case AArch64::STURDi:
173   case AArch64::STURQi:
174   case AArch64::STURBBi:
175   case AArch64::STURHHi:
176   case AArch64::STURWi:
177   case AArch64::STURXi:
178   case AArch64::LDURSi:
179   case AArch64::LDURDi:
180   case AArch64::LDURQi:
181   case AArch64::LDURWi:
182   case AArch64::LDURXi:
183   case AArch64::LDURSWi:
184   case AArch64::LDURHHi:
185   case AArch64::LDURBBi:
186   case AArch64::LDURSBWi:
187   case AArch64::LDURSHWi:
188     return true;
189   }
190 }
191 
192 static bool isUnscaledLdSt(MachineInstr *MI) {
193   return isUnscaledLdSt(MI->getOpcode());
194 }
195 
196 static unsigned getBitExtrOpcode(MachineInstr *MI) {
197   switch (MI->getOpcode()) {
198   default:
199     llvm_unreachable("Unexpected opcode.");
200   case AArch64::LDRBBui:
201   case AArch64::LDURBBi:
202   case AArch64::LDRHHui:
203   case AArch64::LDURHHi:
204     return AArch64::UBFMWri;
205   case AArch64::LDRSBWui:
206   case AArch64::LDURSBWi:
207   case AArch64::LDRSHWui:
208   case AArch64::LDURSHWi:
209     return AArch64::SBFMWri;
210   }
211 }
212 
213 static bool isNarrowStore(unsigned Opc) {
214   switch (Opc) {
215   default:
216     return false;
217   case AArch64::STRBBui:
218   case AArch64::STURBBi:
219   case AArch64::STRHHui:
220   case AArch64::STURHHi:
221     return true;
222   }
223 }
224 
225 static bool isNarrowStore(MachineInstr *MI) {
226   return isNarrowStore(MI->getOpcode());
227 }
228 
229 static bool isNarrowLoad(unsigned Opc) {
230   switch (Opc) {
231   default:
232     return false;
233   case AArch64::LDRHHui:
234   case AArch64::LDURHHi:
235   case AArch64::LDRBBui:
236   case AArch64::LDURBBi:
237   case AArch64::LDRSHWui:
238   case AArch64::LDURSHWi:
239   case AArch64::LDRSBWui:
240   case AArch64::LDURSBWi:
241     return true;
242   }
243 }
244 
245 static bool isNarrowLoad(MachineInstr *MI) {
246   return isNarrowLoad(MI->getOpcode());
247 }
248 
249 // Scaling factor for unscaled load or store.
250 static int getMemScale(MachineInstr *MI) {
251   switch (MI->getOpcode()) {
252   default:
253     llvm_unreachable("Opcode has unknown scale!");
254   case AArch64::LDRBBui:
255   case AArch64::LDURBBi:
256   case AArch64::LDRSBWui:
257   case AArch64::LDURSBWi:
258   case AArch64::STRBBui:
259   case AArch64::STURBBi:
260     return 1;
261   case AArch64::LDRHHui:
262   case AArch64::LDURHHi:
263   case AArch64::LDRSHWui:
264   case AArch64::LDURSHWi:
265   case AArch64::STRHHui:
266   case AArch64::STURHHi:
267     return 2;
268   case AArch64::LDRSui:
269   case AArch64::LDURSi:
270   case AArch64::LDRSWui:
271   case AArch64::LDURSWi:
272   case AArch64::LDRWui:
273   case AArch64::LDURWi:
274   case AArch64::STRSui:
275   case AArch64::STURSi:
276   case AArch64::STRWui:
277   case AArch64::STURWi:
278   case AArch64::LDPSi:
279   case AArch64::LDPSWi:
280   case AArch64::LDPWi:
281   case AArch64::STPSi:
282   case AArch64::STPWi:
283     return 4;
284   case AArch64::LDRDui:
285   case AArch64::LDURDi:
286   case AArch64::LDRXui:
287   case AArch64::LDURXi:
288   case AArch64::STRDui:
289   case AArch64::STURDi:
290   case AArch64::STRXui:
291   case AArch64::STURXi:
292   case AArch64::LDPDi:
293   case AArch64::LDPXi:
294   case AArch64::STPDi:
295   case AArch64::STPXi:
296     return 8;
297   case AArch64::LDRQui:
298   case AArch64::LDURQi:
299   case AArch64::STRQui:
300   case AArch64::STURQi:
301   case AArch64::LDPQi:
302   case AArch64::STPQi:
303     return 16;
304   }
305 }
306 
307 static unsigned getMatchingNonSExtOpcode(unsigned Opc,
308                                          bool *IsValidLdStrOpc = nullptr) {
309   if (IsValidLdStrOpc)
310     *IsValidLdStrOpc = true;
311   switch (Opc) {
312   default:
313     if (IsValidLdStrOpc)
314       *IsValidLdStrOpc = false;
315     return UINT_MAX;
316   case AArch64::STRDui:
317   case AArch64::STURDi:
318   case AArch64::STRQui:
319   case AArch64::STURQi:
320   case AArch64::STRBBui:
321   case AArch64::STURBBi:
322   case AArch64::STRHHui:
323   case AArch64::STURHHi:
324   case AArch64::STRWui:
325   case AArch64::STURWi:
326   case AArch64::STRXui:
327   case AArch64::STURXi:
328   case AArch64::LDRDui:
329   case AArch64::LDURDi:
330   case AArch64::LDRQui:
331   case AArch64::LDURQi:
332   case AArch64::LDRWui:
333   case AArch64::LDURWi:
334   case AArch64::LDRXui:
335   case AArch64::LDURXi:
336   case AArch64::STRSui:
337   case AArch64::STURSi:
338   case AArch64::LDRSui:
339   case AArch64::LDURSi:
340   case AArch64::LDRHHui:
341   case AArch64::LDURHHi:
342   case AArch64::LDRBBui:
343   case AArch64::LDURBBi:
344     return Opc;
345   case AArch64::LDRSWui:
346     return AArch64::LDRWui;
347   case AArch64::LDURSWi:
348     return AArch64::LDURWi;
349   case AArch64::LDRSBWui:
350     return AArch64::LDRBBui;
351   case AArch64::LDRSHWui:
352     return AArch64::LDRHHui;
353   case AArch64::LDURSBWi:
354     return AArch64::LDURBBi;
355   case AArch64::LDURSHWi:
356     return AArch64::LDURHHi;
357   }
358 }
359 
360 static unsigned getMatchingPairOpcode(unsigned Opc) {
361   switch (Opc) {
362   default:
363     llvm_unreachable("Opcode has no pairwise equivalent!");
364   case AArch64::STRSui:
365   case AArch64::STURSi:
366     return AArch64::STPSi;
367   case AArch64::STRDui:
368   case AArch64::STURDi:
369     return AArch64::STPDi;
370   case AArch64::STRQui:
371   case AArch64::STURQi:
372     return AArch64::STPQi;
373   case AArch64::STRBBui:
374     return AArch64::STRHHui;
375   case AArch64::STRHHui:
376     return AArch64::STRWui;
377   case AArch64::STURBBi:
378     return AArch64::STURHHi;
379   case AArch64::STURHHi:
380     return AArch64::STURWi;
381   case AArch64::STRWui:
382   case AArch64::STURWi:
383     return AArch64::STPWi;
384   case AArch64::STRXui:
385   case AArch64::STURXi:
386     return AArch64::STPXi;
387   case AArch64::LDRSui:
388   case AArch64::LDURSi:
389     return AArch64::LDPSi;
390   case AArch64::LDRDui:
391   case AArch64::LDURDi:
392     return AArch64::LDPDi;
393   case AArch64::LDRQui:
394   case AArch64::LDURQi:
395     return AArch64::LDPQi;
396   case AArch64::LDRWui:
397   case AArch64::LDURWi:
398     return AArch64::LDPWi;
399   case AArch64::LDRXui:
400   case AArch64::LDURXi:
401     return AArch64::LDPXi;
402   case AArch64::LDRSWui:
403   case AArch64::LDURSWi:
404     return AArch64::LDPSWi;
405   case AArch64::LDRHHui:
406   case AArch64::LDRSHWui:
407     return AArch64::LDRWui;
408   case AArch64::LDURHHi:
409   case AArch64::LDURSHWi:
410     return AArch64::LDURWi;
411   case AArch64::LDRBBui:
412   case AArch64::LDRSBWui:
413     return AArch64::LDRHHui;
414   case AArch64::LDURBBi:
415   case AArch64::LDURSBWi:
416     return AArch64::LDURHHi;
417   }
418 }
419 
420 static unsigned isMatchingStore(MachineInstr *LoadInst,
421                                 MachineInstr *StoreInst) {
422   unsigned LdOpc = LoadInst->getOpcode();
423   unsigned StOpc = StoreInst->getOpcode();
424   switch (LdOpc) {
425   default:
426     llvm_unreachable("Unsupported load instruction!");
427   case AArch64::LDRBBui:
428     return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
429            StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
430   case AArch64::LDURBBi:
431     return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
432            StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
433   case AArch64::LDRHHui:
434     return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
435            StOpc == AArch64::STRXui;
436   case AArch64::LDURHHi:
437     return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
438            StOpc == AArch64::STURXi;
439   case AArch64::LDRWui:
440     return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
441   case AArch64::LDURWi:
442     return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
443   case AArch64::LDRXui:
444     return StOpc == AArch64::STRXui;
445   case AArch64::LDURXi:
446     return StOpc == AArch64::STURXi;
447   }
448 }
449 
450 static unsigned getPreIndexedOpcode(unsigned Opc) {
451   switch (Opc) {
452   default:
453     llvm_unreachable("Opcode has no pre-indexed equivalent!");
454   case AArch64::STRSui:
455     return AArch64::STRSpre;
456   case AArch64::STRDui:
457     return AArch64::STRDpre;
458   case AArch64::STRQui:
459     return AArch64::STRQpre;
460   case AArch64::STRBBui:
461     return AArch64::STRBBpre;
462   case AArch64::STRHHui:
463     return AArch64::STRHHpre;
464   case AArch64::STRWui:
465     return AArch64::STRWpre;
466   case AArch64::STRXui:
467     return AArch64::STRXpre;
468   case AArch64::LDRSui:
469     return AArch64::LDRSpre;
470   case AArch64::LDRDui:
471     return AArch64::LDRDpre;
472   case AArch64::LDRQui:
473     return AArch64::LDRQpre;
474   case AArch64::LDRBBui:
475     return AArch64::LDRBBpre;
476   case AArch64::LDRHHui:
477     return AArch64::LDRHHpre;
478   case AArch64::LDRWui:
479     return AArch64::LDRWpre;
480   case AArch64::LDRXui:
481     return AArch64::LDRXpre;
482   case AArch64::LDRSWui:
483     return AArch64::LDRSWpre;
484   case AArch64::LDPSi:
485     return AArch64::LDPSpre;
486   case AArch64::LDPSWi:
487     return AArch64::LDPSWpre;
488   case AArch64::LDPDi:
489     return AArch64::LDPDpre;
490   case AArch64::LDPQi:
491     return AArch64::LDPQpre;
492   case AArch64::LDPWi:
493     return AArch64::LDPWpre;
494   case AArch64::LDPXi:
495     return AArch64::LDPXpre;
496   case AArch64::STPSi:
497     return AArch64::STPSpre;
498   case AArch64::STPDi:
499     return AArch64::STPDpre;
500   case AArch64::STPQi:
501     return AArch64::STPQpre;
502   case AArch64::STPWi:
503     return AArch64::STPWpre;
504   case AArch64::STPXi:
505     return AArch64::STPXpre;
506   }
507 }
508 
509 static unsigned getPostIndexedOpcode(unsigned Opc) {
510   switch (Opc) {
511   default:
512     llvm_unreachable("Opcode has no post-indexed wise equivalent!");
513   case AArch64::STRSui:
514     return AArch64::STRSpost;
515   case AArch64::STRDui:
516     return AArch64::STRDpost;
517   case AArch64::STRQui:
518     return AArch64::STRQpost;
519   case AArch64::STRBBui:
520     return AArch64::STRBBpost;
521   case AArch64::STRHHui:
522     return AArch64::STRHHpost;
523   case AArch64::STRWui:
524     return AArch64::STRWpost;
525   case AArch64::STRXui:
526     return AArch64::STRXpost;
527   case AArch64::LDRSui:
528     return AArch64::LDRSpost;
529   case AArch64::LDRDui:
530     return AArch64::LDRDpost;
531   case AArch64::LDRQui:
532     return AArch64::LDRQpost;
533   case AArch64::LDRBBui:
534     return AArch64::LDRBBpost;
535   case AArch64::LDRHHui:
536     return AArch64::LDRHHpost;
537   case AArch64::LDRWui:
538     return AArch64::LDRWpost;
539   case AArch64::LDRXui:
540     return AArch64::LDRXpost;
541   case AArch64::LDRSWui:
542     return AArch64::LDRSWpost;
543   case AArch64::LDPSi:
544     return AArch64::LDPSpost;
545   case AArch64::LDPSWi:
546     return AArch64::LDPSWpost;
547   case AArch64::LDPDi:
548     return AArch64::LDPDpost;
549   case AArch64::LDPQi:
550     return AArch64::LDPQpost;
551   case AArch64::LDPWi:
552     return AArch64::LDPWpost;
553   case AArch64::LDPXi:
554     return AArch64::LDPXpost;
555   case AArch64::STPSi:
556     return AArch64::STPSpost;
557   case AArch64::STPDi:
558     return AArch64::STPDpost;
559   case AArch64::STPQi:
560     return AArch64::STPQpost;
561   case AArch64::STPWi:
562     return AArch64::STPWpost;
563   case AArch64::STPXi:
564     return AArch64::STPXpost;
565   }
566 }
567 
568 static bool isPairedLdSt(const MachineInstr *MI) {
569   switch (MI->getOpcode()) {
570   default:
571     return false;
572   case AArch64::LDPSi:
573   case AArch64::LDPSWi:
574   case AArch64::LDPDi:
575   case AArch64::LDPQi:
576   case AArch64::LDPWi:
577   case AArch64::LDPXi:
578   case AArch64::STPSi:
579   case AArch64::STPDi:
580   case AArch64::STPQi:
581   case AArch64::STPWi:
582   case AArch64::STPXi:
583     return true;
584   }
585 }
586 
587 static const MachineOperand &getLdStRegOp(const MachineInstr *MI,
588                                           unsigned PairedRegOp = 0) {
589   assert(PairedRegOp < 2 && "Unexpected register operand idx.");
590   unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0;
591   return MI->getOperand(Idx);
592 }
593 
594 static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) {
595   unsigned Idx = isPairedLdSt(MI) ? 2 : 1;
596   return MI->getOperand(Idx);
597 }
598 
599 static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) {
600   unsigned Idx = isPairedLdSt(MI) ? 3 : 2;
601   return MI->getOperand(Idx);
602 }
603 
604 static bool isLdOffsetInRangeOfSt(MachineInstr *LoadInst,
605                                   MachineInstr *StoreInst) {
606   assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
607   int LoadSize = getMemScale(LoadInst);
608   int StoreSize = getMemScale(StoreInst);
609   int UnscaledStOffset = isUnscaledLdSt(StoreInst)
610                              ? getLdStOffsetOp(StoreInst).getImm()
611                              : getLdStOffsetOp(StoreInst).getImm() * StoreSize;
612   int UnscaledLdOffset = isUnscaledLdSt(LoadInst)
613                              ? getLdStOffsetOp(LoadInst).getImm()
614                              : getLdStOffsetOp(LoadInst).getImm() * LoadSize;
615   return (UnscaledStOffset <= UnscaledLdOffset) &&
616          (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
617 }
618 
619 MachineBasicBlock::iterator
620 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
621                                       MachineBasicBlock::iterator Paired,
622                                       const LdStPairFlags &Flags) {
623   MachineBasicBlock::iterator NextI = I;
624   ++NextI;
625   // If NextI is the second of the two instructions to be merged, we need
626   // to skip one further. Either way we merge will invalidate the iterator,
627   // and we don't need to scan the new instruction, as it's a pairwise
628   // instruction, which we're not considering for further action anyway.
629   if (NextI == Paired)
630     ++NextI;
631 
632   int SExtIdx = Flags.getSExtIdx();
633   unsigned Opc =
634       SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
635   bool IsUnscaled = isUnscaledLdSt(Opc);
636   int OffsetStride = IsUnscaled ? getMemScale(I) : 1;
637 
638   bool MergeForward = Flags.getMergeForward();
639   unsigned NewOpc = getMatchingPairOpcode(Opc);
640   // Insert our new paired instruction after whichever of the paired
641   // instructions MergeForward indicates.
642   MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
643   // Also based on MergeForward is from where we copy the base register operand
644   // so we get the flags compatible with the input code.
645   const MachineOperand &BaseRegOp =
646       MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I);
647 
648   // Which register is Rt and which is Rt2 depends on the offset order.
649   MachineInstr *RtMI, *Rt2MI;
650   if (getLdStOffsetOp(I).getImm() ==
651       getLdStOffsetOp(Paired).getImm() + OffsetStride) {
652     RtMI = Paired;
653     Rt2MI = I;
654     // Here we swapped the assumption made for SExtIdx.
655     // I.e., we turn ldp I, Paired into ldp Paired, I.
656     // Update the index accordingly.
657     if (SExtIdx != -1)
658       SExtIdx = (SExtIdx + 1) % 2;
659   } else {
660     RtMI = I;
661     Rt2MI = Paired;
662   }
663 
664   int OffsetImm = getLdStOffsetOp(RtMI).getImm();
665 
666   if (isNarrowLoad(Opc)) {
667     // Change the scaled offset from small to large type.
668     if (!IsUnscaled) {
669       assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
670       OffsetImm /= 2;
671     }
672     MachineInstr *RtNewDest = MergeForward ? I : Paired;
673     // When merging small (< 32 bit) loads for big-endian targets, the order of
674     // the component parts gets swapped.
675     if (!Subtarget->isLittleEndian())
676       std::swap(RtMI, Rt2MI);
677     // Construct the new load instruction.
678     MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2;
679     NewMemMI = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
680                        TII->get(NewOpc))
681                    .addOperand(getLdStRegOp(RtNewDest))
682                    .addOperand(BaseRegOp)
683                    .addImm(OffsetImm)
684                    .setMemRefs(I->mergeMemRefsWith(*Paired));
685 
686     DEBUG(
687         dbgs()
688         << "Creating the new load and extract. Replacing instructions:\n    ");
689     DEBUG(I->print(dbgs()));
690     DEBUG(dbgs() << "    ");
691     DEBUG(Paired->print(dbgs()));
692     DEBUG(dbgs() << "  with instructions:\n    ");
693     DEBUG((NewMemMI)->print(dbgs()));
694 
695     int Width = getMemScale(I) == 1 ? 8 : 16;
696     int LSBLow = 0;
697     int LSBHigh = Width;
698     int ImmsLow = LSBLow + Width - 1;
699     int ImmsHigh = LSBHigh + Width - 1;
700     MachineInstr *ExtDestMI = MergeForward ? Paired : I;
701     if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) {
702       // Create the bitfield extract for high bits.
703       BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
704                           TII->get(getBitExtrOpcode(Rt2MI)))
705                       .addOperand(getLdStRegOp(Rt2MI))
706                       .addReg(getLdStRegOp(RtNewDest).getReg())
707                       .addImm(LSBHigh)
708                       .addImm(ImmsHigh);
709       // Create the bitfield extract for low bits.
710       if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
711         // For unsigned, prefer to use AND for low bits.
712         BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
713                             TII->get(AArch64::ANDWri))
714                         .addOperand(getLdStRegOp(RtMI))
715                         .addReg(getLdStRegOp(RtNewDest).getReg())
716                         .addImm(ImmsLow);
717       } else {
718         BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
719                             TII->get(getBitExtrOpcode(RtMI)))
720                         .addOperand(getLdStRegOp(RtMI))
721                         .addReg(getLdStRegOp(RtNewDest).getReg())
722                         .addImm(LSBLow)
723                         .addImm(ImmsLow);
724       }
725     } else {
726       // Create the bitfield extract for low bits.
727       if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
728         // For unsigned, prefer to use AND for low bits.
729         BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
730                             TII->get(AArch64::ANDWri))
731                         .addOperand(getLdStRegOp(RtMI))
732                         .addReg(getLdStRegOp(RtNewDest).getReg())
733                         .addImm(ImmsLow);
734       } else {
735         BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
736                             TII->get(getBitExtrOpcode(RtMI)))
737                         .addOperand(getLdStRegOp(RtMI))
738                         .addReg(getLdStRegOp(RtNewDest).getReg())
739                         .addImm(LSBLow)
740                         .addImm(ImmsLow);
741       }
742 
743       // Create the bitfield extract for high bits.
744       BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
745                           TII->get(getBitExtrOpcode(Rt2MI)))
746                       .addOperand(getLdStRegOp(Rt2MI))
747                       .addReg(getLdStRegOp(RtNewDest).getReg())
748                       .addImm(LSBHigh)
749                       .addImm(ImmsHigh);
750     }
751     DEBUG(dbgs() << "    ");
752     DEBUG((BitExtMI1)->print(dbgs()));
753     DEBUG(dbgs() << "    ");
754     DEBUG((BitExtMI2)->print(dbgs()));
755     DEBUG(dbgs() << "\n");
756 
757     // Erase the old instructions.
758     I->eraseFromParent();
759     Paired->eraseFromParent();
760     return NextI;
761   }
762 
763   // Construct the new instruction.
764   MachineInstrBuilder MIB;
765   if (isNarrowStore(Opc)) {
766     // Change the scaled offset from small to large type.
767     if (!IsUnscaled) {
768       assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
769       OffsetImm /= 2;
770     }
771     MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
772                   TII->get(NewOpc))
773               .addOperand(getLdStRegOp(I))
774               .addOperand(BaseRegOp)
775               .addImm(OffsetImm)
776               .setMemRefs(I->mergeMemRefsWith(*Paired));
777   } else {
778     // Handle Unscaled
779     if (IsUnscaled)
780       OffsetImm /= OffsetStride;
781     MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
782                   TII->get(NewOpc))
783               .addOperand(getLdStRegOp(RtMI))
784               .addOperand(getLdStRegOp(Rt2MI))
785               .addOperand(BaseRegOp)
786               .addImm(OffsetImm);
787   }
788 
789   (void)MIB;
790 
791   // FIXME: Do we need/want to copy the mem operands from the source
792   //        instructions? Probably. What uses them after this?
793 
794   DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
795   DEBUG(I->print(dbgs()));
796   DEBUG(dbgs() << "    ");
797   DEBUG(Paired->print(dbgs()));
798   DEBUG(dbgs() << "  with instruction:\n    ");
799 
800   if (SExtIdx != -1) {
801     // Generate the sign extension for the proper result of the ldp.
802     // I.e., with X1, that would be:
803     // %W1<def> = KILL %W1, %X1<imp-def>
804     // %X1<def> = SBFMXri %X1<kill>, 0, 31
805     MachineOperand &DstMO = MIB->getOperand(SExtIdx);
806     // Right now, DstMO has the extended register, since it comes from an
807     // extended opcode.
808     unsigned DstRegX = DstMO.getReg();
809     // Get the W variant of that register.
810     unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
811     // Update the result of LDP to use the W instead of the X variant.
812     DstMO.setReg(DstRegW);
813     DEBUG(((MachineInstr *)MIB)->print(dbgs()));
814     DEBUG(dbgs() << "\n");
815     // Make the machine verifier happy by providing a definition for
816     // the X register.
817     // Insert this definition right after the generated LDP, i.e., before
818     // InsertionPoint.
819     MachineInstrBuilder MIBKill =
820         BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
821                 TII->get(TargetOpcode::KILL), DstRegW)
822             .addReg(DstRegW)
823             .addReg(DstRegX, RegState::Define);
824     MIBKill->getOperand(2).setImplicit();
825     // Create the sign extension.
826     MachineInstrBuilder MIBSXTW =
827         BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
828                 TII->get(AArch64::SBFMXri), DstRegX)
829             .addReg(DstRegX)
830             .addImm(0)
831             .addImm(31);
832     (void)MIBSXTW;
833     DEBUG(dbgs() << "  Extend operand:\n    ");
834     DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
835     DEBUG(dbgs() << "\n");
836   } else {
837     DEBUG(((MachineInstr *)MIB)->print(dbgs()));
838     DEBUG(dbgs() << "\n");
839   }
840 
841   // Erase the old instructions.
842   I->eraseFromParent();
843   Paired->eraseFromParent();
844 
845   return NextI;
846 }
847 
848 MachineBasicBlock::iterator
849 AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
850                                           MachineBasicBlock::iterator StoreI) {
851   MachineBasicBlock::iterator NextI = LoadI;
852   ++NextI;
853 
854   int LoadSize = getMemScale(LoadI);
855   int StoreSize = getMemScale(StoreI);
856   unsigned LdRt = getLdStRegOp(LoadI).getReg();
857   unsigned StRt = getLdStRegOp(StoreI).getReg();
858   bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
859 
860   assert((IsStoreXReg ||
861           TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
862          "Unexpected RegClass");
863 
864   MachineInstr *BitExtMI;
865   if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
866     // Remove the load, if the destination register of the loads is the same
867     // register for stored value.
868     if (StRt == LdRt && LoadSize == 8) {
869       DEBUG(dbgs() << "Remove load instruction:\n    ");
870       DEBUG(LoadI->print(dbgs()));
871       DEBUG(dbgs() << "\n");
872       LoadI->eraseFromParent();
873       return NextI;
874     }
875     // Replace the load with a mov if the load and store are in the same size.
876     BitExtMI =
877         BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
878                 TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
879             .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
880             .addReg(StRt)
881             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
882   } else {
883     // FIXME: Currently we disable this transformation in big-endian targets as
884     // performance and correctness are verified only in little-endian.
885     if (!Subtarget->isLittleEndian())
886       return NextI;
887     bool IsUnscaled = isUnscaledLdSt(LoadI);
888     assert(IsUnscaled == isUnscaledLdSt(StoreI) && "Unsupported ld/st match");
889     assert(LoadSize <= StoreSize && "Invalid load size");
890     int UnscaledLdOffset = IsUnscaled
891                                ? getLdStOffsetOp(LoadI).getImm()
892                                : getLdStOffsetOp(LoadI).getImm() * LoadSize;
893     int UnscaledStOffset = IsUnscaled
894                                ? getLdStOffsetOp(StoreI).getImm()
895                                : getLdStOffsetOp(StoreI).getImm() * StoreSize;
896     int Width = LoadSize * 8;
897     int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
898     int Imms = Immr + Width - 1;
899     unsigned DestReg = IsStoreXReg
900                            ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32,
901                                                       &AArch64::GPR64RegClass)
902                            : LdRt;
903 
904     assert((UnscaledLdOffset >= UnscaledStOffset &&
905             (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
906            "Invalid offset");
907 
908     Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
909     Imms = Immr + Width - 1;
910     if (UnscaledLdOffset == UnscaledStOffset) {
911       uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
912                                 | ((Immr) << 6)               // immr
913                                 | ((Imms) << 0)               // imms
914           ;
915 
916       BitExtMI =
917           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
918                   TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
919                   DestReg)
920               .addReg(StRt)
921               .addImm(AndMaskEncoded);
922     } else {
923       BitExtMI =
924           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
925                   TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
926                   DestReg)
927               .addReg(StRt)
928               .addImm(Immr)
929               .addImm(Imms);
930     }
931   }
932 
933   DEBUG(dbgs() << "Promoting load by replacing :\n    ");
934   DEBUG(StoreI->print(dbgs()));
935   DEBUG(dbgs() << "    ");
936   DEBUG(LoadI->print(dbgs()));
937   DEBUG(dbgs() << "  with instructions:\n    ");
938   DEBUG(StoreI->print(dbgs()));
939   DEBUG(dbgs() << "    ");
940   DEBUG((BitExtMI)->print(dbgs()));
941   DEBUG(dbgs() << "\n");
942 
943   // Erase the old instructions.
944   LoadI->eraseFromParent();
945   return NextI;
946 }
947 
948 /// trackRegDefsUses - Remember what registers the specified instruction uses
949 /// and modifies.
950 static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs,
951                              BitVector &UsedRegs,
952                              const TargetRegisterInfo *TRI) {
953   for (const MachineOperand &MO : MI->operands()) {
954     if (MO.isRegMask())
955       ModifiedRegs.setBitsNotInMask(MO.getRegMask());
956 
957     if (!MO.isReg())
958       continue;
959     unsigned Reg = MO.getReg();
960     if (MO.isDef()) {
961       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
962         ModifiedRegs.set(*AI);
963     } else {
964       assert(MO.isUse() && "Reg operand not a def and not a use?!?");
965       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
966         UsedRegs.set(*AI);
967     }
968   }
969 }
970 
971 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
972   // Convert the byte-offset used by unscaled into an "element" offset used
973   // by the scaled pair load/store instructions.
974   if (IsUnscaled)
975     Offset /= OffsetStride;
976 
977   return Offset <= 63 && Offset >= -64;
978 }
979 
980 // Do alignment, specialized to power of 2 and for signed ints,
981 // avoiding having to do a C-style cast from uint_64t to int when
982 // using alignTo from include/llvm/Support/MathExtras.h.
983 // FIXME: Move this function to include/MathExtras.h?
984 static int alignTo(int Num, int PowOf2) {
985   return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
986 }
987 
988 static bool mayAlias(MachineInstr *MIa, MachineInstr *MIb,
989                      const AArch64InstrInfo *TII) {
990   // One of the instructions must modify memory.
991   if (!MIa->mayStore() && !MIb->mayStore())
992     return false;
993 
994   // Both instructions must be memory operations.
995   if (!MIa->mayLoadOrStore() && !MIb->mayLoadOrStore())
996     return false;
997 
998   return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb);
999 }
1000 
1001 static bool mayAlias(MachineInstr *MIa,
1002                      SmallVectorImpl<MachineInstr *> &MemInsns,
1003                      const AArch64InstrInfo *TII) {
1004   for (auto &MIb : MemInsns)
1005     if (mayAlias(MIa, MIb, TII))
1006       return true;
1007 
1008   return false;
1009 }
1010 
1011 bool AArch64LoadStoreOpt::findMatchingStore(
1012     MachineBasicBlock::iterator I, unsigned Limit,
1013     MachineBasicBlock::iterator &StoreI) {
1014   MachineBasicBlock::iterator E = I->getParent()->begin();
1015   MachineBasicBlock::iterator MBBI = I;
1016   MachineInstr *FirstMI = I;
1017   unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
1018 
1019   // Track which registers have been modified and used between the first insn
1020   // and the second insn.
1021   ModifiedRegs.reset();
1022   UsedRegs.reset();
1023 
1024   // FIXME: We miss the case where the matching store is the first instruction
1025   // in the basic block.
1026   for (unsigned Count = 0; MBBI != E && Count < Limit;) {
1027     --MBBI;
1028     MachineInstr *MI = MBBI;
1029     // Skip DBG_VALUE instructions. Otherwise debug info can affect the
1030     // optimization by changing how far we scan.
1031     if (MI->isDebugValue())
1032       continue;
1033     // Now that we know this is a real instruction, count it.
1034     ++Count;
1035 
1036     // If the load instruction reads directly from the address to which the
1037     // store instruction writes and the stored value is not modified, we can
1038     // promote the load. Since we do not handle stores with pre-/post-index,
1039     // it's unnecessary to check if BaseReg is modified by the store itself.
1040     if (MI->mayStore() && isMatchingStore(FirstMI, MI) &&
1041         BaseReg == getLdStBaseOp(MI).getReg() &&
1042         isLdOffsetInRangeOfSt(FirstMI, MI) &&
1043         !ModifiedRegs[getLdStRegOp(MI).getReg()]) {
1044       StoreI = MBBI;
1045       return true;
1046     }
1047 
1048     if (MI->isCall())
1049       return false;
1050 
1051     // Update modified / uses register lists.
1052     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1053 
1054     // Otherwise, if the base register is modified, we have no match, so
1055     // return early.
1056     if (ModifiedRegs[BaseReg])
1057       return false;
1058 
1059     // If we encounter a store aliased with the load, return early.
1060     if (MI->mayStore() && mayAlias(FirstMI, MI, TII))
1061       return false;
1062   }
1063   return false;
1064 }
1065 
1066 /// findMatchingInsn - Scan the instructions looking for a load/store that can
1067 /// be combined with the current instruction into a load/store pair.
1068 MachineBasicBlock::iterator
1069 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
1070                                       LdStPairFlags &Flags, unsigned Limit) {
1071   MachineBasicBlock::iterator E = I->getParent()->end();
1072   MachineBasicBlock::iterator MBBI = I;
1073   MachineInstr *FirstMI = I;
1074   ++MBBI;
1075 
1076   unsigned Opc = FirstMI->getOpcode();
1077   bool MayLoad = FirstMI->mayLoad();
1078   bool IsUnscaled = isUnscaledLdSt(FirstMI);
1079   unsigned Reg = getLdStRegOp(FirstMI).getReg();
1080   unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
1081   int Offset = getLdStOffsetOp(FirstMI).getImm();
1082   bool IsNarrowStore = isNarrowStore(Opc);
1083 
1084   // For narrow stores, find only the case where the stored value is WZR.
1085   if (IsNarrowStore && Reg != AArch64::WZR)
1086     return E;
1087 
1088   // Early exit if the first instruction modifies the base register.
1089   // e.g., ldr x0, [x0]
1090   if (FirstMI->modifiesRegister(BaseReg, TRI))
1091     return E;
1092 
1093   // Early exit if the offset if not possible to match. (6 bits of positive
1094   // range, plus allow an extra one in case we find a later insn that matches
1095   // with Offset-1)
1096   int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
1097   if (!(isNarrowLoad(Opc) || IsNarrowStore) &&
1098       !inBoundsForPair(IsUnscaled, Offset, OffsetStride))
1099     return E;
1100 
1101   // Track which registers have been modified and used between the first insn
1102   // (inclusive) and the second insn.
1103   ModifiedRegs.reset();
1104   UsedRegs.reset();
1105 
1106   // Remember any instructions that read/write memory between FirstMI and MI.
1107   SmallVector<MachineInstr *, 4> MemInsns;
1108 
1109   for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
1110     MachineInstr *MI = MBBI;
1111     // Skip DBG_VALUE instructions. Otherwise debug info can affect the
1112     // optimization by changing how far we scan.
1113     if (MI->isDebugValue())
1114       continue;
1115 
1116     // Now that we know this is a real instruction, count it.
1117     ++Count;
1118 
1119     bool CanMergeOpc = Opc == MI->getOpcode();
1120     Flags.setSExtIdx(-1);
1121     if (!CanMergeOpc) {
1122       bool IsValidLdStrOpc;
1123       unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc);
1124       assert(IsValidLdStrOpc &&
1125              "Given Opc should be a Load or Store with an immediate");
1126       // Opc will be the first instruction in the pair.
1127       Flags.setSExtIdx(NonSExtOpc == (unsigned)Opc ? 1 : 0);
1128       CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode());
1129     }
1130 
1131     if (CanMergeOpc && getLdStOffsetOp(MI).isImm()) {
1132       assert(MI->mayLoadOrStore() && "Expected memory operation.");
1133       // If we've found another instruction with the same opcode, check to see
1134       // if the base and offset are compatible with our starting instruction.
1135       // These instructions all have scaled immediate operands, so we just
1136       // check for +1/-1. Make sure to check the new instruction offset is
1137       // actually an immediate and not a symbolic reference destined for
1138       // a relocation.
1139       //
1140       // Pairwise instructions have a 7-bit signed offset field. Single insns
1141       // have a 12-bit unsigned offset field. To be a valid combine, the
1142       // final offset must be in range.
1143       unsigned MIBaseReg = getLdStBaseOp(MI).getReg();
1144       int MIOffset = getLdStOffsetOp(MI).getImm();
1145       if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
1146                                    (Offset + OffsetStride == MIOffset))) {
1147         int MinOffset = Offset < MIOffset ? Offset : MIOffset;
1148         // If this is a volatile load/store that otherwise matched, stop looking
1149         // as something is going on that we don't have enough information to
1150         // safely transform. Similarly, stop if we see a hint to avoid pairs.
1151         if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
1152           return E;
1153         // If the resultant immediate offset of merging these instructions
1154         // is out of range for a pairwise instruction, bail and keep looking.
1155         bool MIIsUnscaled = isUnscaledLdSt(MI);
1156         bool IsNarrowLoad = isNarrowLoad(MI->getOpcode());
1157         if (!IsNarrowLoad &&
1158             !inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
1159           trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1160           MemInsns.push_back(MI);
1161           continue;
1162         }
1163 
1164         if (IsNarrowLoad || IsNarrowStore) {
1165           // If the alignment requirements of the scaled wide load/store
1166           // instruction can't express the offset of the scaled narrow
1167           // input, bail and keep looking.
1168           if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) {
1169             trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1170             MemInsns.push_back(MI);
1171             continue;
1172           }
1173         } else {
1174           // If the alignment requirements of the paired (scaled) instruction
1175           // can't express the offset of the unscaled input, bail and keep
1176           // looking.
1177           if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
1178             trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1179             MemInsns.push_back(MI);
1180             continue;
1181           }
1182         }
1183         // If the destination register of the loads is the same register, bail
1184         // and keep looking. A load-pair instruction with both destination
1185         // registers the same is UNPREDICTABLE and will result in an exception.
1186         // For narrow stores, allow only when the stored value is the same
1187         // (i.e., WZR).
1188         if ((MayLoad && Reg == getLdStRegOp(MI).getReg()) ||
1189             (IsNarrowStore && Reg != getLdStRegOp(MI).getReg())) {
1190           trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1191           MemInsns.push_back(MI);
1192           continue;
1193         }
1194 
1195         // If the Rt of the second instruction was not modified or used between
1196         // the two instructions and none of the instructions between the second
1197         // and first alias with the second, we can combine the second into the
1198         // first.
1199         if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
1200             !(MI->mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
1201             !mayAlias(MI, MemInsns, TII)) {
1202           Flags.setMergeForward(false);
1203           return MBBI;
1204         }
1205 
1206         // Likewise, if the Rt of the first instruction is not modified or used
1207         // between the two instructions and none of the instructions between the
1208         // first and the second alias with the first, we can combine the first
1209         // into the second.
1210         if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] &&
1211             !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) &&
1212             !mayAlias(FirstMI, MemInsns, TII)) {
1213           Flags.setMergeForward(true);
1214           return MBBI;
1215         }
1216         // Unable to combine these instructions due to interference in between.
1217         // Keep looking.
1218       }
1219     }
1220 
1221     // If the instruction wasn't a matching load or store.  Stop searching if we
1222     // encounter a call instruction that might modify memory.
1223     if (MI->isCall())
1224       return E;
1225 
1226     // Update modified / uses register lists.
1227     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1228 
1229     // Otherwise, if the base register is modified, we have no match, so
1230     // return early.
1231     if (ModifiedRegs[BaseReg])
1232       return E;
1233 
1234     // Update list of instructions that read/write memory.
1235     if (MI->mayLoadOrStore())
1236       MemInsns.push_back(MI);
1237   }
1238   return E;
1239 }
1240 
1241 MachineBasicBlock::iterator
1242 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
1243                                      MachineBasicBlock::iterator Update,
1244                                      bool IsPreIdx) {
1245   assert((Update->getOpcode() == AArch64::ADDXri ||
1246           Update->getOpcode() == AArch64::SUBXri) &&
1247          "Unexpected base register update instruction to merge!");
1248   MachineBasicBlock::iterator NextI = I;
1249   // Return the instruction following the merged instruction, which is
1250   // the instruction following our unmerged load. Unless that's the add/sub
1251   // instruction we're merging, in which case it's the one after that.
1252   if (++NextI == Update)
1253     ++NextI;
1254 
1255   int Value = Update->getOperand(2).getImm();
1256   assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
1257          "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
1258   if (Update->getOpcode() == AArch64::SUBXri)
1259     Value = -Value;
1260 
1261   unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
1262                              : getPostIndexedOpcode(I->getOpcode());
1263   MachineInstrBuilder MIB;
1264   if (!isPairedLdSt(I)) {
1265     // Non-paired instruction.
1266     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1267               .addOperand(getLdStRegOp(Update))
1268               .addOperand(getLdStRegOp(I))
1269               .addOperand(getLdStBaseOp(I))
1270               .addImm(Value)
1271               .setMemRefs(I->memoperands_begin(), I->memoperands_end());
1272   } else {
1273     // Paired instruction.
1274     int Scale = getMemScale(I);
1275     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1276               .addOperand(getLdStRegOp(Update))
1277               .addOperand(getLdStRegOp(I, 0))
1278               .addOperand(getLdStRegOp(I, 1))
1279               .addOperand(getLdStBaseOp(I))
1280               .addImm(Value / Scale)
1281               .setMemRefs(I->memoperands_begin(), I->memoperands_end());
1282   }
1283   (void)MIB;
1284 
1285   if (IsPreIdx)
1286     DEBUG(dbgs() << "Creating pre-indexed load/store.");
1287   else
1288     DEBUG(dbgs() << "Creating post-indexed load/store.");
1289   DEBUG(dbgs() << "    Replacing instructions:\n    ");
1290   DEBUG(I->print(dbgs()));
1291   DEBUG(dbgs() << "    ");
1292   DEBUG(Update->print(dbgs()));
1293   DEBUG(dbgs() << "  with instruction:\n    ");
1294   DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1295   DEBUG(dbgs() << "\n");
1296 
1297   // Erase the old instructions for the block.
1298   I->eraseFromParent();
1299   Update->eraseFromParent();
1300 
1301   return NextI;
1302 }
1303 
1304 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI,
1305                                                MachineInstr *MI,
1306                                                unsigned BaseReg, int Offset) {
1307   switch (MI->getOpcode()) {
1308   default:
1309     break;
1310   case AArch64::SUBXri:
1311     // Negate the offset for a SUB instruction.
1312     Offset *= -1;
1313   // FALLTHROUGH
1314   case AArch64::ADDXri:
1315     // Make sure it's a vanilla immediate operand, not a relocation or
1316     // anything else we can't handle.
1317     if (!MI->getOperand(2).isImm())
1318       break;
1319     // Watch out for 1 << 12 shifted value.
1320     if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm()))
1321       break;
1322 
1323     // The update instruction source and destination register must be the
1324     // same as the load/store base register.
1325     if (MI->getOperand(0).getReg() != BaseReg ||
1326         MI->getOperand(1).getReg() != BaseReg)
1327       break;
1328 
1329     bool IsPairedInsn = isPairedLdSt(MemMI);
1330     int UpdateOffset = MI->getOperand(2).getImm();
1331     // For non-paired load/store instructions, the immediate must fit in a
1332     // signed 9-bit integer.
1333     if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256))
1334       break;
1335 
1336     // For paired load/store instructions, the immediate must be a multiple of
1337     // the scaling factor.  The scaled offset must also fit into a signed 7-bit
1338     // integer.
1339     if (IsPairedInsn) {
1340       int Scale = getMemScale(MemMI);
1341       if (UpdateOffset % Scale != 0)
1342         break;
1343 
1344       int ScaledOffset = UpdateOffset / Scale;
1345       if (ScaledOffset > 64 || ScaledOffset < -64)
1346         break;
1347     }
1348 
1349     // If we have a non-zero Offset, we check that it matches the amount
1350     // we're adding to the register.
1351     if (!Offset || Offset == MI->getOperand(2).getImm())
1352       return true;
1353     break;
1354   }
1355   return false;
1356 }
1357 
1358 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
1359     MachineBasicBlock::iterator I, int UnscaledOffset) {
1360   MachineBasicBlock::iterator E = I->getParent()->end();
1361   MachineInstr *MemMI = I;
1362   MachineBasicBlock::iterator MBBI = I;
1363 
1364   unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
1365   int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI);
1366 
1367   // Scan forward looking for post-index opportunities.  Updating instructions
1368   // can't be formed if the memory instruction doesn't have the offset we're
1369   // looking for.
1370   if (MIUnscaledOffset != UnscaledOffset)
1371     return E;
1372 
1373   // If the base register overlaps a destination register, we can't
1374   // merge the update.
1375   bool IsPairedInsn = isPairedLdSt(MemMI);
1376   for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
1377     unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
1378     if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
1379       return E;
1380   }
1381 
1382   // Track which registers have been modified and used between the first insn
1383   // (inclusive) and the second insn.
1384   ModifiedRegs.reset();
1385   UsedRegs.reset();
1386   ++MBBI;
1387   for (; MBBI != E; ++MBBI) {
1388     MachineInstr *MI = MBBI;
1389     // Skip DBG_VALUE instructions.
1390     if (MI->isDebugValue())
1391       continue;
1392 
1393     // If we found a match, return it.
1394     if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset))
1395       return MBBI;
1396 
1397     // Update the status of what the instruction clobbered and used.
1398     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1399 
1400     // Otherwise, if the base register is used or modified, we have no match, so
1401     // return early.
1402     if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
1403       return E;
1404   }
1405   return E;
1406 }
1407 
1408 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
1409     MachineBasicBlock::iterator I) {
1410   MachineBasicBlock::iterator B = I->getParent()->begin();
1411   MachineBasicBlock::iterator E = I->getParent()->end();
1412   MachineInstr *MemMI = I;
1413   MachineBasicBlock::iterator MBBI = I;
1414 
1415   unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
1416   int Offset = getLdStOffsetOp(MemMI).getImm();
1417 
1418   // If the load/store is the first instruction in the block, there's obviously
1419   // not any matching update. Ditto if the memory offset isn't zero.
1420   if (MBBI == B || Offset != 0)
1421     return E;
1422   // If the base register overlaps a destination register, we can't
1423   // merge the update.
1424   bool IsPairedInsn = isPairedLdSt(MemMI);
1425   for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
1426     unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
1427     if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
1428       return E;
1429   }
1430 
1431   // Track which registers have been modified and used between the first insn
1432   // (inclusive) and the second insn.
1433   ModifiedRegs.reset();
1434   UsedRegs.reset();
1435   --MBBI;
1436   for (; MBBI != B; --MBBI) {
1437     MachineInstr *MI = MBBI;
1438     // Skip DBG_VALUE instructions.
1439     if (MI->isDebugValue())
1440       continue;
1441 
1442     // If we found a match, return it.
1443     if (isMatchingUpdateInsn(I, MI, BaseReg, Offset))
1444       return MBBI;
1445 
1446     // Update the status of what the instruction clobbered and used.
1447     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1448 
1449     // Otherwise, if the base register is used or modified, we have no match, so
1450     // return early.
1451     if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
1452       return E;
1453   }
1454   return E;
1455 }
1456 
1457 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
1458     MachineBasicBlock::iterator &MBBI) {
1459   MachineInstr *MI = MBBI;
1460   // If this is a volatile load, don't mess with it.
1461   if (MI->hasOrderedMemoryRef())
1462     return false;
1463 
1464   // Make sure this is a reg+imm.
1465   // FIXME: It is possible to extend it to handle reg+reg cases.
1466   if (!getLdStOffsetOp(MI).isImm())
1467     return false;
1468 
1469   // Look backward up to ScanLimit instructions.
1470   MachineBasicBlock::iterator StoreI;
1471   if (findMatchingStore(MBBI, ScanLimit, StoreI)) {
1472     ++NumLoadsFromStoresPromoted;
1473     // Promote the load. Keeping the iterator straight is a
1474     // pain, so we let the merge routine tell us what the next instruction
1475     // is after it's done mucking about.
1476     MBBI = promoteLoadFromStore(MBBI, StoreI);
1477     return true;
1478   }
1479   return false;
1480 }
1481 
1482 bool AArch64LoadStoreOpt::tryToMergeLdStInst(
1483     MachineBasicBlock::iterator &MBBI) {
1484   MachineInstr *MI = MBBI;
1485   MachineBasicBlock::iterator E = MI->getParent()->end();
1486   // If this is a volatile load/store, don't mess with it.
1487   if (MI->hasOrderedMemoryRef())
1488     return false;
1489 
1490   // Make sure this is a reg+imm (as opposed to an address reloc).
1491   if (!getLdStOffsetOp(MI).isImm())
1492     return false;
1493 
1494   // Check if this load/store has a hint to avoid pair formation.
1495   // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
1496   if (TII->isLdStPairSuppressed(MI))
1497     return false;
1498 
1499   // Look ahead up to ScanLimit instructions for a pairable instruction.
1500   LdStPairFlags Flags;
1501   MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit);
1502   if (Paired != E) {
1503     if (isNarrowLoad(MI)) {
1504       ++NumNarrowLoadsPromoted;
1505     } else if (isNarrowStore(MI)) {
1506       ++NumZeroStoresPromoted;
1507     } else {
1508       ++NumPairCreated;
1509       if (isUnscaledLdSt(MI))
1510         ++NumUnscaledPairCreated;
1511     }
1512 
1513     // Merge the loads into a pair. Keeping the iterator straight is a
1514     // pain, so we let the merge routine tell us what the next instruction
1515     // is after it's done mucking about.
1516     MBBI = mergePairedInsns(MBBI, Paired, Flags);
1517     return true;
1518   }
1519   return false;
1520 }
1521 
1522 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
1523                                         bool enableNarrowLdOpt) {
1524   bool Modified = false;
1525   // Four tranformations to do here:
1526   // 1) Find loads that directly read from stores and promote them by
1527   //    replacing with mov instructions. If the store is wider than the load,
1528   //    the load will be replaced with a bitfield extract.
1529   //      e.g.,
1530   //        str w1, [x0, #4]
1531   //        ldrh w2, [x0, #6]
1532   //        ; becomes
1533   //        str w1, [x0, #4]
1534   //        lsr	w2, w1, #16
1535   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1536        MBBI != E;) {
1537     MachineInstr *MI = MBBI;
1538     switch (MI->getOpcode()) {
1539     default:
1540       // Just move on to the next instruction.
1541       ++MBBI;
1542       break;
1543     // Scaled instructions.
1544     case AArch64::LDRBBui:
1545     case AArch64::LDRHHui:
1546     case AArch64::LDRWui:
1547     case AArch64::LDRXui:
1548     // Unscaled instructions.
1549     case AArch64::LDURBBi:
1550     case AArch64::LDURHHi:
1551     case AArch64::LDURWi:
1552     case AArch64::LDURXi: {
1553       if (tryToPromoteLoadFromStore(MBBI)) {
1554         Modified = true;
1555         break;
1556       }
1557       ++MBBI;
1558       break;
1559     }
1560     }
1561   }
1562   // 2) Find narrow loads that can be converted into a single wider load
1563   //    with bitfield extract instructions.
1564   //      e.g.,
1565   //        ldrh w0, [x2]
1566   //        ldrh w1, [x2, #2]
1567   //        ; becomes
1568   //        ldr w0, [x2]
1569   //        ubfx w1, w0, #16, #16
1570   //        and w0, w0, #ffff
1571   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1572        enableNarrowLdOpt && MBBI != E;) {
1573     MachineInstr *MI = MBBI;
1574     switch (MI->getOpcode()) {
1575     default:
1576       // Just move on to the next instruction.
1577       ++MBBI;
1578       break;
1579     // Scaled instructions.
1580     case AArch64::LDRBBui:
1581     case AArch64::LDRHHui:
1582     case AArch64::LDRSBWui:
1583     case AArch64::LDRSHWui:
1584     case AArch64::STRBBui:
1585     case AArch64::STRHHui:
1586     // Unscaled instructions.
1587     case AArch64::LDURBBi:
1588     case AArch64::LDURHHi:
1589     case AArch64::LDURSBWi:
1590     case AArch64::LDURSHWi:
1591     case AArch64::STURBBi:
1592     case AArch64::STURHHi: {
1593       if (tryToMergeLdStInst(MBBI)) {
1594         Modified = true;
1595         break;
1596       }
1597       ++MBBI;
1598       break;
1599     }
1600     }
1601   }
1602   // 3) Find loads and stores that can be merged into a single load or store
1603   //    pair instruction.
1604   //      e.g.,
1605   //        ldr x0, [x2]
1606   //        ldr x1, [x2, #8]
1607   //        ; becomes
1608   //        ldp x0, x1, [x2]
1609   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1610        MBBI != E;) {
1611     MachineInstr *MI = MBBI;
1612     switch (MI->getOpcode()) {
1613     default:
1614       // Just move on to the next instruction.
1615       ++MBBI;
1616       break;
1617     // Scaled instructions.
1618     case AArch64::STRSui:
1619     case AArch64::STRDui:
1620     case AArch64::STRQui:
1621     case AArch64::STRXui:
1622     case AArch64::STRWui:
1623     case AArch64::LDRSui:
1624     case AArch64::LDRDui:
1625     case AArch64::LDRQui:
1626     case AArch64::LDRXui:
1627     case AArch64::LDRWui:
1628     case AArch64::LDRSWui:
1629     // Unscaled instructions.
1630     case AArch64::STURSi:
1631     case AArch64::STURDi:
1632     case AArch64::STURQi:
1633     case AArch64::STURWi:
1634     case AArch64::STURXi:
1635     case AArch64::LDURSi:
1636     case AArch64::LDURDi:
1637     case AArch64::LDURQi:
1638     case AArch64::LDURWi:
1639     case AArch64::LDURXi:
1640     case AArch64::LDURSWi: {
1641       if (tryToMergeLdStInst(MBBI)) {
1642         Modified = true;
1643         break;
1644       }
1645       ++MBBI;
1646       break;
1647     }
1648     }
1649   }
1650   // 4) Find base register updates that can be merged into the load or store
1651   //    as a base-reg writeback.
1652   //      e.g.,
1653   //        ldr x0, [x2]
1654   //        add x2, x2, #4
1655   //        ; becomes
1656   //        ldr x0, [x2], #4
1657   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1658        MBBI != E;) {
1659     MachineInstr *MI = MBBI;
1660     // Do update merging. It's simpler to keep this separate from the above
1661     // switchs, though not strictly necessary.
1662     unsigned Opc = MI->getOpcode();
1663     switch (Opc) {
1664     default:
1665       // Just move on to the next instruction.
1666       ++MBBI;
1667       break;
1668     // Scaled instructions.
1669     case AArch64::STRSui:
1670     case AArch64::STRDui:
1671     case AArch64::STRQui:
1672     case AArch64::STRXui:
1673     case AArch64::STRWui:
1674     case AArch64::STRHHui:
1675     case AArch64::STRBBui:
1676     case AArch64::LDRSui:
1677     case AArch64::LDRDui:
1678     case AArch64::LDRQui:
1679     case AArch64::LDRXui:
1680     case AArch64::LDRWui:
1681     case AArch64::LDRHHui:
1682     case AArch64::LDRBBui:
1683     // Unscaled instructions.
1684     case AArch64::STURSi:
1685     case AArch64::STURDi:
1686     case AArch64::STURQi:
1687     case AArch64::STURWi:
1688     case AArch64::STURXi:
1689     case AArch64::LDURSi:
1690     case AArch64::LDURDi:
1691     case AArch64::LDURQi:
1692     case AArch64::LDURWi:
1693     case AArch64::LDURXi:
1694     // Paired instructions.
1695     case AArch64::LDPSi:
1696     case AArch64::LDPSWi:
1697     case AArch64::LDPDi:
1698     case AArch64::LDPQi:
1699     case AArch64::LDPWi:
1700     case AArch64::LDPXi:
1701     case AArch64::STPSi:
1702     case AArch64::STPDi:
1703     case AArch64::STPQi:
1704     case AArch64::STPWi:
1705     case AArch64::STPXi: {
1706       // Make sure this is a reg+imm (as opposed to an address reloc).
1707       if (!getLdStOffsetOp(MI).isImm()) {
1708         ++MBBI;
1709         break;
1710       }
1711       // Look forward to try to form a post-index instruction. For example,
1712       // ldr x0, [x20]
1713       // add x20, x20, #32
1714       //   merged into:
1715       // ldr x0, [x20], #32
1716       MachineBasicBlock::iterator Update =
1717           findMatchingUpdateInsnForward(MBBI, 0);
1718       if (Update != E) {
1719         // Merge the update into the ld/st.
1720         MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
1721         Modified = true;
1722         ++NumPostFolded;
1723         break;
1724       }
1725       // Don't know how to handle pre/post-index versions, so move to the next
1726       // instruction.
1727       if (isUnscaledLdSt(Opc)) {
1728         ++MBBI;
1729         break;
1730       }
1731 
1732       // Look back to try to find a pre-index instruction. For example,
1733       // add x0, x0, #8
1734       // ldr x1, [x0]
1735       //   merged into:
1736       // ldr x1, [x0, #8]!
1737       Update = findMatchingUpdateInsnBackward(MBBI);
1738       if (Update != E) {
1739         // Merge the update into the ld/st.
1740         MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
1741         Modified = true;
1742         ++NumPreFolded;
1743         break;
1744       }
1745       // The immediate in the load/store is scaled by the size of the memory
1746       // operation. The immediate in the add we're looking for,
1747       // however, is not, so adjust here.
1748       int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
1749 
1750       // Look forward to try to find a post-index instruction. For example,
1751       // ldr x1, [x0, #64]
1752       // add x0, x0, #64
1753       //   merged into:
1754       // ldr x1, [x0, #64]!
1755       Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset);
1756       if (Update != E) {
1757         // Merge the update into the ld/st.
1758         MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
1759         Modified = true;
1760         ++NumPreFolded;
1761         break;
1762       }
1763 
1764       // Nothing found. Just move to the next instruction.
1765       ++MBBI;
1766       break;
1767     }
1768     }
1769   }
1770 
1771   return Modified;
1772 }
1773 
1774 bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) {
1775   bool ProfitableArch = Subtarget->isCortexA57();
1776   // FIXME: The benefit from converting narrow loads into a wider load could be
1777   // microarchitectural as it assumes that a single load with two bitfield
1778   // extracts is cheaper than two narrow loads. Currently, this conversion is
1779   // enabled only in cortex-a57 on which performance benefits were verified.
1780   return ProfitableArch && !Subtarget->requiresStrictAlign();
1781 }
1782 
1783 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1784   Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
1785   TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
1786   TRI = Subtarget->getRegisterInfo();
1787 
1788   // Resize the modified and used register bitfield trackers.  We do this once
1789   // per function and then clear the bitfield each time we optimize a load or
1790   // store.
1791   ModifiedRegs.resize(TRI->getNumRegs());
1792   UsedRegs.resize(TRI->getNumRegs());
1793 
1794   bool Modified = false;
1795   bool enableNarrowLdOpt = enableNarrowLdMerge(Fn);
1796   for (auto &MBB : Fn)
1797     Modified |= optimizeBlock(MBB, enableNarrowLdOpt);
1798 
1799   return Modified;
1800 }
1801 
1802 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
1803 // loads and stores near one another?
1804 
1805 /// createAArch64LoadStoreOptimizationPass - returns an instance of the
1806 /// load / store optimization pass.
1807 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
1808   return new AArch64LoadStoreOpt();
1809 }
1810