1 //=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file contains a pass that performs load / store related peephole
11 // optimizations. This pass should be run after register allocation.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AArch64InstrInfo.h"
16 #include "AArch64Subtarget.h"
17 #include "MCTargetDesc/AArch64AddressingModes.h"
18 #include "llvm/ADT/BitVector.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/CodeGen/MachineInstr.h"
24 #include "llvm/CodeGen/MachineInstrBuilder.h"
25 #include "llvm/Support/CommandLine.h"
26 #include "llvm/Support/Debug.h"
27 #include "llvm/Support/ErrorHandling.h"
28 #include "llvm/Support/raw_ostream.h"
29 #include "llvm/Target/TargetInstrInfo.h"
30 #include "llvm/Target/TargetMachine.h"
31 #include "llvm/Target/TargetRegisterInfo.h"
32 using namespace llvm;
33 
34 #define DEBUG_TYPE "aarch64-ldst-opt"
35 
36 /// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine
37 /// load / store instructions to form ldp / stp instructions.
38 
39 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
40 STATISTIC(NumPostFolded, "Number of post-index updates folded");
41 STATISTIC(NumPreFolded, "Number of pre-index updates folded");
42 STATISTIC(NumUnscaledPairCreated,
43           "Number of load/store from unscaled generated");
44 STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted");
45 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
46 STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
47 
48 static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit",
49                                    cl::init(20), cl::Hidden);
50 
51 namespace llvm {
52 void initializeAArch64LoadStoreOptPass(PassRegistry &);
53 }
54 
55 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
56 
57 namespace {
58 
59 typedef struct LdStPairFlags {
60   // If a matching instruction is found, MergeForward is set to true if the
61   // merge is to remove the first instruction and replace the second with
62   // a pair-wise insn, and false if the reverse is true.
63   bool MergeForward;
64 
65   // SExtIdx gives the index of the result of the load pair that must be
66   // extended. The value of SExtIdx assumes that the paired load produces the
67   // value in this order: (I, returned iterator), i.e., -1 means no value has
68   // to be extended, 0 means I, and 1 means the returned iterator.
69   int SExtIdx;
70 
71   LdStPairFlags() : MergeForward(false), SExtIdx(-1) {}
72 
73   void setMergeForward(bool V = true) { MergeForward = V; }
74   bool getMergeForward() const { return MergeForward; }
75 
76   void setSExtIdx(int V) { SExtIdx = V; }
77   int getSExtIdx() const { return SExtIdx; }
78 
79 } LdStPairFlags;
80 
81 struct AArch64LoadStoreOpt : public MachineFunctionPass {
82   static char ID;
83   AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
84     initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
85   }
86 
87   const AArch64InstrInfo *TII;
88   const TargetRegisterInfo *TRI;
89   const AArch64Subtarget *Subtarget;
90 
91   // Scan the instructions looking for a load/store that can be combined
92   // with the current instruction into a load/store pair.
93   // Return the matching instruction if one is found, else MBB->end().
94   MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
95                                                LdStPairFlags &Flags,
96                                                unsigned Limit);
97 
98   // Scan the instructions looking for a store that writes to the address from
99   // which the current load instruction reads. Return true if one is found.
100   bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
101                          MachineBasicBlock::iterator &StoreI);
102 
103   // Merge the two instructions indicated into a single pair-wise instruction.
104   // If MergeForward is true, erase the first instruction and fold its
105   // operation into the second. If false, the reverse. Return the instruction
106   // following the first instruction (which may change during processing).
107   MachineBasicBlock::iterator
108   mergePairedInsns(MachineBasicBlock::iterator I,
109                    MachineBasicBlock::iterator Paired,
110                    const LdStPairFlags &Flags);
111 
112   // Promote the load that reads directly from the address stored to.
113   MachineBasicBlock::iterator
114   promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
115                        MachineBasicBlock::iterator StoreI);
116 
117   // Scan the instruction list to find a base register update that can
118   // be combined with the current instruction (a load or store) using
119   // pre or post indexed addressing with writeback. Scan forwards.
120   MachineBasicBlock::iterator
121   findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
122                                 int UnscaledOffset);
123 
124   // Scan the instruction list to find a base register update that can
125   // be combined with the current instruction (a load or store) using
126   // pre or post indexed addressing with writeback. Scan backwards.
127   MachineBasicBlock::iterator
128   findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I);
129 
130   // Find an instruction that updates the base register of the ld/st
131   // instruction.
132   bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI,
133                             unsigned BaseReg, int Offset);
134 
135   // Merge a pre- or post-index base register update into a ld/st instruction.
136   MachineBasicBlock::iterator
137   mergeUpdateInsn(MachineBasicBlock::iterator I,
138                   MachineBasicBlock::iterator Update, bool IsPreIdx);
139 
140   // Find and merge foldable ldr/str instructions.
141   bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI);
142 
143   // Find and promote load instructions which read directly from store.
144   bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
145 
146   // Check if converting two narrow loads into a single wider load with
147   // bitfield extracts could be enabled.
148   bool enableNarrowLdMerge(MachineFunction &Fn);
149 
150   bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt);
151 
152   bool runOnMachineFunction(MachineFunction &Fn) override;
153 
154   const char *getPassName() const override {
155     return AARCH64_LOAD_STORE_OPT_NAME;
156   }
157 };
158 char AArch64LoadStoreOpt::ID = 0;
159 } // namespace
160 
161 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
162                 AARCH64_LOAD_STORE_OPT_NAME, false, false)
163 
164 static bool isUnscaledLdSt(unsigned Opc) {
165   switch (Opc) {
166   default:
167     return false;
168   case AArch64::STURSi:
169   case AArch64::STURDi:
170   case AArch64::STURQi:
171   case AArch64::STURBBi:
172   case AArch64::STURHHi:
173   case AArch64::STURWi:
174   case AArch64::STURXi:
175   case AArch64::LDURSi:
176   case AArch64::LDURDi:
177   case AArch64::LDURQi:
178   case AArch64::LDURWi:
179   case AArch64::LDURXi:
180   case AArch64::LDURSWi:
181   case AArch64::LDURHHi:
182   case AArch64::LDURBBi:
183   case AArch64::LDURSBWi:
184   case AArch64::LDURSHWi:
185     return true;
186   }
187 }
188 
189 static bool isUnscaledLdSt(MachineInstr *MI) {
190   return isUnscaledLdSt(MI->getOpcode());
191 }
192 
193 static unsigned getBitExtrOpcode(MachineInstr *MI) {
194   switch (MI->getOpcode()) {
195   default:
196     llvm_unreachable("Unexpected opcode.");
197   case AArch64::LDRBBui:
198   case AArch64::LDURBBi:
199   case AArch64::LDRHHui:
200   case AArch64::LDURHHi:
201     return AArch64::UBFMWri;
202   case AArch64::LDRSBWui:
203   case AArch64::LDURSBWi:
204   case AArch64::LDRSHWui:
205   case AArch64::LDURSHWi:
206     return AArch64::SBFMWri;
207   }
208 }
209 
210 static bool isNarrowStore(unsigned Opc) {
211   switch (Opc) {
212   default:
213     return false;
214   case AArch64::STRBBui:
215   case AArch64::STURBBi:
216   case AArch64::STRHHui:
217   case AArch64::STURHHi:
218     return true;
219   }
220 }
221 
222 static bool isNarrowStore(MachineInstr *MI) {
223   return isNarrowStore(MI->getOpcode());
224 }
225 
226 static bool isNarrowLoad(unsigned Opc) {
227   switch (Opc) {
228   default:
229     return false;
230   case AArch64::LDRHHui:
231   case AArch64::LDURHHi:
232   case AArch64::LDRBBui:
233   case AArch64::LDURBBi:
234   case AArch64::LDRSHWui:
235   case AArch64::LDURSHWi:
236   case AArch64::LDRSBWui:
237   case AArch64::LDURSBWi:
238     return true;
239   }
240 }
241 
242 static bool isNarrowLoad(MachineInstr *MI) {
243   return isNarrowLoad(MI->getOpcode());
244 }
245 
246 // Scaling factor for unscaled load or store.
247 static int getMemScale(MachineInstr *MI) {
248   switch (MI->getOpcode()) {
249   default:
250     llvm_unreachable("Opcode has unknown scale!");
251   case AArch64::LDRBBui:
252   case AArch64::LDURBBi:
253   case AArch64::LDRSBWui:
254   case AArch64::LDURSBWi:
255   case AArch64::STRBBui:
256   case AArch64::STURBBi:
257     return 1;
258   case AArch64::LDRHHui:
259   case AArch64::LDURHHi:
260   case AArch64::LDRSHWui:
261   case AArch64::LDURSHWi:
262   case AArch64::STRHHui:
263   case AArch64::STURHHi:
264     return 2;
265   case AArch64::LDRSui:
266   case AArch64::LDURSi:
267   case AArch64::LDRSWui:
268   case AArch64::LDURSWi:
269   case AArch64::LDRWui:
270   case AArch64::LDURWi:
271   case AArch64::STRSui:
272   case AArch64::STURSi:
273   case AArch64::STRWui:
274   case AArch64::STURWi:
275   case AArch64::LDPSi:
276   case AArch64::LDPSWi:
277   case AArch64::LDPWi:
278   case AArch64::STPSi:
279   case AArch64::STPWi:
280     return 4;
281   case AArch64::LDRDui:
282   case AArch64::LDURDi:
283   case AArch64::LDRXui:
284   case AArch64::LDURXi:
285   case AArch64::STRDui:
286   case AArch64::STURDi:
287   case AArch64::STRXui:
288   case AArch64::STURXi:
289   case AArch64::LDPDi:
290   case AArch64::LDPXi:
291   case AArch64::STPDi:
292   case AArch64::STPXi:
293     return 8;
294   case AArch64::LDRQui:
295   case AArch64::LDURQi:
296   case AArch64::STRQui:
297   case AArch64::STURQi:
298   case AArch64::LDPQi:
299   case AArch64::STPQi:
300     return 16;
301   }
302 }
303 
304 static unsigned getMatchingNonSExtOpcode(unsigned Opc,
305                                          bool *IsValidLdStrOpc = nullptr) {
306   if (IsValidLdStrOpc)
307     *IsValidLdStrOpc = true;
308   switch (Opc) {
309   default:
310     if (IsValidLdStrOpc)
311       *IsValidLdStrOpc = false;
312     return UINT_MAX;
313   case AArch64::STRDui:
314   case AArch64::STURDi:
315   case AArch64::STRQui:
316   case AArch64::STURQi:
317   case AArch64::STRBBui:
318   case AArch64::STURBBi:
319   case AArch64::STRHHui:
320   case AArch64::STURHHi:
321   case AArch64::STRWui:
322   case AArch64::STURWi:
323   case AArch64::STRXui:
324   case AArch64::STURXi:
325   case AArch64::LDRDui:
326   case AArch64::LDURDi:
327   case AArch64::LDRQui:
328   case AArch64::LDURQi:
329   case AArch64::LDRWui:
330   case AArch64::LDURWi:
331   case AArch64::LDRXui:
332   case AArch64::LDURXi:
333   case AArch64::STRSui:
334   case AArch64::STURSi:
335   case AArch64::LDRSui:
336   case AArch64::LDURSi:
337   case AArch64::LDRHHui:
338   case AArch64::LDURHHi:
339   case AArch64::LDRBBui:
340   case AArch64::LDURBBi:
341     return Opc;
342   case AArch64::LDRSWui:
343     return AArch64::LDRWui;
344   case AArch64::LDURSWi:
345     return AArch64::LDURWi;
346   case AArch64::LDRSBWui:
347     return AArch64::LDRBBui;
348   case AArch64::LDRSHWui:
349     return AArch64::LDRHHui;
350   case AArch64::LDURSBWi:
351     return AArch64::LDURBBi;
352   case AArch64::LDURSHWi:
353     return AArch64::LDURHHi;
354   }
355 }
356 
357 static unsigned getMatchingPairOpcode(unsigned Opc) {
358   switch (Opc) {
359   default:
360     llvm_unreachable("Opcode has no pairwise equivalent!");
361   case AArch64::STRSui:
362   case AArch64::STURSi:
363     return AArch64::STPSi;
364   case AArch64::STRDui:
365   case AArch64::STURDi:
366     return AArch64::STPDi;
367   case AArch64::STRQui:
368   case AArch64::STURQi:
369     return AArch64::STPQi;
370   case AArch64::STRBBui:
371     return AArch64::STRHHui;
372   case AArch64::STRHHui:
373     return AArch64::STRWui;
374   case AArch64::STURBBi:
375     return AArch64::STURHHi;
376   case AArch64::STURHHi:
377     return AArch64::STURWi;
378   case AArch64::STRWui:
379   case AArch64::STURWi:
380     return AArch64::STPWi;
381   case AArch64::STRXui:
382   case AArch64::STURXi:
383     return AArch64::STPXi;
384   case AArch64::LDRSui:
385   case AArch64::LDURSi:
386     return AArch64::LDPSi;
387   case AArch64::LDRDui:
388   case AArch64::LDURDi:
389     return AArch64::LDPDi;
390   case AArch64::LDRQui:
391   case AArch64::LDURQi:
392     return AArch64::LDPQi;
393   case AArch64::LDRWui:
394   case AArch64::LDURWi:
395     return AArch64::LDPWi;
396   case AArch64::LDRXui:
397   case AArch64::LDURXi:
398     return AArch64::LDPXi;
399   case AArch64::LDRSWui:
400   case AArch64::LDURSWi:
401     return AArch64::LDPSWi;
402   case AArch64::LDRHHui:
403   case AArch64::LDRSHWui:
404     return AArch64::LDRWui;
405   case AArch64::LDURHHi:
406   case AArch64::LDURSHWi:
407     return AArch64::LDURWi;
408   case AArch64::LDRBBui:
409   case AArch64::LDRSBWui:
410     return AArch64::LDRHHui;
411   case AArch64::LDURBBi:
412   case AArch64::LDURSBWi:
413     return AArch64::LDURHHi;
414   }
415 }
416 
417 static unsigned isMatchingStore(MachineInstr *LoadInst,
418                                 MachineInstr *StoreInst) {
419   unsigned LdOpc = LoadInst->getOpcode();
420   unsigned StOpc = StoreInst->getOpcode();
421   switch (LdOpc) {
422   default:
423     llvm_unreachable("Unsupported load instruction!");
424   case AArch64::LDRBBui:
425     return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
426            StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
427   case AArch64::LDURBBi:
428     return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
429            StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
430   case AArch64::LDRHHui:
431     return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
432            StOpc == AArch64::STRXui;
433   case AArch64::LDURHHi:
434     return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
435            StOpc == AArch64::STURXi;
436   case AArch64::LDRWui:
437     return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
438   case AArch64::LDURWi:
439     return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
440   case AArch64::LDRXui:
441     return StOpc == AArch64::STRXui;
442   case AArch64::LDURXi:
443     return StOpc == AArch64::STURXi;
444   }
445 }
446 
447 static unsigned getPreIndexedOpcode(unsigned Opc) {
448   switch (Opc) {
449   default:
450     llvm_unreachable("Opcode has no pre-indexed equivalent!");
451   case AArch64::STRSui:
452     return AArch64::STRSpre;
453   case AArch64::STRDui:
454     return AArch64::STRDpre;
455   case AArch64::STRQui:
456     return AArch64::STRQpre;
457   case AArch64::STRBBui:
458     return AArch64::STRBBpre;
459   case AArch64::STRHHui:
460     return AArch64::STRHHpre;
461   case AArch64::STRWui:
462     return AArch64::STRWpre;
463   case AArch64::STRXui:
464     return AArch64::STRXpre;
465   case AArch64::LDRSui:
466     return AArch64::LDRSpre;
467   case AArch64::LDRDui:
468     return AArch64::LDRDpre;
469   case AArch64::LDRQui:
470     return AArch64::LDRQpre;
471   case AArch64::LDRBBui:
472     return AArch64::LDRBBpre;
473   case AArch64::LDRHHui:
474     return AArch64::LDRHHpre;
475   case AArch64::LDRWui:
476     return AArch64::LDRWpre;
477   case AArch64::LDRXui:
478     return AArch64::LDRXpre;
479   case AArch64::LDRSWui:
480     return AArch64::LDRSWpre;
481   case AArch64::LDPSi:
482     return AArch64::LDPSpre;
483   case AArch64::LDPSWi:
484     return AArch64::LDPSWpre;
485   case AArch64::LDPDi:
486     return AArch64::LDPDpre;
487   case AArch64::LDPQi:
488     return AArch64::LDPQpre;
489   case AArch64::LDPWi:
490     return AArch64::LDPWpre;
491   case AArch64::LDPXi:
492     return AArch64::LDPXpre;
493   case AArch64::STPSi:
494     return AArch64::STPSpre;
495   case AArch64::STPDi:
496     return AArch64::STPDpre;
497   case AArch64::STPQi:
498     return AArch64::STPQpre;
499   case AArch64::STPWi:
500     return AArch64::STPWpre;
501   case AArch64::STPXi:
502     return AArch64::STPXpre;
503   }
504 }
505 
506 static unsigned getPostIndexedOpcode(unsigned Opc) {
507   switch (Opc) {
508   default:
509     llvm_unreachable("Opcode has no post-indexed wise equivalent!");
510   case AArch64::STRSui:
511     return AArch64::STRSpost;
512   case AArch64::STRDui:
513     return AArch64::STRDpost;
514   case AArch64::STRQui:
515     return AArch64::STRQpost;
516   case AArch64::STRBBui:
517     return AArch64::STRBBpost;
518   case AArch64::STRHHui:
519     return AArch64::STRHHpost;
520   case AArch64::STRWui:
521     return AArch64::STRWpost;
522   case AArch64::STRXui:
523     return AArch64::STRXpost;
524   case AArch64::LDRSui:
525     return AArch64::LDRSpost;
526   case AArch64::LDRDui:
527     return AArch64::LDRDpost;
528   case AArch64::LDRQui:
529     return AArch64::LDRQpost;
530   case AArch64::LDRBBui:
531     return AArch64::LDRBBpost;
532   case AArch64::LDRHHui:
533     return AArch64::LDRHHpost;
534   case AArch64::LDRWui:
535     return AArch64::LDRWpost;
536   case AArch64::LDRXui:
537     return AArch64::LDRXpost;
538   case AArch64::LDRSWui:
539     return AArch64::LDRSWpost;
540   case AArch64::LDPSi:
541     return AArch64::LDPSpost;
542   case AArch64::LDPSWi:
543     return AArch64::LDPSWpost;
544   case AArch64::LDPDi:
545     return AArch64::LDPDpost;
546   case AArch64::LDPQi:
547     return AArch64::LDPQpost;
548   case AArch64::LDPWi:
549     return AArch64::LDPWpost;
550   case AArch64::LDPXi:
551     return AArch64::LDPXpost;
552   case AArch64::STPSi:
553     return AArch64::STPSpost;
554   case AArch64::STPDi:
555     return AArch64::STPDpost;
556   case AArch64::STPQi:
557     return AArch64::STPQpost;
558   case AArch64::STPWi:
559     return AArch64::STPWpost;
560   case AArch64::STPXi:
561     return AArch64::STPXpost;
562   }
563 }
564 
565 static bool isPairedLdSt(const MachineInstr *MI) {
566   switch (MI->getOpcode()) {
567   default:
568     return false;
569   case AArch64::LDPSi:
570   case AArch64::LDPSWi:
571   case AArch64::LDPDi:
572   case AArch64::LDPQi:
573   case AArch64::LDPWi:
574   case AArch64::LDPXi:
575   case AArch64::STPSi:
576   case AArch64::STPDi:
577   case AArch64::STPQi:
578   case AArch64::STPWi:
579   case AArch64::STPXi:
580     return true;
581   }
582 }
583 
584 static const MachineOperand &getLdStRegOp(const MachineInstr *MI,
585                                           unsigned PairedRegOp = 0) {
586   assert(PairedRegOp < 2 && "Unexpected register operand idx.");
587   unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0;
588   return MI->getOperand(Idx);
589 }
590 
591 static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) {
592   unsigned Idx = isPairedLdSt(MI) ? 2 : 1;
593   return MI->getOperand(Idx);
594 }
595 
596 static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) {
597   unsigned Idx = isPairedLdSt(MI) ? 3 : 2;
598   return MI->getOperand(Idx);
599 }
600 
601 static bool isLdOffsetInRangeOfSt(MachineInstr *LoadInst,
602                                   MachineInstr *StoreInst) {
603   assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
604   int LoadSize = getMemScale(LoadInst);
605   int StoreSize = getMemScale(StoreInst);
606   int UnscaledStOffset = isUnscaledLdSt(StoreInst)
607                              ? getLdStOffsetOp(StoreInst).getImm()
608                              : getLdStOffsetOp(StoreInst).getImm() * StoreSize;
609   int UnscaledLdOffset = isUnscaledLdSt(LoadInst)
610                              ? getLdStOffsetOp(LoadInst).getImm()
611                              : getLdStOffsetOp(LoadInst).getImm() * LoadSize;
612   return (UnscaledStOffset <= UnscaledLdOffset) &&
613          (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
614 }
615 
616 MachineBasicBlock::iterator
617 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
618                                       MachineBasicBlock::iterator Paired,
619                                       const LdStPairFlags &Flags) {
620   MachineBasicBlock::iterator NextI = I;
621   ++NextI;
622   // If NextI is the second of the two instructions to be merged, we need
623   // to skip one further. Either way we merge will invalidate the iterator,
624   // and we don't need to scan the new instruction, as it's a pairwise
625   // instruction, which we're not considering for further action anyway.
626   if (NextI == Paired)
627     ++NextI;
628 
629   int SExtIdx = Flags.getSExtIdx();
630   unsigned Opc =
631       SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
632   bool IsUnscaled = isUnscaledLdSt(Opc);
633   int OffsetStride = IsUnscaled ? getMemScale(I) : 1;
634 
635   bool MergeForward = Flags.getMergeForward();
636   unsigned NewOpc = getMatchingPairOpcode(Opc);
637   // Insert our new paired instruction after whichever of the paired
638   // instructions MergeForward indicates.
639   MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
640   // Also based on MergeForward is from where we copy the base register operand
641   // so we get the flags compatible with the input code.
642   const MachineOperand &BaseRegOp =
643       MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I);
644 
645   // Which register is Rt and which is Rt2 depends on the offset order.
646   MachineInstr *RtMI, *Rt2MI;
647   if (getLdStOffsetOp(I).getImm() ==
648       getLdStOffsetOp(Paired).getImm() + OffsetStride) {
649     RtMI = Paired;
650     Rt2MI = I;
651     // Here we swapped the assumption made for SExtIdx.
652     // I.e., we turn ldp I, Paired into ldp Paired, I.
653     // Update the index accordingly.
654     if (SExtIdx != -1)
655       SExtIdx = (SExtIdx + 1) % 2;
656   } else {
657     RtMI = I;
658     Rt2MI = Paired;
659   }
660 
661   int OffsetImm = getLdStOffsetOp(RtMI).getImm();
662 
663   if (isNarrowLoad(Opc)) {
664     // Change the scaled offset from small to large type.
665     if (!IsUnscaled) {
666       assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
667       OffsetImm /= 2;
668     }
669     MachineInstr *RtNewDest = MergeForward ? I : Paired;
670     // When merging small (< 32 bit) loads for big-endian targets, the order of
671     // the component parts gets swapped.
672     if (!Subtarget->isLittleEndian())
673       std::swap(RtMI, Rt2MI);
674     // Construct the new load instruction.
675     MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2;
676     NewMemMI = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
677                        TII->get(NewOpc))
678                    .addOperand(getLdStRegOp(RtNewDest))
679                    .addOperand(BaseRegOp)
680                    .addImm(OffsetImm)
681                    .setMemRefs(I->mergeMemRefsWith(*Paired));
682 
683     DEBUG(
684         dbgs()
685         << "Creating the new load and extract. Replacing instructions:\n    ");
686     DEBUG(I->print(dbgs()));
687     DEBUG(dbgs() << "    ");
688     DEBUG(Paired->print(dbgs()));
689     DEBUG(dbgs() << "  with instructions:\n    ");
690     DEBUG((NewMemMI)->print(dbgs()));
691 
692     int Width = getMemScale(I) == 1 ? 8 : 16;
693     int LSBLow = 0;
694     int LSBHigh = Width;
695     int ImmsLow = LSBLow + Width - 1;
696     int ImmsHigh = LSBHigh + Width - 1;
697     MachineInstr *ExtDestMI = MergeForward ? Paired : I;
698     if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) {
699       // Create the bitfield extract for high bits.
700       BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
701                           TII->get(getBitExtrOpcode(Rt2MI)))
702                       .addOperand(getLdStRegOp(Rt2MI))
703                       .addReg(getLdStRegOp(RtNewDest).getReg())
704                       .addImm(LSBHigh)
705                       .addImm(ImmsHigh);
706       // Create the bitfield extract for low bits.
707       if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
708         // For unsigned, prefer to use AND for low bits.
709         BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
710                             TII->get(AArch64::ANDWri))
711                         .addOperand(getLdStRegOp(RtMI))
712                         .addReg(getLdStRegOp(RtNewDest).getReg())
713                         .addImm(ImmsLow);
714       } else {
715         BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
716                             TII->get(getBitExtrOpcode(RtMI)))
717                         .addOperand(getLdStRegOp(RtMI))
718                         .addReg(getLdStRegOp(RtNewDest).getReg())
719                         .addImm(LSBLow)
720                         .addImm(ImmsLow);
721       }
722     } else {
723       // Create the bitfield extract for low bits.
724       if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
725         // For unsigned, prefer to use AND for low bits.
726         BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
727                             TII->get(AArch64::ANDWri))
728                         .addOperand(getLdStRegOp(RtMI))
729                         .addReg(getLdStRegOp(RtNewDest).getReg())
730                         .addImm(ImmsLow);
731       } else {
732         BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
733                             TII->get(getBitExtrOpcode(RtMI)))
734                         .addOperand(getLdStRegOp(RtMI))
735                         .addReg(getLdStRegOp(RtNewDest).getReg())
736                         .addImm(LSBLow)
737                         .addImm(ImmsLow);
738       }
739 
740       // Create the bitfield extract for high bits.
741       BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
742                           TII->get(getBitExtrOpcode(Rt2MI)))
743                       .addOperand(getLdStRegOp(Rt2MI))
744                       .addReg(getLdStRegOp(RtNewDest).getReg())
745                       .addImm(LSBHigh)
746                       .addImm(ImmsHigh);
747     }
748     DEBUG(dbgs() << "    ");
749     DEBUG((BitExtMI1)->print(dbgs()));
750     DEBUG(dbgs() << "    ");
751     DEBUG((BitExtMI2)->print(dbgs()));
752     DEBUG(dbgs() << "\n");
753 
754     // Erase the old instructions.
755     I->eraseFromParent();
756     Paired->eraseFromParent();
757     return NextI;
758   }
759 
760   // Construct the new instruction.
761   MachineInstrBuilder MIB;
762   if (isNarrowStore(Opc)) {
763     // Change the scaled offset from small to large type.
764     if (!IsUnscaled) {
765       assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
766       OffsetImm /= 2;
767     }
768     MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
769                   TII->get(NewOpc))
770               .addOperand(getLdStRegOp(I))
771               .addOperand(BaseRegOp)
772               .addImm(OffsetImm)
773               .setMemRefs(I->mergeMemRefsWith(*Paired));
774   } else {
775     // Handle Unscaled
776     if (IsUnscaled)
777       OffsetImm /= OffsetStride;
778     MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
779                   TII->get(NewOpc))
780               .addOperand(getLdStRegOp(RtMI))
781               .addOperand(getLdStRegOp(Rt2MI))
782               .addOperand(BaseRegOp)
783               .addImm(OffsetImm);
784   }
785 
786   (void)MIB;
787 
788   // FIXME: Do we need/want to copy the mem operands from the source
789   //        instructions? Probably. What uses them after this?
790 
791   DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
792   DEBUG(I->print(dbgs()));
793   DEBUG(dbgs() << "    ");
794   DEBUG(Paired->print(dbgs()));
795   DEBUG(dbgs() << "  with instruction:\n    ");
796 
797   if (SExtIdx != -1) {
798     // Generate the sign extension for the proper result of the ldp.
799     // I.e., with X1, that would be:
800     // %W1<def> = KILL %W1, %X1<imp-def>
801     // %X1<def> = SBFMXri %X1<kill>, 0, 31
802     MachineOperand &DstMO = MIB->getOperand(SExtIdx);
803     // Right now, DstMO has the extended register, since it comes from an
804     // extended opcode.
805     unsigned DstRegX = DstMO.getReg();
806     // Get the W variant of that register.
807     unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
808     // Update the result of LDP to use the W instead of the X variant.
809     DstMO.setReg(DstRegW);
810     DEBUG(((MachineInstr *)MIB)->print(dbgs()));
811     DEBUG(dbgs() << "\n");
812     // Make the machine verifier happy by providing a definition for
813     // the X register.
814     // Insert this definition right after the generated LDP, i.e., before
815     // InsertionPoint.
816     MachineInstrBuilder MIBKill =
817         BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
818                 TII->get(TargetOpcode::KILL), DstRegW)
819             .addReg(DstRegW)
820             .addReg(DstRegX, RegState::Define);
821     MIBKill->getOperand(2).setImplicit();
822     // Create the sign extension.
823     MachineInstrBuilder MIBSXTW =
824         BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
825                 TII->get(AArch64::SBFMXri), DstRegX)
826             .addReg(DstRegX)
827             .addImm(0)
828             .addImm(31);
829     (void)MIBSXTW;
830     DEBUG(dbgs() << "  Extend operand:\n    ");
831     DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
832     DEBUG(dbgs() << "\n");
833   } else {
834     DEBUG(((MachineInstr *)MIB)->print(dbgs()));
835     DEBUG(dbgs() << "\n");
836   }
837 
838   // Erase the old instructions.
839   I->eraseFromParent();
840   Paired->eraseFromParent();
841 
842   return NextI;
843 }
844 
845 MachineBasicBlock::iterator
846 AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
847                                           MachineBasicBlock::iterator StoreI) {
848   MachineBasicBlock::iterator NextI = LoadI;
849   ++NextI;
850 
851   int LoadSize = getMemScale(LoadI);
852   int StoreSize = getMemScale(StoreI);
853   unsigned LdRt = getLdStRegOp(LoadI).getReg();
854   unsigned StRt = getLdStRegOp(StoreI).getReg();
855   bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
856 
857   assert((IsStoreXReg ||
858           TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
859          "Unexpected RegClass");
860 
861   MachineInstr *BitExtMI;
862   if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
863     // Remove the load, if the destination register of the loads is the same
864     // register for stored value.
865     if (StRt == LdRt && LoadSize == 8) {
866       DEBUG(dbgs() << "Remove load instruction:\n    ");
867       DEBUG(LoadI->print(dbgs()));
868       DEBUG(dbgs() << "\n");
869       LoadI->eraseFromParent();
870       return NextI;
871     }
872     // Replace the load with a mov if the load and store are in the same size.
873     BitExtMI =
874         BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
875                 TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
876             .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
877             .addReg(StRt)
878             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
879   } else {
880     // FIXME: Currently we disable this transformation in big-endian targets as
881     // performance and correctness are verified only in little-endian.
882     if (!Subtarget->isLittleEndian())
883       return NextI;
884     bool IsUnscaled = isUnscaledLdSt(LoadI);
885     assert(IsUnscaled == isUnscaledLdSt(StoreI) && "Unsupported ld/st match");
886     assert(LoadSize <= StoreSize && "Invalid load size");
887     int UnscaledLdOffset = IsUnscaled
888                                ? getLdStOffsetOp(LoadI).getImm()
889                                : getLdStOffsetOp(LoadI).getImm() * LoadSize;
890     int UnscaledStOffset = IsUnscaled
891                                ? getLdStOffsetOp(StoreI).getImm()
892                                : getLdStOffsetOp(StoreI).getImm() * StoreSize;
893     int Width = LoadSize * 8;
894     int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
895     int Imms = Immr + Width - 1;
896     unsigned DestReg = IsStoreXReg
897                            ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32,
898                                                       &AArch64::GPR64RegClass)
899                            : LdRt;
900 
901     assert((UnscaledLdOffset >= UnscaledStOffset &&
902             (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
903            "Invalid offset");
904 
905     Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
906     Imms = Immr + Width - 1;
907     if (UnscaledLdOffset == UnscaledStOffset) {
908       uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
909                                 | ((Immr) << 6)               // immr
910                                 | ((Imms) << 0)               // imms
911           ;
912 
913       BitExtMI =
914           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
915                   TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
916                   DestReg)
917               .addReg(StRt)
918               .addImm(AndMaskEncoded);
919     } else {
920       BitExtMI =
921           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
922                   TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
923                   DestReg)
924               .addReg(StRt)
925               .addImm(Immr)
926               .addImm(Imms);
927     }
928   }
929 
930   DEBUG(dbgs() << "Promoting load by replacing :\n    ");
931   DEBUG(StoreI->print(dbgs()));
932   DEBUG(dbgs() << "    ");
933   DEBUG(LoadI->print(dbgs()));
934   DEBUG(dbgs() << "  with instructions:\n    ");
935   DEBUG(StoreI->print(dbgs()));
936   DEBUG(dbgs() << "    ");
937   DEBUG((BitExtMI)->print(dbgs()));
938   DEBUG(dbgs() << "\n");
939 
940   // Erase the old instructions.
941   LoadI->eraseFromParent();
942   return NextI;
943 }
944 
945 /// trackRegDefsUses - Remember what registers the specified instruction uses
946 /// and modifies.
947 static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs,
948                              BitVector &UsedRegs,
949                              const TargetRegisterInfo *TRI) {
950   for (const MachineOperand &MO : MI->operands()) {
951     if (MO.isRegMask())
952       ModifiedRegs.setBitsNotInMask(MO.getRegMask());
953 
954     if (!MO.isReg())
955       continue;
956     unsigned Reg = MO.getReg();
957     if (MO.isDef()) {
958       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
959         ModifiedRegs.set(*AI);
960     } else {
961       assert(MO.isUse() && "Reg operand not a def and not a use?!?");
962       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
963         UsedRegs.set(*AI);
964     }
965   }
966 }
967 
968 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
969   // Convert the byte-offset used by unscaled into an "element" offset used
970   // by the scaled pair load/store instructions.
971   if (IsUnscaled)
972     Offset /= OffsetStride;
973 
974   return Offset <= 63 && Offset >= -64;
975 }
976 
977 // Do alignment, specialized to power of 2 and for signed ints,
978 // avoiding having to do a C-style cast from uint_64t to int when
979 // using alignTo from include/llvm/Support/MathExtras.h.
980 // FIXME: Move this function to include/MathExtras.h?
981 static int alignTo(int Num, int PowOf2) {
982   return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
983 }
984 
985 static bool mayAlias(MachineInstr *MIa, MachineInstr *MIb,
986                      const AArch64InstrInfo *TII) {
987   // One of the instructions must modify memory.
988   if (!MIa->mayStore() && !MIb->mayStore())
989     return false;
990 
991   // Both instructions must be memory operations.
992   if (!MIa->mayLoadOrStore() && !MIb->mayLoadOrStore())
993     return false;
994 
995   return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb);
996 }
997 
998 static bool mayAlias(MachineInstr *MIa,
999                      SmallVectorImpl<MachineInstr *> &MemInsns,
1000                      const AArch64InstrInfo *TII) {
1001   for (auto &MIb : MemInsns)
1002     if (mayAlias(MIa, MIb, TII))
1003       return true;
1004 
1005   return false;
1006 }
1007 
1008 bool AArch64LoadStoreOpt::findMatchingStore(
1009     MachineBasicBlock::iterator I, unsigned Limit,
1010     MachineBasicBlock::iterator &StoreI) {
1011   MachineBasicBlock::iterator E = I->getParent()->begin();
1012   MachineBasicBlock::iterator MBBI = I;
1013   MachineInstr *FirstMI = I;
1014   unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
1015 
1016   // Track which registers have been modified and used between the first insn
1017   // and the second insn.
1018   BitVector ModifiedRegs, UsedRegs;
1019   ModifiedRegs.resize(TRI->getNumRegs());
1020   UsedRegs.resize(TRI->getNumRegs());
1021 
1022   for (unsigned Count = 0; MBBI != E && Count < Limit;) {
1023     --MBBI;
1024     MachineInstr *MI = MBBI;
1025     // Skip DBG_VALUE instructions. Otherwise debug info can affect the
1026     // optimization by changing how far we scan.
1027     if (MI->isDebugValue())
1028       continue;
1029     // Now that we know this is a real instruction, count it.
1030     ++Count;
1031 
1032     // If the load instruction reads directly from the address to which the
1033     // store instruction writes and the stored value is not modified, we can
1034     // promote the load. Since we do not handle stores with pre-/post-index,
1035     // it's unnecessary to check if BaseReg is modified by the store itself.
1036     if (MI->mayStore() && isMatchingStore(FirstMI, MI) &&
1037         BaseReg == getLdStBaseOp(MI).getReg() &&
1038         isLdOffsetInRangeOfSt(FirstMI, MI) &&
1039         !ModifiedRegs[getLdStRegOp(MI).getReg()]) {
1040       StoreI = MBBI;
1041       return true;
1042     }
1043 
1044     if (MI->isCall())
1045       return false;
1046 
1047     // Update modified / uses register lists.
1048     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1049 
1050     // Otherwise, if the base register is modified, we have no match, so
1051     // return early.
1052     if (ModifiedRegs[BaseReg])
1053       return false;
1054 
1055     // If we encounter a store aliased with the load, return early.
1056     if (MI->mayStore() && mayAlias(FirstMI, MI, TII))
1057       return false;
1058   }
1059   return false;
1060 }
1061 
1062 /// findMatchingInsn - Scan the instructions looking for a load/store that can
1063 /// be combined with the current instruction into a load/store pair.
1064 MachineBasicBlock::iterator
1065 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
1066                                       LdStPairFlags &Flags, unsigned Limit) {
1067   MachineBasicBlock::iterator E = I->getParent()->end();
1068   MachineBasicBlock::iterator MBBI = I;
1069   MachineInstr *FirstMI = I;
1070   ++MBBI;
1071 
1072   unsigned Opc = FirstMI->getOpcode();
1073   bool MayLoad = FirstMI->mayLoad();
1074   bool IsUnscaled = isUnscaledLdSt(FirstMI);
1075   unsigned Reg = getLdStRegOp(FirstMI).getReg();
1076   unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
1077   int Offset = getLdStOffsetOp(FirstMI).getImm();
1078   bool IsNarrowStore = isNarrowStore(Opc);
1079 
1080   // For narrow stores, find only the case where the stored value is WZR.
1081   if (IsNarrowStore && Reg != AArch64::WZR)
1082     return E;
1083 
1084   // Early exit if the first instruction modifies the base register.
1085   // e.g., ldr x0, [x0]
1086   if (FirstMI->modifiesRegister(BaseReg, TRI))
1087     return E;
1088 
1089   // Early exit if the offset if not possible to match. (6 bits of positive
1090   // range, plus allow an extra one in case we find a later insn that matches
1091   // with Offset-1)
1092   int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
1093   if (!(isNarrowLoad(Opc) || IsNarrowStore) &&
1094       !inBoundsForPair(IsUnscaled, Offset, OffsetStride))
1095     return E;
1096 
1097   // Track which registers have been modified and used between the first insn
1098   // (inclusive) and the second insn.
1099   BitVector ModifiedRegs, UsedRegs;
1100   ModifiedRegs.resize(TRI->getNumRegs());
1101   UsedRegs.resize(TRI->getNumRegs());
1102 
1103   // Remember any instructions that read/write memory between FirstMI and MI.
1104   SmallVector<MachineInstr *, 4> MemInsns;
1105 
1106   for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
1107     MachineInstr *MI = MBBI;
1108     // Skip DBG_VALUE instructions. Otherwise debug info can affect the
1109     // optimization by changing how far we scan.
1110     if (MI->isDebugValue())
1111       continue;
1112 
1113     // Now that we know this is a real instruction, count it.
1114     ++Count;
1115 
1116     bool CanMergeOpc = Opc == MI->getOpcode();
1117     Flags.setSExtIdx(-1);
1118     if (!CanMergeOpc) {
1119       bool IsValidLdStrOpc;
1120       unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc);
1121       assert(IsValidLdStrOpc &&
1122              "Given Opc should be a Load or Store with an immediate");
1123       // Opc will be the first instruction in the pair.
1124       Flags.setSExtIdx(NonSExtOpc == (unsigned)Opc ? 1 : 0);
1125       CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode());
1126     }
1127 
1128     if (CanMergeOpc && getLdStOffsetOp(MI).isImm()) {
1129       assert(MI->mayLoadOrStore() && "Expected memory operation.");
1130       // If we've found another instruction with the same opcode, check to see
1131       // if the base and offset are compatible with our starting instruction.
1132       // These instructions all have scaled immediate operands, so we just
1133       // check for +1/-1. Make sure to check the new instruction offset is
1134       // actually an immediate and not a symbolic reference destined for
1135       // a relocation.
1136       //
1137       // Pairwise instructions have a 7-bit signed offset field. Single insns
1138       // have a 12-bit unsigned offset field. To be a valid combine, the
1139       // final offset must be in range.
1140       unsigned MIBaseReg = getLdStBaseOp(MI).getReg();
1141       int MIOffset = getLdStOffsetOp(MI).getImm();
1142       if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
1143                                    (Offset + OffsetStride == MIOffset))) {
1144         int MinOffset = Offset < MIOffset ? Offset : MIOffset;
1145         // If this is a volatile load/store that otherwise matched, stop looking
1146         // as something is going on that we don't have enough information to
1147         // safely transform. Similarly, stop if we see a hint to avoid pairs.
1148         if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
1149           return E;
1150         // If the resultant immediate offset of merging these instructions
1151         // is out of range for a pairwise instruction, bail and keep looking.
1152         bool MIIsUnscaled = isUnscaledLdSt(MI);
1153         bool IsNarrowLoad = isNarrowLoad(MI->getOpcode());
1154         if (!IsNarrowLoad &&
1155             !inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
1156           trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1157           MemInsns.push_back(MI);
1158           continue;
1159         }
1160 
1161         if (IsNarrowLoad || IsNarrowStore) {
1162           // If the alignment requirements of the scaled wide load/store
1163           // instruction can't express the offset of the scaled narrow
1164           // input, bail and keep looking.
1165           if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) {
1166             trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1167             MemInsns.push_back(MI);
1168             continue;
1169           }
1170         } else {
1171           // If the alignment requirements of the paired (scaled) instruction
1172           // can't express the offset of the unscaled input, bail and keep
1173           // looking.
1174           if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
1175             trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1176             MemInsns.push_back(MI);
1177             continue;
1178           }
1179         }
1180         // If the destination register of the loads is the same register, bail
1181         // and keep looking. A load-pair instruction with both destination
1182         // registers the same is UNPREDICTABLE and will result in an exception.
1183         // For narrow stores, allow only when the stored value is the same
1184         // (i.e., WZR).
1185         if ((MayLoad && Reg == getLdStRegOp(MI).getReg()) ||
1186             (IsNarrowStore && Reg != getLdStRegOp(MI).getReg())) {
1187           trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1188           MemInsns.push_back(MI);
1189           continue;
1190         }
1191 
1192         // If the Rt of the second instruction was not modified or used between
1193         // the two instructions and none of the instructions between the second
1194         // and first alias with the second, we can combine the second into the
1195         // first.
1196         if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
1197             !(MI->mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
1198             !mayAlias(MI, MemInsns, TII)) {
1199           Flags.setMergeForward(false);
1200           return MBBI;
1201         }
1202 
1203         // Likewise, if the Rt of the first instruction is not modified or used
1204         // between the two instructions and none of the instructions between the
1205         // first and the second alias with the first, we can combine the first
1206         // into the second.
1207         if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] &&
1208             !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) &&
1209             !mayAlias(FirstMI, MemInsns, TII)) {
1210           Flags.setMergeForward(true);
1211           return MBBI;
1212         }
1213         // Unable to combine these instructions due to interference in between.
1214         // Keep looking.
1215       }
1216     }
1217 
1218     // If the instruction wasn't a matching load or store.  Stop searching if we
1219     // encounter a call instruction that might modify memory.
1220     if (MI->isCall())
1221       return E;
1222 
1223     // Update modified / uses register lists.
1224     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1225 
1226     // Otherwise, if the base register is modified, we have no match, so
1227     // return early.
1228     if (ModifiedRegs[BaseReg])
1229       return E;
1230 
1231     // Update list of instructions that read/write memory.
1232     if (MI->mayLoadOrStore())
1233       MemInsns.push_back(MI);
1234   }
1235   return E;
1236 }
1237 
1238 MachineBasicBlock::iterator
1239 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
1240                                      MachineBasicBlock::iterator Update,
1241                                      bool IsPreIdx) {
1242   assert((Update->getOpcode() == AArch64::ADDXri ||
1243           Update->getOpcode() == AArch64::SUBXri) &&
1244          "Unexpected base register update instruction to merge!");
1245   MachineBasicBlock::iterator NextI = I;
1246   // Return the instruction following the merged instruction, which is
1247   // the instruction following our unmerged load. Unless that's the add/sub
1248   // instruction we're merging, in which case it's the one after that.
1249   if (++NextI == Update)
1250     ++NextI;
1251 
1252   int Value = Update->getOperand(2).getImm();
1253   assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
1254          "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
1255   if (Update->getOpcode() == AArch64::SUBXri)
1256     Value = -Value;
1257 
1258   unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
1259                              : getPostIndexedOpcode(I->getOpcode());
1260   MachineInstrBuilder MIB;
1261   if (!isPairedLdSt(I)) {
1262     // Non-paired instruction.
1263     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1264               .addOperand(getLdStRegOp(Update))
1265               .addOperand(getLdStRegOp(I))
1266               .addOperand(getLdStBaseOp(I))
1267               .addImm(Value)
1268               .setMemRefs(I->memoperands_begin(), I->memoperands_end());
1269   } else {
1270     // Paired instruction.
1271     int Scale = getMemScale(I);
1272     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1273               .addOperand(getLdStRegOp(Update))
1274               .addOperand(getLdStRegOp(I, 0))
1275               .addOperand(getLdStRegOp(I, 1))
1276               .addOperand(getLdStBaseOp(I))
1277               .addImm(Value / Scale)
1278               .setMemRefs(I->memoperands_begin(), I->memoperands_end());
1279   }
1280   (void)MIB;
1281 
1282   if (IsPreIdx)
1283     DEBUG(dbgs() << "Creating pre-indexed load/store.");
1284   else
1285     DEBUG(dbgs() << "Creating post-indexed load/store.");
1286   DEBUG(dbgs() << "    Replacing instructions:\n    ");
1287   DEBUG(I->print(dbgs()));
1288   DEBUG(dbgs() << "    ");
1289   DEBUG(Update->print(dbgs()));
1290   DEBUG(dbgs() << "  with instruction:\n    ");
1291   DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1292   DEBUG(dbgs() << "\n");
1293 
1294   // Erase the old instructions for the block.
1295   I->eraseFromParent();
1296   Update->eraseFromParent();
1297 
1298   return NextI;
1299 }
1300 
1301 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI,
1302                                                MachineInstr *MI,
1303                                                unsigned BaseReg, int Offset) {
1304   switch (MI->getOpcode()) {
1305   default:
1306     break;
1307   case AArch64::SUBXri:
1308     // Negate the offset for a SUB instruction.
1309     Offset *= -1;
1310   // FALLTHROUGH
1311   case AArch64::ADDXri:
1312     // Make sure it's a vanilla immediate operand, not a relocation or
1313     // anything else we can't handle.
1314     if (!MI->getOperand(2).isImm())
1315       break;
1316     // Watch out for 1 << 12 shifted value.
1317     if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm()))
1318       break;
1319 
1320     // The update instruction source and destination register must be the
1321     // same as the load/store base register.
1322     if (MI->getOperand(0).getReg() != BaseReg ||
1323         MI->getOperand(1).getReg() != BaseReg)
1324       break;
1325 
1326     bool IsPairedInsn = isPairedLdSt(MemMI);
1327     int UpdateOffset = MI->getOperand(2).getImm();
1328     // For non-paired load/store instructions, the immediate must fit in a
1329     // signed 9-bit integer.
1330     if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256))
1331       break;
1332 
1333     // For paired load/store instructions, the immediate must be a multiple of
1334     // the scaling factor.  The scaled offset must also fit into a signed 7-bit
1335     // integer.
1336     if (IsPairedInsn) {
1337       int Scale = getMemScale(MemMI);
1338       if (UpdateOffset % Scale != 0)
1339         break;
1340 
1341       int ScaledOffset = UpdateOffset / Scale;
1342       if (ScaledOffset > 64 || ScaledOffset < -64)
1343         break;
1344     }
1345 
1346     // If we have a non-zero Offset, we check that it matches the amount
1347     // we're adding to the register.
1348     if (!Offset || Offset == MI->getOperand(2).getImm())
1349       return true;
1350     break;
1351   }
1352   return false;
1353 }
1354 
1355 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
1356     MachineBasicBlock::iterator I, int UnscaledOffset) {
1357   MachineBasicBlock::iterator E = I->getParent()->end();
1358   MachineInstr *MemMI = I;
1359   MachineBasicBlock::iterator MBBI = I;
1360 
1361   unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
1362   int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI);
1363 
1364   // Scan forward looking for post-index opportunities.  Updating instructions
1365   // can't be formed if the memory instruction doesn't have the offset we're
1366   // looking for.
1367   if (MIUnscaledOffset != UnscaledOffset)
1368     return E;
1369 
1370   // If the base register overlaps a destination register, we can't
1371   // merge the update.
1372   bool IsPairedInsn = isPairedLdSt(MemMI);
1373   for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
1374     unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
1375     if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
1376       return E;
1377   }
1378 
1379   // Track which registers have been modified and used between the first insn
1380   // (inclusive) and the second insn.
1381   BitVector ModifiedRegs, UsedRegs;
1382   ModifiedRegs.resize(TRI->getNumRegs());
1383   UsedRegs.resize(TRI->getNumRegs());
1384   ++MBBI;
1385   for (; MBBI != E; ++MBBI) {
1386     MachineInstr *MI = MBBI;
1387     // Skip DBG_VALUE instructions.
1388     if (MI->isDebugValue())
1389       continue;
1390 
1391     // If we found a match, return it.
1392     if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset))
1393       return MBBI;
1394 
1395     // Update the status of what the instruction clobbered and used.
1396     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1397 
1398     // Otherwise, if the base register is used or modified, we have no match, so
1399     // return early.
1400     if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
1401       return E;
1402   }
1403   return E;
1404 }
1405 
1406 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
1407     MachineBasicBlock::iterator I) {
1408   MachineBasicBlock::iterator B = I->getParent()->begin();
1409   MachineBasicBlock::iterator E = I->getParent()->end();
1410   MachineInstr *MemMI = I;
1411   MachineBasicBlock::iterator MBBI = I;
1412 
1413   unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
1414   int Offset = getLdStOffsetOp(MemMI).getImm();
1415 
1416   // If the load/store is the first instruction in the block, there's obviously
1417   // not any matching update. Ditto if the memory offset isn't zero.
1418   if (MBBI == B || Offset != 0)
1419     return E;
1420   // If the base register overlaps a destination register, we can't
1421   // merge the update.
1422   bool IsPairedInsn = isPairedLdSt(MemMI);
1423   for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
1424     unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
1425     if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
1426       return E;
1427   }
1428 
1429   // Track which registers have been modified and used between the first insn
1430   // (inclusive) and the second insn.
1431   BitVector ModifiedRegs, UsedRegs;
1432   ModifiedRegs.resize(TRI->getNumRegs());
1433   UsedRegs.resize(TRI->getNumRegs());
1434   --MBBI;
1435   for (; MBBI != B; --MBBI) {
1436     MachineInstr *MI = MBBI;
1437     // Skip DBG_VALUE instructions.
1438     if (MI->isDebugValue())
1439       continue;
1440 
1441     // If we found a match, return it.
1442     if (isMatchingUpdateInsn(I, MI, BaseReg, Offset))
1443       return MBBI;
1444 
1445     // Update the status of what the instruction clobbered and used.
1446     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1447 
1448     // Otherwise, if the base register is used or modified, we have no match, so
1449     // return early.
1450     if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
1451       return E;
1452   }
1453   return E;
1454 }
1455 
1456 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
1457     MachineBasicBlock::iterator &MBBI) {
1458   MachineInstr *MI = MBBI;
1459   // If this is a volatile load, don't mess with it.
1460   if (MI->hasOrderedMemoryRef())
1461     return false;
1462 
1463   // Make sure this is a reg+imm.
1464   // FIXME: It is possible to extend it to handle reg+reg cases.
1465   if (!getLdStOffsetOp(MI).isImm())
1466     return false;
1467 
1468   // Look backward up to ScanLimit instructions.
1469   MachineBasicBlock::iterator StoreI;
1470   if (findMatchingStore(MBBI, ScanLimit, StoreI)) {
1471     ++NumLoadsFromStoresPromoted;
1472     // Promote the load. Keeping the iterator straight is a
1473     // pain, so we let the merge routine tell us what the next instruction
1474     // is after it's done mucking about.
1475     MBBI = promoteLoadFromStore(MBBI, StoreI);
1476     return true;
1477   }
1478   return false;
1479 }
1480 
1481 bool AArch64LoadStoreOpt::tryToMergeLdStInst(
1482     MachineBasicBlock::iterator &MBBI) {
1483   MachineInstr *MI = MBBI;
1484   MachineBasicBlock::iterator E = MI->getParent()->end();
1485   // If this is a volatile load/store, don't mess with it.
1486   if (MI->hasOrderedMemoryRef())
1487     return false;
1488 
1489   // Make sure this is a reg+imm (as opposed to an address reloc).
1490   if (!getLdStOffsetOp(MI).isImm())
1491     return false;
1492 
1493   // Check if this load/store has a hint to avoid pair formation.
1494   // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
1495   if (TII->isLdStPairSuppressed(MI))
1496     return false;
1497 
1498   // Look ahead up to ScanLimit instructions for a pairable instruction.
1499   LdStPairFlags Flags;
1500   MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit);
1501   if (Paired != E) {
1502     if (isNarrowLoad(MI)) {
1503       ++NumNarrowLoadsPromoted;
1504     } else if (isNarrowStore(MI)) {
1505       ++NumZeroStoresPromoted;
1506     } else {
1507       ++NumPairCreated;
1508       if (isUnscaledLdSt(MI))
1509         ++NumUnscaledPairCreated;
1510     }
1511 
1512     // Merge the loads into a pair. Keeping the iterator straight is a
1513     // pain, so we let the merge routine tell us what the next instruction
1514     // is after it's done mucking about.
1515     MBBI = mergePairedInsns(MBBI, Paired, Flags);
1516     return true;
1517   }
1518   return false;
1519 }
1520 
1521 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
1522                                         bool enableNarrowLdOpt) {
1523   bool Modified = false;
1524   // Three tranformations to do here:
1525   // 1) Find loads that directly read from stores and promote them by
1526   //    replacing with mov instructions. If the store is wider than the load,
1527   //    the load will be replaced with a bitfield extract.
1528   //      e.g.,
1529   //        str w1, [x0, #4]
1530   //        ldrh w2, [x0, #6]
1531   //        ; becomes
1532   //        str w1, [x0, #4]
1533   //        lsr	w2, w1, #16
1534   // 2) Find narrow loads that can be converted into a single wider load
1535   //    with bitfield extract instructions.
1536   //      e.g.,
1537   //        ldrh w0, [x2]
1538   //        ldrh w1, [x2, #2]
1539   //        ; becomes
1540   //        ldr w0, [x2]
1541   //        ubfx w1, w0, #16, #16
1542   //        and w0, w0, #ffff
1543   // 3) Find loads and stores that can be merged into a single load or store
1544   //    pair instruction.
1545   //      e.g.,
1546   //        ldr x0, [x2]
1547   //        ldr x1, [x2, #8]
1548   //        ; becomes
1549   //        ldp x0, x1, [x2]
1550   // 4) Find base register updates that can be merged into the load or store
1551   //    as a base-reg writeback.
1552   //      e.g.,
1553   //        ldr x0, [x2]
1554   //        add x2, x2, #4
1555   //        ; becomes
1556   //        ldr x0, [x2], #4
1557 
1558   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1559        MBBI != E;) {
1560     MachineInstr *MI = MBBI;
1561     switch (MI->getOpcode()) {
1562     default:
1563       // Just move on to the next instruction.
1564       ++MBBI;
1565       break;
1566     // Scaled instructions.
1567     case AArch64::LDRBBui:
1568     case AArch64::LDRHHui:
1569     case AArch64::LDRWui:
1570     case AArch64::LDRXui:
1571     // Unscaled instructions.
1572     case AArch64::LDURBBi:
1573     case AArch64::LDURHHi:
1574     case AArch64::LDURWi:
1575     case AArch64::LDURXi: {
1576       if (tryToPromoteLoadFromStore(MBBI)) {
1577         Modified = true;
1578         break;
1579       }
1580       ++MBBI;
1581       break;
1582     }
1583     }
1584   }
1585 
1586   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1587        enableNarrowLdOpt && MBBI != E;) {
1588     MachineInstr *MI = MBBI;
1589     switch (MI->getOpcode()) {
1590     default:
1591       // Just move on to the next instruction.
1592       ++MBBI;
1593       break;
1594     // Scaled instructions.
1595     case AArch64::LDRBBui:
1596     case AArch64::LDRHHui:
1597     case AArch64::LDRSBWui:
1598     case AArch64::LDRSHWui:
1599     case AArch64::STRBBui:
1600     case AArch64::STRHHui:
1601     // Unscaled instructions.
1602     case AArch64::LDURBBi:
1603     case AArch64::LDURHHi:
1604     case AArch64::LDURSBWi:
1605     case AArch64::LDURSHWi:
1606     case AArch64::STURBBi:
1607     case AArch64::STURHHi: {
1608       if (tryToMergeLdStInst(MBBI)) {
1609         Modified = true;
1610         break;
1611       }
1612       ++MBBI;
1613       break;
1614     }
1615     }
1616   }
1617 
1618   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1619        MBBI != E;) {
1620     MachineInstr *MI = MBBI;
1621     switch (MI->getOpcode()) {
1622     default:
1623       // Just move on to the next instruction.
1624       ++MBBI;
1625       break;
1626     // Scaled instructions.
1627     case AArch64::STRSui:
1628     case AArch64::STRDui:
1629     case AArch64::STRQui:
1630     case AArch64::STRXui:
1631     case AArch64::STRWui:
1632     case AArch64::LDRSui:
1633     case AArch64::LDRDui:
1634     case AArch64::LDRQui:
1635     case AArch64::LDRXui:
1636     case AArch64::LDRWui:
1637     case AArch64::LDRSWui:
1638     // Unscaled instructions.
1639     case AArch64::STURSi:
1640     case AArch64::STURDi:
1641     case AArch64::STURQi:
1642     case AArch64::STURWi:
1643     case AArch64::STURXi:
1644     case AArch64::LDURSi:
1645     case AArch64::LDURDi:
1646     case AArch64::LDURQi:
1647     case AArch64::LDURWi:
1648     case AArch64::LDURXi:
1649     case AArch64::LDURSWi: {
1650       if (tryToMergeLdStInst(MBBI)) {
1651         Modified = true;
1652         break;
1653       }
1654       ++MBBI;
1655       break;
1656     }
1657     }
1658   }
1659 
1660   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1661        MBBI != E;) {
1662     MachineInstr *MI = MBBI;
1663     // Do update merging. It's simpler to keep this separate from the above
1664     // switch, though not strictly necessary.
1665     unsigned Opc = MI->getOpcode();
1666     switch (Opc) {
1667     default:
1668       // Just move on to the next instruction.
1669       ++MBBI;
1670       break;
1671     // Scaled instructions.
1672     case AArch64::STRSui:
1673     case AArch64::STRDui:
1674     case AArch64::STRQui:
1675     case AArch64::STRXui:
1676     case AArch64::STRWui:
1677     case AArch64::STRHHui:
1678     case AArch64::STRBBui:
1679     case AArch64::LDRSui:
1680     case AArch64::LDRDui:
1681     case AArch64::LDRQui:
1682     case AArch64::LDRXui:
1683     case AArch64::LDRWui:
1684     case AArch64::LDRHHui:
1685     case AArch64::LDRBBui:
1686     // Unscaled instructions.
1687     case AArch64::STURSi:
1688     case AArch64::STURDi:
1689     case AArch64::STURQi:
1690     case AArch64::STURWi:
1691     case AArch64::STURXi:
1692     case AArch64::LDURSi:
1693     case AArch64::LDURDi:
1694     case AArch64::LDURQi:
1695     case AArch64::LDURWi:
1696     case AArch64::LDURXi:
1697     // Paired instructions.
1698     case AArch64::LDPSi:
1699     case AArch64::LDPSWi:
1700     case AArch64::LDPDi:
1701     case AArch64::LDPQi:
1702     case AArch64::LDPWi:
1703     case AArch64::LDPXi:
1704     case AArch64::STPSi:
1705     case AArch64::STPDi:
1706     case AArch64::STPQi:
1707     case AArch64::STPWi:
1708     case AArch64::STPXi: {
1709       // Make sure this is a reg+imm (as opposed to an address reloc).
1710       if (!getLdStOffsetOp(MI).isImm()) {
1711         ++MBBI;
1712         break;
1713       }
1714       // Look forward to try to form a post-index instruction. For example,
1715       // ldr x0, [x20]
1716       // add x20, x20, #32
1717       //   merged into:
1718       // ldr x0, [x20], #32
1719       MachineBasicBlock::iterator Update =
1720           findMatchingUpdateInsnForward(MBBI, 0);
1721       if (Update != E) {
1722         // Merge the update into the ld/st.
1723         MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
1724         Modified = true;
1725         ++NumPostFolded;
1726         break;
1727       }
1728       // Don't know how to handle pre/post-index versions, so move to the next
1729       // instruction.
1730       if (isUnscaledLdSt(Opc)) {
1731         ++MBBI;
1732         break;
1733       }
1734 
1735       // Look back to try to find a pre-index instruction. For example,
1736       // add x0, x0, #8
1737       // ldr x1, [x0]
1738       //   merged into:
1739       // ldr x1, [x0, #8]!
1740       Update = findMatchingUpdateInsnBackward(MBBI);
1741       if (Update != E) {
1742         // Merge the update into the ld/st.
1743         MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
1744         Modified = true;
1745         ++NumPreFolded;
1746         break;
1747       }
1748       // The immediate in the load/store is scaled by the size of the memory
1749       // operation. The immediate in the add we're looking for,
1750       // however, is not, so adjust here.
1751       int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
1752 
1753       // Look forward to try to find a post-index instruction. For example,
1754       // ldr x1, [x0, #64]
1755       // add x0, x0, #64
1756       //   merged into:
1757       // ldr x1, [x0, #64]!
1758       Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset);
1759       if (Update != E) {
1760         // Merge the update into the ld/st.
1761         MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
1762         Modified = true;
1763         ++NumPreFolded;
1764         break;
1765       }
1766 
1767       // Nothing found. Just move to the next instruction.
1768       ++MBBI;
1769       break;
1770     }
1771     }
1772   }
1773 
1774   return Modified;
1775 }
1776 
1777 bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) {
1778   bool ProfitableArch = Subtarget->isCortexA57();
1779   // FIXME: The benefit from converting narrow loads into a wider load could be
1780   // microarchitectural as it assumes that a single load with two bitfield
1781   // extracts is cheaper than two narrow loads. Currently, this conversion is
1782   // enabled only in cortex-a57 on which performance benefits were verified.
1783   return ProfitableArch && !Subtarget->requiresStrictAlign();
1784 }
1785 
1786 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1787   Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
1788   TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
1789   TRI = Subtarget->getRegisterInfo();
1790 
1791   bool Modified = false;
1792   bool enableNarrowLdOpt = enableNarrowLdMerge(Fn);
1793   for (auto &MBB : Fn)
1794     Modified |= optimizeBlock(MBB, enableNarrowLdOpt);
1795 
1796   return Modified;
1797 }
1798 
1799 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
1800 // loads and stores near one another?
1801 
1802 /// createAArch64LoadStoreOptimizationPass - returns an instance of the
1803 /// load / store optimization pass.
1804 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
1805   return new AArch64LoadStoreOpt();
1806 }
1807