1 //=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file contains a pass that performs load / store related peephole
11 // optimizations. This pass should be run after register allocation.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AArch64InstrInfo.h"
16 #include "AArch64Subtarget.h"
17 #include "MCTargetDesc/AArch64AddressingModes.h"
18 #include "llvm/ADT/BitVector.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/CodeGen/MachineInstr.h"
24 #include "llvm/CodeGen/MachineInstrBuilder.h"
25 #include "llvm/Support/CommandLine.h"
26 #include "llvm/Support/Debug.h"
27 #include "llvm/Support/ErrorHandling.h"
28 #include "llvm/Support/raw_ostream.h"
29 #include "llvm/Target/TargetInstrInfo.h"
30 #include "llvm/Target/TargetMachine.h"
31 #include "llvm/Target/TargetRegisterInfo.h"
32 using namespace llvm;
33 
34 #define DEBUG_TYPE "aarch64-ldst-opt"
35 
36 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
37 STATISTIC(NumPostFolded, "Number of post-index updates folded");
38 STATISTIC(NumPreFolded, "Number of pre-index updates folded");
39 STATISTIC(NumUnscaledPairCreated,
40           "Number of load/store from unscaled generated");
41 STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted");
42 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
43 STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
44 
45 // The LdStLimit limits how far we search for load/store pairs.
46 static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
47                                    cl::init(20), cl::Hidden);
48 
49 // The UpdateLimit limits how far we search for update instructions when we form
50 // pre-/post-index instructions.
51 static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
52                                      cl::Hidden);
53 
54 static cl::opt<bool> EnableNarrowLdMerge("enable-narrow-ld-merge", cl::Hidden,
55                                          cl::init(false),
56                                          cl::desc("Enable narrow load merge"));
57 
58 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
59 
60 namespace {
61 
62 typedef struct LdStPairFlags {
63   // If a matching instruction is found, MergeForward is set to true if the
64   // merge is to remove the first instruction and replace the second with
65   // a pair-wise insn, and false if the reverse is true.
66   bool MergeForward;
67 
68   // SExtIdx gives the index of the result of the load pair that must be
69   // extended. The value of SExtIdx assumes that the paired load produces the
70   // value in this order: (I, returned iterator), i.e., -1 means no value has
71   // to be extended, 0 means I, and 1 means the returned iterator.
72   int SExtIdx;
73 
74   LdStPairFlags() : MergeForward(false), SExtIdx(-1) {}
75 
76   void setMergeForward(bool V = true) { MergeForward = V; }
77   bool getMergeForward() const { return MergeForward; }
78 
79   void setSExtIdx(int V) { SExtIdx = V; }
80   int getSExtIdx() const { return SExtIdx; }
81 
82 } LdStPairFlags;
83 
84 struct AArch64LoadStoreOpt : public MachineFunctionPass {
85   static char ID;
86   AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
87     initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
88   }
89 
90   const AArch64InstrInfo *TII;
91   const TargetRegisterInfo *TRI;
92   const AArch64Subtarget *Subtarget;
93 
94   // Track which registers have been modified and used.
95   BitVector ModifiedRegs, UsedRegs;
96 
97   // Scan the instructions looking for a load/store that can be combined
98   // with the current instruction into a load/store pair.
99   // Return the matching instruction if one is found, else MBB->end().
100   MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
101                                                LdStPairFlags &Flags,
102                                                unsigned Limit,
103                                                bool FindNarrowMerge);
104 
105   // Scan the instructions looking for a store that writes to the address from
106   // which the current load instruction reads. Return true if one is found.
107   bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
108                          MachineBasicBlock::iterator &StoreI);
109 
110   // Merge the two instructions indicated into a wider instruction.
111   MachineBasicBlock::iterator
112   mergeNarrowInsns(MachineBasicBlock::iterator I,
113                    MachineBasicBlock::iterator MergeMI,
114                    const LdStPairFlags &Flags);
115 
116   // Merge the two instructions indicated into a single pair-wise instruction.
117   MachineBasicBlock::iterator
118   mergePairedInsns(MachineBasicBlock::iterator I,
119                    MachineBasicBlock::iterator Paired,
120                    const LdStPairFlags &Flags);
121 
122   // Promote the load that reads directly from the address stored to.
123   MachineBasicBlock::iterator
124   promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
125                        MachineBasicBlock::iterator StoreI);
126 
127   // Scan the instruction list to find a base register update that can
128   // be combined with the current instruction (a load or store) using
129   // pre or post indexed addressing with writeback. Scan forwards.
130   MachineBasicBlock::iterator
131   findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
132                                 int UnscaledOffset, unsigned Limit);
133 
134   // Scan the instruction list to find a base register update that can
135   // be combined with the current instruction (a load or store) using
136   // pre or post indexed addressing with writeback. Scan backwards.
137   MachineBasicBlock::iterator
138   findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
139 
140   // Find an instruction that updates the base register of the ld/st
141   // instruction.
142   bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
143                             unsigned BaseReg, int Offset);
144 
145   // Merge a pre- or post-index base register update into a ld/st instruction.
146   MachineBasicBlock::iterator
147   mergeUpdateInsn(MachineBasicBlock::iterator I,
148                   MachineBasicBlock::iterator Update, bool IsPreIdx);
149 
150   // Find and merge foldable ldr/str instructions.
151   bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI);
152 
153   // Find and pair ldr/str instructions.
154   bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
155 
156   // Find and promote load instructions which read directly from store.
157   bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
158 
159   bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt);
160 
161   bool runOnMachineFunction(MachineFunction &Fn) override;
162 
163   MachineFunctionProperties getRequiredProperties() const override {
164     return MachineFunctionProperties().set(
165         MachineFunctionProperties::Property::NoVRegs);
166   }
167 
168   StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
169 };
170 char AArch64LoadStoreOpt::ID = 0;
171 } // namespace
172 
173 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
174                 AARCH64_LOAD_STORE_OPT_NAME, false, false)
175 
176 static unsigned getBitExtrOpcode(MachineInstr &MI) {
177   switch (MI.getOpcode()) {
178   default:
179     llvm_unreachable("Unexpected opcode.");
180   case AArch64::LDRBBui:
181   case AArch64::LDURBBi:
182   case AArch64::LDRHHui:
183   case AArch64::LDURHHi:
184     return AArch64::UBFMWri;
185   case AArch64::LDRSBWui:
186   case AArch64::LDURSBWi:
187   case AArch64::LDRSHWui:
188   case AArch64::LDURSHWi:
189     return AArch64::SBFMWri;
190   }
191 }
192 
193 static bool isNarrowStore(unsigned Opc) {
194   switch (Opc) {
195   default:
196     return false;
197   case AArch64::STRBBui:
198   case AArch64::STURBBi:
199   case AArch64::STRHHui:
200   case AArch64::STURHHi:
201     return true;
202   }
203 }
204 
205 static bool isNarrowLoad(unsigned Opc) {
206   switch (Opc) {
207   default:
208     return false;
209   case AArch64::LDRHHui:
210   case AArch64::LDURHHi:
211   case AArch64::LDRBBui:
212   case AArch64::LDURBBi:
213   case AArch64::LDRSHWui:
214   case AArch64::LDURSHWi:
215   case AArch64::LDRSBWui:
216   case AArch64::LDURSBWi:
217     return true;
218   }
219 }
220 
221 static bool isNarrowLoad(MachineInstr &MI) {
222   return isNarrowLoad(MI.getOpcode());
223 }
224 
225 static bool isNarrowLoadOrStore(unsigned Opc) {
226   return isNarrowLoad(Opc) || isNarrowStore(Opc);
227 }
228 
229 // Scaling factor for unscaled load or store.
230 static int getMemScale(MachineInstr &MI) {
231   switch (MI.getOpcode()) {
232   default:
233     llvm_unreachable("Opcode has unknown scale!");
234   case AArch64::LDRBBui:
235   case AArch64::LDURBBi:
236   case AArch64::LDRSBWui:
237   case AArch64::LDURSBWi:
238   case AArch64::STRBBui:
239   case AArch64::STURBBi:
240     return 1;
241   case AArch64::LDRHHui:
242   case AArch64::LDURHHi:
243   case AArch64::LDRSHWui:
244   case AArch64::LDURSHWi:
245   case AArch64::STRHHui:
246   case AArch64::STURHHi:
247     return 2;
248   case AArch64::LDRSui:
249   case AArch64::LDURSi:
250   case AArch64::LDRSWui:
251   case AArch64::LDURSWi:
252   case AArch64::LDRWui:
253   case AArch64::LDURWi:
254   case AArch64::STRSui:
255   case AArch64::STURSi:
256   case AArch64::STRWui:
257   case AArch64::STURWi:
258   case AArch64::LDPSi:
259   case AArch64::LDPSWi:
260   case AArch64::LDPWi:
261   case AArch64::STPSi:
262   case AArch64::STPWi:
263     return 4;
264   case AArch64::LDRDui:
265   case AArch64::LDURDi:
266   case AArch64::LDRXui:
267   case AArch64::LDURXi:
268   case AArch64::STRDui:
269   case AArch64::STURDi:
270   case AArch64::STRXui:
271   case AArch64::STURXi:
272   case AArch64::LDPDi:
273   case AArch64::LDPXi:
274   case AArch64::STPDi:
275   case AArch64::STPXi:
276     return 8;
277   case AArch64::LDRQui:
278   case AArch64::LDURQi:
279   case AArch64::STRQui:
280   case AArch64::STURQi:
281   case AArch64::LDPQi:
282   case AArch64::STPQi:
283     return 16;
284   }
285 }
286 
287 static unsigned getMatchingNonSExtOpcode(unsigned Opc,
288                                          bool *IsValidLdStrOpc = nullptr) {
289   if (IsValidLdStrOpc)
290     *IsValidLdStrOpc = true;
291   switch (Opc) {
292   default:
293     if (IsValidLdStrOpc)
294       *IsValidLdStrOpc = false;
295     return UINT_MAX;
296   case AArch64::STRDui:
297   case AArch64::STURDi:
298   case AArch64::STRQui:
299   case AArch64::STURQi:
300   case AArch64::STRBBui:
301   case AArch64::STURBBi:
302   case AArch64::STRHHui:
303   case AArch64::STURHHi:
304   case AArch64::STRWui:
305   case AArch64::STURWi:
306   case AArch64::STRXui:
307   case AArch64::STURXi:
308   case AArch64::LDRDui:
309   case AArch64::LDURDi:
310   case AArch64::LDRQui:
311   case AArch64::LDURQi:
312   case AArch64::LDRWui:
313   case AArch64::LDURWi:
314   case AArch64::LDRXui:
315   case AArch64::LDURXi:
316   case AArch64::STRSui:
317   case AArch64::STURSi:
318   case AArch64::LDRSui:
319   case AArch64::LDURSi:
320   case AArch64::LDRHHui:
321   case AArch64::LDURHHi:
322   case AArch64::LDRBBui:
323   case AArch64::LDURBBi:
324     return Opc;
325   case AArch64::LDRSWui:
326     return AArch64::LDRWui;
327   case AArch64::LDURSWi:
328     return AArch64::LDURWi;
329   case AArch64::LDRSBWui:
330     return AArch64::LDRBBui;
331   case AArch64::LDRSHWui:
332     return AArch64::LDRHHui;
333   case AArch64::LDURSBWi:
334     return AArch64::LDURBBi;
335   case AArch64::LDURSHWi:
336     return AArch64::LDURHHi;
337   }
338 }
339 
340 static unsigned getMatchingWideOpcode(unsigned Opc) {
341   switch (Opc) {
342   default:
343     llvm_unreachable("Opcode has no wide equivalent!");
344   case AArch64::STRBBui:
345     return AArch64::STRHHui;
346   case AArch64::STRHHui:
347     return AArch64::STRWui;
348   case AArch64::STURBBi:
349     return AArch64::STURHHi;
350   case AArch64::STURHHi:
351     return AArch64::STURWi;
352   case AArch64::STURWi:
353     return AArch64::STURXi;
354   case AArch64::STRWui:
355     return AArch64::STRXui;
356   case AArch64::LDRHHui:
357   case AArch64::LDRSHWui:
358     return AArch64::LDRWui;
359   case AArch64::LDURHHi:
360   case AArch64::LDURSHWi:
361     return AArch64::LDURWi;
362   case AArch64::LDRBBui:
363   case AArch64::LDRSBWui:
364     return AArch64::LDRHHui;
365   case AArch64::LDURBBi:
366   case AArch64::LDURSBWi:
367     return AArch64::LDURHHi;
368   }
369 }
370 
371 static unsigned getMatchingPairOpcode(unsigned Opc) {
372   switch (Opc) {
373   default:
374     llvm_unreachable("Opcode has no pairwise equivalent!");
375   case AArch64::STRSui:
376   case AArch64::STURSi:
377     return AArch64::STPSi;
378   case AArch64::STRDui:
379   case AArch64::STURDi:
380     return AArch64::STPDi;
381   case AArch64::STRQui:
382   case AArch64::STURQi:
383     return AArch64::STPQi;
384   case AArch64::STRWui:
385   case AArch64::STURWi:
386     return AArch64::STPWi;
387   case AArch64::STRXui:
388   case AArch64::STURXi:
389     return AArch64::STPXi;
390   case AArch64::LDRSui:
391   case AArch64::LDURSi:
392     return AArch64::LDPSi;
393   case AArch64::LDRDui:
394   case AArch64::LDURDi:
395     return AArch64::LDPDi;
396   case AArch64::LDRQui:
397   case AArch64::LDURQi:
398     return AArch64::LDPQi;
399   case AArch64::LDRWui:
400   case AArch64::LDURWi:
401     return AArch64::LDPWi;
402   case AArch64::LDRXui:
403   case AArch64::LDURXi:
404     return AArch64::LDPXi;
405   case AArch64::LDRSWui:
406   case AArch64::LDURSWi:
407     return AArch64::LDPSWi;
408   }
409 }
410 
411 static unsigned isMatchingStore(MachineInstr &LoadInst,
412                                 MachineInstr &StoreInst) {
413   unsigned LdOpc = LoadInst.getOpcode();
414   unsigned StOpc = StoreInst.getOpcode();
415   switch (LdOpc) {
416   default:
417     llvm_unreachable("Unsupported load instruction!");
418   case AArch64::LDRBBui:
419     return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
420            StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
421   case AArch64::LDURBBi:
422     return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
423            StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
424   case AArch64::LDRHHui:
425     return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
426            StOpc == AArch64::STRXui;
427   case AArch64::LDURHHi:
428     return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
429            StOpc == AArch64::STURXi;
430   case AArch64::LDRWui:
431     return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
432   case AArch64::LDURWi:
433     return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
434   case AArch64::LDRXui:
435     return StOpc == AArch64::STRXui;
436   case AArch64::LDURXi:
437     return StOpc == AArch64::STURXi;
438   }
439 }
440 
441 static unsigned getPreIndexedOpcode(unsigned Opc) {
442   switch (Opc) {
443   default:
444     llvm_unreachable("Opcode has no pre-indexed equivalent!");
445   case AArch64::STRSui:
446     return AArch64::STRSpre;
447   case AArch64::STRDui:
448     return AArch64::STRDpre;
449   case AArch64::STRQui:
450     return AArch64::STRQpre;
451   case AArch64::STRBBui:
452     return AArch64::STRBBpre;
453   case AArch64::STRHHui:
454     return AArch64::STRHHpre;
455   case AArch64::STRWui:
456     return AArch64::STRWpre;
457   case AArch64::STRXui:
458     return AArch64::STRXpre;
459   case AArch64::LDRSui:
460     return AArch64::LDRSpre;
461   case AArch64::LDRDui:
462     return AArch64::LDRDpre;
463   case AArch64::LDRQui:
464     return AArch64::LDRQpre;
465   case AArch64::LDRBBui:
466     return AArch64::LDRBBpre;
467   case AArch64::LDRHHui:
468     return AArch64::LDRHHpre;
469   case AArch64::LDRWui:
470     return AArch64::LDRWpre;
471   case AArch64::LDRXui:
472     return AArch64::LDRXpre;
473   case AArch64::LDRSWui:
474     return AArch64::LDRSWpre;
475   case AArch64::LDPSi:
476     return AArch64::LDPSpre;
477   case AArch64::LDPSWi:
478     return AArch64::LDPSWpre;
479   case AArch64::LDPDi:
480     return AArch64::LDPDpre;
481   case AArch64::LDPQi:
482     return AArch64::LDPQpre;
483   case AArch64::LDPWi:
484     return AArch64::LDPWpre;
485   case AArch64::LDPXi:
486     return AArch64::LDPXpre;
487   case AArch64::STPSi:
488     return AArch64::STPSpre;
489   case AArch64::STPDi:
490     return AArch64::STPDpre;
491   case AArch64::STPQi:
492     return AArch64::STPQpre;
493   case AArch64::STPWi:
494     return AArch64::STPWpre;
495   case AArch64::STPXi:
496     return AArch64::STPXpre;
497   }
498 }
499 
500 static unsigned getPostIndexedOpcode(unsigned Opc) {
501   switch (Opc) {
502   default:
503     llvm_unreachable("Opcode has no post-indexed wise equivalent!");
504   case AArch64::STRSui:
505     return AArch64::STRSpost;
506   case AArch64::STRDui:
507     return AArch64::STRDpost;
508   case AArch64::STRQui:
509     return AArch64::STRQpost;
510   case AArch64::STRBBui:
511     return AArch64::STRBBpost;
512   case AArch64::STRHHui:
513     return AArch64::STRHHpost;
514   case AArch64::STRWui:
515     return AArch64::STRWpost;
516   case AArch64::STRXui:
517     return AArch64::STRXpost;
518   case AArch64::LDRSui:
519     return AArch64::LDRSpost;
520   case AArch64::LDRDui:
521     return AArch64::LDRDpost;
522   case AArch64::LDRQui:
523     return AArch64::LDRQpost;
524   case AArch64::LDRBBui:
525     return AArch64::LDRBBpost;
526   case AArch64::LDRHHui:
527     return AArch64::LDRHHpost;
528   case AArch64::LDRWui:
529     return AArch64::LDRWpost;
530   case AArch64::LDRXui:
531     return AArch64::LDRXpost;
532   case AArch64::LDRSWui:
533     return AArch64::LDRSWpost;
534   case AArch64::LDPSi:
535     return AArch64::LDPSpost;
536   case AArch64::LDPSWi:
537     return AArch64::LDPSWpost;
538   case AArch64::LDPDi:
539     return AArch64::LDPDpost;
540   case AArch64::LDPQi:
541     return AArch64::LDPQpost;
542   case AArch64::LDPWi:
543     return AArch64::LDPWpost;
544   case AArch64::LDPXi:
545     return AArch64::LDPXpost;
546   case AArch64::STPSi:
547     return AArch64::STPSpost;
548   case AArch64::STPDi:
549     return AArch64::STPDpost;
550   case AArch64::STPQi:
551     return AArch64::STPQpost;
552   case AArch64::STPWi:
553     return AArch64::STPWpost;
554   case AArch64::STPXi:
555     return AArch64::STPXpost;
556   }
557 }
558 
559 static bool isPairedLdSt(const MachineInstr &MI) {
560   switch (MI.getOpcode()) {
561   default:
562     return false;
563   case AArch64::LDPSi:
564   case AArch64::LDPSWi:
565   case AArch64::LDPDi:
566   case AArch64::LDPQi:
567   case AArch64::LDPWi:
568   case AArch64::LDPXi:
569   case AArch64::STPSi:
570   case AArch64::STPDi:
571   case AArch64::STPQi:
572   case AArch64::STPWi:
573   case AArch64::STPXi:
574     return true;
575   }
576 }
577 
578 static const MachineOperand &getLdStRegOp(const MachineInstr &MI,
579                                           unsigned PairedRegOp = 0) {
580   assert(PairedRegOp < 2 && "Unexpected register operand idx.");
581   unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0;
582   return MI.getOperand(Idx);
583 }
584 
585 static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) {
586   unsigned Idx = isPairedLdSt(MI) ? 2 : 1;
587   return MI.getOperand(Idx);
588 }
589 
590 static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) {
591   unsigned Idx = isPairedLdSt(MI) ? 3 : 2;
592   return MI.getOperand(Idx);
593 }
594 
595 static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
596                                   MachineInstr &StoreInst,
597                                   const AArch64InstrInfo *TII) {
598   assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
599   int LoadSize = getMemScale(LoadInst);
600   int StoreSize = getMemScale(StoreInst);
601   int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst)
602                              ? getLdStOffsetOp(StoreInst).getImm()
603                              : getLdStOffsetOp(StoreInst).getImm() * StoreSize;
604   int UnscaledLdOffset = TII->isUnscaledLdSt(LoadInst)
605                              ? getLdStOffsetOp(LoadInst).getImm()
606                              : getLdStOffsetOp(LoadInst).getImm() * LoadSize;
607   return (UnscaledStOffset <= UnscaledLdOffset) &&
608          (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
609 }
610 
611 static bool isPromotableZeroStoreOpcode(unsigned Opc) {
612   return isNarrowStore(Opc) || Opc == AArch64::STRWui || Opc == AArch64::STURWi;
613 }
614 
615 static bool isPromotableZeroStoreOpcode(MachineInstr &MI) {
616   return isPromotableZeroStoreOpcode(MI.getOpcode());
617 }
618 
619 static bool isPromotableZeroStoreInst(MachineInstr &MI) {
620   return (isPromotableZeroStoreOpcode(MI)) &&
621          getLdStRegOp(MI).getReg() == AArch64::WZR;
622 }
623 
624 MachineBasicBlock::iterator
625 AArch64LoadStoreOpt::mergeNarrowInsns(MachineBasicBlock::iterator I,
626                                       MachineBasicBlock::iterator MergeMI,
627                                       const LdStPairFlags &Flags) {
628   MachineBasicBlock::iterator NextI = I;
629   ++NextI;
630   // If NextI is the second of the two instructions to be merged, we need
631   // to skip one further. Either way we merge will invalidate the iterator,
632   // and we don't need to scan the new instruction, as it's a pairwise
633   // instruction, which we're not considering for further action anyway.
634   if (NextI == MergeMI)
635     ++NextI;
636 
637   unsigned Opc = I->getOpcode();
638   bool IsScaled = !TII->isUnscaledLdSt(Opc);
639   int OffsetStride = IsScaled ? 1 : getMemScale(*I);
640 
641   bool MergeForward = Flags.getMergeForward();
642   // Insert our new paired instruction after whichever of the paired
643   // instructions MergeForward indicates.
644   MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
645   // Also based on MergeForward is from where we copy the base register operand
646   // so we get the flags compatible with the input code.
647   const MachineOperand &BaseRegOp =
648       MergeForward ? getLdStBaseOp(*MergeMI) : getLdStBaseOp(*I);
649 
650   // Which register is Rt and which is Rt2 depends on the offset order.
651   MachineInstr *RtMI, *Rt2MI;
652   if (getLdStOffsetOp(*I).getImm() ==
653       getLdStOffsetOp(*MergeMI).getImm() + OffsetStride) {
654     RtMI = &*MergeMI;
655     Rt2MI = &*I;
656   } else {
657     RtMI = &*I;
658     Rt2MI = &*MergeMI;
659   }
660 
661   int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
662   // Change the scaled offset from small to large type.
663   if (IsScaled) {
664     assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
665     OffsetImm /= 2;
666   }
667 
668   DebugLoc DL = I->getDebugLoc();
669   MachineBasicBlock *MBB = I->getParent();
670   if (isNarrowLoad(Opc)) {
671     MachineInstr *RtNewDest = &*(MergeForward ? I : MergeMI);
672     // When merging small (< 32 bit) loads for big-endian targets, the order of
673     // the component parts gets swapped.
674     if (!Subtarget->isLittleEndian())
675       std::swap(RtMI, Rt2MI);
676     // Construct the new load instruction.
677     MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2;
678     NewMemMI =
679         BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
680             .addOperand(getLdStRegOp(*RtNewDest))
681             .addOperand(BaseRegOp)
682             .addImm(OffsetImm)
683             .setMemRefs(I->mergeMemRefsWith(*MergeMI));
684     (void)NewMemMI;
685 
686     DEBUG(
687         dbgs()
688         << "Creating the new load and extract. Replacing instructions:\n    ");
689     DEBUG(I->print(dbgs()));
690     DEBUG(dbgs() << "    ");
691     DEBUG(MergeMI->print(dbgs()));
692     DEBUG(dbgs() << "  with instructions:\n    ");
693     DEBUG((NewMemMI)->print(dbgs()));
694 
695     int Width = getMemScale(*I) == 1 ? 8 : 16;
696     int LSBLow = 0;
697     int LSBHigh = Width;
698     int ImmsLow = LSBLow + Width - 1;
699     int ImmsHigh = LSBHigh + Width - 1;
700     MachineInstr *ExtDestMI = &*(MergeForward ? MergeMI : I);
701     if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) {
702       // Create the bitfield extract for high bits.
703       BitExtMI1 =
704           BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*Rt2MI)))
705               .addOperand(getLdStRegOp(*Rt2MI))
706               .addReg(getLdStRegOp(*RtNewDest).getReg())
707               .addImm(LSBHigh)
708               .addImm(ImmsHigh);
709       // Create the bitfield extract for low bits.
710       if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
711         // For unsigned, prefer to use AND for low bits.
712         BitExtMI2 = BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::ANDWri))
713                         .addOperand(getLdStRegOp(*RtMI))
714                         .addReg(getLdStRegOp(*RtNewDest).getReg())
715                         .addImm(ImmsLow);
716       } else {
717         BitExtMI2 =
718             BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*RtMI)))
719                 .addOperand(getLdStRegOp(*RtMI))
720                 .addReg(getLdStRegOp(*RtNewDest).getReg())
721                 .addImm(LSBLow)
722                 .addImm(ImmsLow);
723       }
724     } else {
725       // Create the bitfield extract for low bits.
726       if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
727         // For unsigned, prefer to use AND for low bits.
728         BitExtMI1 = BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::ANDWri))
729                         .addOperand(getLdStRegOp(*RtMI))
730                         .addReg(getLdStRegOp(*RtNewDest).getReg())
731                         .addImm(ImmsLow);
732       } else {
733         BitExtMI1 =
734             BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*RtMI)))
735                 .addOperand(getLdStRegOp(*RtMI))
736                 .addReg(getLdStRegOp(*RtNewDest).getReg())
737                 .addImm(LSBLow)
738                 .addImm(ImmsLow);
739       }
740 
741       // Create the bitfield extract for high bits.
742       BitExtMI2 =
743           BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*Rt2MI)))
744               .addOperand(getLdStRegOp(*Rt2MI))
745               .addReg(getLdStRegOp(*RtNewDest).getReg())
746               .addImm(LSBHigh)
747               .addImm(ImmsHigh);
748     }
749     (void)BitExtMI1;
750     (void)BitExtMI2;
751 
752     DEBUG(dbgs() << "    ");
753     DEBUG((BitExtMI1)->print(dbgs()));
754     DEBUG(dbgs() << "    ");
755     DEBUG((BitExtMI2)->print(dbgs()));
756     DEBUG(dbgs() << "\n");
757 
758     // Erase the old instructions.
759     I->eraseFromParent();
760     MergeMI->eraseFromParent();
761     return NextI;
762   }
763   assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
764          "Expected promotable zero store");
765 
766   // Construct the new instruction.
767   MachineInstrBuilder MIB;
768   MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
769             .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
770             .addOperand(BaseRegOp)
771             .addImm(OffsetImm)
772             .setMemRefs(I->mergeMemRefsWith(*MergeMI));
773   (void)MIB;
774 
775   DEBUG(dbgs() << "Creating wider load/store. Replacing instructions:\n    ");
776   DEBUG(I->print(dbgs()));
777   DEBUG(dbgs() << "    ");
778   DEBUG(MergeMI->print(dbgs()));
779   DEBUG(dbgs() << "  with instruction:\n    ");
780   DEBUG(((MachineInstr *)MIB)->print(dbgs()));
781   DEBUG(dbgs() << "\n");
782 
783   // Erase the old instructions.
784   I->eraseFromParent();
785   MergeMI->eraseFromParent();
786   return NextI;
787 }
788 
789 MachineBasicBlock::iterator
790 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
791                                       MachineBasicBlock::iterator Paired,
792                                       const LdStPairFlags &Flags) {
793   MachineBasicBlock::iterator NextI = I;
794   ++NextI;
795   // If NextI is the second of the two instructions to be merged, we need
796   // to skip one further. Either way we merge will invalidate the iterator,
797   // and we don't need to scan the new instruction, as it's a pairwise
798   // instruction, which we're not considering for further action anyway.
799   if (NextI == Paired)
800     ++NextI;
801 
802   int SExtIdx = Flags.getSExtIdx();
803   unsigned Opc =
804       SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
805   bool IsUnscaled = TII->isUnscaledLdSt(Opc);
806   int OffsetStride = IsUnscaled ? getMemScale(*I) : 1;
807 
808   bool MergeForward = Flags.getMergeForward();
809   // Insert our new paired instruction after whichever of the paired
810   // instructions MergeForward indicates.
811   MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
812   // Also based on MergeForward is from where we copy the base register operand
813   // so we get the flags compatible with the input code.
814   const MachineOperand &BaseRegOp =
815       MergeForward ? getLdStBaseOp(*Paired) : getLdStBaseOp(*I);
816 
817   int Offset = getLdStOffsetOp(*I).getImm();
818   int PairedOffset = getLdStOffsetOp(*Paired).getImm();
819   bool PairedIsUnscaled = TII->isUnscaledLdSt(Paired->getOpcode());
820   if (IsUnscaled != PairedIsUnscaled) {
821     // We're trying to pair instructions that differ in how they are scaled.  If
822     // I is scaled then scale the offset of Paired accordingly.  Otherwise, do
823     // the opposite (i.e., make Paired's offset unscaled).
824     int MemSize = getMemScale(*Paired);
825     if (PairedIsUnscaled) {
826       // If the unscaled offset isn't a multiple of the MemSize, we can't
827       // pair the operations together.
828       assert(!(PairedOffset % getMemScale(*Paired)) &&
829              "Offset should be a multiple of the stride!");
830       PairedOffset /= MemSize;
831     } else {
832       PairedOffset *= MemSize;
833     }
834   }
835 
836   // Which register is Rt and which is Rt2 depends on the offset order.
837   MachineInstr *RtMI, *Rt2MI;
838   if (Offset == PairedOffset + OffsetStride) {
839     RtMI = &*Paired;
840     Rt2MI = &*I;
841     // Here we swapped the assumption made for SExtIdx.
842     // I.e., we turn ldp I, Paired into ldp Paired, I.
843     // Update the index accordingly.
844     if (SExtIdx != -1)
845       SExtIdx = (SExtIdx + 1) % 2;
846   } else {
847     RtMI = &*I;
848     Rt2MI = &*Paired;
849   }
850   int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
851   // Scale the immediate offset, if necessary.
852   if (TII->isUnscaledLdSt(RtMI->getOpcode())) {
853     assert(!(OffsetImm % getMemScale(*RtMI)) &&
854            "Unscaled offset cannot be scaled.");
855     OffsetImm /= getMemScale(*RtMI);
856   }
857 
858   // Construct the new instruction.
859   MachineInstrBuilder MIB;
860   DebugLoc DL = I->getDebugLoc();
861   MachineBasicBlock *MBB = I->getParent();
862   MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc)))
863             .addOperand(getLdStRegOp(*RtMI))
864             .addOperand(getLdStRegOp(*Rt2MI))
865             .addOperand(BaseRegOp)
866             .addImm(OffsetImm)
867             .setMemRefs(I->mergeMemRefsWith(*Paired));
868 
869   (void)MIB;
870 
871   DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
872   DEBUG(I->print(dbgs()));
873   DEBUG(dbgs() << "    ");
874   DEBUG(Paired->print(dbgs()));
875   DEBUG(dbgs() << "  with instruction:\n    ");
876   if (SExtIdx != -1) {
877     // Generate the sign extension for the proper result of the ldp.
878     // I.e., with X1, that would be:
879     // %W1<def> = KILL %W1, %X1<imp-def>
880     // %X1<def> = SBFMXri %X1<kill>, 0, 31
881     MachineOperand &DstMO = MIB->getOperand(SExtIdx);
882     // Right now, DstMO has the extended register, since it comes from an
883     // extended opcode.
884     unsigned DstRegX = DstMO.getReg();
885     // Get the W variant of that register.
886     unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
887     // Update the result of LDP to use the W instead of the X variant.
888     DstMO.setReg(DstRegW);
889     DEBUG(((MachineInstr *)MIB)->print(dbgs()));
890     DEBUG(dbgs() << "\n");
891     // Make the machine verifier happy by providing a definition for
892     // the X register.
893     // Insert this definition right after the generated LDP, i.e., before
894     // InsertionPoint.
895     MachineInstrBuilder MIBKill =
896         BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)
897             .addReg(DstRegW)
898             .addReg(DstRegX, RegState::Define);
899     MIBKill->getOperand(2).setImplicit();
900     // Create the sign extension.
901     MachineInstrBuilder MIBSXTW =
902         BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)
903             .addReg(DstRegX)
904             .addImm(0)
905             .addImm(31);
906     (void)MIBSXTW;
907     DEBUG(dbgs() << "  Extend operand:\n    ");
908     DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
909   } else {
910     DEBUG(((MachineInstr *)MIB)->print(dbgs()));
911   }
912   DEBUG(dbgs() << "\n");
913 
914   // Erase the old instructions.
915   I->eraseFromParent();
916   Paired->eraseFromParent();
917 
918   return NextI;
919 }
920 
921 MachineBasicBlock::iterator
922 AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
923                                           MachineBasicBlock::iterator StoreI) {
924   MachineBasicBlock::iterator NextI = LoadI;
925   ++NextI;
926 
927   int LoadSize = getMemScale(*LoadI);
928   int StoreSize = getMemScale(*StoreI);
929   unsigned LdRt = getLdStRegOp(*LoadI).getReg();
930   unsigned StRt = getLdStRegOp(*StoreI).getReg();
931   bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
932 
933   assert((IsStoreXReg ||
934           TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
935          "Unexpected RegClass");
936 
937   MachineInstr *BitExtMI;
938   if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
939     // Remove the load, if the destination register of the loads is the same
940     // register for stored value.
941     if (StRt == LdRt && LoadSize == 8) {
942       DEBUG(dbgs() << "Remove load instruction:\n    ");
943       DEBUG(LoadI->print(dbgs()));
944       DEBUG(dbgs() << "\n");
945       LoadI->eraseFromParent();
946       return NextI;
947     }
948     // Replace the load with a mov if the load and store are in the same size.
949     BitExtMI =
950         BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
951                 TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
952             .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
953             .addReg(StRt)
954             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
955   } else {
956     // FIXME: Currently we disable this transformation in big-endian targets as
957     // performance and correctness are verified only in little-endian.
958     if (!Subtarget->isLittleEndian())
959       return NextI;
960     bool IsUnscaled = TII->isUnscaledLdSt(*LoadI);
961     assert(IsUnscaled == TII->isUnscaledLdSt(*StoreI) &&
962            "Unsupported ld/st match");
963     assert(LoadSize <= StoreSize && "Invalid load size");
964     int UnscaledLdOffset = IsUnscaled
965                                ? getLdStOffsetOp(*LoadI).getImm()
966                                : getLdStOffsetOp(*LoadI).getImm() * LoadSize;
967     int UnscaledStOffset = IsUnscaled
968                                ? getLdStOffsetOp(*StoreI).getImm()
969                                : getLdStOffsetOp(*StoreI).getImm() * StoreSize;
970     int Width = LoadSize * 8;
971     int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
972     int Imms = Immr + Width - 1;
973     unsigned DestReg = IsStoreXReg
974                            ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32,
975                                                       &AArch64::GPR64RegClass)
976                            : LdRt;
977 
978     assert((UnscaledLdOffset >= UnscaledStOffset &&
979             (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
980            "Invalid offset");
981 
982     Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
983     Imms = Immr + Width - 1;
984     if (UnscaledLdOffset == UnscaledStOffset) {
985       uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
986                                 | ((Immr) << 6)               // immr
987                                 | ((Imms) << 0)               // imms
988           ;
989 
990       BitExtMI =
991           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
992                   TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
993                   DestReg)
994               .addReg(StRt)
995               .addImm(AndMaskEncoded);
996     } else {
997       BitExtMI =
998           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
999                   TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
1000                   DestReg)
1001               .addReg(StRt)
1002               .addImm(Immr)
1003               .addImm(Imms);
1004     }
1005   }
1006   (void)BitExtMI;
1007 
1008   DEBUG(dbgs() << "Promoting load by replacing :\n    ");
1009   DEBUG(StoreI->print(dbgs()));
1010   DEBUG(dbgs() << "    ");
1011   DEBUG(LoadI->print(dbgs()));
1012   DEBUG(dbgs() << "  with instructions:\n    ");
1013   DEBUG(StoreI->print(dbgs()));
1014   DEBUG(dbgs() << "    ");
1015   DEBUG((BitExtMI)->print(dbgs()));
1016   DEBUG(dbgs() << "\n");
1017 
1018   // Erase the old instructions.
1019   LoadI->eraseFromParent();
1020   return NextI;
1021 }
1022 
1023 /// trackRegDefsUses - Remember what registers the specified instruction uses
1024 /// and modifies.
1025 static void trackRegDefsUses(const MachineInstr &MI, BitVector &ModifiedRegs,
1026                              BitVector &UsedRegs,
1027                              const TargetRegisterInfo *TRI) {
1028   for (const MachineOperand &MO : MI.operands()) {
1029     if (MO.isRegMask())
1030       ModifiedRegs.setBitsNotInMask(MO.getRegMask());
1031 
1032     if (!MO.isReg())
1033       continue;
1034     unsigned Reg = MO.getReg();
1035     if (!Reg)
1036       continue;
1037     if (MO.isDef()) {
1038       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
1039         ModifiedRegs.set(*AI);
1040     } else {
1041       assert(MO.isUse() && "Reg operand not a def and not a use?!?");
1042       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
1043         UsedRegs.set(*AI);
1044     }
1045   }
1046 }
1047 
1048 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
1049   // Convert the byte-offset used by unscaled into an "element" offset used
1050   // by the scaled pair load/store instructions.
1051   if (IsUnscaled) {
1052     // If the byte-offset isn't a multiple of the stride, there's no point
1053     // trying to match it.
1054     if (Offset % OffsetStride)
1055       return false;
1056     Offset /= OffsetStride;
1057   }
1058   return Offset <= 63 && Offset >= -64;
1059 }
1060 
1061 // Do alignment, specialized to power of 2 and for signed ints,
1062 // avoiding having to do a C-style cast from uint_64t to int when
1063 // using alignTo from include/llvm/Support/MathExtras.h.
1064 // FIXME: Move this function to include/MathExtras.h?
1065 static int alignTo(int Num, int PowOf2) {
1066   return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
1067 }
1068 
1069 static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
1070                      const AArch64InstrInfo *TII) {
1071   // One of the instructions must modify memory.
1072   if (!MIa.mayStore() && !MIb.mayStore())
1073     return false;
1074 
1075   // Both instructions must be memory operations.
1076   if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore())
1077     return false;
1078 
1079   return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb);
1080 }
1081 
1082 static bool mayAlias(MachineInstr &MIa,
1083                      SmallVectorImpl<MachineInstr *> &MemInsns,
1084                      const AArch64InstrInfo *TII) {
1085   for (MachineInstr *MIb : MemInsns)
1086     if (mayAlias(MIa, *MIb, TII))
1087       return true;
1088 
1089   return false;
1090 }
1091 
1092 bool AArch64LoadStoreOpt::findMatchingStore(
1093     MachineBasicBlock::iterator I, unsigned Limit,
1094     MachineBasicBlock::iterator &StoreI) {
1095   MachineBasicBlock::iterator B = I->getParent()->begin();
1096   MachineBasicBlock::iterator MBBI = I;
1097   MachineInstr &LoadMI = *I;
1098   unsigned BaseReg = getLdStBaseOp(LoadMI).getReg();
1099 
1100   // If the load is the first instruction in the block, there's obviously
1101   // not any matching store.
1102   if (MBBI == B)
1103     return false;
1104 
1105   // Track which registers have been modified and used between the first insn
1106   // and the second insn.
1107   ModifiedRegs.reset();
1108   UsedRegs.reset();
1109 
1110   unsigned Count = 0;
1111   do {
1112     --MBBI;
1113     MachineInstr &MI = *MBBI;
1114 
1115     // Don't count transient instructions towards the search limit since there
1116     // may be different numbers of them if e.g. debug information is present.
1117     if (!MI.isTransient())
1118       ++Count;
1119 
1120     // If the load instruction reads directly from the address to which the
1121     // store instruction writes and the stored value is not modified, we can
1122     // promote the load. Since we do not handle stores with pre-/post-index,
1123     // it's unnecessary to check if BaseReg is modified by the store itself.
1124     if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
1125         BaseReg == getLdStBaseOp(MI).getReg() &&
1126         isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
1127         !ModifiedRegs[getLdStRegOp(MI).getReg()]) {
1128       StoreI = MBBI;
1129       return true;
1130     }
1131 
1132     if (MI.isCall())
1133       return false;
1134 
1135     // Update modified / uses register lists.
1136     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1137 
1138     // Otherwise, if the base register is modified, we have no match, so
1139     // return early.
1140     if (ModifiedRegs[BaseReg])
1141       return false;
1142 
1143     // If we encounter a store aliased with the load, return early.
1144     if (MI.mayStore() && mayAlias(LoadMI, MI, TII))
1145       return false;
1146   } while (MBBI != B && Count < Limit);
1147   return false;
1148 }
1149 
1150 // Returns true if FirstMI and MI are candidates for merging or pairing.
1151 // Otherwise, returns false.
1152 static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
1153                                        LdStPairFlags &Flags,
1154                                        const AArch64InstrInfo *TII) {
1155   // If this is volatile or if pairing is suppressed, not a candidate.
1156   if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
1157     return false;
1158 
1159   // We should have already checked FirstMI for pair suppression and volatility.
1160   assert(!FirstMI.hasOrderedMemoryRef() &&
1161          !TII->isLdStPairSuppressed(FirstMI) &&
1162          "FirstMI shouldn't get here if either of these checks are true.");
1163 
1164   unsigned OpcA = FirstMI.getOpcode();
1165   unsigned OpcB = MI.getOpcode();
1166 
1167   // Opcodes match: nothing more to check.
1168   if (OpcA == OpcB)
1169     return true;
1170 
1171   // Try to match a sign-extended load/store with a zero-extended load/store.
1172   bool IsValidLdStrOpc, PairIsValidLdStrOpc;
1173   unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
1174   assert(IsValidLdStrOpc &&
1175          "Given Opc should be a Load or Store with an immediate");
1176   // OpcA will be the first instruction in the pair.
1177   if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
1178     Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);
1179     return true;
1180   }
1181 
1182   // If the second instruction isn't even a load/store, bail out.
1183   if (!PairIsValidLdStrOpc)
1184     return false;
1185 
1186   // FIXME: We don't support merging narrow loads/stores with mixed
1187   // scaled/unscaled offsets.
1188   if (isNarrowLoadOrStore(OpcA) || isNarrowLoadOrStore(OpcB))
1189     return false;
1190 
1191   // Try to match an unscaled load/store with a scaled load/store.
1192   return TII->isUnscaledLdSt(OpcA) != TII->isUnscaledLdSt(OpcB) &&
1193          getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);
1194 
1195   // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1196 }
1197 
1198 /// Scan the instructions looking for a load/store that can be combined with the
1199 /// current instruction into a wider equivalent or a load/store pair.
1200 MachineBasicBlock::iterator
1201 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
1202                                       LdStPairFlags &Flags, unsigned Limit,
1203                                       bool FindNarrowMerge) {
1204   MachineBasicBlock::iterator E = I->getParent()->end();
1205   MachineBasicBlock::iterator MBBI = I;
1206   MachineInstr &FirstMI = *I;
1207   ++MBBI;
1208 
1209   bool MayLoad = FirstMI.mayLoad();
1210   bool IsUnscaled = TII->isUnscaledLdSt(FirstMI);
1211   unsigned Reg = getLdStRegOp(FirstMI).getReg();
1212   unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
1213   int Offset = getLdStOffsetOp(FirstMI).getImm();
1214   int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
1215   bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
1216 
1217   // Track which registers have been modified and used between the first insn
1218   // (inclusive) and the second insn.
1219   ModifiedRegs.reset();
1220   UsedRegs.reset();
1221 
1222   // Remember any instructions that read/write memory between FirstMI and MI.
1223   SmallVector<MachineInstr *, 4> MemInsns;
1224 
1225   for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
1226     MachineInstr &MI = *MBBI;
1227 
1228     // Don't count transient instructions towards the search limit since there
1229     // may be different numbers of them if e.g. debug information is present.
1230     if (!MI.isTransient())
1231       ++Count;
1232 
1233     Flags.setSExtIdx(-1);
1234     if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
1235         getLdStOffsetOp(MI).isImm()) {
1236       assert(MI.mayLoadOrStore() && "Expected memory operation.");
1237       // If we've found another instruction with the same opcode, check to see
1238       // if the base and offset are compatible with our starting instruction.
1239       // These instructions all have scaled immediate operands, so we just
1240       // check for +1/-1. Make sure to check the new instruction offset is
1241       // actually an immediate and not a symbolic reference destined for
1242       // a relocation.
1243       unsigned MIBaseReg = getLdStBaseOp(MI).getReg();
1244       int MIOffset = getLdStOffsetOp(MI).getImm();
1245       bool MIIsUnscaled = TII->isUnscaledLdSt(MI);
1246       if (IsUnscaled != MIIsUnscaled) {
1247         // We're trying to pair instructions that differ in how they are scaled.
1248         // If FirstMI is scaled then scale the offset of MI accordingly.
1249         // Otherwise, do the opposite (i.e., make MI's offset unscaled).
1250         int MemSize = getMemScale(MI);
1251         if (MIIsUnscaled) {
1252           // If the unscaled offset isn't a multiple of the MemSize, we can't
1253           // pair the operations together: bail and keep looking.
1254           if (MIOffset % MemSize) {
1255             trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1256             MemInsns.push_back(&MI);
1257             continue;
1258           }
1259           MIOffset /= MemSize;
1260         } else {
1261           MIOffset *= MemSize;
1262         }
1263       }
1264 
1265       if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
1266                                    (Offset + OffsetStride == MIOffset))) {
1267         int MinOffset = Offset < MIOffset ? Offset : MIOffset;
1268         if (FindNarrowMerge) {
1269           // If the alignment requirements of the scaled wide load/store
1270           // instruction can't express the offset of the scaled narrow input,
1271           // bail and keep looking. For promotable zero stores, allow only when
1272           // the stored value is the same (i.e., WZR).
1273           if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) ||
1274               (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
1275             trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1276             MemInsns.push_back(&MI);
1277             continue;
1278           }
1279         } else {
1280           // Pairwise instructions have a 7-bit signed offset field. Single
1281           // insns have a 12-bit unsigned offset field.  If the resultant
1282           // immediate offset of merging these instructions is out of range for
1283           // a pairwise instruction, bail and keep looking.
1284           if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
1285             trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1286             MemInsns.push_back(&MI);
1287             continue;
1288           }
1289           // If the alignment requirements of the paired (scaled) instruction
1290           // can't express the offset of the unscaled input, bail and keep
1291           // looking.
1292           if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
1293             trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1294             MemInsns.push_back(&MI);
1295             continue;
1296           }
1297         }
1298         // If the destination register of the loads is the same register, bail
1299         // and keep looking. A load-pair instruction with both destination
1300         // registers the same is UNPREDICTABLE and will result in an exception.
1301         if (MayLoad && Reg == getLdStRegOp(MI).getReg()) {
1302           trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1303           MemInsns.push_back(&MI);
1304           continue;
1305         }
1306 
1307         // If the Rt of the second instruction was not modified or used between
1308         // the two instructions and none of the instructions between the second
1309         // and first alias with the second, we can combine the second into the
1310         // first.
1311         if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
1312             !(MI.mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
1313             !mayAlias(MI, MemInsns, TII)) {
1314           Flags.setMergeForward(false);
1315           return MBBI;
1316         }
1317 
1318         // Likewise, if the Rt of the first instruction is not modified or used
1319         // between the two instructions and none of the instructions between the
1320         // first and the second alias with the first, we can combine the first
1321         // into the second.
1322         if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] &&
1323             !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) &&
1324             !mayAlias(FirstMI, MemInsns, TII)) {
1325           Flags.setMergeForward(true);
1326           return MBBI;
1327         }
1328         // Unable to combine these instructions due to interference in between.
1329         // Keep looking.
1330       }
1331     }
1332 
1333     // If the instruction wasn't a matching load or store.  Stop searching if we
1334     // encounter a call instruction that might modify memory.
1335     if (MI.isCall())
1336       return E;
1337 
1338     // Update modified / uses register lists.
1339     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1340 
1341     // Otherwise, if the base register is modified, we have no match, so
1342     // return early.
1343     if (ModifiedRegs[BaseReg])
1344       return E;
1345 
1346     // Update list of instructions that read/write memory.
1347     if (MI.mayLoadOrStore())
1348       MemInsns.push_back(&MI);
1349   }
1350   return E;
1351 }
1352 
1353 MachineBasicBlock::iterator
1354 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
1355                                      MachineBasicBlock::iterator Update,
1356                                      bool IsPreIdx) {
1357   assert((Update->getOpcode() == AArch64::ADDXri ||
1358           Update->getOpcode() == AArch64::SUBXri) &&
1359          "Unexpected base register update instruction to merge!");
1360   MachineBasicBlock::iterator NextI = I;
1361   // Return the instruction following the merged instruction, which is
1362   // the instruction following our unmerged load. Unless that's the add/sub
1363   // instruction we're merging, in which case it's the one after that.
1364   if (++NextI == Update)
1365     ++NextI;
1366 
1367   int Value = Update->getOperand(2).getImm();
1368   assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
1369          "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
1370   if (Update->getOpcode() == AArch64::SUBXri)
1371     Value = -Value;
1372 
1373   unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
1374                              : getPostIndexedOpcode(I->getOpcode());
1375   MachineInstrBuilder MIB;
1376   if (!isPairedLdSt(*I)) {
1377     // Non-paired instruction.
1378     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1379               .addOperand(getLdStRegOp(*Update))
1380               .addOperand(getLdStRegOp(*I))
1381               .addOperand(getLdStBaseOp(*I))
1382               .addImm(Value)
1383               .setMemRefs(I->memoperands_begin(), I->memoperands_end());
1384   } else {
1385     // Paired instruction.
1386     int Scale = getMemScale(*I);
1387     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1388               .addOperand(getLdStRegOp(*Update))
1389               .addOperand(getLdStRegOp(*I, 0))
1390               .addOperand(getLdStRegOp(*I, 1))
1391               .addOperand(getLdStBaseOp(*I))
1392               .addImm(Value / Scale)
1393               .setMemRefs(I->memoperands_begin(), I->memoperands_end());
1394   }
1395   (void)MIB;
1396 
1397   if (IsPreIdx)
1398     DEBUG(dbgs() << "Creating pre-indexed load/store.");
1399   else
1400     DEBUG(dbgs() << "Creating post-indexed load/store.");
1401   DEBUG(dbgs() << "    Replacing instructions:\n    ");
1402   DEBUG(I->print(dbgs()));
1403   DEBUG(dbgs() << "    ");
1404   DEBUG(Update->print(dbgs()));
1405   DEBUG(dbgs() << "  with instruction:\n    ");
1406   DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1407   DEBUG(dbgs() << "\n");
1408 
1409   // Erase the old instructions for the block.
1410   I->eraseFromParent();
1411   Update->eraseFromParent();
1412 
1413   return NextI;
1414 }
1415 
1416 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
1417                                                MachineInstr &MI,
1418                                                unsigned BaseReg, int Offset) {
1419   switch (MI.getOpcode()) {
1420   default:
1421     break;
1422   case AArch64::SUBXri:
1423   case AArch64::ADDXri:
1424     // Make sure it's a vanilla immediate operand, not a relocation or
1425     // anything else we can't handle.
1426     if (!MI.getOperand(2).isImm())
1427       break;
1428     // Watch out for 1 << 12 shifted value.
1429     if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm()))
1430       break;
1431 
1432     // The update instruction source and destination register must be the
1433     // same as the load/store base register.
1434     if (MI.getOperand(0).getReg() != BaseReg ||
1435         MI.getOperand(1).getReg() != BaseReg)
1436       break;
1437 
1438     bool IsPairedInsn = isPairedLdSt(MemMI);
1439     int UpdateOffset = MI.getOperand(2).getImm();
1440     if (MI.getOpcode() == AArch64::SUBXri)
1441       UpdateOffset = -UpdateOffset;
1442 
1443     // For non-paired load/store instructions, the immediate must fit in a
1444     // signed 9-bit integer.
1445     if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256))
1446       break;
1447 
1448     // For paired load/store instructions, the immediate must be a multiple of
1449     // the scaling factor.  The scaled offset must also fit into a signed 7-bit
1450     // integer.
1451     if (IsPairedInsn) {
1452       int Scale = getMemScale(MemMI);
1453       if (UpdateOffset % Scale != 0)
1454         break;
1455 
1456       int ScaledOffset = UpdateOffset / Scale;
1457       if (ScaledOffset > 63 || ScaledOffset < -64)
1458         break;
1459     }
1460 
1461     // If we have a non-zero Offset, we check that it matches the amount
1462     // we're adding to the register.
1463     if (!Offset || Offset == UpdateOffset)
1464       return true;
1465     break;
1466   }
1467   return false;
1468 }
1469 
1470 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
1471     MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
1472   MachineBasicBlock::iterator E = I->getParent()->end();
1473   MachineInstr &MemMI = *I;
1474   MachineBasicBlock::iterator MBBI = I;
1475 
1476   unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
1477   int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI);
1478 
1479   // Scan forward looking for post-index opportunities.  Updating instructions
1480   // can't be formed if the memory instruction doesn't have the offset we're
1481   // looking for.
1482   if (MIUnscaledOffset != UnscaledOffset)
1483     return E;
1484 
1485   // If the base register overlaps a destination register, we can't
1486   // merge the update.
1487   bool IsPairedInsn = isPairedLdSt(MemMI);
1488   for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
1489     unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
1490     if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
1491       return E;
1492   }
1493 
1494   // Track which registers have been modified and used between the first insn
1495   // (inclusive) and the second insn.
1496   ModifiedRegs.reset();
1497   UsedRegs.reset();
1498   ++MBBI;
1499   for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
1500     MachineInstr &MI = *MBBI;
1501 
1502     // Don't count transient instructions towards the search limit since there
1503     // may be different numbers of them if e.g. debug information is present.
1504     if (!MI.isTransient())
1505       ++Count;
1506 
1507     // If we found a match, return it.
1508     if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
1509       return MBBI;
1510 
1511     // Update the status of what the instruction clobbered and used.
1512     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1513 
1514     // Otherwise, if the base register is used or modified, we have no match, so
1515     // return early.
1516     if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
1517       return E;
1518   }
1519   return E;
1520 }
1521 
1522 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
1523     MachineBasicBlock::iterator I, unsigned Limit) {
1524   MachineBasicBlock::iterator B = I->getParent()->begin();
1525   MachineBasicBlock::iterator E = I->getParent()->end();
1526   MachineInstr &MemMI = *I;
1527   MachineBasicBlock::iterator MBBI = I;
1528 
1529   unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
1530   int Offset = getLdStOffsetOp(MemMI).getImm();
1531 
1532   // If the load/store is the first instruction in the block, there's obviously
1533   // not any matching update. Ditto if the memory offset isn't zero.
1534   if (MBBI == B || Offset != 0)
1535     return E;
1536   // If the base register overlaps a destination register, we can't
1537   // merge the update.
1538   bool IsPairedInsn = isPairedLdSt(MemMI);
1539   for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
1540     unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
1541     if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
1542       return E;
1543   }
1544 
1545   // Track which registers have been modified and used between the first insn
1546   // (inclusive) and the second insn.
1547   ModifiedRegs.reset();
1548   UsedRegs.reset();
1549   unsigned Count = 0;
1550   do {
1551     --MBBI;
1552     MachineInstr &MI = *MBBI;
1553 
1554     // Don't count transient instructions towards the search limit since there
1555     // may be different numbers of them if e.g. debug information is present.
1556     if (!MI.isTransient())
1557       ++Count;
1558 
1559     // If we found a match, return it.
1560     if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset))
1561       return MBBI;
1562 
1563     // Update the status of what the instruction clobbered and used.
1564     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1565 
1566     // Otherwise, if the base register is used or modified, we have no match, so
1567     // return early.
1568     if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
1569       return E;
1570   } while (MBBI != B && Count < Limit);
1571   return E;
1572 }
1573 
1574 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
1575     MachineBasicBlock::iterator &MBBI) {
1576   MachineInstr &MI = *MBBI;
1577   // If this is a volatile load, don't mess with it.
1578   if (MI.hasOrderedMemoryRef())
1579     return false;
1580 
1581   // Make sure this is a reg+imm.
1582   // FIXME: It is possible to extend it to handle reg+reg cases.
1583   if (!getLdStOffsetOp(MI).isImm())
1584     return false;
1585 
1586   // Look backward up to LdStLimit instructions.
1587   MachineBasicBlock::iterator StoreI;
1588   if (findMatchingStore(MBBI, LdStLimit, StoreI)) {
1589     ++NumLoadsFromStoresPromoted;
1590     // Promote the load. Keeping the iterator straight is a
1591     // pain, so we let the merge routine tell us what the next instruction
1592     // is after it's done mucking about.
1593     MBBI = promoteLoadFromStore(MBBI, StoreI);
1594     return true;
1595   }
1596   return false;
1597 }
1598 
1599 // Find narrow loads that can be converted into a single wider load with
1600 // bitfield extract instructions.  Also merge adjacent zero stores into a wider
1601 // store.
1602 bool AArch64LoadStoreOpt::tryToMergeLdStInst(
1603     MachineBasicBlock::iterator &MBBI) {
1604   assert((isNarrowLoad(*MBBI) || isPromotableZeroStoreOpcode(*MBBI)) &&
1605          "Expected narrow op.");
1606   MachineInstr &MI = *MBBI;
1607   MachineBasicBlock::iterator E = MI.getParent()->end();
1608 
1609   if (!TII->isCandidateToMergeOrPair(MI))
1610     return false;
1611 
1612   // For promotable zero stores, the stored value should be WZR.
1613   if (isPromotableZeroStoreOpcode(MI) &&
1614       getLdStRegOp(MI).getReg() != AArch64::WZR)
1615     return false;
1616 
1617   // Look ahead up to LdStLimit instructions for a mergable instruction.
1618   LdStPairFlags Flags;
1619   MachineBasicBlock::iterator MergeMI =
1620       findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
1621   if (MergeMI != E) {
1622     if (isNarrowLoad(MI)) {
1623       ++NumNarrowLoadsPromoted;
1624     } else if (isPromotableZeroStoreInst(MI)) {
1625       ++NumZeroStoresPromoted;
1626     }
1627     // Keeping the iterator straight is a pain, so we let the merge routine tell
1628     // us what the next instruction is after it's done mucking about.
1629     MBBI = mergeNarrowInsns(MBBI, MergeMI, Flags);
1630     return true;
1631   }
1632   return false;
1633 }
1634 
1635 // Find loads and stores that can be merged into a single load or store pair
1636 // instruction.
1637 bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
1638   MachineInstr &MI = *MBBI;
1639   MachineBasicBlock::iterator E = MI.getParent()->end();
1640 
1641   if (!TII->isCandidateToMergeOrPair(MI))
1642     return false;
1643 
1644   // Early exit if the offset is not possible to match. (6 bits of positive
1645   // range, plus allow an extra one in case we find a later insn that matches
1646   // with Offset-1)
1647   bool IsUnscaled = TII->isUnscaledLdSt(MI);
1648   int Offset = getLdStOffsetOp(MI).getImm();
1649   int OffsetStride = IsUnscaled ? getMemScale(MI) : 1;
1650   if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
1651     return false;
1652 
1653   // Look ahead up to LdStLimit instructions for a pairable instruction.
1654   LdStPairFlags Flags;
1655   MachineBasicBlock::iterator Paired =
1656       findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
1657   if (Paired != E) {
1658     ++NumPairCreated;
1659     if (TII->isUnscaledLdSt(MI))
1660       ++NumUnscaledPairCreated;
1661     // Keeping the iterator straight is a pain, so we let the merge routine tell
1662     // us what the next instruction is after it's done mucking about.
1663     MBBI = mergePairedInsns(MBBI, Paired, Flags);
1664     return true;
1665   }
1666   return false;
1667 }
1668 
1669 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
1670                                         bool enableNarrowLdOpt) {
1671   bool Modified = false;
1672   // Four tranformations to do here:
1673   // 1) Find loads that directly read from stores and promote them by
1674   //    replacing with mov instructions. If the store is wider than the load,
1675   //    the load will be replaced with a bitfield extract.
1676   //      e.g.,
1677   //        str w1, [x0, #4]
1678   //        ldrh w2, [x0, #6]
1679   //        ; becomes
1680   //        str w1, [x0, #4]
1681   //        lsr w2, w1, #16
1682   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1683        MBBI != E;) {
1684     MachineInstr &MI = *MBBI;
1685     switch (MI.getOpcode()) {
1686     default:
1687       // Just move on to the next instruction.
1688       ++MBBI;
1689       break;
1690     // Scaled instructions.
1691     case AArch64::LDRBBui:
1692     case AArch64::LDRHHui:
1693     case AArch64::LDRWui:
1694     case AArch64::LDRXui:
1695     // Unscaled instructions.
1696     case AArch64::LDURBBi:
1697     case AArch64::LDURHHi:
1698     case AArch64::LDURWi:
1699     case AArch64::LDURXi: {
1700       if (tryToPromoteLoadFromStore(MBBI)) {
1701         Modified = true;
1702         break;
1703       }
1704       ++MBBI;
1705       break;
1706     }
1707     }
1708   }
1709   // 2) Find narrow loads that can be converted into a single wider load
1710   //    with bitfield extract instructions.
1711   //      e.g.,
1712   //        ldrh w0, [x2]
1713   //        ldrh w1, [x2, #2]
1714   //        ; becomes
1715   //        ldr w0, [x2]
1716   //        ubfx w1, w0, #16, #16
1717   //        and w0, w0, #ffff
1718   //
1719   //    Also merge adjacent zero stores into a wider store.
1720   //      e.g.,
1721   //        strh wzr, [x0]
1722   //        strh wzr, [x0, #2]
1723   //        ; becomes
1724   //        str wzr, [x0]
1725   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1726        enableNarrowLdOpt && MBBI != E;) {
1727     MachineInstr &MI = *MBBI;
1728     unsigned Opc = MI.getOpcode();
1729     if (isPromotableZeroStoreOpcode(Opc) ||
1730         (EnableNarrowLdMerge && isNarrowLoad(Opc))) {
1731       if (tryToMergeLdStInst(MBBI)) {
1732         Modified = true;
1733       } else
1734         ++MBBI;
1735     } else
1736       ++MBBI;
1737   }
1738 
1739   // 3) Find loads and stores that can be merged into a single load or store
1740   //    pair instruction.
1741   //      e.g.,
1742   //        ldr x0, [x2]
1743   //        ldr x1, [x2, #8]
1744   //        ; becomes
1745   //        ldp x0, x1, [x2]
1746   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1747        MBBI != E;) {
1748     if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI))
1749       Modified = true;
1750     else
1751       ++MBBI;
1752   }
1753   // 4) Find base register updates that can be merged into the load or store
1754   //    as a base-reg writeback.
1755   //      e.g.,
1756   //        ldr x0, [x2]
1757   //        add x2, x2, #4
1758   //        ; becomes
1759   //        ldr x0, [x2], #4
1760   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1761        MBBI != E;) {
1762     MachineInstr &MI = *MBBI;
1763     // Do update merging. It's simpler to keep this separate from the above
1764     // switchs, though not strictly necessary.
1765     unsigned Opc = MI.getOpcode();
1766     switch (Opc) {
1767     default:
1768       // Just move on to the next instruction.
1769       ++MBBI;
1770       break;
1771     // Scaled instructions.
1772     case AArch64::STRSui:
1773     case AArch64::STRDui:
1774     case AArch64::STRQui:
1775     case AArch64::STRXui:
1776     case AArch64::STRWui:
1777     case AArch64::STRHHui:
1778     case AArch64::STRBBui:
1779     case AArch64::LDRSui:
1780     case AArch64::LDRDui:
1781     case AArch64::LDRQui:
1782     case AArch64::LDRXui:
1783     case AArch64::LDRWui:
1784     case AArch64::LDRHHui:
1785     case AArch64::LDRBBui:
1786     // Unscaled instructions.
1787     case AArch64::STURSi:
1788     case AArch64::STURDi:
1789     case AArch64::STURQi:
1790     case AArch64::STURWi:
1791     case AArch64::STURXi:
1792     case AArch64::LDURSi:
1793     case AArch64::LDURDi:
1794     case AArch64::LDURQi:
1795     case AArch64::LDURWi:
1796     case AArch64::LDURXi:
1797     // Paired instructions.
1798     case AArch64::LDPSi:
1799     case AArch64::LDPSWi:
1800     case AArch64::LDPDi:
1801     case AArch64::LDPQi:
1802     case AArch64::LDPWi:
1803     case AArch64::LDPXi:
1804     case AArch64::STPSi:
1805     case AArch64::STPDi:
1806     case AArch64::STPQi:
1807     case AArch64::STPWi:
1808     case AArch64::STPXi: {
1809       // Make sure this is a reg+imm (as opposed to an address reloc).
1810       if (!getLdStOffsetOp(MI).isImm()) {
1811         ++MBBI;
1812         break;
1813       }
1814       // Look forward to try to form a post-index instruction. For example,
1815       // ldr x0, [x20]
1816       // add x20, x20, #32
1817       //   merged into:
1818       // ldr x0, [x20], #32
1819       MachineBasicBlock::iterator Update =
1820           findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
1821       if (Update != E) {
1822         // Merge the update into the ld/st.
1823         MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
1824         Modified = true;
1825         ++NumPostFolded;
1826         break;
1827       }
1828       // Don't know how to handle pre/post-index versions, so move to the next
1829       // instruction.
1830       if (TII->isUnscaledLdSt(Opc)) {
1831         ++MBBI;
1832         break;
1833       }
1834 
1835       // Look back to try to find a pre-index instruction. For example,
1836       // add x0, x0, #8
1837       // ldr x1, [x0]
1838       //   merged into:
1839       // ldr x1, [x0, #8]!
1840       Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
1841       if (Update != E) {
1842         // Merge the update into the ld/st.
1843         MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
1844         Modified = true;
1845         ++NumPreFolded;
1846         break;
1847       }
1848       // The immediate in the load/store is scaled by the size of the memory
1849       // operation. The immediate in the add we're looking for,
1850       // however, is not, so adjust here.
1851       int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
1852 
1853       // Look forward to try to find a post-index instruction. For example,
1854       // ldr x1, [x0, #64]
1855       // add x0, x0, #64
1856       //   merged into:
1857       // ldr x1, [x0, #64]!
1858       Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
1859       if (Update != E) {
1860         // Merge the update into the ld/st.
1861         MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
1862         Modified = true;
1863         ++NumPreFolded;
1864         break;
1865       }
1866 
1867       // Nothing found. Just move to the next instruction.
1868       ++MBBI;
1869       break;
1870     }
1871     }
1872   }
1873 
1874   return Modified;
1875 }
1876 
1877 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1878   if (skipFunction(*Fn.getFunction()))
1879     return false;
1880 
1881   Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
1882   TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
1883   TRI = Subtarget->getRegisterInfo();
1884 
1885   // Resize the modified and used register bitfield trackers.  We do this once
1886   // per function and then clear the bitfield each time we optimize a load or
1887   // store.
1888   ModifiedRegs.resize(TRI->getNumRegs());
1889   UsedRegs.resize(TRI->getNumRegs());
1890 
1891   bool Modified = false;
1892   bool enableNarrowLdOpt =
1893     Subtarget->mergeNarrowLoads() && !Subtarget->requiresStrictAlign();
1894   for (auto &MBB : Fn)
1895     Modified |= optimizeBlock(MBB, enableNarrowLdOpt);
1896 
1897   return Modified;
1898 }
1899 
1900 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
1901 // loads and stores near one another?
1902 
1903 // FIXME: When pairing store instructions it's very possible for this pass to
1904 // hoist a store with a KILL marker above another use (without a KILL marker).
1905 // The resulting IR is invalid, but nothing uses the KILL markers after this
1906 // pass, so it's never caused a problem in practice.
1907 
1908 /// createAArch64LoadStoreOptimizationPass - returns an instance of the
1909 /// load / store optimization pass.
1910 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
1911   return new AArch64LoadStoreOpt();
1912 }
1913