1 //=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file contains a pass that performs load / store related peephole
11 // optimizations. This pass should be run after register allocation.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AArch64InstrInfo.h"
16 #include "AArch64Subtarget.h"
17 #include "MCTargetDesc/AArch64AddressingModes.h"
18 #include "llvm/ADT/BitVector.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/CodeGen/MachineInstr.h"
24 #include "llvm/CodeGen/MachineInstrBuilder.h"
25 #include "llvm/Support/CommandLine.h"
26 #include "llvm/Support/Debug.h"
27 #include "llvm/Support/ErrorHandling.h"
28 #include "llvm/Support/raw_ostream.h"
29 #include "llvm/Target/TargetInstrInfo.h"
30 #include "llvm/Target/TargetMachine.h"
31 #include "llvm/Target/TargetRegisterInfo.h"
32 using namespace llvm;
33 
34 #define DEBUG_TYPE "aarch64-ldst-opt"
35 
36 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
37 STATISTIC(NumPostFolded, "Number of post-index updates folded");
38 STATISTIC(NumPreFolded, "Number of pre-index updates folded");
39 STATISTIC(NumUnscaledPairCreated,
40           "Number of load/store from unscaled generated");
41 STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted");
42 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
43 STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
44 
45 // The LdStLimit limits how far we search for load/store pairs.
46 static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
47                                    cl::init(20), cl::Hidden);
48 
49 // The UpdateLimit limits how far we search for update instructions when we form
50 // pre-/post-index instructions.
51 static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
52                                      cl::Hidden);
53 
54 static cl::opt<bool> EnableNarrowLdMerge("enable-narrow-ld-merge", cl::Hidden,
55                                          cl::init(false),
56                                          cl::desc("Enable narrow load merge"));
57 
58 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
59 
60 namespace {
61 
62 typedef struct LdStPairFlags {
63   // If a matching instruction is found, MergeForward is set to true if the
64   // merge is to remove the first instruction and replace the second with
65   // a pair-wise insn, and false if the reverse is true.
66   bool MergeForward;
67 
68   // SExtIdx gives the index of the result of the load pair that must be
69   // extended. The value of SExtIdx assumes that the paired load produces the
70   // value in this order: (I, returned iterator), i.e., -1 means no value has
71   // to be extended, 0 means I, and 1 means the returned iterator.
72   int SExtIdx;
73 
74   LdStPairFlags() : MergeForward(false), SExtIdx(-1) {}
75 
76   void setMergeForward(bool V = true) { MergeForward = V; }
77   bool getMergeForward() const { return MergeForward; }
78 
79   void setSExtIdx(int V) { SExtIdx = V; }
80   int getSExtIdx() const { return SExtIdx; }
81 
82 } LdStPairFlags;
83 
84 struct AArch64LoadStoreOpt : public MachineFunctionPass {
85   static char ID;
86   AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
87     initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
88   }
89 
90   const AArch64InstrInfo *TII;
91   const TargetRegisterInfo *TRI;
92   const AArch64Subtarget *Subtarget;
93 
94   // Track which registers have been modified and used.
95   BitVector ModifiedRegs, UsedRegs;
96 
97   // Scan the instructions looking for a load/store that can be combined
98   // with the current instruction into a load/store pair.
99   // Return the matching instruction if one is found, else MBB->end().
100   MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
101                                                LdStPairFlags &Flags,
102                                                unsigned Limit,
103                                                bool FindNarrowMerge);
104 
105   // Scan the instructions looking for a store that writes to the address from
106   // which the current load instruction reads. Return true if one is found.
107   bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
108                          MachineBasicBlock::iterator &StoreI);
109 
110   // Merge the two instructions indicated into a wider instruction.
111   MachineBasicBlock::iterator
112   mergeNarrowInsns(MachineBasicBlock::iterator I,
113                    MachineBasicBlock::iterator MergeMI,
114                    const LdStPairFlags &Flags);
115 
116   // Merge the two instructions indicated into a single pair-wise instruction.
117   MachineBasicBlock::iterator
118   mergePairedInsns(MachineBasicBlock::iterator I,
119                    MachineBasicBlock::iterator Paired,
120                    const LdStPairFlags &Flags);
121 
122   // Promote the load that reads directly from the address stored to.
123   MachineBasicBlock::iterator
124   promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
125                        MachineBasicBlock::iterator StoreI);
126 
127   // Scan the instruction list to find a base register update that can
128   // be combined with the current instruction (a load or store) using
129   // pre or post indexed addressing with writeback. Scan forwards.
130   MachineBasicBlock::iterator
131   findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
132                                 int UnscaledOffset, unsigned Limit);
133 
134   // Scan the instruction list to find a base register update that can
135   // be combined with the current instruction (a load or store) using
136   // pre or post indexed addressing with writeback. Scan backwards.
137   MachineBasicBlock::iterator
138   findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
139 
140   // Find an instruction that updates the base register of the ld/st
141   // instruction.
142   bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
143                             unsigned BaseReg, int Offset);
144 
145   // Merge a pre- or post-index base register update into a ld/st instruction.
146   MachineBasicBlock::iterator
147   mergeUpdateInsn(MachineBasicBlock::iterator I,
148                   MachineBasicBlock::iterator Update, bool IsPreIdx);
149 
150   // Find and merge foldable ldr/str instructions.
151   bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI);
152 
153   // Find and pair ldr/str instructions.
154   bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
155 
156   // Find and promote load instructions which read directly from store.
157   bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
158 
159   bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt);
160 
161   bool runOnMachineFunction(MachineFunction &Fn) override;
162 
163   MachineFunctionProperties getRequiredProperties() const override {
164     return MachineFunctionProperties().set(
165         MachineFunctionProperties::Property::AllVRegsAllocated);
166   }
167 
168   const char *getPassName() const override {
169     return AARCH64_LOAD_STORE_OPT_NAME;
170   }
171 };
172 char AArch64LoadStoreOpt::ID = 0;
173 } // namespace
174 
175 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
176                 AARCH64_LOAD_STORE_OPT_NAME, false, false)
177 
178 static unsigned getBitExtrOpcode(MachineInstr &MI) {
179   switch (MI.getOpcode()) {
180   default:
181     llvm_unreachable("Unexpected opcode.");
182   case AArch64::LDRBBui:
183   case AArch64::LDURBBi:
184   case AArch64::LDRHHui:
185   case AArch64::LDURHHi:
186     return AArch64::UBFMWri;
187   case AArch64::LDRSBWui:
188   case AArch64::LDURSBWi:
189   case AArch64::LDRSHWui:
190   case AArch64::LDURSHWi:
191     return AArch64::SBFMWri;
192   }
193 }
194 
195 static bool isNarrowStore(unsigned Opc) {
196   switch (Opc) {
197   default:
198     return false;
199   case AArch64::STRBBui:
200   case AArch64::STURBBi:
201   case AArch64::STRHHui:
202   case AArch64::STURHHi:
203     return true;
204   }
205 }
206 
207 static bool isNarrowLoad(unsigned Opc) {
208   switch (Opc) {
209   default:
210     return false;
211   case AArch64::LDRHHui:
212   case AArch64::LDURHHi:
213   case AArch64::LDRBBui:
214   case AArch64::LDURBBi:
215   case AArch64::LDRSHWui:
216   case AArch64::LDURSHWi:
217   case AArch64::LDRSBWui:
218   case AArch64::LDURSBWi:
219     return true;
220   }
221 }
222 
223 static bool isNarrowLoad(MachineInstr &MI) {
224   return isNarrowLoad(MI.getOpcode());
225 }
226 
227 static bool isNarrowLoadOrStore(unsigned Opc) {
228   return isNarrowLoad(Opc) || isNarrowStore(Opc);
229 }
230 
231 // Scaling factor for unscaled load or store.
232 static int getMemScale(MachineInstr &MI) {
233   switch (MI.getOpcode()) {
234   default:
235     llvm_unreachable("Opcode has unknown scale!");
236   case AArch64::LDRBBui:
237   case AArch64::LDURBBi:
238   case AArch64::LDRSBWui:
239   case AArch64::LDURSBWi:
240   case AArch64::STRBBui:
241   case AArch64::STURBBi:
242     return 1;
243   case AArch64::LDRHHui:
244   case AArch64::LDURHHi:
245   case AArch64::LDRSHWui:
246   case AArch64::LDURSHWi:
247   case AArch64::STRHHui:
248   case AArch64::STURHHi:
249     return 2;
250   case AArch64::LDRSui:
251   case AArch64::LDURSi:
252   case AArch64::LDRSWui:
253   case AArch64::LDURSWi:
254   case AArch64::LDRWui:
255   case AArch64::LDURWi:
256   case AArch64::STRSui:
257   case AArch64::STURSi:
258   case AArch64::STRWui:
259   case AArch64::STURWi:
260   case AArch64::LDPSi:
261   case AArch64::LDPSWi:
262   case AArch64::LDPWi:
263   case AArch64::STPSi:
264   case AArch64::STPWi:
265     return 4;
266   case AArch64::LDRDui:
267   case AArch64::LDURDi:
268   case AArch64::LDRXui:
269   case AArch64::LDURXi:
270   case AArch64::STRDui:
271   case AArch64::STURDi:
272   case AArch64::STRXui:
273   case AArch64::STURXi:
274   case AArch64::LDPDi:
275   case AArch64::LDPXi:
276   case AArch64::STPDi:
277   case AArch64::STPXi:
278     return 8;
279   case AArch64::LDRQui:
280   case AArch64::LDURQi:
281   case AArch64::STRQui:
282   case AArch64::STURQi:
283   case AArch64::LDPQi:
284   case AArch64::STPQi:
285     return 16;
286   }
287 }
288 
289 static unsigned getMatchingNonSExtOpcode(unsigned Opc,
290                                          bool *IsValidLdStrOpc = nullptr) {
291   if (IsValidLdStrOpc)
292     *IsValidLdStrOpc = true;
293   switch (Opc) {
294   default:
295     if (IsValidLdStrOpc)
296       *IsValidLdStrOpc = false;
297     return UINT_MAX;
298   case AArch64::STRDui:
299   case AArch64::STURDi:
300   case AArch64::STRQui:
301   case AArch64::STURQi:
302   case AArch64::STRBBui:
303   case AArch64::STURBBi:
304   case AArch64::STRHHui:
305   case AArch64::STURHHi:
306   case AArch64::STRWui:
307   case AArch64::STURWi:
308   case AArch64::STRXui:
309   case AArch64::STURXi:
310   case AArch64::LDRDui:
311   case AArch64::LDURDi:
312   case AArch64::LDRQui:
313   case AArch64::LDURQi:
314   case AArch64::LDRWui:
315   case AArch64::LDURWi:
316   case AArch64::LDRXui:
317   case AArch64::LDURXi:
318   case AArch64::STRSui:
319   case AArch64::STURSi:
320   case AArch64::LDRSui:
321   case AArch64::LDURSi:
322   case AArch64::LDRHHui:
323   case AArch64::LDURHHi:
324   case AArch64::LDRBBui:
325   case AArch64::LDURBBi:
326     return Opc;
327   case AArch64::LDRSWui:
328     return AArch64::LDRWui;
329   case AArch64::LDURSWi:
330     return AArch64::LDURWi;
331   case AArch64::LDRSBWui:
332     return AArch64::LDRBBui;
333   case AArch64::LDRSHWui:
334     return AArch64::LDRHHui;
335   case AArch64::LDURSBWi:
336     return AArch64::LDURBBi;
337   case AArch64::LDURSHWi:
338     return AArch64::LDURHHi;
339   }
340 }
341 
342 static unsigned getMatchingWideOpcode(unsigned Opc) {
343   switch (Opc) {
344   default:
345     llvm_unreachable("Opcode has no wide equivalent!");
346   case AArch64::STRBBui:
347     return AArch64::STRHHui;
348   case AArch64::STRHHui:
349     return AArch64::STRWui;
350   case AArch64::STURBBi:
351     return AArch64::STURHHi;
352   case AArch64::STURHHi:
353     return AArch64::STURWi;
354   case AArch64::STURWi:
355     return AArch64::STURXi;
356   case AArch64::STRWui:
357     return AArch64::STRXui;
358   case AArch64::LDRHHui:
359   case AArch64::LDRSHWui:
360     return AArch64::LDRWui;
361   case AArch64::LDURHHi:
362   case AArch64::LDURSHWi:
363     return AArch64::LDURWi;
364   case AArch64::LDRBBui:
365   case AArch64::LDRSBWui:
366     return AArch64::LDRHHui;
367   case AArch64::LDURBBi:
368   case AArch64::LDURSBWi:
369     return AArch64::LDURHHi;
370   }
371 }
372 
373 static unsigned getMatchingPairOpcode(unsigned Opc) {
374   switch (Opc) {
375   default:
376     llvm_unreachable("Opcode has no pairwise equivalent!");
377   case AArch64::STRSui:
378   case AArch64::STURSi:
379     return AArch64::STPSi;
380   case AArch64::STRDui:
381   case AArch64::STURDi:
382     return AArch64::STPDi;
383   case AArch64::STRQui:
384   case AArch64::STURQi:
385     return AArch64::STPQi;
386   case AArch64::STRWui:
387   case AArch64::STURWi:
388     return AArch64::STPWi;
389   case AArch64::STRXui:
390   case AArch64::STURXi:
391     return AArch64::STPXi;
392   case AArch64::LDRSui:
393   case AArch64::LDURSi:
394     return AArch64::LDPSi;
395   case AArch64::LDRDui:
396   case AArch64::LDURDi:
397     return AArch64::LDPDi;
398   case AArch64::LDRQui:
399   case AArch64::LDURQi:
400     return AArch64::LDPQi;
401   case AArch64::LDRWui:
402   case AArch64::LDURWi:
403     return AArch64::LDPWi;
404   case AArch64::LDRXui:
405   case AArch64::LDURXi:
406     return AArch64::LDPXi;
407   case AArch64::LDRSWui:
408   case AArch64::LDURSWi:
409     return AArch64::LDPSWi;
410   }
411 }
412 
413 static unsigned isMatchingStore(MachineInstr &LoadInst,
414                                 MachineInstr &StoreInst) {
415   unsigned LdOpc = LoadInst.getOpcode();
416   unsigned StOpc = StoreInst.getOpcode();
417   switch (LdOpc) {
418   default:
419     llvm_unreachable("Unsupported load instruction!");
420   case AArch64::LDRBBui:
421     return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
422            StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
423   case AArch64::LDURBBi:
424     return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
425            StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
426   case AArch64::LDRHHui:
427     return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
428            StOpc == AArch64::STRXui;
429   case AArch64::LDURHHi:
430     return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
431            StOpc == AArch64::STURXi;
432   case AArch64::LDRWui:
433     return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
434   case AArch64::LDURWi:
435     return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
436   case AArch64::LDRXui:
437     return StOpc == AArch64::STRXui;
438   case AArch64::LDURXi:
439     return StOpc == AArch64::STURXi;
440   }
441 }
442 
443 static unsigned getPreIndexedOpcode(unsigned Opc) {
444   switch (Opc) {
445   default:
446     llvm_unreachable("Opcode has no pre-indexed equivalent!");
447   case AArch64::STRSui:
448     return AArch64::STRSpre;
449   case AArch64::STRDui:
450     return AArch64::STRDpre;
451   case AArch64::STRQui:
452     return AArch64::STRQpre;
453   case AArch64::STRBBui:
454     return AArch64::STRBBpre;
455   case AArch64::STRHHui:
456     return AArch64::STRHHpre;
457   case AArch64::STRWui:
458     return AArch64::STRWpre;
459   case AArch64::STRXui:
460     return AArch64::STRXpre;
461   case AArch64::LDRSui:
462     return AArch64::LDRSpre;
463   case AArch64::LDRDui:
464     return AArch64::LDRDpre;
465   case AArch64::LDRQui:
466     return AArch64::LDRQpre;
467   case AArch64::LDRBBui:
468     return AArch64::LDRBBpre;
469   case AArch64::LDRHHui:
470     return AArch64::LDRHHpre;
471   case AArch64::LDRWui:
472     return AArch64::LDRWpre;
473   case AArch64::LDRXui:
474     return AArch64::LDRXpre;
475   case AArch64::LDRSWui:
476     return AArch64::LDRSWpre;
477   case AArch64::LDPSi:
478     return AArch64::LDPSpre;
479   case AArch64::LDPSWi:
480     return AArch64::LDPSWpre;
481   case AArch64::LDPDi:
482     return AArch64::LDPDpre;
483   case AArch64::LDPQi:
484     return AArch64::LDPQpre;
485   case AArch64::LDPWi:
486     return AArch64::LDPWpre;
487   case AArch64::LDPXi:
488     return AArch64::LDPXpre;
489   case AArch64::STPSi:
490     return AArch64::STPSpre;
491   case AArch64::STPDi:
492     return AArch64::STPDpre;
493   case AArch64::STPQi:
494     return AArch64::STPQpre;
495   case AArch64::STPWi:
496     return AArch64::STPWpre;
497   case AArch64::STPXi:
498     return AArch64::STPXpre;
499   }
500 }
501 
502 static unsigned getPostIndexedOpcode(unsigned Opc) {
503   switch (Opc) {
504   default:
505     llvm_unreachable("Opcode has no post-indexed wise equivalent!");
506   case AArch64::STRSui:
507     return AArch64::STRSpost;
508   case AArch64::STRDui:
509     return AArch64::STRDpost;
510   case AArch64::STRQui:
511     return AArch64::STRQpost;
512   case AArch64::STRBBui:
513     return AArch64::STRBBpost;
514   case AArch64::STRHHui:
515     return AArch64::STRHHpost;
516   case AArch64::STRWui:
517     return AArch64::STRWpost;
518   case AArch64::STRXui:
519     return AArch64::STRXpost;
520   case AArch64::LDRSui:
521     return AArch64::LDRSpost;
522   case AArch64::LDRDui:
523     return AArch64::LDRDpost;
524   case AArch64::LDRQui:
525     return AArch64::LDRQpost;
526   case AArch64::LDRBBui:
527     return AArch64::LDRBBpost;
528   case AArch64::LDRHHui:
529     return AArch64::LDRHHpost;
530   case AArch64::LDRWui:
531     return AArch64::LDRWpost;
532   case AArch64::LDRXui:
533     return AArch64::LDRXpost;
534   case AArch64::LDRSWui:
535     return AArch64::LDRSWpost;
536   case AArch64::LDPSi:
537     return AArch64::LDPSpost;
538   case AArch64::LDPSWi:
539     return AArch64::LDPSWpost;
540   case AArch64::LDPDi:
541     return AArch64::LDPDpost;
542   case AArch64::LDPQi:
543     return AArch64::LDPQpost;
544   case AArch64::LDPWi:
545     return AArch64::LDPWpost;
546   case AArch64::LDPXi:
547     return AArch64::LDPXpost;
548   case AArch64::STPSi:
549     return AArch64::STPSpost;
550   case AArch64::STPDi:
551     return AArch64::STPDpost;
552   case AArch64::STPQi:
553     return AArch64::STPQpost;
554   case AArch64::STPWi:
555     return AArch64::STPWpost;
556   case AArch64::STPXi:
557     return AArch64::STPXpost;
558   }
559 }
560 
561 static bool isPairedLdSt(const MachineInstr &MI) {
562   switch (MI.getOpcode()) {
563   default:
564     return false;
565   case AArch64::LDPSi:
566   case AArch64::LDPSWi:
567   case AArch64::LDPDi:
568   case AArch64::LDPQi:
569   case AArch64::LDPWi:
570   case AArch64::LDPXi:
571   case AArch64::STPSi:
572   case AArch64::STPDi:
573   case AArch64::STPQi:
574   case AArch64::STPWi:
575   case AArch64::STPXi:
576     return true;
577   }
578 }
579 
580 static const MachineOperand &getLdStRegOp(const MachineInstr &MI,
581                                           unsigned PairedRegOp = 0) {
582   assert(PairedRegOp < 2 && "Unexpected register operand idx.");
583   unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0;
584   return MI.getOperand(Idx);
585 }
586 
587 static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) {
588   unsigned Idx = isPairedLdSt(MI) ? 2 : 1;
589   return MI.getOperand(Idx);
590 }
591 
592 static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) {
593   unsigned Idx = isPairedLdSt(MI) ? 3 : 2;
594   return MI.getOperand(Idx);
595 }
596 
597 static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
598                                   MachineInstr &StoreInst,
599                                   const AArch64InstrInfo *TII) {
600   assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
601   int LoadSize = getMemScale(LoadInst);
602   int StoreSize = getMemScale(StoreInst);
603   int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst)
604                              ? getLdStOffsetOp(StoreInst).getImm()
605                              : getLdStOffsetOp(StoreInst).getImm() * StoreSize;
606   int UnscaledLdOffset = TII->isUnscaledLdSt(LoadInst)
607                              ? getLdStOffsetOp(LoadInst).getImm()
608                              : getLdStOffsetOp(LoadInst).getImm() * LoadSize;
609   return (UnscaledStOffset <= UnscaledLdOffset) &&
610          (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
611 }
612 
613 static bool isPromotableZeroStoreOpcode(unsigned Opc) {
614   return isNarrowStore(Opc) || Opc == AArch64::STRWui || Opc == AArch64::STURWi;
615 }
616 
617 static bool isPromotableZeroStoreOpcode(MachineInstr &MI) {
618   return isPromotableZeroStoreOpcode(MI.getOpcode());
619 }
620 
621 static bool isPromotableZeroStoreInst(MachineInstr &MI) {
622   return (isPromotableZeroStoreOpcode(MI)) &&
623          getLdStRegOp(MI).getReg() == AArch64::WZR;
624 }
625 
626 MachineBasicBlock::iterator
627 AArch64LoadStoreOpt::mergeNarrowInsns(MachineBasicBlock::iterator I,
628                                       MachineBasicBlock::iterator MergeMI,
629                                       const LdStPairFlags &Flags) {
630   MachineBasicBlock::iterator NextI = I;
631   ++NextI;
632   // If NextI is the second of the two instructions to be merged, we need
633   // to skip one further. Either way we merge will invalidate the iterator,
634   // and we don't need to scan the new instruction, as it's a pairwise
635   // instruction, which we're not considering for further action anyway.
636   if (NextI == MergeMI)
637     ++NextI;
638 
639   unsigned Opc = I->getOpcode();
640   bool IsScaled = !TII->isUnscaledLdSt(Opc);
641   int OffsetStride = IsScaled ? 1 : getMemScale(*I);
642 
643   bool MergeForward = Flags.getMergeForward();
644   // Insert our new paired instruction after whichever of the paired
645   // instructions MergeForward indicates.
646   MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
647   // Also based on MergeForward is from where we copy the base register operand
648   // so we get the flags compatible with the input code.
649   const MachineOperand &BaseRegOp =
650       MergeForward ? getLdStBaseOp(*MergeMI) : getLdStBaseOp(*I);
651 
652   // Which register is Rt and which is Rt2 depends on the offset order.
653   MachineInstr *RtMI, *Rt2MI;
654   if (getLdStOffsetOp(*I).getImm() ==
655       getLdStOffsetOp(*MergeMI).getImm() + OffsetStride) {
656     RtMI = &*MergeMI;
657     Rt2MI = &*I;
658   } else {
659     RtMI = &*I;
660     Rt2MI = &*MergeMI;
661   }
662 
663   int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
664   // Change the scaled offset from small to large type.
665   if (IsScaled) {
666     assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
667     OffsetImm /= 2;
668   }
669 
670   DebugLoc DL = I->getDebugLoc();
671   MachineBasicBlock *MBB = I->getParent();
672   if (isNarrowLoad(Opc)) {
673     MachineInstr *RtNewDest = &*(MergeForward ? I : MergeMI);
674     // When merging small (< 32 bit) loads for big-endian targets, the order of
675     // the component parts gets swapped.
676     if (!Subtarget->isLittleEndian())
677       std::swap(RtMI, Rt2MI);
678     // Construct the new load instruction.
679     MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2;
680     NewMemMI =
681         BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
682             .addOperand(getLdStRegOp(*RtNewDest))
683             .addOperand(BaseRegOp)
684             .addImm(OffsetImm)
685             .setMemRefs(I->mergeMemRefsWith(*MergeMI));
686     (void)NewMemMI;
687 
688     DEBUG(
689         dbgs()
690         << "Creating the new load and extract. Replacing instructions:\n    ");
691     DEBUG(I->print(dbgs()));
692     DEBUG(dbgs() << "    ");
693     DEBUG(MergeMI->print(dbgs()));
694     DEBUG(dbgs() << "  with instructions:\n    ");
695     DEBUG((NewMemMI)->print(dbgs()));
696 
697     int Width = getMemScale(*I) == 1 ? 8 : 16;
698     int LSBLow = 0;
699     int LSBHigh = Width;
700     int ImmsLow = LSBLow + Width - 1;
701     int ImmsHigh = LSBHigh + Width - 1;
702     MachineInstr *ExtDestMI = &*(MergeForward ? MergeMI : I);
703     if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) {
704       // Create the bitfield extract for high bits.
705       BitExtMI1 =
706           BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*Rt2MI)))
707               .addOperand(getLdStRegOp(*Rt2MI))
708               .addReg(getLdStRegOp(*RtNewDest).getReg())
709               .addImm(LSBHigh)
710               .addImm(ImmsHigh);
711       // Create the bitfield extract for low bits.
712       if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
713         // For unsigned, prefer to use AND for low bits.
714         BitExtMI2 = BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::ANDWri))
715                         .addOperand(getLdStRegOp(*RtMI))
716                         .addReg(getLdStRegOp(*RtNewDest).getReg())
717                         .addImm(ImmsLow);
718       } else {
719         BitExtMI2 =
720             BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*RtMI)))
721                 .addOperand(getLdStRegOp(*RtMI))
722                 .addReg(getLdStRegOp(*RtNewDest).getReg())
723                 .addImm(LSBLow)
724                 .addImm(ImmsLow);
725       }
726     } else {
727       // Create the bitfield extract for low bits.
728       if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
729         // For unsigned, prefer to use AND for low bits.
730         BitExtMI1 = BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::ANDWri))
731                         .addOperand(getLdStRegOp(*RtMI))
732                         .addReg(getLdStRegOp(*RtNewDest).getReg())
733                         .addImm(ImmsLow);
734       } else {
735         BitExtMI1 =
736             BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*RtMI)))
737                 .addOperand(getLdStRegOp(*RtMI))
738                 .addReg(getLdStRegOp(*RtNewDest).getReg())
739                 .addImm(LSBLow)
740                 .addImm(ImmsLow);
741       }
742 
743       // Create the bitfield extract for high bits.
744       BitExtMI2 =
745           BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*Rt2MI)))
746               .addOperand(getLdStRegOp(*Rt2MI))
747               .addReg(getLdStRegOp(*RtNewDest).getReg())
748               .addImm(LSBHigh)
749               .addImm(ImmsHigh);
750     }
751     (void)BitExtMI1;
752     (void)BitExtMI2;
753 
754     DEBUG(dbgs() << "    ");
755     DEBUG((BitExtMI1)->print(dbgs()));
756     DEBUG(dbgs() << "    ");
757     DEBUG((BitExtMI2)->print(dbgs()));
758     DEBUG(dbgs() << "\n");
759 
760     // Erase the old instructions.
761     I->eraseFromParent();
762     MergeMI->eraseFromParent();
763     return NextI;
764   }
765   assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
766          "Expected promotable zero store");
767 
768   // Construct the new instruction.
769   MachineInstrBuilder MIB;
770   MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
771             .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
772             .addOperand(BaseRegOp)
773             .addImm(OffsetImm)
774             .setMemRefs(I->mergeMemRefsWith(*MergeMI));
775   (void)MIB;
776 
777   DEBUG(dbgs() << "Creating wider load/store. Replacing instructions:\n    ");
778   DEBUG(I->print(dbgs()));
779   DEBUG(dbgs() << "    ");
780   DEBUG(MergeMI->print(dbgs()));
781   DEBUG(dbgs() << "  with instruction:\n    ");
782   DEBUG(((MachineInstr *)MIB)->print(dbgs()));
783   DEBUG(dbgs() << "\n");
784 
785   // Erase the old instructions.
786   I->eraseFromParent();
787   MergeMI->eraseFromParent();
788   return NextI;
789 }
790 
791 MachineBasicBlock::iterator
792 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
793                                       MachineBasicBlock::iterator Paired,
794                                       const LdStPairFlags &Flags) {
795   MachineBasicBlock::iterator NextI = I;
796   ++NextI;
797   // If NextI is the second of the two instructions to be merged, we need
798   // to skip one further. Either way we merge will invalidate the iterator,
799   // and we don't need to scan the new instruction, as it's a pairwise
800   // instruction, which we're not considering for further action anyway.
801   if (NextI == Paired)
802     ++NextI;
803 
804   int SExtIdx = Flags.getSExtIdx();
805   unsigned Opc =
806       SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
807   bool IsUnscaled = TII->isUnscaledLdSt(Opc);
808   int OffsetStride = IsUnscaled ? getMemScale(*I) : 1;
809 
810   bool MergeForward = Flags.getMergeForward();
811   // Insert our new paired instruction after whichever of the paired
812   // instructions MergeForward indicates.
813   MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
814   // Also based on MergeForward is from where we copy the base register operand
815   // so we get the flags compatible with the input code.
816   const MachineOperand &BaseRegOp =
817       MergeForward ? getLdStBaseOp(*Paired) : getLdStBaseOp(*I);
818 
819   int Offset = getLdStOffsetOp(*I).getImm();
820   int PairedOffset = getLdStOffsetOp(*Paired).getImm();
821   bool PairedIsUnscaled = TII->isUnscaledLdSt(Paired->getOpcode());
822   if (IsUnscaled != PairedIsUnscaled) {
823     // We're trying to pair instructions that differ in how they are scaled.  If
824     // I is scaled then scale the offset of Paired accordingly.  Otherwise, do
825     // the opposite (i.e., make Paired's offset unscaled).
826     int MemSize = getMemScale(*Paired);
827     if (PairedIsUnscaled) {
828       // If the unscaled offset isn't a multiple of the MemSize, we can't
829       // pair the operations together.
830       assert(!(PairedOffset % getMemScale(*Paired)) &&
831              "Offset should be a multiple of the stride!");
832       PairedOffset /= MemSize;
833     } else {
834       PairedOffset *= MemSize;
835     }
836   }
837 
838   // Which register is Rt and which is Rt2 depends on the offset order.
839   MachineInstr *RtMI, *Rt2MI;
840   if (Offset == PairedOffset + OffsetStride) {
841     RtMI = &*Paired;
842     Rt2MI = &*I;
843     // Here we swapped the assumption made for SExtIdx.
844     // I.e., we turn ldp I, Paired into ldp Paired, I.
845     // Update the index accordingly.
846     if (SExtIdx != -1)
847       SExtIdx = (SExtIdx + 1) % 2;
848   } else {
849     RtMI = &*I;
850     Rt2MI = &*Paired;
851   }
852   int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
853   // Scale the immediate offset, if necessary.
854   if (TII->isUnscaledLdSt(RtMI->getOpcode())) {
855     assert(!(OffsetImm % getMemScale(*RtMI)) &&
856            "Unscaled offset cannot be scaled.");
857     OffsetImm /= getMemScale(*RtMI);
858   }
859 
860   // Construct the new instruction.
861   MachineInstrBuilder MIB;
862   DebugLoc DL = I->getDebugLoc();
863   MachineBasicBlock *MBB = I->getParent();
864   MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc)))
865             .addOperand(getLdStRegOp(*RtMI))
866             .addOperand(getLdStRegOp(*Rt2MI))
867             .addOperand(BaseRegOp)
868             .addImm(OffsetImm)
869             .setMemRefs(I->mergeMemRefsWith(*Paired));
870 
871   (void)MIB;
872 
873   DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
874   DEBUG(I->print(dbgs()));
875   DEBUG(dbgs() << "    ");
876   DEBUG(Paired->print(dbgs()));
877   DEBUG(dbgs() << "  with instruction:\n    ");
878   if (SExtIdx != -1) {
879     // Generate the sign extension for the proper result of the ldp.
880     // I.e., with X1, that would be:
881     // %W1<def> = KILL %W1, %X1<imp-def>
882     // %X1<def> = SBFMXri %X1<kill>, 0, 31
883     MachineOperand &DstMO = MIB->getOperand(SExtIdx);
884     // Right now, DstMO has the extended register, since it comes from an
885     // extended opcode.
886     unsigned DstRegX = DstMO.getReg();
887     // Get the W variant of that register.
888     unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
889     // Update the result of LDP to use the W instead of the X variant.
890     DstMO.setReg(DstRegW);
891     DEBUG(((MachineInstr *)MIB)->print(dbgs()));
892     DEBUG(dbgs() << "\n");
893     // Make the machine verifier happy by providing a definition for
894     // the X register.
895     // Insert this definition right after the generated LDP, i.e., before
896     // InsertionPoint.
897     MachineInstrBuilder MIBKill =
898         BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)
899             .addReg(DstRegW)
900             .addReg(DstRegX, RegState::Define);
901     MIBKill->getOperand(2).setImplicit();
902     // Create the sign extension.
903     MachineInstrBuilder MIBSXTW =
904         BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)
905             .addReg(DstRegX)
906             .addImm(0)
907             .addImm(31);
908     (void)MIBSXTW;
909     DEBUG(dbgs() << "  Extend operand:\n    ");
910     DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
911   } else {
912     DEBUG(((MachineInstr *)MIB)->print(dbgs()));
913   }
914   DEBUG(dbgs() << "\n");
915 
916   // Erase the old instructions.
917   I->eraseFromParent();
918   Paired->eraseFromParent();
919 
920   return NextI;
921 }
922 
923 MachineBasicBlock::iterator
924 AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
925                                           MachineBasicBlock::iterator StoreI) {
926   MachineBasicBlock::iterator NextI = LoadI;
927   ++NextI;
928 
929   int LoadSize = getMemScale(*LoadI);
930   int StoreSize = getMemScale(*StoreI);
931   unsigned LdRt = getLdStRegOp(*LoadI).getReg();
932   unsigned StRt = getLdStRegOp(*StoreI).getReg();
933   bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
934 
935   assert((IsStoreXReg ||
936           TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
937          "Unexpected RegClass");
938 
939   MachineInstr *BitExtMI;
940   if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
941     // Remove the load, if the destination register of the loads is the same
942     // register for stored value.
943     if (StRt == LdRt && LoadSize == 8) {
944       DEBUG(dbgs() << "Remove load instruction:\n    ");
945       DEBUG(LoadI->print(dbgs()));
946       DEBUG(dbgs() << "\n");
947       LoadI->eraseFromParent();
948       return NextI;
949     }
950     // Replace the load with a mov if the load and store are in the same size.
951     BitExtMI =
952         BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
953                 TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
954             .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
955             .addReg(StRt)
956             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
957   } else {
958     // FIXME: Currently we disable this transformation in big-endian targets as
959     // performance and correctness are verified only in little-endian.
960     if (!Subtarget->isLittleEndian())
961       return NextI;
962     bool IsUnscaled = TII->isUnscaledLdSt(*LoadI);
963     assert(IsUnscaled == TII->isUnscaledLdSt(*StoreI) &&
964            "Unsupported ld/st match");
965     assert(LoadSize <= StoreSize && "Invalid load size");
966     int UnscaledLdOffset = IsUnscaled
967                                ? getLdStOffsetOp(*LoadI).getImm()
968                                : getLdStOffsetOp(*LoadI).getImm() * LoadSize;
969     int UnscaledStOffset = IsUnscaled
970                                ? getLdStOffsetOp(*StoreI).getImm()
971                                : getLdStOffsetOp(*StoreI).getImm() * StoreSize;
972     int Width = LoadSize * 8;
973     int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
974     int Imms = Immr + Width - 1;
975     unsigned DestReg = IsStoreXReg
976                            ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32,
977                                                       &AArch64::GPR64RegClass)
978                            : LdRt;
979 
980     assert((UnscaledLdOffset >= UnscaledStOffset &&
981             (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
982            "Invalid offset");
983 
984     Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
985     Imms = Immr + Width - 1;
986     if (UnscaledLdOffset == UnscaledStOffset) {
987       uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
988                                 | ((Immr) << 6)               // immr
989                                 | ((Imms) << 0)               // imms
990           ;
991 
992       BitExtMI =
993           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
994                   TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
995                   DestReg)
996               .addReg(StRt)
997               .addImm(AndMaskEncoded);
998     } else {
999       BitExtMI =
1000           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1001                   TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
1002                   DestReg)
1003               .addReg(StRt)
1004               .addImm(Immr)
1005               .addImm(Imms);
1006     }
1007   }
1008   (void)BitExtMI;
1009 
1010   DEBUG(dbgs() << "Promoting load by replacing :\n    ");
1011   DEBUG(StoreI->print(dbgs()));
1012   DEBUG(dbgs() << "    ");
1013   DEBUG(LoadI->print(dbgs()));
1014   DEBUG(dbgs() << "  with instructions:\n    ");
1015   DEBUG(StoreI->print(dbgs()));
1016   DEBUG(dbgs() << "    ");
1017   DEBUG((BitExtMI)->print(dbgs()));
1018   DEBUG(dbgs() << "\n");
1019 
1020   // Erase the old instructions.
1021   LoadI->eraseFromParent();
1022   return NextI;
1023 }
1024 
1025 /// trackRegDefsUses - Remember what registers the specified instruction uses
1026 /// and modifies.
1027 static void trackRegDefsUses(const MachineInstr &MI, BitVector &ModifiedRegs,
1028                              BitVector &UsedRegs,
1029                              const TargetRegisterInfo *TRI) {
1030   for (const MachineOperand &MO : MI.operands()) {
1031     if (MO.isRegMask())
1032       ModifiedRegs.setBitsNotInMask(MO.getRegMask());
1033 
1034     if (!MO.isReg())
1035       continue;
1036     unsigned Reg = MO.getReg();
1037     if (!Reg)
1038       continue;
1039     if (MO.isDef()) {
1040       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
1041         ModifiedRegs.set(*AI);
1042     } else {
1043       assert(MO.isUse() && "Reg operand not a def and not a use?!?");
1044       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
1045         UsedRegs.set(*AI);
1046     }
1047   }
1048 }
1049 
1050 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
1051   // Convert the byte-offset used by unscaled into an "element" offset used
1052   // by the scaled pair load/store instructions.
1053   if (IsUnscaled) {
1054     // If the byte-offset isn't a multiple of the stride, there's no point
1055     // trying to match it.
1056     if (Offset % OffsetStride)
1057       return false;
1058     Offset /= OffsetStride;
1059   }
1060   return Offset <= 63 && Offset >= -64;
1061 }
1062 
1063 // Do alignment, specialized to power of 2 and for signed ints,
1064 // avoiding having to do a C-style cast from uint_64t to int when
1065 // using alignTo from include/llvm/Support/MathExtras.h.
1066 // FIXME: Move this function to include/MathExtras.h?
1067 static int alignTo(int Num, int PowOf2) {
1068   return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
1069 }
1070 
1071 static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
1072                      const AArch64InstrInfo *TII) {
1073   // One of the instructions must modify memory.
1074   if (!MIa.mayStore() && !MIb.mayStore())
1075     return false;
1076 
1077   // Both instructions must be memory operations.
1078   if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore())
1079     return false;
1080 
1081   return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb);
1082 }
1083 
1084 static bool mayAlias(MachineInstr &MIa,
1085                      SmallVectorImpl<MachineInstr *> &MemInsns,
1086                      const AArch64InstrInfo *TII) {
1087   for (MachineInstr *MIb : MemInsns)
1088     if (mayAlias(MIa, *MIb, TII))
1089       return true;
1090 
1091   return false;
1092 }
1093 
1094 bool AArch64LoadStoreOpt::findMatchingStore(
1095     MachineBasicBlock::iterator I, unsigned Limit,
1096     MachineBasicBlock::iterator &StoreI) {
1097   MachineBasicBlock::iterator B = I->getParent()->begin();
1098   MachineBasicBlock::iterator MBBI = I;
1099   MachineInstr &LoadMI = *I;
1100   unsigned BaseReg = getLdStBaseOp(LoadMI).getReg();
1101 
1102   // If the load is the first instruction in the block, there's obviously
1103   // not any matching store.
1104   if (MBBI == B)
1105     return false;
1106 
1107   // Track which registers have been modified and used between the first insn
1108   // and the second insn.
1109   ModifiedRegs.reset();
1110   UsedRegs.reset();
1111 
1112   unsigned Count = 0;
1113   do {
1114     --MBBI;
1115     MachineInstr &MI = *MBBI;
1116 
1117     // Don't count transient instructions towards the search limit since there
1118     // may be different numbers of them if e.g. debug information is present.
1119     if (!MI.isTransient())
1120       ++Count;
1121 
1122     // If the load instruction reads directly from the address to which the
1123     // store instruction writes and the stored value is not modified, we can
1124     // promote the load. Since we do not handle stores with pre-/post-index,
1125     // it's unnecessary to check if BaseReg is modified by the store itself.
1126     if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
1127         BaseReg == getLdStBaseOp(MI).getReg() &&
1128         isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
1129         !ModifiedRegs[getLdStRegOp(MI).getReg()]) {
1130       StoreI = MBBI;
1131       return true;
1132     }
1133 
1134     if (MI.isCall())
1135       return false;
1136 
1137     // Update modified / uses register lists.
1138     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1139 
1140     // Otherwise, if the base register is modified, we have no match, so
1141     // return early.
1142     if (ModifiedRegs[BaseReg])
1143       return false;
1144 
1145     // If we encounter a store aliased with the load, return early.
1146     if (MI.mayStore() && mayAlias(LoadMI, MI, TII))
1147       return false;
1148   } while (MBBI != B && Count < Limit);
1149   return false;
1150 }
1151 
1152 // Returns true if FirstMI and MI are candidates for merging or pairing.
1153 // Otherwise, returns false.
1154 static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
1155                                        LdStPairFlags &Flags,
1156                                        const AArch64InstrInfo *TII) {
1157   // If this is volatile or if pairing is suppressed, not a candidate.
1158   if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
1159     return false;
1160 
1161   // We should have already checked FirstMI for pair suppression and volatility.
1162   assert(!FirstMI.hasOrderedMemoryRef() &&
1163          !TII->isLdStPairSuppressed(FirstMI) &&
1164          "FirstMI shouldn't get here if either of these checks are true.");
1165 
1166   unsigned OpcA = FirstMI.getOpcode();
1167   unsigned OpcB = MI.getOpcode();
1168 
1169   // Opcodes match: nothing more to check.
1170   if (OpcA == OpcB)
1171     return true;
1172 
1173   // Try to match a sign-extended load/store with a zero-extended load/store.
1174   bool IsValidLdStrOpc, PairIsValidLdStrOpc;
1175   unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
1176   assert(IsValidLdStrOpc &&
1177          "Given Opc should be a Load or Store with an immediate");
1178   // OpcA will be the first instruction in the pair.
1179   if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
1180     Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);
1181     return true;
1182   }
1183 
1184   // If the second instruction isn't even a load/store, bail out.
1185   if (!PairIsValidLdStrOpc)
1186     return false;
1187 
1188   // FIXME: We don't support merging narrow loads/stores with mixed
1189   // scaled/unscaled offsets.
1190   if (isNarrowLoadOrStore(OpcA) || isNarrowLoadOrStore(OpcB))
1191     return false;
1192 
1193   // Try to match an unscaled load/store with a scaled load/store.
1194   return TII->isUnscaledLdSt(OpcA) != TII->isUnscaledLdSt(OpcB) &&
1195          getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);
1196 
1197   // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1198 }
1199 
1200 /// Scan the instructions looking for a load/store that can be combined with the
1201 /// current instruction into a wider equivalent or a load/store pair.
1202 MachineBasicBlock::iterator
1203 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
1204                                       LdStPairFlags &Flags, unsigned Limit,
1205                                       bool FindNarrowMerge) {
1206   MachineBasicBlock::iterator E = I->getParent()->end();
1207   MachineBasicBlock::iterator MBBI = I;
1208   MachineInstr &FirstMI = *I;
1209   ++MBBI;
1210 
1211   bool MayLoad = FirstMI.mayLoad();
1212   bool IsUnscaled = TII->isUnscaledLdSt(FirstMI);
1213   unsigned Reg = getLdStRegOp(FirstMI).getReg();
1214   unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
1215   int Offset = getLdStOffsetOp(FirstMI).getImm();
1216   int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
1217   bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
1218 
1219   // Track which registers have been modified and used between the first insn
1220   // (inclusive) and the second insn.
1221   ModifiedRegs.reset();
1222   UsedRegs.reset();
1223 
1224   // Remember any instructions that read/write memory between FirstMI and MI.
1225   SmallVector<MachineInstr *, 4> MemInsns;
1226 
1227   for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
1228     MachineInstr &MI = *MBBI;
1229 
1230     // Don't count transient instructions towards the search limit since there
1231     // may be different numbers of them if e.g. debug information is present.
1232     if (!MI.isTransient())
1233       ++Count;
1234 
1235     Flags.setSExtIdx(-1);
1236     if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
1237         getLdStOffsetOp(MI).isImm()) {
1238       assert(MI.mayLoadOrStore() && "Expected memory operation.");
1239       // If we've found another instruction with the same opcode, check to see
1240       // if the base and offset are compatible with our starting instruction.
1241       // These instructions all have scaled immediate operands, so we just
1242       // check for +1/-1. Make sure to check the new instruction offset is
1243       // actually an immediate and not a symbolic reference destined for
1244       // a relocation.
1245       unsigned MIBaseReg = getLdStBaseOp(MI).getReg();
1246       int MIOffset = getLdStOffsetOp(MI).getImm();
1247       bool MIIsUnscaled = TII->isUnscaledLdSt(MI);
1248       if (IsUnscaled != MIIsUnscaled) {
1249         // We're trying to pair instructions that differ in how they are scaled.
1250         // If FirstMI is scaled then scale the offset of MI accordingly.
1251         // Otherwise, do the opposite (i.e., make MI's offset unscaled).
1252         int MemSize = getMemScale(MI);
1253         if (MIIsUnscaled) {
1254           // If the unscaled offset isn't a multiple of the MemSize, we can't
1255           // pair the operations together: bail and keep looking.
1256           if (MIOffset % MemSize)
1257             continue;
1258           MIOffset /= MemSize;
1259         } else {
1260           MIOffset *= MemSize;
1261         }
1262       }
1263 
1264       if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
1265                                    (Offset + OffsetStride == MIOffset))) {
1266         int MinOffset = Offset < MIOffset ? Offset : MIOffset;
1267         if (FindNarrowMerge) {
1268           // If the alignment requirements of the scaled wide load/store
1269           // instruction can't express the offset of the scaled narrow input,
1270           // bail and keep looking. For promotable zero stores, allow only when
1271           // the stored value is the same (i.e., WZR).
1272           if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) ||
1273               (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
1274             trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1275             MemInsns.push_back(&MI);
1276             continue;
1277           }
1278         } else {
1279           // Pairwise instructions have a 7-bit signed offset field. Single
1280           // insns have a 12-bit unsigned offset field.  If the resultant
1281           // immediate offset of merging these instructions is out of range for
1282           // a pairwise instruction, bail and keep looking.
1283           if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
1284             trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1285             MemInsns.push_back(&MI);
1286             continue;
1287           }
1288           // If the alignment requirements of the paired (scaled) instruction
1289           // can't express the offset of the unscaled input, bail and keep
1290           // looking.
1291           if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
1292             trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1293             MemInsns.push_back(&MI);
1294             continue;
1295           }
1296         }
1297         // If the destination register of the loads is the same register, bail
1298         // and keep looking. A load-pair instruction with both destination
1299         // registers the same is UNPREDICTABLE and will result in an exception.
1300         if (MayLoad && Reg == getLdStRegOp(MI).getReg()) {
1301           trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1302           MemInsns.push_back(&MI);
1303           continue;
1304         }
1305 
1306         // If the Rt of the second instruction was not modified or used between
1307         // the two instructions and none of the instructions between the second
1308         // and first alias with the second, we can combine the second into the
1309         // first.
1310         if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
1311             !(MI.mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
1312             !mayAlias(MI, MemInsns, TII)) {
1313           Flags.setMergeForward(false);
1314           return MBBI;
1315         }
1316 
1317         // Likewise, if the Rt of the first instruction is not modified or used
1318         // between the two instructions and none of the instructions between the
1319         // first and the second alias with the first, we can combine the first
1320         // into the second.
1321         if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] &&
1322             !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) &&
1323             !mayAlias(FirstMI, MemInsns, TII)) {
1324           Flags.setMergeForward(true);
1325           return MBBI;
1326         }
1327         // Unable to combine these instructions due to interference in between.
1328         // Keep looking.
1329       }
1330     }
1331 
1332     // If the instruction wasn't a matching load or store.  Stop searching if we
1333     // encounter a call instruction that might modify memory.
1334     if (MI.isCall())
1335       return E;
1336 
1337     // Update modified / uses register lists.
1338     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1339 
1340     // Otherwise, if the base register is modified, we have no match, so
1341     // return early.
1342     if (ModifiedRegs[BaseReg])
1343       return E;
1344 
1345     // Update list of instructions that read/write memory.
1346     if (MI.mayLoadOrStore())
1347       MemInsns.push_back(&MI);
1348   }
1349   return E;
1350 }
1351 
1352 MachineBasicBlock::iterator
1353 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
1354                                      MachineBasicBlock::iterator Update,
1355                                      bool IsPreIdx) {
1356   assert((Update->getOpcode() == AArch64::ADDXri ||
1357           Update->getOpcode() == AArch64::SUBXri) &&
1358          "Unexpected base register update instruction to merge!");
1359   MachineBasicBlock::iterator NextI = I;
1360   // Return the instruction following the merged instruction, which is
1361   // the instruction following our unmerged load. Unless that's the add/sub
1362   // instruction we're merging, in which case it's the one after that.
1363   if (++NextI == Update)
1364     ++NextI;
1365 
1366   int Value = Update->getOperand(2).getImm();
1367   assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
1368          "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
1369   if (Update->getOpcode() == AArch64::SUBXri)
1370     Value = -Value;
1371 
1372   unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
1373                              : getPostIndexedOpcode(I->getOpcode());
1374   MachineInstrBuilder MIB;
1375   if (!isPairedLdSt(*I)) {
1376     // Non-paired instruction.
1377     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1378               .addOperand(getLdStRegOp(*Update))
1379               .addOperand(getLdStRegOp(*I))
1380               .addOperand(getLdStBaseOp(*I))
1381               .addImm(Value)
1382               .setMemRefs(I->memoperands_begin(), I->memoperands_end());
1383   } else {
1384     // Paired instruction.
1385     int Scale = getMemScale(*I);
1386     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1387               .addOperand(getLdStRegOp(*Update))
1388               .addOperand(getLdStRegOp(*I, 0))
1389               .addOperand(getLdStRegOp(*I, 1))
1390               .addOperand(getLdStBaseOp(*I))
1391               .addImm(Value / Scale)
1392               .setMemRefs(I->memoperands_begin(), I->memoperands_end());
1393   }
1394   (void)MIB;
1395 
1396   if (IsPreIdx)
1397     DEBUG(dbgs() << "Creating pre-indexed load/store.");
1398   else
1399     DEBUG(dbgs() << "Creating post-indexed load/store.");
1400   DEBUG(dbgs() << "    Replacing instructions:\n    ");
1401   DEBUG(I->print(dbgs()));
1402   DEBUG(dbgs() << "    ");
1403   DEBUG(Update->print(dbgs()));
1404   DEBUG(dbgs() << "  with instruction:\n    ");
1405   DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1406   DEBUG(dbgs() << "\n");
1407 
1408   // Erase the old instructions for the block.
1409   I->eraseFromParent();
1410   Update->eraseFromParent();
1411 
1412   return NextI;
1413 }
1414 
1415 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
1416                                                MachineInstr &MI,
1417                                                unsigned BaseReg, int Offset) {
1418   switch (MI.getOpcode()) {
1419   default:
1420     break;
1421   case AArch64::SUBXri:
1422     // Negate the offset for a SUB instruction.
1423     Offset *= -1;
1424   // FALLTHROUGH
1425   case AArch64::ADDXri:
1426     // Make sure it's a vanilla immediate operand, not a relocation or
1427     // anything else we can't handle.
1428     if (!MI.getOperand(2).isImm())
1429       break;
1430     // Watch out for 1 << 12 shifted value.
1431     if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm()))
1432       break;
1433 
1434     // The update instruction source and destination register must be the
1435     // same as the load/store base register.
1436     if (MI.getOperand(0).getReg() != BaseReg ||
1437         MI.getOperand(1).getReg() != BaseReg)
1438       break;
1439 
1440     bool IsPairedInsn = isPairedLdSt(MemMI);
1441     int UpdateOffset = MI.getOperand(2).getImm();
1442     // For non-paired load/store instructions, the immediate must fit in a
1443     // signed 9-bit integer.
1444     if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256))
1445       break;
1446 
1447     // For paired load/store instructions, the immediate must be a multiple of
1448     // the scaling factor.  The scaled offset must also fit into a signed 7-bit
1449     // integer.
1450     if (IsPairedInsn) {
1451       int Scale = getMemScale(MemMI);
1452       if (UpdateOffset % Scale != 0)
1453         break;
1454 
1455       int ScaledOffset = UpdateOffset / Scale;
1456       if (ScaledOffset > 64 || ScaledOffset < -64)
1457         break;
1458     }
1459 
1460     // If we have a non-zero Offset, we check that it matches the amount
1461     // we're adding to the register.
1462     if (!Offset || Offset == MI.getOperand(2).getImm())
1463       return true;
1464     break;
1465   }
1466   return false;
1467 }
1468 
1469 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
1470     MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
1471   MachineBasicBlock::iterator E = I->getParent()->end();
1472   MachineInstr &MemMI = *I;
1473   MachineBasicBlock::iterator MBBI = I;
1474 
1475   unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
1476   int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI);
1477 
1478   // Scan forward looking for post-index opportunities.  Updating instructions
1479   // can't be formed if the memory instruction doesn't have the offset we're
1480   // looking for.
1481   if (MIUnscaledOffset != UnscaledOffset)
1482     return E;
1483 
1484   // If the base register overlaps a destination register, we can't
1485   // merge the update.
1486   bool IsPairedInsn = isPairedLdSt(MemMI);
1487   for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
1488     unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
1489     if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
1490       return E;
1491   }
1492 
1493   // Track which registers have been modified and used between the first insn
1494   // (inclusive) and the second insn.
1495   ModifiedRegs.reset();
1496   UsedRegs.reset();
1497   ++MBBI;
1498   for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
1499     MachineInstr &MI = *MBBI;
1500 
1501     // Don't count transient instructions towards the search limit since there
1502     // may be different numbers of them if e.g. debug information is present.
1503     if (!MI.isTransient())
1504       ++Count;
1505 
1506     // If we found a match, return it.
1507     if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
1508       return MBBI;
1509 
1510     // Update the status of what the instruction clobbered and used.
1511     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1512 
1513     // Otherwise, if the base register is used or modified, we have no match, so
1514     // return early.
1515     if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
1516       return E;
1517   }
1518   return E;
1519 }
1520 
1521 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
1522     MachineBasicBlock::iterator I, unsigned Limit) {
1523   MachineBasicBlock::iterator B = I->getParent()->begin();
1524   MachineBasicBlock::iterator E = I->getParent()->end();
1525   MachineInstr &MemMI = *I;
1526   MachineBasicBlock::iterator MBBI = I;
1527 
1528   unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
1529   int Offset = getLdStOffsetOp(MemMI).getImm();
1530 
1531   // If the load/store is the first instruction in the block, there's obviously
1532   // not any matching update. Ditto if the memory offset isn't zero.
1533   if (MBBI == B || Offset != 0)
1534     return E;
1535   // If the base register overlaps a destination register, we can't
1536   // merge the update.
1537   bool IsPairedInsn = isPairedLdSt(MemMI);
1538   for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
1539     unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
1540     if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
1541       return E;
1542   }
1543 
1544   // Track which registers have been modified and used between the first insn
1545   // (inclusive) and the second insn.
1546   ModifiedRegs.reset();
1547   UsedRegs.reset();
1548   unsigned Count = 0;
1549   do {
1550     --MBBI;
1551     MachineInstr &MI = *MBBI;
1552 
1553     // Don't count transient instructions towards the search limit since there
1554     // may be different numbers of them if e.g. debug information is present.
1555     if (!MI.isTransient())
1556       ++Count;
1557 
1558     // If we found a match, return it.
1559     if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset))
1560       return MBBI;
1561 
1562     // Update the status of what the instruction clobbered and used.
1563     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1564 
1565     // Otherwise, if the base register is used or modified, we have no match, so
1566     // return early.
1567     if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
1568       return E;
1569   } while (MBBI != B && Count < Limit);
1570   return E;
1571 }
1572 
1573 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
1574     MachineBasicBlock::iterator &MBBI) {
1575   MachineInstr &MI = *MBBI;
1576   // If this is a volatile load, don't mess with it.
1577   if (MI.hasOrderedMemoryRef())
1578     return false;
1579 
1580   // Make sure this is a reg+imm.
1581   // FIXME: It is possible to extend it to handle reg+reg cases.
1582   if (!getLdStOffsetOp(MI).isImm())
1583     return false;
1584 
1585   // Look backward up to LdStLimit instructions.
1586   MachineBasicBlock::iterator StoreI;
1587   if (findMatchingStore(MBBI, LdStLimit, StoreI)) {
1588     ++NumLoadsFromStoresPromoted;
1589     // Promote the load. Keeping the iterator straight is a
1590     // pain, so we let the merge routine tell us what the next instruction
1591     // is after it's done mucking about.
1592     MBBI = promoteLoadFromStore(MBBI, StoreI);
1593     return true;
1594   }
1595   return false;
1596 }
1597 
1598 // Find narrow loads that can be converted into a single wider load with
1599 // bitfield extract instructions.  Also merge adjacent zero stores into a wider
1600 // store.
1601 bool AArch64LoadStoreOpt::tryToMergeLdStInst(
1602     MachineBasicBlock::iterator &MBBI) {
1603   assert((isNarrowLoad(*MBBI) || isPromotableZeroStoreOpcode(*MBBI)) &&
1604          "Expected narrow op.");
1605   MachineInstr &MI = *MBBI;
1606   MachineBasicBlock::iterator E = MI.getParent()->end();
1607 
1608   if (!TII->isCandidateToMergeOrPair(MI))
1609     return false;
1610 
1611   // For promotable zero stores, the stored value should be WZR.
1612   if (isPromotableZeroStoreOpcode(MI) &&
1613       getLdStRegOp(MI).getReg() != AArch64::WZR)
1614     return false;
1615 
1616   // Look ahead up to LdStLimit instructions for a mergable instruction.
1617   LdStPairFlags Flags;
1618   MachineBasicBlock::iterator MergeMI =
1619       findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
1620   if (MergeMI != E) {
1621     if (isNarrowLoad(MI)) {
1622       ++NumNarrowLoadsPromoted;
1623     } else if (isPromotableZeroStoreInst(MI)) {
1624       ++NumZeroStoresPromoted;
1625     }
1626     // Keeping the iterator straight is a pain, so we let the merge routine tell
1627     // us what the next instruction is after it's done mucking about.
1628     MBBI = mergeNarrowInsns(MBBI, MergeMI, Flags);
1629     return true;
1630   }
1631   return false;
1632 }
1633 
1634 // Find loads and stores that can be merged into a single load or store pair
1635 // instruction.
1636 bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
1637   MachineInstr &MI = *MBBI;
1638   MachineBasicBlock::iterator E = MI.getParent()->end();
1639 
1640   if (!TII->isCandidateToMergeOrPair(MI))
1641     return false;
1642 
1643   // Early exit if the offset is not possible to match. (6 bits of positive
1644   // range, plus allow an extra one in case we find a later insn that matches
1645   // with Offset-1)
1646   bool IsUnscaled = TII->isUnscaledLdSt(MI);
1647   int Offset = getLdStOffsetOp(MI).getImm();
1648   int OffsetStride = IsUnscaled ? getMemScale(MI) : 1;
1649   if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
1650     return false;
1651 
1652   // Look ahead up to LdStLimit instructions for a pairable instruction.
1653   LdStPairFlags Flags;
1654   MachineBasicBlock::iterator Paired =
1655       findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
1656   if (Paired != E) {
1657     ++NumPairCreated;
1658     if (TII->isUnscaledLdSt(MI))
1659       ++NumUnscaledPairCreated;
1660     // Keeping the iterator straight is a pain, so we let the merge routine tell
1661     // us what the next instruction is after it's done mucking about.
1662     MBBI = mergePairedInsns(MBBI, Paired, Flags);
1663     return true;
1664   }
1665   return false;
1666 }
1667 
1668 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
1669                                         bool enableNarrowLdOpt) {
1670   bool Modified = false;
1671   // Four tranformations to do here:
1672   // 1) Find loads that directly read from stores and promote them by
1673   //    replacing with mov instructions. If the store is wider than the load,
1674   //    the load will be replaced with a bitfield extract.
1675   //      e.g.,
1676   //        str w1, [x0, #4]
1677   //        ldrh w2, [x0, #6]
1678   //        ; becomes
1679   //        str w1, [x0, #4]
1680   //        lsr w2, w1, #16
1681   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1682        MBBI != E;) {
1683     MachineInstr &MI = *MBBI;
1684     switch (MI.getOpcode()) {
1685     default:
1686       // Just move on to the next instruction.
1687       ++MBBI;
1688       break;
1689     // Scaled instructions.
1690     case AArch64::LDRBBui:
1691     case AArch64::LDRHHui:
1692     case AArch64::LDRWui:
1693     case AArch64::LDRXui:
1694     // Unscaled instructions.
1695     case AArch64::LDURBBi:
1696     case AArch64::LDURHHi:
1697     case AArch64::LDURWi:
1698     case AArch64::LDURXi: {
1699       if (tryToPromoteLoadFromStore(MBBI)) {
1700         Modified = true;
1701         break;
1702       }
1703       ++MBBI;
1704       break;
1705     }
1706     }
1707   }
1708   // 2) Find narrow loads that can be converted into a single wider load
1709   //    with bitfield extract instructions.
1710   //      e.g.,
1711   //        ldrh w0, [x2]
1712   //        ldrh w1, [x2, #2]
1713   //        ; becomes
1714   //        ldr w0, [x2]
1715   //        ubfx w1, w0, #16, #16
1716   //        and w0, w0, #ffff
1717   //
1718   //    Also merge adjacent zero stores into a wider store.
1719   //      e.g.,
1720   //        strh wzr, [x0]
1721   //        strh wzr, [x0, #2]
1722   //        ; becomes
1723   //        str wzr, [x0]
1724   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1725        enableNarrowLdOpt && MBBI != E;) {
1726     MachineInstr &MI = *MBBI;
1727     unsigned Opc = MI.getOpcode();
1728     if (isPromotableZeroStoreOpcode(Opc) ||
1729         (EnableNarrowLdMerge && isNarrowLoad(Opc))) {
1730       if (tryToMergeLdStInst(MBBI)) {
1731         Modified = true;
1732       } else
1733         ++MBBI;
1734     } else
1735       ++MBBI;
1736   }
1737 
1738   // 3) Find loads and stores that can be merged into a single load or store
1739   //    pair instruction.
1740   //      e.g.,
1741   //        ldr x0, [x2]
1742   //        ldr x1, [x2, #8]
1743   //        ; becomes
1744   //        ldp x0, x1, [x2]
1745   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1746        MBBI != E;) {
1747     MachineInstr &MI = *MBBI;
1748     switch (MI.getOpcode()) {
1749     default:
1750       // Just move on to the next instruction.
1751       ++MBBI;
1752       break;
1753     // Scaled instructions.
1754     case AArch64::STRSui:
1755     case AArch64::STRDui:
1756     case AArch64::STRQui:
1757     case AArch64::STRXui:
1758     case AArch64::STRWui:
1759     case AArch64::LDRSui:
1760     case AArch64::LDRDui:
1761     case AArch64::LDRQui:
1762     case AArch64::LDRXui:
1763     case AArch64::LDRWui:
1764     case AArch64::LDRSWui:
1765     // Unscaled instructions.
1766     case AArch64::STURSi:
1767     case AArch64::STURDi:
1768     case AArch64::STURQi:
1769     case AArch64::STURWi:
1770     case AArch64::STURXi:
1771     case AArch64::LDURSi:
1772     case AArch64::LDURDi:
1773     case AArch64::LDURQi:
1774     case AArch64::LDURWi:
1775     case AArch64::LDURXi:
1776     case AArch64::LDURSWi: {
1777       if (tryToPairLdStInst(MBBI)) {
1778         Modified = true;
1779         break;
1780       }
1781       ++MBBI;
1782       break;
1783     }
1784     }
1785   }
1786   // 4) Find base register updates that can be merged into the load or store
1787   //    as a base-reg writeback.
1788   //      e.g.,
1789   //        ldr x0, [x2]
1790   //        add x2, x2, #4
1791   //        ; becomes
1792   //        ldr x0, [x2], #4
1793   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1794        MBBI != E;) {
1795     MachineInstr &MI = *MBBI;
1796     // Do update merging. It's simpler to keep this separate from the above
1797     // switchs, though not strictly necessary.
1798     unsigned Opc = MI.getOpcode();
1799     switch (Opc) {
1800     default:
1801       // Just move on to the next instruction.
1802       ++MBBI;
1803       break;
1804     // Scaled instructions.
1805     case AArch64::STRSui:
1806     case AArch64::STRDui:
1807     case AArch64::STRQui:
1808     case AArch64::STRXui:
1809     case AArch64::STRWui:
1810     case AArch64::STRHHui:
1811     case AArch64::STRBBui:
1812     case AArch64::LDRSui:
1813     case AArch64::LDRDui:
1814     case AArch64::LDRQui:
1815     case AArch64::LDRXui:
1816     case AArch64::LDRWui:
1817     case AArch64::LDRHHui:
1818     case AArch64::LDRBBui:
1819     // Unscaled instructions.
1820     case AArch64::STURSi:
1821     case AArch64::STURDi:
1822     case AArch64::STURQi:
1823     case AArch64::STURWi:
1824     case AArch64::STURXi:
1825     case AArch64::LDURSi:
1826     case AArch64::LDURDi:
1827     case AArch64::LDURQi:
1828     case AArch64::LDURWi:
1829     case AArch64::LDURXi:
1830     // Paired instructions.
1831     case AArch64::LDPSi:
1832     case AArch64::LDPSWi:
1833     case AArch64::LDPDi:
1834     case AArch64::LDPQi:
1835     case AArch64::LDPWi:
1836     case AArch64::LDPXi:
1837     case AArch64::STPSi:
1838     case AArch64::STPDi:
1839     case AArch64::STPQi:
1840     case AArch64::STPWi:
1841     case AArch64::STPXi: {
1842       // Make sure this is a reg+imm (as opposed to an address reloc).
1843       if (!getLdStOffsetOp(MI).isImm()) {
1844         ++MBBI;
1845         break;
1846       }
1847       // Look forward to try to form a post-index instruction. For example,
1848       // ldr x0, [x20]
1849       // add x20, x20, #32
1850       //   merged into:
1851       // ldr x0, [x20], #32
1852       MachineBasicBlock::iterator Update =
1853           findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
1854       if (Update != E) {
1855         // Merge the update into the ld/st.
1856         MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
1857         Modified = true;
1858         ++NumPostFolded;
1859         break;
1860       }
1861       // Don't know how to handle pre/post-index versions, so move to the next
1862       // instruction.
1863       if (TII->isUnscaledLdSt(Opc)) {
1864         ++MBBI;
1865         break;
1866       }
1867 
1868       // Look back to try to find a pre-index instruction. For example,
1869       // add x0, x0, #8
1870       // ldr x1, [x0]
1871       //   merged into:
1872       // ldr x1, [x0, #8]!
1873       Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
1874       if (Update != E) {
1875         // Merge the update into the ld/st.
1876         MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
1877         Modified = true;
1878         ++NumPreFolded;
1879         break;
1880       }
1881       // The immediate in the load/store is scaled by the size of the memory
1882       // operation. The immediate in the add we're looking for,
1883       // however, is not, so adjust here.
1884       int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
1885 
1886       // Look forward to try to find a post-index instruction. For example,
1887       // ldr x1, [x0, #64]
1888       // add x0, x0, #64
1889       //   merged into:
1890       // ldr x1, [x0, #64]!
1891       Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
1892       if (Update != E) {
1893         // Merge the update into the ld/st.
1894         MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
1895         Modified = true;
1896         ++NumPreFolded;
1897         break;
1898       }
1899 
1900       // Nothing found. Just move to the next instruction.
1901       ++MBBI;
1902       break;
1903     }
1904     }
1905   }
1906 
1907   return Modified;
1908 }
1909 
1910 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1911   if (skipFunction(*Fn.getFunction()))
1912     return false;
1913 
1914   Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
1915   TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
1916   TRI = Subtarget->getRegisterInfo();
1917 
1918   // Resize the modified and used register bitfield trackers.  We do this once
1919   // per function and then clear the bitfield each time we optimize a load or
1920   // store.
1921   ModifiedRegs.resize(TRI->getNumRegs());
1922   UsedRegs.resize(TRI->getNumRegs());
1923 
1924   bool Modified = false;
1925   bool enableNarrowLdOpt =
1926     Subtarget->mergeNarrowLoads() && !Subtarget->requiresStrictAlign();
1927   for (auto &MBB : Fn)
1928     Modified |= optimizeBlock(MBB, enableNarrowLdOpt);
1929 
1930   return Modified;
1931 }
1932 
1933 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
1934 // loads and stores near one another?
1935 
1936 // FIXME: When pairing store instructions it's very possible for this pass to
1937 // hoist a store with a KILL marker above another use (without a KILL marker).
1938 // The resulting IR is invalid, but nothing uses the KILL markers after this
1939 // pass, so it's never caused a problem in practice.
1940 
1941 /// createAArch64LoadStoreOptimizationPass - returns an instance of the
1942 /// load / store optimization pass.
1943 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
1944   return new AArch64LoadStoreOpt();
1945 }
1946