1 //===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains a pass that performs load / store related peephole
10 // optimizations. This pass should be run after register allocation.
11 //
12 // The pass runs after the PrologEpilogInserter where we emit the CFI
13 // instructions. In order to preserve the correctness of the unwind informaiton,
14 // the pass should not change the order of any two instructions, one of which
15 // has the FrameSetup/FrameDestroy flag or, alternatively, apply an add-hoc fix
16 // to unwind information.
17 //
18 //===----------------------------------------------------------------------===//
19 
20 #include "AArch64InstrInfo.h"
21 #include "AArch64MachineFunctionInfo.h"
22 #include "AArch64Subtarget.h"
23 #include "MCTargetDesc/AArch64AddressingModes.h"
24 #include "llvm/ADT/BitVector.h"
25 #include "llvm/ADT/SmallVector.h"
26 #include "llvm/ADT/Statistic.h"
27 #include "llvm/ADT/StringRef.h"
28 #include "llvm/ADT/iterator_range.h"
29 #include "llvm/Analysis/AliasAnalysis.h"
30 #include "llvm/CodeGen/MachineBasicBlock.h"
31 #include "llvm/CodeGen/MachineFunction.h"
32 #include "llvm/CodeGen/MachineFunctionPass.h"
33 #include "llvm/CodeGen/MachineInstr.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineOperand.h"
36 #include "llvm/CodeGen/MachineRegisterInfo.h"
37 #include "llvm/CodeGen/TargetRegisterInfo.h"
38 #include "llvm/IR/DebugLoc.h"
39 #include "llvm/MC/MCAsmInfo.h"
40 #include "llvm/MC/MCDwarf.h"
41 #include "llvm/MC/MCRegisterInfo.h"
42 #include "llvm/Pass.h"
43 #include "llvm/Support/CommandLine.h"
44 #include "llvm/Support/Debug.h"
45 #include "llvm/Support/DebugCounter.h"
46 #include "llvm/Support/ErrorHandling.h"
47 #include "llvm/Support/raw_ostream.h"
48 #include <cassert>
49 #include <cstdint>
50 #include <functional>
51 #include <iterator>
52 #include <limits>
53 
54 using namespace llvm;
55 
56 #define DEBUG_TYPE "aarch64-ldst-opt"
57 
58 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
59 STATISTIC(NumPostFolded, "Number of post-index updates folded");
60 STATISTIC(NumPreFolded, "Number of pre-index updates folded");
61 STATISTIC(NumUnscaledPairCreated,
62           "Number of load/store from unscaled generated");
63 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
64 STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
65 
66 DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
67               "Controls which pairs are considered for renaming");
68 
69 // The LdStLimit limits how far we search for load/store pairs.
70 static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
71                                    cl::init(20), cl::Hidden);
72 
73 // The UpdateLimit limits how far we search for update instructions when we form
74 // pre-/post-index instructions.
75 static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
76                                      cl::Hidden);
77 
78 // Enable register renaming to find additional store pairing opportunities.
79 static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
80                                     cl::init(true), cl::Hidden);
81 
82 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
83 
84 namespace {
85 
86 using LdStPairFlags = struct LdStPairFlags {
87   // If a matching instruction is found, MergeForward is set to true if the
88   // merge is to remove the first instruction and replace the second with
89   // a pair-wise insn, and false if the reverse is true.
90   bool MergeForward = false;
91 
92   // SExtIdx gives the index of the result of the load pair that must be
93   // extended. The value of SExtIdx assumes that the paired load produces the
94   // value in this order: (I, returned iterator), i.e., -1 means no value has
95   // to be extended, 0 means I, and 1 means the returned iterator.
96   int SExtIdx = -1;
97 
98   // If not none, RenameReg can be used to rename the result register of the
99   // first store in a pair. Currently this only works when merging stores
100   // forward.
101   Optional<MCPhysReg> RenameReg = None;
102 
103   LdStPairFlags() = default;
104 
105   void setMergeForward(bool V = true) { MergeForward = V; }
106   bool getMergeForward() const { return MergeForward; }
107 
108   void setSExtIdx(int V) { SExtIdx = V; }
109   int getSExtIdx() const { return SExtIdx; }
110 
111   void setRenameReg(MCPhysReg R) { RenameReg = R; }
112   void clearRenameReg() { RenameReg = None; }
113   Optional<MCPhysReg> getRenameReg() const { return RenameReg; }
114 };
115 
116 struct AArch64LoadStoreOpt : public MachineFunctionPass {
117   static char ID;
118 
119   AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
120     initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
121   }
122 
123   AliasAnalysis *AA;
124   const AArch64InstrInfo *TII;
125   const TargetRegisterInfo *TRI;
126   const AArch64Subtarget *Subtarget;
127 
128   // Track which register units have been modified and used.
129   LiveRegUnits ModifiedRegUnits, UsedRegUnits;
130   LiveRegUnits DefinedInBB;
131 
132   void getAnalysisUsage(AnalysisUsage &AU) const override {
133     AU.addRequired<AAResultsWrapperPass>();
134     MachineFunctionPass::getAnalysisUsage(AU);
135   }
136 
137   // Scan the instructions looking for a load/store that can be combined
138   // with the current instruction into a load/store pair.
139   // Return the matching instruction if one is found, else MBB->end().
140   MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
141                                                LdStPairFlags &Flags,
142                                                unsigned Limit,
143                                                bool FindNarrowMerge);
144 
145   // Scan the instructions looking for a store that writes to the address from
146   // which the current load instruction reads. Return true if one is found.
147   bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
148                          MachineBasicBlock::iterator &StoreI);
149 
150   // Merge the two instructions indicated into a wider narrow store instruction.
151   MachineBasicBlock::iterator
152   mergeNarrowZeroStores(MachineBasicBlock::iterator I,
153                         MachineBasicBlock::iterator MergeMI,
154                         const LdStPairFlags &Flags);
155 
156   // Merge the two instructions indicated into a single pair-wise instruction.
157   MachineBasicBlock::iterator
158   mergePairedInsns(MachineBasicBlock::iterator I,
159                    MachineBasicBlock::iterator Paired,
160                    const LdStPairFlags &Flags);
161 
162   // Promote the load that reads directly from the address stored to.
163   MachineBasicBlock::iterator
164   promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
165                        MachineBasicBlock::iterator StoreI);
166 
167   // Scan the instruction list to find a base register update that can
168   // be combined with the current instruction (a load or store) using
169   // pre or post indexed addressing with writeback. Scan forwards.
170   MachineBasicBlock::iterator
171   findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
172                                 int UnscaledOffset, unsigned Limit);
173 
174   // Scan the instruction list to find a base register update that can
175   // be combined with the current instruction (a load or store) using
176   // pre or post indexed addressing with writeback. Scan backwards.
177   MachineBasicBlock::iterator
178   findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
179 
180   // Find an instruction that updates the base register of the ld/st
181   // instruction.
182   bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
183                             unsigned BaseReg, int Offset);
184 
185   // Merge a pre- or post-index base register update into a ld/st instruction.
186   MachineBasicBlock::iterator
187   mergeUpdateInsn(MachineBasicBlock::iterator I,
188                   MachineBasicBlock::iterator Update, bool IsPreIdx);
189 
190   // Find and merge zero store instructions.
191   bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
192 
193   // Find and pair ldr/str instructions.
194   bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
195 
196   // Find and promote load instructions which read directly from store.
197   bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
198 
199   // Find and merge a base register updates before or after a ld/st instruction.
200   bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
201 
202   bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
203 
204   bool runOnMachineFunction(MachineFunction &Fn) override;
205 
206   MachineFunctionProperties getRequiredProperties() const override {
207     return MachineFunctionProperties().set(
208         MachineFunctionProperties::Property::NoVRegs);
209   }
210 
211   StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
212 };
213 
214 char AArch64LoadStoreOpt::ID = 0;
215 
216 } // end anonymous namespace
217 
218 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
219                 AARCH64_LOAD_STORE_OPT_NAME, false, false)
220 
221 static bool isNarrowStore(unsigned Opc) {
222   switch (Opc) {
223   default:
224     return false;
225   case AArch64::STRBBui:
226   case AArch64::STURBBi:
227   case AArch64::STRHHui:
228   case AArch64::STURHHi:
229     return true;
230   }
231 }
232 
233 // These instruction set memory tag and either keep memory contents unchanged or
234 // set it to zero, ignoring the address part of the source register.
235 static bool isTagStore(const MachineInstr &MI) {
236   switch (MI.getOpcode()) {
237   default:
238     return false;
239   case AArch64::STGOffset:
240   case AArch64::STZGOffset:
241   case AArch64::ST2GOffset:
242   case AArch64::STZ2GOffset:
243     return true;
244   }
245 }
246 
247 static unsigned getMatchingNonSExtOpcode(unsigned Opc,
248                                          bool *IsValidLdStrOpc = nullptr) {
249   if (IsValidLdStrOpc)
250     *IsValidLdStrOpc = true;
251   switch (Opc) {
252   default:
253     if (IsValidLdStrOpc)
254       *IsValidLdStrOpc = false;
255     return std::numeric_limits<unsigned>::max();
256   case AArch64::STRDui:
257   case AArch64::STURDi:
258   case AArch64::STRDpre:
259   case AArch64::STRQui:
260   case AArch64::STURQi:
261   case AArch64::STRQpre:
262   case AArch64::STRBBui:
263   case AArch64::STURBBi:
264   case AArch64::STRHHui:
265   case AArch64::STURHHi:
266   case AArch64::STRWui:
267   case AArch64::STRWpre:
268   case AArch64::STURWi:
269   case AArch64::STRXui:
270   case AArch64::STRXpre:
271   case AArch64::STURXi:
272   case AArch64::LDRDui:
273   case AArch64::LDURDi:
274   case AArch64::LDRDpre:
275   case AArch64::LDRQui:
276   case AArch64::LDURQi:
277   case AArch64::LDRQpre:
278   case AArch64::LDRWui:
279   case AArch64::LDURWi:
280   case AArch64::LDRWpre:
281   case AArch64::LDRXui:
282   case AArch64::LDURXi:
283   case AArch64::LDRXpre:
284   case AArch64::STRSui:
285   case AArch64::STURSi:
286   case AArch64::STRSpre:
287   case AArch64::LDRSui:
288   case AArch64::LDURSi:
289   case AArch64::LDRSpre:
290     return Opc;
291   case AArch64::LDRSWui:
292     return AArch64::LDRWui;
293   case AArch64::LDURSWi:
294     return AArch64::LDURWi;
295   }
296 }
297 
298 static unsigned getMatchingWideOpcode(unsigned Opc) {
299   switch (Opc) {
300   default:
301     llvm_unreachable("Opcode has no wide equivalent!");
302   case AArch64::STRBBui:
303     return AArch64::STRHHui;
304   case AArch64::STRHHui:
305     return AArch64::STRWui;
306   case AArch64::STURBBi:
307     return AArch64::STURHHi;
308   case AArch64::STURHHi:
309     return AArch64::STURWi;
310   case AArch64::STURWi:
311     return AArch64::STURXi;
312   case AArch64::STRWui:
313     return AArch64::STRXui;
314   }
315 }
316 
317 static unsigned getMatchingPairOpcode(unsigned Opc) {
318   switch (Opc) {
319   default:
320     llvm_unreachable("Opcode has no pairwise equivalent!");
321   case AArch64::STRSui:
322   case AArch64::STURSi:
323     return AArch64::STPSi;
324   case AArch64::STRSpre:
325     return AArch64::STPSpre;
326   case AArch64::STRDui:
327   case AArch64::STURDi:
328     return AArch64::STPDi;
329   case AArch64::STRDpre:
330     return AArch64::STPDpre;
331   case AArch64::STRQui:
332   case AArch64::STURQi:
333     return AArch64::STPQi;
334   case AArch64::STRQpre:
335     return AArch64::STPQpre;
336   case AArch64::STRWui:
337   case AArch64::STURWi:
338     return AArch64::STPWi;
339   case AArch64::STRWpre:
340     return AArch64::STPWpre;
341   case AArch64::STRXui:
342   case AArch64::STURXi:
343     return AArch64::STPXi;
344   case AArch64::STRXpre:
345     return AArch64::STPXpre;
346   case AArch64::LDRSui:
347   case AArch64::LDURSi:
348     return AArch64::LDPSi;
349   case AArch64::LDRSpre:
350     return AArch64::LDPSpre;
351   case AArch64::LDRDui:
352   case AArch64::LDURDi:
353     return AArch64::LDPDi;
354   case AArch64::LDRDpre:
355     return AArch64::LDPDpre;
356   case AArch64::LDRQui:
357   case AArch64::LDURQi:
358     return AArch64::LDPQi;
359   case AArch64::LDRQpre:
360     return AArch64::LDPQpre;
361   case AArch64::LDRWui:
362   case AArch64::LDURWi:
363     return AArch64::LDPWi;
364   case AArch64::LDRWpre:
365     return AArch64::LDPWpre;
366   case AArch64::LDRXui:
367   case AArch64::LDURXi:
368     return AArch64::LDPXi;
369   case AArch64::LDRXpre:
370     return AArch64::LDPXpre;
371   case AArch64::LDRSWui:
372   case AArch64::LDURSWi:
373     return AArch64::LDPSWi;
374   }
375 }
376 
377 static unsigned isMatchingStore(MachineInstr &LoadInst,
378                                 MachineInstr &StoreInst) {
379   unsigned LdOpc = LoadInst.getOpcode();
380   unsigned StOpc = StoreInst.getOpcode();
381   switch (LdOpc) {
382   default:
383     llvm_unreachable("Unsupported load instruction!");
384   case AArch64::LDRBBui:
385     return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
386            StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
387   case AArch64::LDURBBi:
388     return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
389            StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
390   case AArch64::LDRHHui:
391     return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
392            StOpc == AArch64::STRXui;
393   case AArch64::LDURHHi:
394     return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
395            StOpc == AArch64::STURXi;
396   case AArch64::LDRWui:
397     return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
398   case AArch64::LDURWi:
399     return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
400   case AArch64::LDRXui:
401     return StOpc == AArch64::STRXui;
402   case AArch64::LDURXi:
403     return StOpc == AArch64::STURXi;
404   }
405 }
406 
407 static unsigned getPreIndexedOpcode(unsigned Opc) {
408   // FIXME: We don't currently support creating pre-indexed loads/stores when
409   // the load or store is the unscaled version.  If we decide to perform such an
410   // optimization in the future the cases for the unscaled loads/stores will
411   // need to be added here.
412   switch (Opc) {
413   default:
414     llvm_unreachable("Opcode has no pre-indexed equivalent!");
415   case AArch64::STRSui:
416     return AArch64::STRSpre;
417   case AArch64::STRDui:
418     return AArch64::STRDpre;
419   case AArch64::STRQui:
420     return AArch64::STRQpre;
421   case AArch64::STRBBui:
422     return AArch64::STRBBpre;
423   case AArch64::STRHHui:
424     return AArch64::STRHHpre;
425   case AArch64::STRWui:
426     return AArch64::STRWpre;
427   case AArch64::STRXui:
428     return AArch64::STRXpre;
429   case AArch64::LDRSui:
430     return AArch64::LDRSpre;
431   case AArch64::LDRDui:
432     return AArch64::LDRDpre;
433   case AArch64::LDRQui:
434     return AArch64::LDRQpre;
435   case AArch64::LDRBBui:
436     return AArch64::LDRBBpre;
437   case AArch64::LDRHHui:
438     return AArch64::LDRHHpre;
439   case AArch64::LDRWui:
440     return AArch64::LDRWpre;
441   case AArch64::LDRXui:
442     return AArch64::LDRXpre;
443   case AArch64::LDRSWui:
444     return AArch64::LDRSWpre;
445   case AArch64::LDPSi:
446     return AArch64::LDPSpre;
447   case AArch64::LDPSWi:
448     return AArch64::LDPSWpre;
449   case AArch64::LDPDi:
450     return AArch64::LDPDpre;
451   case AArch64::LDPQi:
452     return AArch64::LDPQpre;
453   case AArch64::LDPWi:
454     return AArch64::LDPWpre;
455   case AArch64::LDPXi:
456     return AArch64::LDPXpre;
457   case AArch64::STPSi:
458     return AArch64::STPSpre;
459   case AArch64::STPDi:
460     return AArch64::STPDpre;
461   case AArch64::STPQi:
462     return AArch64::STPQpre;
463   case AArch64::STPWi:
464     return AArch64::STPWpre;
465   case AArch64::STPXi:
466     return AArch64::STPXpre;
467   case AArch64::STGOffset:
468     return AArch64::STGPreIndex;
469   case AArch64::STZGOffset:
470     return AArch64::STZGPreIndex;
471   case AArch64::ST2GOffset:
472     return AArch64::ST2GPreIndex;
473   case AArch64::STZ2GOffset:
474     return AArch64::STZ2GPreIndex;
475   case AArch64::STGPi:
476     return AArch64::STGPpre;
477   }
478 }
479 
480 static unsigned getPostIndexedOpcode(unsigned Opc) {
481   switch (Opc) {
482   default:
483     llvm_unreachable("Opcode has no post-indexed wise equivalent!");
484   case AArch64::STRSui:
485   case AArch64::STURSi:
486     return AArch64::STRSpost;
487   case AArch64::STRDui:
488   case AArch64::STURDi:
489     return AArch64::STRDpost;
490   case AArch64::STRQui:
491   case AArch64::STURQi:
492     return AArch64::STRQpost;
493   case AArch64::STRBBui:
494     return AArch64::STRBBpost;
495   case AArch64::STRHHui:
496     return AArch64::STRHHpost;
497   case AArch64::STRWui:
498   case AArch64::STURWi:
499     return AArch64::STRWpost;
500   case AArch64::STRXui:
501   case AArch64::STURXi:
502     return AArch64::STRXpost;
503   case AArch64::LDRSui:
504   case AArch64::LDURSi:
505     return AArch64::LDRSpost;
506   case AArch64::LDRDui:
507   case AArch64::LDURDi:
508     return AArch64::LDRDpost;
509   case AArch64::LDRQui:
510   case AArch64::LDURQi:
511     return AArch64::LDRQpost;
512   case AArch64::LDRBBui:
513     return AArch64::LDRBBpost;
514   case AArch64::LDRHHui:
515     return AArch64::LDRHHpost;
516   case AArch64::LDRWui:
517   case AArch64::LDURWi:
518     return AArch64::LDRWpost;
519   case AArch64::LDRXui:
520   case AArch64::LDURXi:
521     return AArch64::LDRXpost;
522   case AArch64::LDRSWui:
523     return AArch64::LDRSWpost;
524   case AArch64::LDPSi:
525     return AArch64::LDPSpost;
526   case AArch64::LDPSWi:
527     return AArch64::LDPSWpost;
528   case AArch64::LDPDi:
529     return AArch64::LDPDpost;
530   case AArch64::LDPQi:
531     return AArch64::LDPQpost;
532   case AArch64::LDPWi:
533     return AArch64::LDPWpost;
534   case AArch64::LDPXi:
535     return AArch64::LDPXpost;
536   case AArch64::STPSi:
537     return AArch64::STPSpost;
538   case AArch64::STPDi:
539     return AArch64::STPDpost;
540   case AArch64::STPQi:
541     return AArch64::STPQpost;
542   case AArch64::STPWi:
543     return AArch64::STPWpost;
544   case AArch64::STPXi:
545     return AArch64::STPXpost;
546   case AArch64::STGOffset:
547     return AArch64::STGPostIndex;
548   case AArch64::STZGOffset:
549     return AArch64::STZGPostIndex;
550   case AArch64::ST2GOffset:
551     return AArch64::ST2GPostIndex;
552   case AArch64::STZ2GOffset:
553     return AArch64::STZ2GPostIndex;
554   case AArch64::STGPi:
555     return AArch64::STGPpost;
556   }
557 }
558 
559 static bool isPairedLdSt(const MachineInstr &MI) {
560   switch (MI.getOpcode()) {
561   default:
562     return false;
563   case AArch64::LDPSi:
564   case AArch64::LDPSWi:
565   case AArch64::LDPDi:
566   case AArch64::LDPQi:
567   case AArch64::LDPWi:
568   case AArch64::LDPXi:
569   case AArch64::STPSi:
570   case AArch64::STPDi:
571   case AArch64::STPQi:
572   case AArch64::STPWi:
573   case AArch64::STPXi:
574   case AArch64::STGPi:
575     return true;
576   }
577 }
578 
579 static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) {
580 
581   unsigned OpcA = FirstMI.getOpcode();
582   unsigned OpcB = MI.getOpcode();
583 
584   switch (OpcA) {
585   default:
586     return false;
587   case AArch64::STRSpre:
588     return (OpcB == AArch64::STRSui) || (OpcB == AArch64::STURSi);
589   case AArch64::STRDpre:
590     return (OpcB == AArch64::STRDui) || (OpcB == AArch64::STURDi);
591   case AArch64::STRQpre:
592     return (OpcB == AArch64::STRQui) || (OpcB == AArch64::STURQi);
593   case AArch64::STRWpre:
594     return (OpcB == AArch64::STRWui) || (OpcB == AArch64::STURWi);
595   case AArch64::STRXpre:
596     return (OpcB == AArch64::STRXui) || (OpcB == AArch64::STURXi);
597   case AArch64::LDRSpre:
598     return (OpcB == AArch64::LDRSui) || (OpcB == AArch64::LDURSi);
599   case AArch64::LDRDpre:
600     return (OpcB == AArch64::LDRDui) || (OpcB == AArch64::LDURDi);
601   case AArch64::LDRQpre:
602     return (OpcB == AArch64::LDRQui) || (OpcB == AArch64::LDURQi);
603   case AArch64::LDRWpre:
604     return (OpcB == AArch64::LDRWui) || (OpcB == AArch64::LDURWi);
605   case AArch64::LDRXpre:
606     return (OpcB == AArch64::LDRXui) || (OpcB == AArch64::LDURXi);
607   }
608 }
609 
610 // Returns the scale and offset range of pre/post indexed variants of MI.
611 static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
612                                        int &MinOffset, int &MaxOffset) {
613   bool IsPaired = isPairedLdSt(MI);
614   bool IsTagStore = isTagStore(MI);
615   // ST*G and all paired ldst have the same scale in pre/post-indexed variants
616   // as in the "unsigned offset" variant.
617   // All other pre/post indexed ldst instructions are unscaled.
618   Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1;
619 
620   if (IsPaired) {
621     MinOffset = -64;
622     MaxOffset = 63;
623   } else {
624     MinOffset = -256;
625     MaxOffset = 255;
626   }
627 }
628 
629 static MachineOperand &getLdStRegOp(MachineInstr &MI,
630                                     unsigned PairedRegOp = 0) {
631   assert(PairedRegOp < 2 && "Unexpected register operand idx.");
632   bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI);
633   if (IsPreLdSt)
634     PairedRegOp += 1;
635   unsigned Idx = isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0;
636   return MI.getOperand(Idx);
637 }
638 
639 static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) {
640   unsigned Idx = isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2 : 1;
641   return MI.getOperand(Idx);
642 }
643 
644 static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) {
645   unsigned Idx = isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3 : 2;
646   return MI.getOperand(Idx);
647 }
648 
649 static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
650                                   MachineInstr &StoreInst,
651                                   const AArch64InstrInfo *TII) {
652   assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
653   int LoadSize = TII->getMemScale(LoadInst);
654   int StoreSize = TII->getMemScale(StoreInst);
655   int UnscaledStOffset = TII->hasUnscaledLdStOffset(StoreInst)
656                              ? getLdStOffsetOp(StoreInst).getImm()
657                              : getLdStOffsetOp(StoreInst).getImm() * StoreSize;
658   int UnscaledLdOffset = TII->hasUnscaledLdStOffset(LoadInst)
659                              ? getLdStOffsetOp(LoadInst).getImm()
660                              : getLdStOffsetOp(LoadInst).getImm() * LoadSize;
661   return (UnscaledStOffset <= UnscaledLdOffset) &&
662          (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
663 }
664 
665 static bool isPromotableZeroStoreInst(MachineInstr &MI) {
666   unsigned Opc = MI.getOpcode();
667   return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
668           isNarrowStore(Opc)) &&
669          getLdStRegOp(MI).getReg() == AArch64::WZR;
670 }
671 
672 static bool isPromotableLoadFromStore(MachineInstr &MI) {
673   switch (MI.getOpcode()) {
674   default:
675     return false;
676   // Scaled instructions.
677   case AArch64::LDRBBui:
678   case AArch64::LDRHHui:
679   case AArch64::LDRWui:
680   case AArch64::LDRXui:
681   // Unscaled instructions.
682   case AArch64::LDURBBi:
683   case AArch64::LDURHHi:
684   case AArch64::LDURWi:
685   case AArch64::LDURXi:
686     return true;
687   }
688 }
689 
690 static bool isMergeableLdStUpdate(MachineInstr &MI) {
691   unsigned Opc = MI.getOpcode();
692   switch (Opc) {
693   default:
694     return false;
695   // Scaled instructions.
696   case AArch64::STRSui:
697   case AArch64::STRDui:
698   case AArch64::STRQui:
699   case AArch64::STRXui:
700   case AArch64::STRWui:
701   case AArch64::STRHHui:
702   case AArch64::STRBBui:
703   case AArch64::LDRSui:
704   case AArch64::LDRDui:
705   case AArch64::LDRQui:
706   case AArch64::LDRXui:
707   case AArch64::LDRWui:
708   case AArch64::LDRHHui:
709   case AArch64::LDRBBui:
710   case AArch64::STGOffset:
711   case AArch64::STZGOffset:
712   case AArch64::ST2GOffset:
713   case AArch64::STZ2GOffset:
714   case AArch64::STGPi:
715   // Unscaled instructions.
716   case AArch64::STURSi:
717   case AArch64::STURDi:
718   case AArch64::STURQi:
719   case AArch64::STURWi:
720   case AArch64::STURXi:
721   case AArch64::LDURSi:
722   case AArch64::LDURDi:
723   case AArch64::LDURQi:
724   case AArch64::LDURWi:
725   case AArch64::LDURXi:
726   // Paired instructions.
727   case AArch64::LDPSi:
728   case AArch64::LDPSWi:
729   case AArch64::LDPDi:
730   case AArch64::LDPQi:
731   case AArch64::LDPWi:
732   case AArch64::LDPXi:
733   case AArch64::STPSi:
734   case AArch64::STPDi:
735   case AArch64::STPQi:
736   case AArch64::STPWi:
737   case AArch64::STPXi:
738     // Make sure this is a reg+imm (as opposed to an address reloc).
739     if (!getLdStOffsetOp(MI).isImm())
740       return false;
741 
742     return true;
743   }
744 }
745 
746 MachineBasicBlock::iterator
747 AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
748                                            MachineBasicBlock::iterator MergeMI,
749                                            const LdStPairFlags &Flags) {
750   assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
751          "Expected promotable zero stores.");
752 
753   MachineBasicBlock::iterator E = I->getParent()->end();
754   MachineBasicBlock::iterator NextI = next_nodbg(I, E);
755   // If NextI is the second of the two instructions to be merged, we need
756   // to skip one further. Either way we merge will invalidate the iterator,
757   // and we don't need to scan the new instruction, as it's a pairwise
758   // instruction, which we're not considering for further action anyway.
759   if (NextI == MergeMI)
760     NextI = next_nodbg(NextI, E);
761 
762   unsigned Opc = I->getOpcode();
763   bool IsScaled = !TII->hasUnscaledLdStOffset(Opc);
764   int OffsetStride = IsScaled ? 1 : TII->getMemScale(*I);
765 
766   bool MergeForward = Flags.getMergeForward();
767   // Insert our new paired instruction after whichever of the paired
768   // instructions MergeForward indicates.
769   MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
770   // Also based on MergeForward is from where we copy the base register operand
771   // so we get the flags compatible with the input code.
772   const MachineOperand &BaseRegOp =
773       MergeForward ? getLdStBaseOp(*MergeMI) : getLdStBaseOp(*I);
774 
775   // Which register is Rt and which is Rt2 depends on the offset order.
776   MachineInstr *RtMI;
777   if (getLdStOffsetOp(*I).getImm() ==
778       getLdStOffsetOp(*MergeMI).getImm() + OffsetStride)
779     RtMI = &*MergeMI;
780   else
781     RtMI = &*I;
782 
783   int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
784   // Change the scaled offset from small to large type.
785   if (IsScaled) {
786     assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
787     OffsetImm /= 2;
788   }
789 
790   // Construct the new instruction.
791   DebugLoc DL = I->getDebugLoc();
792   MachineBasicBlock *MBB = I->getParent();
793   MachineInstrBuilder MIB;
794   MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
795             .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
796             .add(BaseRegOp)
797             .addImm(OffsetImm)
798             .cloneMergedMemRefs({&*I, &*MergeMI})
799             .setMIFlags(I->mergeFlagsWith(*MergeMI));
800   (void)MIB;
801 
802   LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n    ");
803   LLVM_DEBUG(I->print(dbgs()));
804   LLVM_DEBUG(dbgs() << "    ");
805   LLVM_DEBUG(MergeMI->print(dbgs()));
806   LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
807   LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
808   LLVM_DEBUG(dbgs() << "\n");
809 
810   // Erase the old instructions.
811   I->eraseFromParent();
812   MergeMI->eraseFromParent();
813   return NextI;
814 }
815 
816 // Apply Fn to all instructions between MI and the beginning of the block, until
817 // a def for DefReg is reached. Returns true, iff Fn returns true for all
818 // visited instructions. Stop after visiting Limit iterations.
819 static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg,
820                               const TargetRegisterInfo *TRI, unsigned Limit,
821                               std::function<bool(MachineInstr &, bool)> &Fn) {
822   auto MBB = MI.getParent();
823   for (MachineInstr &I :
824        instructionsWithoutDebug(MI.getReverseIterator(), MBB->instr_rend())) {
825     if (!Limit)
826       return false;
827     --Limit;
828 
829     bool isDef = any_of(I.operands(), [DefReg, TRI](MachineOperand &MOP) {
830       return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() &&
831              TRI->regsOverlap(MOP.getReg(), DefReg);
832     });
833     if (!Fn(I, isDef))
834       return false;
835     if (isDef)
836       break;
837   }
838   return true;
839 }
840 
841 static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units,
842                                    const TargetRegisterInfo *TRI) {
843 
844   for (const MachineOperand &MOP : phys_regs_and_masks(MI))
845     if (MOP.isReg() && MOP.isKill())
846       Units.removeReg(MOP.getReg());
847 
848   for (const MachineOperand &MOP : phys_regs_and_masks(MI))
849     if (MOP.isReg() && !MOP.isKill())
850       Units.addReg(MOP.getReg());
851 }
852 
853 MachineBasicBlock::iterator
854 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
855                                       MachineBasicBlock::iterator Paired,
856                                       const LdStPairFlags &Flags) {
857   MachineBasicBlock::iterator E = I->getParent()->end();
858   MachineBasicBlock::iterator NextI = next_nodbg(I, E);
859   // If NextI is the second of the two instructions to be merged, we need
860   // to skip one further. Either way we merge will invalidate the iterator,
861   // and we don't need to scan the new instruction, as it's a pairwise
862   // instruction, which we're not considering for further action anyway.
863   if (NextI == Paired)
864     NextI = next_nodbg(NextI, E);
865 
866   int SExtIdx = Flags.getSExtIdx();
867   unsigned Opc =
868       SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
869   bool IsUnscaled = TII->hasUnscaledLdStOffset(Opc);
870   int OffsetStride = IsUnscaled ? TII->getMemScale(*I) : 1;
871 
872   bool MergeForward = Flags.getMergeForward();
873 
874   Optional<MCPhysReg> RenameReg = Flags.getRenameReg();
875   if (MergeForward && RenameReg) {
876     MCRegister RegToRename = getLdStRegOp(*I).getReg();
877     DefinedInBB.addReg(*RenameReg);
878 
879     // Return the sub/super register for RenameReg, matching the size of
880     // OriginalReg.
881     auto GetMatchingSubReg = [this,
882                               RenameReg](MCPhysReg OriginalReg) -> MCPhysReg {
883       for (MCPhysReg SubOrSuper : TRI->sub_and_superregs_inclusive(*RenameReg))
884         if (TRI->getMinimalPhysRegClass(OriginalReg) ==
885             TRI->getMinimalPhysRegClass(SubOrSuper))
886           return SubOrSuper;
887       llvm_unreachable("Should have found matching sub or super register!");
888     };
889 
890     std::function<bool(MachineInstr &, bool)> UpdateMIs =
891         [this, RegToRename, GetMatchingSubReg](MachineInstr &MI, bool IsDef) {
892           if (IsDef) {
893             bool SeenDef = false;
894             for (auto &MOP : MI.operands()) {
895               // Rename the first explicit definition and all implicit
896               // definitions matching RegToRename.
897               if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
898                   (!SeenDef || (MOP.isDef() && MOP.isImplicit())) &&
899                   TRI->regsOverlap(MOP.getReg(), RegToRename)) {
900                 assert((MOP.isImplicit() ||
901                         (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
902                        "Need renamable operands");
903                 MOP.setReg(GetMatchingSubReg(MOP.getReg()));
904                 SeenDef = true;
905               }
906             }
907           } else {
908             for (auto &MOP : MI.operands()) {
909               if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
910                   TRI->regsOverlap(MOP.getReg(), RegToRename)) {
911                 assert((MOP.isImplicit() ||
912                         (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
913                            "Need renamable operands");
914                 MOP.setReg(GetMatchingSubReg(MOP.getReg()));
915               }
916             }
917           }
918           LLVM_DEBUG(dbgs() << "Renamed " << MI << "\n");
919           return true;
920         };
921     forAllMIsUntilDef(*I, RegToRename, TRI, LdStLimit, UpdateMIs);
922 
923 #if !defined(NDEBUG)
924     // Make sure the register used for renaming is not used between the paired
925     // instructions. That would trash the content before the new paired
926     // instruction.
927     for (auto &MI :
928          iterator_range<MachineInstrBundleIterator<llvm::MachineInstr>>(
929              std::next(I), std::next(Paired)))
930       assert(all_of(MI.operands(),
931                     [this, &RenameReg](const MachineOperand &MOP) {
932                       return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
933                              MOP.isUndef() ||
934                              !TRI->regsOverlap(MOP.getReg(), *RenameReg);
935                     }) &&
936              "Rename register used between paired instruction, trashing the "
937              "content");
938 #endif
939   }
940 
941   // Insert our new paired instruction after whichever of the paired
942   // instructions MergeForward indicates.
943   MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
944   // Also based on MergeForward is from where we copy the base register operand
945   // so we get the flags compatible with the input code.
946   const MachineOperand &BaseRegOp =
947       MergeForward ? getLdStBaseOp(*Paired) : getLdStBaseOp(*I);
948 
949   int Offset = getLdStOffsetOp(*I).getImm();
950   int PairedOffset = getLdStOffsetOp(*Paired).getImm();
951   bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Paired->getOpcode());
952   if (IsUnscaled != PairedIsUnscaled) {
953     // We're trying to pair instructions that differ in how they are scaled.  If
954     // I is scaled then scale the offset of Paired accordingly.  Otherwise, do
955     // the opposite (i.e., make Paired's offset unscaled).
956     int MemSize = TII->getMemScale(*Paired);
957     if (PairedIsUnscaled) {
958       // If the unscaled offset isn't a multiple of the MemSize, we can't
959       // pair the operations together.
960       assert(!(PairedOffset % TII->getMemScale(*Paired)) &&
961              "Offset should be a multiple of the stride!");
962       PairedOffset /= MemSize;
963     } else {
964       PairedOffset *= MemSize;
965     }
966   }
967 
968   // Which register is Rt and which is Rt2 depends on the offset order.
969   // However, for pre load/stores the Rt should be the one of the pre
970   // load/store.
971   MachineInstr *RtMI, *Rt2MI;
972   if (Offset == PairedOffset + OffsetStride &&
973       !AArch64InstrInfo::isPreLdSt(*I)) {
974     RtMI = &*Paired;
975     Rt2MI = &*I;
976     // Here we swapped the assumption made for SExtIdx.
977     // I.e., we turn ldp I, Paired into ldp Paired, I.
978     // Update the index accordingly.
979     if (SExtIdx != -1)
980       SExtIdx = (SExtIdx + 1) % 2;
981   } else {
982     RtMI = &*I;
983     Rt2MI = &*Paired;
984   }
985   int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
986   // Scale the immediate offset, if necessary.
987   if (TII->hasUnscaledLdStOffset(RtMI->getOpcode())) {
988     assert(!(OffsetImm % TII->getMemScale(*RtMI)) &&
989            "Unscaled offset cannot be scaled.");
990     OffsetImm /= TII->getMemScale(*RtMI);
991   }
992 
993   // Construct the new instruction.
994   MachineInstrBuilder MIB;
995   DebugLoc DL = I->getDebugLoc();
996   MachineBasicBlock *MBB = I->getParent();
997   MachineOperand RegOp0 = getLdStRegOp(*RtMI);
998   MachineOperand RegOp1 = getLdStRegOp(*Rt2MI);
999   // Kill flags may become invalid when moving stores for pairing.
1000   if (RegOp0.isUse()) {
1001     if (!MergeForward) {
1002       // Clear kill flags on store if moving upwards. Example:
1003       //   STRWui %w0, ...
1004       //   USE %w1
1005       //   STRWui kill %w1  ; need to clear kill flag when moving STRWui upwards
1006       RegOp0.setIsKill(false);
1007       RegOp1.setIsKill(false);
1008     } else {
1009       // Clear kill flags of the first stores register. Example:
1010       //   STRWui %w1, ...
1011       //   USE kill %w1   ; need to clear kill flag when moving STRWui downwards
1012       //   STRW %w0
1013       Register Reg = getLdStRegOp(*I).getReg();
1014       for (MachineInstr &MI : make_range(std::next(I), Paired))
1015         MI.clearRegisterKills(Reg, TRI);
1016     }
1017   }
1018 
1019   unsigned int MatchPairOpcode = getMatchingPairOpcode(Opc);
1020   MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(MatchPairOpcode));
1021 
1022   // Adds the pre-index operand for pre-indexed ld/st pairs.
1023   if (AArch64InstrInfo::isPreLdSt(*RtMI))
1024     MIB.addReg(BaseRegOp.getReg(), RegState::Define);
1025 
1026   MIB.add(RegOp0)
1027       .add(RegOp1)
1028       .add(BaseRegOp)
1029       .addImm(OffsetImm)
1030       .cloneMergedMemRefs({&*I, &*Paired})
1031       .setMIFlags(I->mergeFlagsWith(*Paired));
1032 
1033   (void)MIB;
1034 
1035   LLVM_DEBUG(
1036       dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
1037   LLVM_DEBUG(I->print(dbgs()));
1038   LLVM_DEBUG(dbgs() << "    ");
1039   LLVM_DEBUG(Paired->print(dbgs()));
1040   LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
1041   if (SExtIdx != -1) {
1042     // Generate the sign extension for the proper result of the ldp.
1043     // I.e., with X1, that would be:
1044     // %w1 = KILL %w1, implicit-def %x1
1045     // %x1 = SBFMXri killed %x1, 0, 31
1046     MachineOperand &DstMO = MIB->getOperand(SExtIdx);
1047     // Right now, DstMO has the extended register, since it comes from an
1048     // extended opcode.
1049     Register DstRegX = DstMO.getReg();
1050     // Get the W variant of that register.
1051     Register DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
1052     // Update the result of LDP to use the W instead of the X variant.
1053     DstMO.setReg(DstRegW);
1054     LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1055     LLVM_DEBUG(dbgs() << "\n");
1056     // Make the machine verifier happy by providing a definition for
1057     // the X register.
1058     // Insert this definition right after the generated LDP, i.e., before
1059     // InsertionPoint.
1060     MachineInstrBuilder MIBKill =
1061         BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)
1062             .addReg(DstRegW)
1063             .addReg(DstRegX, RegState::Define);
1064     MIBKill->getOperand(2).setImplicit();
1065     // Create the sign extension.
1066     MachineInstrBuilder MIBSXTW =
1067         BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)
1068             .addReg(DstRegX)
1069             .addImm(0)
1070             .addImm(31);
1071     (void)MIBSXTW;
1072     LLVM_DEBUG(dbgs() << "  Extend operand:\n    ");
1073     LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
1074   } else {
1075     LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1076   }
1077   LLVM_DEBUG(dbgs() << "\n");
1078 
1079   if (MergeForward)
1080     for (const MachineOperand &MOP : phys_regs_and_masks(*I))
1081       if (MOP.isReg() && MOP.isKill())
1082         DefinedInBB.addReg(MOP.getReg());
1083 
1084   // Erase the old instructions.
1085   I->eraseFromParent();
1086   Paired->eraseFromParent();
1087 
1088   return NextI;
1089 }
1090 
1091 MachineBasicBlock::iterator
1092 AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
1093                                           MachineBasicBlock::iterator StoreI) {
1094   MachineBasicBlock::iterator NextI =
1095       next_nodbg(LoadI, LoadI->getParent()->end());
1096 
1097   int LoadSize = TII->getMemScale(*LoadI);
1098   int StoreSize = TII->getMemScale(*StoreI);
1099   Register LdRt = getLdStRegOp(*LoadI).getReg();
1100   const MachineOperand &StMO = getLdStRegOp(*StoreI);
1101   Register StRt = getLdStRegOp(*StoreI).getReg();
1102   bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
1103 
1104   assert((IsStoreXReg ||
1105           TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
1106          "Unexpected RegClass");
1107 
1108   MachineInstr *BitExtMI;
1109   if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
1110     // Remove the load, if the destination register of the loads is the same
1111     // register for stored value.
1112     if (StRt == LdRt && LoadSize == 8) {
1113       for (MachineInstr &MI : make_range(StoreI->getIterator(),
1114                                          LoadI->getIterator())) {
1115         if (MI.killsRegister(StRt, TRI)) {
1116           MI.clearRegisterKills(StRt, TRI);
1117           break;
1118         }
1119       }
1120       LLVM_DEBUG(dbgs() << "Remove load instruction:\n    ");
1121       LLVM_DEBUG(LoadI->print(dbgs()));
1122       LLVM_DEBUG(dbgs() << "\n");
1123       LoadI->eraseFromParent();
1124       return NextI;
1125     }
1126     // Replace the load with a mov if the load and store are in the same size.
1127     BitExtMI =
1128         BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1129                 TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
1130             .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
1131             .add(StMO)
1132             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
1133             .setMIFlags(LoadI->getFlags());
1134   } else {
1135     // FIXME: Currently we disable this transformation in big-endian targets as
1136     // performance and correctness are verified only in little-endian.
1137     if (!Subtarget->isLittleEndian())
1138       return NextI;
1139     bool IsUnscaled = TII->hasUnscaledLdStOffset(*LoadI);
1140     assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) &&
1141            "Unsupported ld/st match");
1142     assert(LoadSize <= StoreSize && "Invalid load size");
1143     int UnscaledLdOffset = IsUnscaled
1144                                ? getLdStOffsetOp(*LoadI).getImm()
1145                                : getLdStOffsetOp(*LoadI).getImm() * LoadSize;
1146     int UnscaledStOffset = IsUnscaled
1147                                ? getLdStOffsetOp(*StoreI).getImm()
1148                                : getLdStOffsetOp(*StoreI).getImm() * StoreSize;
1149     int Width = LoadSize * 8;
1150     Register DestReg =
1151         IsStoreXReg ? Register(TRI->getMatchingSuperReg(
1152                           LdRt, AArch64::sub_32, &AArch64::GPR64RegClass))
1153                     : LdRt;
1154 
1155     assert((UnscaledLdOffset >= UnscaledStOffset &&
1156             (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
1157            "Invalid offset");
1158 
1159     int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
1160     int Imms = Immr + Width - 1;
1161     if (UnscaledLdOffset == UnscaledStOffset) {
1162       uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
1163                                 | ((Immr) << 6)               // immr
1164                                 | ((Imms) << 0)               // imms
1165           ;
1166 
1167       BitExtMI =
1168           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1169                   TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
1170                   DestReg)
1171               .add(StMO)
1172               .addImm(AndMaskEncoded)
1173               .setMIFlags(LoadI->getFlags());
1174     } else {
1175       BitExtMI =
1176           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1177                   TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
1178                   DestReg)
1179               .add(StMO)
1180               .addImm(Immr)
1181               .addImm(Imms)
1182               .setMIFlags(LoadI->getFlags());
1183     }
1184   }
1185 
1186   // Clear kill flags between store and load.
1187   for (MachineInstr &MI : make_range(StoreI->getIterator(),
1188                                      BitExtMI->getIterator()))
1189     if (MI.killsRegister(StRt, TRI)) {
1190       MI.clearRegisterKills(StRt, TRI);
1191       break;
1192     }
1193 
1194   LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n    ");
1195   LLVM_DEBUG(StoreI->print(dbgs()));
1196   LLVM_DEBUG(dbgs() << "    ");
1197   LLVM_DEBUG(LoadI->print(dbgs()));
1198   LLVM_DEBUG(dbgs() << "  with instructions:\n    ");
1199   LLVM_DEBUG(StoreI->print(dbgs()));
1200   LLVM_DEBUG(dbgs() << "    ");
1201   LLVM_DEBUG((BitExtMI)->print(dbgs()));
1202   LLVM_DEBUG(dbgs() << "\n");
1203 
1204   // Erase the old instructions.
1205   LoadI->eraseFromParent();
1206   return NextI;
1207 }
1208 
1209 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
1210   // Convert the byte-offset used by unscaled into an "element" offset used
1211   // by the scaled pair load/store instructions.
1212   if (IsUnscaled) {
1213     // If the byte-offset isn't a multiple of the stride, there's no point
1214     // trying to match it.
1215     if (Offset % OffsetStride)
1216       return false;
1217     Offset /= OffsetStride;
1218   }
1219   return Offset <= 63 && Offset >= -64;
1220 }
1221 
1222 // Do alignment, specialized to power of 2 and for signed ints,
1223 // avoiding having to do a C-style cast from uint_64t to int when
1224 // using alignTo from include/llvm/Support/MathExtras.h.
1225 // FIXME: Move this function to include/MathExtras.h?
1226 static int alignTo(int Num, int PowOf2) {
1227   return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
1228 }
1229 
1230 static bool mayAlias(MachineInstr &MIa,
1231                      SmallVectorImpl<MachineInstr *> &MemInsns,
1232                      AliasAnalysis *AA) {
1233   for (MachineInstr *MIb : MemInsns)
1234     if (MIa.mayAlias(AA, *MIb, /*UseTBAA*/ false))
1235       return true;
1236 
1237   return false;
1238 }
1239 
1240 bool AArch64LoadStoreOpt::findMatchingStore(
1241     MachineBasicBlock::iterator I, unsigned Limit,
1242     MachineBasicBlock::iterator &StoreI) {
1243   MachineBasicBlock::iterator B = I->getParent()->begin();
1244   MachineBasicBlock::iterator MBBI = I;
1245   MachineInstr &LoadMI = *I;
1246   Register BaseReg = getLdStBaseOp(LoadMI).getReg();
1247 
1248   // If the load is the first instruction in the block, there's obviously
1249   // not any matching store.
1250   if (MBBI == B)
1251     return false;
1252 
1253   // Track which register units have been modified and used between the first
1254   // insn and the second insn.
1255   ModifiedRegUnits.clear();
1256   UsedRegUnits.clear();
1257 
1258   unsigned Count = 0;
1259   do {
1260     MBBI = prev_nodbg(MBBI, B);
1261     MachineInstr &MI = *MBBI;
1262 
1263     // Don't count transient instructions towards the search limit since there
1264     // may be different numbers of them if e.g. debug information is present.
1265     if (!MI.isTransient())
1266       ++Count;
1267 
1268     // If the load instruction reads directly from the address to which the
1269     // store instruction writes and the stored value is not modified, we can
1270     // promote the load. Since we do not handle stores with pre-/post-index,
1271     // it's unnecessary to check if BaseReg is modified by the store itself.
1272     // Also we can't handle stores without an immediate offset operand,
1273     // while the operand might be the address for a global variable.
1274     if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
1275         BaseReg == getLdStBaseOp(MI).getReg() && getLdStOffsetOp(MI).isImm() &&
1276         isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
1277         ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) {
1278       StoreI = MBBI;
1279       return true;
1280     }
1281 
1282     if (MI.isCall())
1283       return false;
1284 
1285     // Update modified / uses register units.
1286     LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1287 
1288     // Otherwise, if the base register is modified, we have no match, so
1289     // return early.
1290     if (!ModifiedRegUnits.available(BaseReg))
1291       return false;
1292 
1293     // If we encounter a store aliased with the load, return early.
1294     if (MI.mayStore() && LoadMI.mayAlias(AA, MI, /*UseTBAA*/ false))
1295       return false;
1296   } while (MBBI != B && Count < Limit);
1297   return false;
1298 }
1299 
1300 // Returns true if FirstMI and MI are candidates for merging or pairing.
1301 // Otherwise, returns false.
1302 static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
1303                                        LdStPairFlags &Flags,
1304                                        const AArch64InstrInfo *TII) {
1305   // If this is volatile or if pairing is suppressed, not a candidate.
1306   if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
1307     return false;
1308 
1309   // We should have already checked FirstMI for pair suppression and volatility.
1310   assert(!FirstMI.hasOrderedMemoryRef() &&
1311          !TII->isLdStPairSuppressed(FirstMI) &&
1312          "FirstMI shouldn't get here if either of these checks are true.");
1313 
1314   unsigned OpcA = FirstMI.getOpcode();
1315   unsigned OpcB = MI.getOpcode();
1316 
1317   // Opcodes match: If the opcodes are pre ld/st there is nothing more to check.
1318   if (OpcA == OpcB)
1319     return !AArch64InstrInfo::isPreLdSt(FirstMI);
1320 
1321   // Try to match a sign-extended load/store with a zero-extended load/store.
1322   bool IsValidLdStrOpc, PairIsValidLdStrOpc;
1323   unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
1324   assert(IsValidLdStrOpc &&
1325          "Given Opc should be a Load or Store with an immediate");
1326   // OpcA will be the first instruction in the pair.
1327   if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
1328     Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);
1329     return true;
1330   }
1331 
1332   // If the second instruction isn't even a mergable/pairable load/store, bail
1333   // out.
1334   if (!PairIsValidLdStrOpc)
1335     return false;
1336 
1337   // FIXME: We don't support merging narrow stores with mixed scaled/unscaled
1338   // offsets.
1339   if (isNarrowStore(OpcA) || isNarrowStore(OpcB))
1340     return false;
1341 
1342   // The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and
1343   // LDR<S,D,Q,W,X>pre-LDR<S,D,Q,W,X>ui
1344   // are candidate pairs that can be merged.
1345   if (isPreLdStPairCandidate(FirstMI, MI))
1346     return true;
1347 
1348   // Try to match an unscaled load/store with a scaled load/store.
1349   return TII->hasUnscaledLdStOffset(OpcA) != TII->hasUnscaledLdStOffset(OpcB) &&
1350          getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);
1351 
1352   // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1353 }
1354 
1355 static bool
1356 canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
1357                  SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1358                  const TargetRegisterInfo *TRI) {
1359   if (!FirstMI.mayStore())
1360     return false;
1361 
1362   // Check if we can find an unused register which we can use to rename
1363   // the register used by the first load/store.
1364   auto *RegClass = TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg());
1365   MachineFunction &MF = *FirstMI.getParent()->getParent();
1366   if (!RegClass || !MF.getRegInfo().tracksLiveness())
1367     return false;
1368 
1369   auto RegToRename = getLdStRegOp(FirstMI).getReg();
1370   // For now, we only rename if the store operand gets killed at the store.
1371   if (!getLdStRegOp(FirstMI).isKill() &&
1372       !any_of(FirstMI.operands(),
1373               [TRI, RegToRename](const MachineOperand &MOP) {
1374                 return MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1375                        MOP.isImplicit() && MOP.isKill() &&
1376                        TRI->regsOverlap(RegToRename, MOP.getReg());
1377               })) {
1378     LLVM_DEBUG(dbgs() << "  Operand not killed at " << FirstMI << "\n");
1379     return false;
1380   }
1381   auto canRenameMOP = [TRI](const MachineOperand &MOP) {
1382     if (MOP.isReg()) {
1383       auto *RegClass = TRI->getMinimalPhysRegClass(MOP.getReg());
1384       // Renaming registers with multiple disjunct sub-registers (e.g. the
1385       // result of a LD3) means that all sub-registers are renamed, potentially
1386       // impacting other instructions we did not check. Bail out.
1387       // Note that this relies on the structure of the AArch64 register file. In
1388       // particular, a subregister cannot be written without overwriting the
1389       // whole register.
1390       if (RegClass->HasDisjunctSubRegs) {
1391         LLVM_DEBUG(
1392             dbgs()
1393             << "  Cannot rename operands with multiple disjunct subregisters ("
1394             << MOP << ")\n");
1395         return false;
1396       }
1397     }
1398     return MOP.isImplicit() ||
1399            (MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied());
1400   };
1401 
1402   bool FoundDef = false;
1403 
1404   // For each instruction between FirstMI and the previous def for RegToRename,
1405   // we
1406   // * check if we can rename RegToRename in this instruction
1407   // * collect the registers used and required register classes for RegToRename.
1408   std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI,
1409                                                            bool IsDef) {
1410     LLVM_DEBUG(dbgs() << "Checking " << MI << "\n");
1411     // Currently we do not try to rename across frame-setup instructions.
1412     if (MI.getFlag(MachineInstr::FrameSetup)) {
1413       LLVM_DEBUG(dbgs() << "  Cannot rename framesetup instructions currently ("
1414                         << MI << ")\n");
1415       return false;
1416     }
1417 
1418     UsedInBetween.accumulate(MI);
1419 
1420     // For a definition, check that we can rename the definition and exit the
1421     // loop.
1422     FoundDef = IsDef;
1423 
1424     // For defs, check if we can rename the first def of RegToRename.
1425     if (FoundDef) {
1426       // For some pseudo instructions, we might not generate code in the end
1427       // (e.g. KILL) and we would end up without a correct def for the rename
1428       // register.
1429       // TODO: This might be overly conservative and we could handle those cases
1430       // in multiple ways:
1431       //       1. Insert an extra copy, to materialize the def.
1432       //       2. Skip pseudo-defs until we find an non-pseudo def.
1433       if (MI.isPseudo()) {
1434         LLVM_DEBUG(dbgs() << "  Cannot rename pseudo instruction " << MI
1435                           << "\n");
1436         return false;
1437       }
1438 
1439       for (auto &MOP : MI.operands()) {
1440         if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() ||
1441             !TRI->regsOverlap(MOP.getReg(), RegToRename))
1442           continue;
1443         if (!canRenameMOP(MOP)) {
1444           LLVM_DEBUG(dbgs()
1445                      << "  Cannot rename " << MOP << " in " << MI << "\n");
1446           return false;
1447         }
1448         RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
1449       }
1450       return true;
1451     } else {
1452       for (auto &MOP : MI.operands()) {
1453         if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1454             !TRI->regsOverlap(MOP.getReg(), RegToRename))
1455           continue;
1456 
1457         if (!canRenameMOP(MOP)) {
1458           LLVM_DEBUG(dbgs()
1459                      << "  Cannot rename " << MOP << " in " << MI << "\n");
1460           return false;
1461         }
1462         RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
1463       }
1464     }
1465     return true;
1466   };
1467 
1468   if (!forAllMIsUntilDef(FirstMI, RegToRename, TRI, LdStLimit, CheckMIs))
1469     return false;
1470 
1471   if (!FoundDef) {
1472     LLVM_DEBUG(dbgs() << "  Did not find definition for register in BB\n");
1473     return false;
1474   }
1475   return true;
1476 }
1477 
1478 // Check if we can find a physical register for renaming \p Reg. This register
1479 // must:
1480 // * not be defined already in \p DefinedInBB; DefinedInBB must contain all
1481 //   defined registers up to the point where the renamed register will be used,
1482 // * not used in \p UsedInBetween; UsedInBetween must contain all accessed
1483 //   registers in the range the rename register will be used,
1484 // * is available in all used register classes (checked using RequiredClasses).
1485 static Optional<MCPhysReg> tryToFindRegisterToRename(
1486     const MachineFunction &MF, Register Reg, LiveRegUnits &DefinedInBB,
1487     LiveRegUnits &UsedInBetween,
1488     SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1489     const TargetRegisterInfo *TRI) {
1490   const MachineRegisterInfo &RegInfo = MF.getRegInfo();
1491 
1492   // Checks if any sub- or super-register of PR is callee saved.
1493   auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) {
1494     return any_of(TRI->sub_and_superregs_inclusive(PR),
1495                   [&MF, TRI](MCPhysReg SubOrSuper) {
1496                     return TRI->isCalleeSavedPhysReg(SubOrSuper, MF);
1497                   });
1498   };
1499 
1500   // Check if PR or one of its sub- or super-registers can be used for all
1501   // required register classes.
1502   auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) {
1503     return all_of(RequiredClasses, [PR, TRI](const TargetRegisterClass *C) {
1504       return any_of(TRI->sub_and_superregs_inclusive(PR),
1505                     [C, TRI](MCPhysReg SubOrSuper) {
1506                       return C == TRI->getMinimalPhysRegClass(SubOrSuper);
1507                     });
1508     });
1509   };
1510 
1511   auto *RegClass = TRI->getMinimalPhysRegClass(Reg);
1512   for (const MCPhysReg &PR : *RegClass) {
1513     if (DefinedInBB.available(PR) && UsedInBetween.available(PR) &&
1514         !RegInfo.isReserved(PR) && !AnySubOrSuperRegCalleePreserved(PR) &&
1515         CanBeUsedForAllClasses(PR)) {
1516       DefinedInBB.addReg(PR);
1517       LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR, TRI)
1518                         << "\n");
1519       return {PR};
1520     }
1521   }
1522   LLVM_DEBUG(dbgs() << "No rename register found from "
1523                     << TRI->getRegClassName(RegClass) << "\n");
1524   return None;
1525 }
1526 
1527 /// Scan the instructions looking for a load/store that can be combined with the
1528 /// current instruction into a wider equivalent or a load/store pair.
1529 MachineBasicBlock::iterator
1530 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
1531                                       LdStPairFlags &Flags, unsigned Limit,
1532                                       bool FindNarrowMerge) {
1533   MachineBasicBlock::iterator E = I->getParent()->end();
1534   MachineBasicBlock::iterator MBBI = I;
1535   MachineBasicBlock::iterator MBBIWithRenameReg;
1536   MachineInstr &FirstMI = *I;
1537   MBBI = next_nodbg(MBBI, E);
1538 
1539   bool MayLoad = FirstMI.mayLoad();
1540   bool IsUnscaled = TII->hasUnscaledLdStOffset(FirstMI);
1541   Register Reg = getLdStRegOp(FirstMI).getReg();
1542   Register BaseReg = getLdStBaseOp(FirstMI).getReg();
1543   int Offset = getLdStOffsetOp(FirstMI).getImm();
1544   int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1;
1545   bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
1546 
1547   Optional<bool> MaybeCanRename = None;
1548   if (!EnableRenaming)
1549     MaybeCanRename = {false};
1550 
1551   SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses;
1552   LiveRegUnits UsedInBetween;
1553   UsedInBetween.init(*TRI);
1554 
1555   Flags.clearRenameReg();
1556 
1557   // Track which register units have been modified and used between the first
1558   // insn (inclusive) and the second insn.
1559   ModifiedRegUnits.clear();
1560   UsedRegUnits.clear();
1561 
1562   // Remember any instructions that read/write memory between FirstMI and MI.
1563   SmallVector<MachineInstr *, 4> MemInsns;
1564 
1565   for (unsigned Count = 0; MBBI != E && Count < Limit;
1566        MBBI = next_nodbg(MBBI, E)) {
1567     MachineInstr &MI = *MBBI;
1568 
1569     UsedInBetween.accumulate(MI);
1570 
1571     // Don't count transient instructions towards the search limit since there
1572     // may be different numbers of them if e.g. debug information is present.
1573     if (!MI.isTransient())
1574       ++Count;
1575 
1576     Flags.setSExtIdx(-1);
1577     if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
1578         getLdStOffsetOp(MI).isImm()) {
1579       assert(MI.mayLoadOrStore() && "Expected memory operation.");
1580       // If we've found another instruction with the same opcode, check to see
1581       // if the base and offset are compatible with our starting instruction.
1582       // These instructions all have scaled immediate operands, so we just
1583       // check for +1/-1. Make sure to check the new instruction offset is
1584       // actually an immediate and not a symbolic reference destined for
1585       // a relocation.
1586       Register MIBaseReg = getLdStBaseOp(MI).getReg();
1587       int MIOffset = getLdStOffsetOp(MI).getImm();
1588       bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI);
1589       if (IsUnscaled != MIIsUnscaled) {
1590         // We're trying to pair instructions that differ in how they are scaled.
1591         // If FirstMI is scaled then scale the offset of MI accordingly.
1592         // Otherwise, do the opposite (i.e., make MI's offset unscaled).
1593         int MemSize = TII->getMemScale(MI);
1594         if (MIIsUnscaled) {
1595           // If the unscaled offset isn't a multiple of the MemSize, we can't
1596           // pair the operations together: bail and keep looking.
1597           if (MIOffset % MemSize) {
1598             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1599                                               UsedRegUnits, TRI);
1600             MemInsns.push_back(&MI);
1601             continue;
1602           }
1603           MIOffset /= MemSize;
1604         } else {
1605           MIOffset *= MemSize;
1606         }
1607       }
1608 
1609       bool IsPreLdSt = isPreLdStPairCandidate(FirstMI, MI);
1610 
1611       if (BaseReg == MIBaseReg) {
1612         // If the offset of the second ld/st is not equal to the size of the
1613         // destination register it can’t be paired with a pre-index ld/st
1614         // pair. Additionally if the base reg is used or modified the operations
1615         // can't be paired: bail and keep looking.
1616         if (IsPreLdSt) {
1617           bool IsOutOfBounds = MIOffset != TII->getMemScale(MI);
1618           bool IsBaseRegUsed =
1619               !UsedRegUnits.available(getLdStBaseOp(MI).getReg());
1620           bool IsBaseRegModified =
1621               !ModifiedRegUnits.available(getLdStBaseOp(MI).getReg());
1622           // If the stored value and the address of the second instruction is
1623           // the same, it needs to be using the updated register and therefore
1624           // it must not be folded.
1625           bool IsMIRegTheSame = TRI->regsOverlap(getLdStRegOp(MI).getReg(),
1626                                                  getLdStBaseOp(MI).getReg());
1627           if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified ||
1628               IsMIRegTheSame) {
1629             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1630                                               UsedRegUnits, TRI);
1631             MemInsns.push_back(&MI);
1632             continue;
1633           }
1634         } else {
1635           if ((Offset != MIOffset + OffsetStride) &&
1636               (Offset + OffsetStride != MIOffset)) {
1637             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1638                                               UsedRegUnits, TRI);
1639             MemInsns.push_back(&MI);
1640             continue;
1641           }
1642         }
1643 
1644         int MinOffset = Offset < MIOffset ? Offset : MIOffset;
1645         if (FindNarrowMerge) {
1646           // If the alignment requirements of the scaled wide load/store
1647           // instruction can't express the offset of the scaled narrow input,
1648           // bail and keep looking. For promotable zero stores, allow only when
1649           // the stored value is the same (i.e., WZR).
1650           if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) ||
1651               (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
1652             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1653                                               UsedRegUnits, TRI);
1654             MemInsns.push_back(&MI);
1655             continue;
1656           }
1657         } else {
1658           // Pairwise instructions have a 7-bit signed offset field. Single
1659           // insns have a 12-bit unsigned offset field.  If the resultant
1660           // immediate offset of merging these instructions is out of range for
1661           // a pairwise instruction, bail and keep looking.
1662           if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
1663             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1664                                               UsedRegUnits, TRI);
1665             MemInsns.push_back(&MI);
1666             continue;
1667           }
1668           // If the alignment requirements of the paired (scaled) instruction
1669           // can't express the offset of the unscaled input, bail and keep
1670           // looking.
1671           if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
1672             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1673                                               UsedRegUnits, TRI);
1674             MemInsns.push_back(&MI);
1675             continue;
1676           }
1677         }
1678         // If the destination register of one load is the same register or a
1679         // sub/super register of the other load, bail and keep looking. A
1680         // load-pair instruction with both destination registers the same is
1681         // UNPREDICTABLE and will result in an exception.
1682         if (MayLoad &&
1683             TRI->isSuperOrSubRegisterEq(Reg, getLdStRegOp(MI).getReg())) {
1684           LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
1685                                             TRI);
1686           MemInsns.push_back(&MI);
1687           continue;
1688         }
1689 
1690         // If the BaseReg has been modified, then we cannot do the optimization.
1691         // For example, in the following pattern
1692         //   ldr x1 [x2]
1693         //   ldr x2 [x3]
1694         //   ldr x4 [x2, #8],
1695         // the first and third ldr cannot be converted to ldp x1, x4, [x2]
1696         if (!ModifiedRegUnits.available(BaseReg))
1697           return E;
1698 
1699         // If the Rt of the second instruction was not modified or used between
1700         // the two instructions and none of the instructions between the second
1701         // and first alias with the second, we can combine the second into the
1702         // first.
1703         if (ModifiedRegUnits.available(getLdStRegOp(MI).getReg()) &&
1704             !(MI.mayLoad() &&
1705               !UsedRegUnits.available(getLdStRegOp(MI).getReg())) &&
1706             !mayAlias(MI, MemInsns, AA)) {
1707 
1708           Flags.setMergeForward(false);
1709           Flags.clearRenameReg();
1710           return MBBI;
1711         }
1712 
1713         // Likewise, if the Rt of the first instruction is not modified or used
1714         // between the two instructions and none of the instructions between the
1715         // first and the second alias with the first, we can combine the first
1716         // into the second.
1717         if (!(MayLoad &&
1718               !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg())) &&
1719             !mayAlias(FirstMI, MemInsns, AA)) {
1720 
1721           if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg())) {
1722             Flags.setMergeForward(true);
1723             Flags.clearRenameReg();
1724             return MBBI;
1725           }
1726 
1727           if (DebugCounter::shouldExecute(RegRenamingCounter)) {
1728             if (!MaybeCanRename)
1729               MaybeCanRename = {canRenameUpToDef(FirstMI, UsedInBetween,
1730                                                  RequiredClasses, TRI)};
1731 
1732             if (*MaybeCanRename) {
1733               Optional<MCPhysReg> MaybeRenameReg = tryToFindRegisterToRename(
1734                   *FirstMI.getParent()->getParent(), Reg, DefinedInBB,
1735                   UsedInBetween, RequiredClasses, TRI);
1736               if (MaybeRenameReg) {
1737                 Flags.setRenameReg(*MaybeRenameReg);
1738                 Flags.setMergeForward(true);
1739                 MBBIWithRenameReg = MBBI;
1740               }
1741             }
1742           }
1743         }
1744         // Unable to combine these instructions due to interference in between.
1745         // Keep looking.
1746       }
1747     }
1748 
1749     if (Flags.getRenameReg())
1750       return MBBIWithRenameReg;
1751 
1752     // If the instruction wasn't a matching load or store.  Stop searching if we
1753     // encounter a call instruction that might modify memory.
1754     if (MI.isCall())
1755       return E;
1756 
1757     // Update modified / uses register units.
1758     LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1759 
1760     // Otherwise, if the base register is modified, we have no match, so
1761     // return early.
1762     if (!ModifiedRegUnits.available(BaseReg))
1763       return E;
1764 
1765     // Update list of instructions that read/write memory.
1766     if (MI.mayLoadOrStore())
1767       MemInsns.push_back(&MI);
1768   }
1769   return E;
1770 }
1771 
1772 static MachineBasicBlock::iterator
1773 maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) {
1774   auto End = MI.getParent()->end();
1775   if (MaybeCFI == End ||
1776       MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION ||
1777       !(MI.getFlag(MachineInstr::FrameSetup) ||
1778         MI.getFlag(MachineInstr::FrameDestroy)) ||
1779       getLdStBaseOp(MI).getReg() != AArch64::SP)
1780     return End;
1781 
1782   const MachineFunction &MF = *MI.getParent()->getParent();
1783   unsigned CFIIndex = MaybeCFI->getOperand(0).getCFIIndex();
1784   const MCCFIInstruction &CFI = MF.getFrameInstructions()[CFIIndex];
1785   switch (CFI.getOperation()) {
1786   case MCCFIInstruction::OpDefCfa:
1787   case MCCFIInstruction::OpDefCfaOffset:
1788     return MaybeCFI;
1789   default:
1790     return End;
1791   }
1792 }
1793 
1794 MachineBasicBlock::iterator
1795 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
1796                                      MachineBasicBlock::iterator Update,
1797                                      bool IsPreIdx) {
1798   assert((Update->getOpcode() == AArch64::ADDXri ||
1799           Update->getOpcode() == AArch64::SUBXri) &&
1800          "Unexpected base register update instruction to merge!");
1801   MachineBasicBlock::iterator E = I->getParent()->end();
1802   MachineBasicBlock::iterator NextI = next_nodbg(I, E);
1803 
1804   // If updating the SP and the following instruction is CFA offset related CFI
1805   // instruction move it after the merged instruction.
1806   MachineBasicBlock::iterator CFI =
1807       IsPreIdx ? maybeMoveCFI(*Update, next_nodbg(Update, E)) : E;
1808 
1809   // Return the instruction following the merged instruction, which is
1810   // the instruction following our unmerged load. Unless that's the add/sub
1811   // instruction we're merging, in which case it's the one after that.
1812   if (NextI == Update)
1813     NextI = next_nodbg(NextI, E);
1814 
1815   int Value = Update->getOperand(2).getImm();
1816   assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
1817          "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
1818   if (Update->getOpcode() == AArch64::SUBXri)
1819     Value = -Value;
1820 
1821   unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
1822                              : getPostIndexedOpcode(I->getOpcode());
1823   MachineInstrBuilder MIB;
1824   int Scale, MinOffset, MaxOffset;
1825   getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset);
1826   if (!isPairedLdSt(*I)) {
1827     // Non-paired instruction.
1828     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1829               .add(getLdStRegOp(*Update))
1830               .add(getLdStRegOp(*I))
1831               .add(getLdStBaseOp(*I))
1832               .addImm(Value / Scale)
1833               .setMemRefs(I->memoperands())
1834               .setMIFlags(I->mergeFlagsWith(*Update));
1835   } else {
1836     // Paired instruction.
1837     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1838               .add(getLdStRegOp(*Update))
1839               .add(getLdStRegOp(*I, 0))
1840               .add(getLdStRegOp(*I, 1))
1841               .add(getLdStBaseOp(*I))
1842               .addImm(Value / Scale)
1843               .setMemRefs(I->memoperands())
1844               .setMIFlags(I->mergeFlagsWith(*Update));
1845   }
1846   if (CFI != E) {
1847     MachineBasicBlock *MBB = I->getParent();
1848     MBB->splice(std::next(MIB.getInstr()->getIterator()), MBB, CFI);
1849   }
1850 
1851   if (IsPreIdx) {
1852     ++NumPreFolded;
1853     LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
1854   } else {
1855     ++NumPostFolded;
1856     LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
1857   }
1858   LLVM_DEBUG(dbgs() << "    Replacing instructions:\n    ");
1859   LLVM_DEBUG(I->print(dbgs()));
1860   LLVM_DEBUG(dbgs() << "    ");
1861   LLVM_DEBUG(Update->print(dbgs()));
1862   LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
1863   LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1864   LLVM_DEBUG(dbgs() << "\n");
1865 
1866   // Erase the old instructions for the block.
1867   I->eraseFromParent();
1868   Update->eraseFromParent();
1869 
1870   return NextI;
1871 }
1872 
1873 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
1874                                                MachineInstr &MI,
1875                                                unsigned BaseReg, int Offset) {
1876   switch (MI.getOpcode()) {
1877   default:
1878     break;
1879   case AArch64::SUBXri:
1880   case AArch64::ADDXri:
1881     // Make sure it's a vanilla immediate operand, not a relocation or
1882     // anything else we can't handle.
1883     if (!MI.getOperand(2).isImm())
1884       break;
1885     // Watch out for 1 << 12 shifted value.
1886     if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm()))
1887       break;
1888 
1889     // The update instruction source and destination register must be the
1890     // same as the load/store base register.
1891     if (MI.getOperand(0).getReg() != BaseReg ||
1892         MI.getOperand(1).getReg() != BaseReg)
1893       break;
1894 
1895     int UpdateOffset = MI.getOperand(2).getImm();
1896     if (MI.getOpcode() == AArch64::SUBXri)
1897       UpdateOffset = -UpdateOffset;
1898 
1899     // The immediate must be a multiple of the scaling factor of the pre/post
1900     // indexed instruction.
1901     int Scale, MinOffset, MaxOffset;
1902     getPrePostIndexedMemOpInfo(MemMI, Scale, MinOffset, MaxOffset);
1903     if (UpdateOffset % Scale != 0)
1904       break;
1905 
1906     // Scaled offset must fit in the instruction immediate.
1907     int ScaledOffset = UpdateOffset / Scale;
1908     if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset)
1909       break;
1910 
1911     // If we have a non-zero Offset, we check that it matches the amount
1912     // we're adding to the register.
1913     if (!Offset || Offset == UpdateOffset)
1914       return true;
1915     break;
1916   }
1917   return false;
1918 }
1919 
1920 static bool needsWinCFI(const MachineFunction *MF) {
1921   return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() &&
1922          MF->getFunction().needsUnwindTableEntry();
1923 }
1924 
1925 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
1926     MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
1927   MachineBasicBlock::iterator E = I->getParent()->end();
1928   MachineInstr &MemMI = *I;
1929   MachineBasicBlock::iterator MBBI = I;
1930 
1931   Register BaseReg = getLdStBaseOp(MemMI).getReg();
1932   int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * TII->getMemScale(MemMI);
1933 
1934   // Scan forward looking for post-index opportunities.  Updating instructions
1935   // can't be formed if the memory instruction doesn't have the offset we're
1936   // looking for.
1937   if (MIUnscaledOffset != UnscaledOffset)
1938     return E;
1939 
1940   // If the base register overlaps a source/destination register, we can't
1941   // merge the update. This does not apply to tag store instructions which
1942   // ignore the address part of the source register.
1943   // This does not apply to STGPi as well, which does not have unpredictable
1944   // behavior in this case unlike normal stores, and always performs writeback
1945   // after reading the source register value.
1946   if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) {
1947     bool IsPairedInsn = isPairedLdSt(MemMI);
1948     for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
1949       Register DestReg = getLdStRegOp(MemMI, i).getReg();
1950       if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
1951         return E;
1952     }
1953   }
1954 
1955   // Track which register units have been modified and used between the first
1956   // insn (inclusive) and the second insn.
1957   ModifiedRegUnits.clear();
1958   UsedRegUnits.clear();
1959   MBBI = next_nodbg(MBBI, E);
1960 
1961   // We can't post-increment the stack pointer if any instruction between
1962   // the memory access (I) and the increment (MBBI) can access the memory
1963   // region defined by [SP, MBBI].
1964   const bool BaseRegSP = BaseReg == AArch64::SP;
1965   if (BaseRegSP && needsWinCFI(I->getMF())) {
1966     // FIXME: For now, we always block the optimization over SP in windows
1967     // targets as it requires to adjust the unwind/debug info, messing up
1968     // the unwind info can actually cause a miscompile.
1969     return E;
1970   }
1971 
1972   for (unsigned Count = 0; MBBI != E && Count < Limit;
1973        MBBI = next_nodbg(MBBI, E)) {
1974     MachineInstr &MI = *MBBI;
1975 
1976     // Don't count transient instructions towards the search limit since there
1977     // may be different numbers of them if e.g. debug information is present.
1978     if (!MI.isTransient())
1979       ++Count;
1980 
1981     // If we found a match, return it.
1982     if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
1983       return MBBI;
1984 
1985     // Update the status of what the instruction clobbered and used.
1986     LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1987 
1988     // Otherwise, if the base register is used or modified, we have no match, so
1989     // return early.
1990     // If we are optimizing SP, do not allow instructions that may load or store
1991     // in between the load and the optimized value update.
1992     if (!ModifiedRegUnits.available(BaseReg) ||
1993         !UsedRegUnits.available(BaseReg) ||
1994         (BaseRegSP && MBBI->mayLoadOrStore()))
1995       return E;
1996   }
1997   return E;
1998 }
1999 
2000 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
2001     MachineBasicBlock::iterator I, unsigned Limit) {
2002   MachineBasicBlock::iterator B = I->getParent()->begin();
2003   MachineBasicBlock::iterator E = I->getParent()->end();
2004   MachineInstr &MemMI = *I;
2005   MachineBasicBlock::iterator MBBI = I;
2006   MachineFunction &MF = *MemMI.getMF();
2007 
2008   Register BaseReg = getLdStBaseOp(MemMI).getReg();
2009   int Offset = getLdStOffsetOp(MemMI).getImm();
2010 
2011   // If the load/store is the first instruction in the block, there's obviously
2012   // not any matching update. Ditto if the memory offset isn't zero.
2013   if (MBBI == B || Offset != 0)
2014     return E;
2015   // If the base register overlaps a destination register, we can't
2016   // merge the update.
2017   if (!isTagStore(MemMI)) {
2018     bool IsPairedInsn = isPairedLdSt(MemMI);
2019     for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
2020       Register DestReg = getLdStRegOp(MemMI, i).getReg();
2021       if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
2022         return E;
2023     }
2024   }
2025 
2026   const bool BaseRegSP = BaseReg == AArch64::SP;
2027   if (BaseRegSP && needsWinCFI(I->getMF())) {
2028     // FIXME: For now, we always block the optimization over SP in windows
2029     // targets as it requires to adjust the unwind/debug info, messing up
2030     // the unwind info can actually cause a miscompile.
2031     return E;
2032   }
2033 
2034   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2035   unsigned RedZoneSize =
2036       Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction());
2037 
2038   // Track which register units have been modified and used between the first
2039   // insn (inclusive) and the second insn.
2040   ModifiedRegUnits.clear();
2041   UsedRegUnits.clear();
2042   unsigned Count = 0;
2043   bool MemAcessBeforeSPPreInc = false;
2044   do {
2045     MBBI = prev_nodbg(MBBI, B);
2046     MachineInstr &MI = *MBBI;
2047 
2048     // Don't count transient instructions towards the search limit since there
2049     // may be different numbers of them if e.g. debug information is present.
2050     if (!MI.isTransient())
2051       ++Count;
2052 
2053     // If we found a match, return it.
2054     if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset)) {
2055       // Check that the update value is within our red zone limit (which may be
2056       // zero).
2057       if (MemAcessBeforeSPPreInc && MBBI->getOperand(2).getImm() > RedZoneSize)
2058         return E;
2059       return MBBI;
2060     }
2061 
2062     // Update the status of what the instruction clobbered and used.
2063     LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2064 
2065     // Otherwise, if the base register is used or modified, we have no match, so
2066     // return early.
2067     if (!ModifiedRegUnits.available(BaseReg) ||
2068         !UsedRegUnits.available(BaseReg))
2069       return E;
2070     // Keep track if we have a memory access before an SP pre-increment, in this
2071     // case we need to validate later that the update amount respects the red
2072     // zone.
2073     if (BaseRegSP && MBBI->mayLoadOrStore())
2074       MemAcessBeforeSPPreInc = true;
2075   } while (MBBI != B && Count < Limit);
2076   return E;
2077 }
2078 
2079 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
2080     MachineBasicBlock::iterator &MBBI) {
2081   MachineInstr &MI = *MBBI;
2082   // If this is a volatile load, don't mess with it.
2083   if (MI.hasOrderedMemoryRef())
2084     return false;
2085 
2086   // Make sure this is a reg+imm.
2087   // FIXME: It is possible to extend it to handle reg+reg cases.
2088   if (!getLdStOffsetOp(MI).isImm())
2089     return false;
2090 
2091   // Look backward up to LdStLimit instructions.
2092   MachineBasicBlock::iterator StoreI;
2093   if (findMatchingStore(MBBI, LdStLimit, StoreI)) {
2094     ++NumLoadsFromStoresPromoted;
2095     // Promote the load. Keeping the iterator straight is a
2096     // pain, so we let the merge routine tell us what the next instruction
2097     // is after it's done mucking about.
2098     MBBI = promoteLoadFromStore(MBBI, StoreI);
2099     return true;
2100   }
2101   return false;
2102 }
2103 
2104 // Merge adjacent zero stores into a wider store.
2105 bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
2106     MachineBasicBlock::iterator &MBBI) {
2107   assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
2108   MachineInstr &MI = *MBBI;
2109   MachineBasicBlock::iterator E = MI.getParent()->end();
2110 
2111   if (!TII->isCandidateToMergeOrPair(MI))
2112     return false;
2113 
2114   // Look ahead up to LdStLimit instructions for a mergable instruction.
2115   LdStPairFlags Flags;
2116   MachineBasicBlock::iterator MergeMI =
2117       findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
2118   if (MergeMI != E) {
2119     ++NumZeroStoresPromoted;
2120 
2121     // Keeping the iterator straight is a pain, so we let the merge routine tell
2122     // us what the next instruction is after it's done mucking about.
2123     MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags);
2124     return true;
2125   }
2126   return false;
2127 }
2128 
2129 // Find loads and stores that can be merged into a single load or store pair
2130 // instruction.
2131 bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
2132   MachineInstr &MI = *MBBI;
2133   MachineBasicBlock::iterator E = MI.getParent()->end();
2134 
2135   if (!TII->isCandidateToMergeOrPair(MI))
2136     return false;
2137 
2138   // Early exit if the offset is not possible to match. (6 bits of positive
2139   // range, plus allow an extra one in case we find a later insn that matches
2140   // with Offset-1)
2141   bool IsUnscaled = TII->hasUnscaledLdStOffset(MI);
2142   int Offset = getLdStOffsetOp(MI).getImm();
2143   int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1;
2144   // Allow one more for offset.
2145   if (Offset > 0)
2146     Offset -= OffsetStride;
2147   if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
2148     return false;
2149 
2150   // Look ahead up to LdStLimit instructions for a pairable instruction.
2151   LdStPairFlags Flags;
2152   MachineBasicBlock::iterator Paired =
2153       findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
2154   if (Paired != E) {
2155     ++NumPairCreated;
2156     if (TII->hasUnscaledLdStOffset(MI))
2157       ++NumUnscaledPairCreated;
2158     // Keeping the iterator straight is a pain, so we let the merge routine tell
2159     // us what the next instruction is after it's done mucking about.
2160     auto Prev = std::prev(MBBI);
2161     MBBI = mergePairedInsns(MBBI, Paired, Flags);
2162     // Collect liveness info for instructions between Prev and the new position
2163     // MBBI.
2164     for (auto I = std::next(Prev); I != MBBI; I++)
2165       updateDefinedRegisters(*I, DefinedInBB, TRI);
2166 
2167     return true;
2168   }
2169   return false;
2170 }
2171 
2172 bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
2173     (MachineBasicBlock::iterator &MBBI) {
2174   MachineInstr &MI = *MBBI;
2175   MachineBasicBlock::iterator E = MI.getParent()->end();
2176   MachineBasicBlock::iterator Update;
2177 
2178   // Look forward to try to form a post-index instruction. For example,
2179   // ldr x0, [x20]
2180   // add x20, x20, #32
2181   //   merged into:
2182   // ldr x0, [x20], #32
2183   Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
2184   if (Update != E) {
2185     // Merge the update into the ld/st.
2186     MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
2187     return true;
2188   }
2189 
2190   // Don't know how to handle unscaled pre/post-index versions below, so bail.
2191   if (TII->hasUnscaledLdStOffset(MI.getOpcode()))
2192     return false;
2193 
2194   // Look back to try to find a pre-index instruction. For example,
2195   // add x0, x0, #8
2196   // ldr x1, [x0]
2197   //   merged into:
2198   // ldr x1, [x0, #8]!
2199   Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
2200   if (Update != E) {
2201     // Merge the update into the ld/st.
2202     MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
2203     return true;
2204   }
2205 
2206   // The immediate in the load/store is scaled by the size of the memory
2207   // operation. The immediate in the add we're looking for,
2208   // however, is not, so adjust here.
2209   int UnscaledOffset = getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);
2210 
2211   // Look forward to try to find a pre-index instruction. For example,
2212   // ldr x1, [x0, #64]
2213   // add x0, x0, #64
2214   //   merged into:
2215   // ldr x1, [x0, #64]!
2216   Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
2217   if (Update != E) {
2218     // Merge the update into the ld/st.
2219     MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
2220     return true;
2221   }
2222 
2223   return false;
2224 }
2225 
2226 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
2227                                         bool EnableNarrowZeroStOpt) {
2228 
2229   bool Modified = false;
2230   // Four tranformations to do here:
2231   // 1) Find loads that directly read from stores and promote them by
2232   //    replacing with mov instructions. If the store is wider than the load,
2233   //    the load will be replaced with a bitfield extract.
2234   //      e.g.,
2235   //        str w1, [x0, #4]
2236   //        ldrh w2, [x0, #6]
2237   //        ; becomes
2238   //        str w1, [x0, #4]
2239   //        lsr w2, w1, #16
2240   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2241        MBBI != E;) {
2242     if (isPromotableLoadFromStore(*MBBI) && tryToPromoteLoadFromStore(MBBI))
2243       Modified = true;
2244     else
2245       ++MBBI;
2246   }
2247   // 2) Merge adjacent zero stores into a wider store.
2248   //      e.g.,
2249   //        strh wzr, [x0]
2250   //        strh wzr, [x0, #2]
2251   //        ; becomes
2252   //        str wzr, [x0]
2253   //      e.g.,
2254   //        str wzr, [x0]
2255   //        str wzr, [x0, #4]
2256   //        ; becomes
2257   //        str xzr, [x0]
2258   if (EnableNarrowZeroStOpt)
2259     for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2260          MBBI != E;) {
2261       if (isPromotableZeroStoreInst(*MBBI) && tryToMergeZeroStInst(MBBI))
2262         Modified = true;
2263       else
2264         ++MBBI;
2265     }
2266   // 3) Find loads and stores that can be merged into a single load or store
2267   //    pair instruction.
2268   //      e.g.,
2269   //        ldr x0, [x2]
2270   //        ldr x1, [x2, #8]
2271   //        ; becomes
2272   //        ldp x0, x1, [x2]
2273 
2274   if (MBB.getParent()->getRegInfo().tracksLiveness()) {
2275     DefinedInBB.clear();
2276     DefinedInBB.addLiveIns(MBB);
2277   }
2278 
2279   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2280        MBBI != E;) {
2281     // Track currently live registers up to this point, to help with
2282     // searching for a rename register on demand.
2283     updateDefinedRegisters(*MBBI, DefinedInBB, TRI);
2284     if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI))
2285       Modified = true;
2286     else
2287       ++MBBI;
2288   }
2289   // 4) Find base register updates that can be merged into the load or store
2290   //    as a base-reg writeback.
2291   //      e.g.,
2292   //        ldr x0, [x2]
2293   //        add x2, x2, #4
2294   //        ; becomes
2295   //        ldr x0, [x2], #4
2296   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2297        MBBI != E;) {
2298     if (isMergeableLdStUpdate(*MBBI) && tryToMergeLdStUpdate(MBBI))
2299       Modified = true;
2300     else
2301       ++MBBI;
2302   }
2303 
2304   return Modified;
2305 }
2306 
2307 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
2308   if (skipFunction(Fn.getFunction()))
2309     return false;
2310 
2311   Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
2312   TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
2313   TRI = Subtarget->getRegisterInfo();
2314   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2315 
2316   // Resize the modified and used register unit trackers.  We do this once
2317   // per function and then clear the register units each time we optimize a load
2318   // or store.
2319   ModifiedRegUnits.init(*TRI);
2320   UsedRegUnits.init(*TRI);
2321   DefinedInBB.init(*TRI);
2322 
2323   bool Modified = false;
2324   bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
2325   for (auto &MBB : Fn) {
2326     auto M = optimizeBlock(MBB, enableNarrowZeroStOpt);
2327     Modified |= M;
2328   }
2329 
2330   return Modified;
2331 }
2332 
2333 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
2334 // stores near one another?  Note: The pre-RA instruction scheduler already has
2335 // hooks to try and schedule pairable loads/stores together to improve pairing
2336 // opportunities.  Thus, pre-RA pairing pass may not be worth the effort.
2337 
2338 // FIXME: When pairing store instructions it's very possible for this pass to
2339 // hoist a store with a KILL marker above another use (without a KILL marker).
2340 // The resulting IR is invalid, but nothing uses the KILL markers after this
2341 // pass, so it's never caused a problem in practice.
2342 
2343 /// createAArch64LoadStoreOptimizationPass - returns an instance of the
2344 /// load / store optimization pass.
2345 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
2346   return new AArch64LoadStoreOpt();
2347 }
2348