1 //===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains a pass that performs load / store related peephole
10 // optimizations. This pass should be run after register allocation.
11 //
12 // The pass runs after the PrologEpilogInserter where we emit the CFI
13 // instructions. In order to preserve the correctness of the unwind informaiton,
14 // the pass should not change the order of any two instructions, one of which
15 // has the FrameSetup/FrameDestroy flag or, alternatively, apply an add-hoc fix
16 // to unwind information.
17 //
18 //===----------------------------------------------------------------------===//
19
20 #include "AArch64InstrInfo.h"
21 #include "AArch64MachineFunctionInfo.h"
22 #include "AArch64Subtarget.h"
23 #include "MCTargetDesc/AArch64AddressingModes.h"
24 #include "llvm/ADT/BitVector.h"
25 #include "llvm/ADT/SmallVector.h"
26 #include "llvm/ADT/Statistic.h"
27 #include "llvm/ADT/StringRef.h"
28 #include "llvm/ADT/iterator_range.h"
29 #include "llvm/Analysis/AliasAnalysis.h"
30 #include "llvm/CodeGen/MachineBasicBlock.h"
31 #include "llvm/CodeGen/MachineFunction.h"
32 #include "llvm/CodeGen/MachineFunctionPass.h"
33 #include "llvm/CodeGen/MachineInstr.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineOperand.h"
36 #include "llvm/CodeGen/MachineRegisterInfo.h"
37 #include "llvm/CodeGen/TargetRegisterInfo.h"
38 #include "llvm/IR/DebugLoc.h"
39 #include "llvm/MC/MCAsmInfo.h"
40 #include "llvm/MC/MCDwarf.h"
41 #include "llvm/MC/MCRegisterInfo.h"
42 #include "llvm/Pass.h"
43 #include "llvm/Support/CommandLine.h"
44 #include "llvm/Support/Debug.h"
45 #include "llvm/Support/DebugCounter.h"
46 #include "llvm/Support/ErrorHandling.h"
47 #include "llvm/Support/raw_ostream.h"
48 #include <cassert>
49 #include <cstdint>
50 #include <functional>
51 #include <iterator>
52 #include <limits>
53
54 using namespace llvm;
55
56 #define DEBUG_TYPE "aarch64-ldst-opt"
57
58 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
59 STATISTIC(NumPostFolded, "Number of post-index updates folded");
60 STATISTIC(NumPreFolded, "Number of pre-index updates folded");
61 STATISTIC(NumUnscaledPairCreated,
62 "Number of load/store from unscaled generated");
63 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
64 STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
65
66 DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
67 "Controls which pairs are considered for renaming");
68
69 // The LdStLimit limits how far we search for load/store pairs.
70 static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
71 cl::init(20), cl::Hidden);
72
73 // The UpdateLimit limits how far we search for update instructions when we form
74 // pre-/post-index instructions.
75 static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
76 cl::Hidden);
77
78 // Enable register renaming to find additional store pairing opportunities.
79 static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
80 cl::init(true), cl::Hidden);
81
82 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
83
84 namespace {
85
86 using LdStPairFlags = struct LdStPairFlags {
87 // If a matching instruction is found, MergeForward is set to true if the
88 // merge is to remove the first instruction and replace the second with
89 // a pair-wise insn, and false if the reverse is true.
90 bool MergeForward = false;
91
92 // SExtIdx gives the index of the result of the load pair that must be
93 // extended. The value of SExtIdx assumes that the paired load produces the
94 // value in this order: (I, returned iterator), i.e., -1 means no value has
95 // to be extended, 0 means I, and 1 means the returned iterator.
96 int SExtIdx = -1;
97
98 // If not none, RenameReg can be used to rename the result register of the
99 // first store in a pair. Currently this only works when merging stores
100 // forward.
101 Optional<MCPhysReg> RenameReg = None;
102
103 LdStPairFlags() = default;
104
105 void setMergeForward(bool V = true) { MergeForward = V; }
106 bool getMergeForward() const { return MergeForward; }
107
108 void setSExtIdx(int V) { SExtIdx = V; }
109 int getSExtIdx() const { return SExtIdx; }
110
111 void setRenameReg(MCPhysReg R) { RenameReg = R; }
112 void clearRenameReg() { RenameReg = None; }
113 Optional<MCPhysReg> getRenameReg() const { return RenameReg; }
114 };
115
116 struct AArch64LoadStoreOpt : public MachineFunctionPass {
117 static char ID;
118
AArch64LoadStoreOpt__anon5c30a3470111::AArch64LoadStoreOpt119 AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
120 initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
121 }
122
123 AliasAnalysis *AA;
124 const AArch64InstrInfo *TII;
125 const TargetRegisterInfo *TRI;
126 const AArch64Subtarget *Subtarget;
127
128 // Track which register units have been modified and used.
129 LiveRegUnits ModifiedRegUnits, UsedRegUnits;
130 LiveRegUnits DefinedInBB;
131
getAnalysisUsage__anon5c30a3470111::AArch64LoadStoreOpt132 void getAnalysisUsage(AnalysisUsage &AU) const override {
133 AU.addRequired<AAResultsWrapperPass>();
134 MachineFunctionPass::getAnalysisUsage(AU);
135 }
136
137 // Scan the instructions looking for a load/store that can be combined
138 // with the current instruction into a load/store pair.
139 // Return the matching instruction if one is found, else MBB->end().
140 MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
141 LdStPairFlags &Flags,
142 unsigned Limit,
143 bool FindNarrowMerge);
144
145 // Scan the instructions looking for a store that writes to the address from
146 // which the current load instruction reads. Return true if one is found.
147 bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
148 MachineBasicBlock::iterator &StoreI);
149
150 // Merge the two instructions indicated into a wider narrow store instruction.
151 MachineBasicBlock::iterator
152 mergeNarrowZeroStores(MachineBasicBlock::iterator I,
153 MachineBasicBlock::iterator MergeMI,
154 const LdStPairFlags &Flags);
155
156 // Merge the two instructions indicated into a single pair-wise instruction.
157 MachineBasicBlock::iterator
158 mergePairedInsns(MachineBasicBlock::iterator I,
159 MachineBasicBlock::iterator Paired,
160 const LdStPairFlags &Flags);
161
162 // Promote the load that reads directly from the address stored to.
163 MachineBasicBlock::iterator
164 promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
165 MachineBasicBlock::iterator StoreI);
166
167 // Scan the instruction list to find a base register update that can
168 // be combined with the current instruction (a load or store) using
169 // pre or post indexed addressing with writeback. Scan forwards.
170 MachineBasicBlock::iterator
171 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
172 int UnscaledOffset, unsigned Limit);
173
174 // Scan the instruction list to find a base register update that can
175 // be combined with the current instruction (a load or store) using
176 // pre or post indexed addressing with writeback. Scan backwards.
177 MachineBasicBlock::iterator
178 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
179
180 // Find an instruction that updates the base register of the ld/st
181 // instruction.
182 bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
183 unsigned BaseReg, int Offset);
184
185 // Merge a pre- or post-index base register update into a ld/st instruction.
186 MachineBasicBlock::iterator
187 mergeUpdateInsn(MachineBasicBlock::iterator I,
188 MachineBasicBlock::iterator Update, bool IsPreIdx);
189
190 // Find and merge zero store instructions.
191 bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
192
193 // Find and pair ldr/str instructions.
194 bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
195
196 // Find and promote load instructions which read directly from store.
197 bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
198
199 // Find and merge a base register updates before or after a ld/st instruction.
200 bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
201
202 bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
203
204 bool runOnMachineFunction(MachineFunction &Fn) override;
205
getRequiredProperties__anon5c30a3470111::AArch64LoadStoreOpt206 MachineFunctionProperties getRequiredProperties() const override {
207 return MachineFunctionProperties().set(
208 MachineFunctionProperties::Property::NoVRegs);
209 }
210
getPassName__anon5c30a3470111::AArch64LoadStoreOpt211 StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
212 };
213
214 char AArch64LoadStoreOpt::ID = 0;
215
216 } // end anonymous namespace
217
218 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
219 AARCH64_LOAD_STORE_OPT_NAME, false, false)
220
isNarrowStore(unsigned Opc)221 static bool isNarrowStore(unsigned Opc) {
222 switch (Opc) {
223 default:
224 return false;
225 case AArch64::STRBBui:
226 case AArch64::STURBBi:
227 case AArch64::STRHHui:
228 case AArch64::STURHHi:
229 return true;
230 }
231 }
232
233 // These instruction set memory tag and either keep memory contents unchanged or
234 // set it to zero, ignoring the address part of the source register.
isTagStore(const MachineInstr & MI)235 static bool isTagStore(const MachineInstr &MI) {
236 switch (MI.getOpcode()) {
237 default:
238 return false;
239 case AArch64::STGOffset:
240 case AArch64::STZGOffset:
241 case AArch64::ST2GOffset:
242 case AArch64::STZ2GOffset:
243 return true;
244 }
245 }
246
getMatchingNonSExtOpcode(unsigned Opc,bool * IsValidLdStrOpc=nullptr)247 static unsigned getMatchingNonSExtOpcode(unsigned Opc,
248 bool *IsValidLdStrOpc = nullptr) {
249 if (IsValidLdStrOpc)
250 *IsValidLdStrOpc = true;
251 switch (Opc) {
252 default:
253 if (IsValidLdStrOpc)
254 *IsValidLdStrOpc = false;
255 return std::numeric_limits<unsigned>::max();
256 case AArch64::STRDui:
257 case AArch64::STURDi:
258 case AArch64::STRDpre:
259 case AArch64::STRQui:
260 case AArch64::STURQi:
261 case AArch64::STRQpre:
262 case AArch64::STRBBui:
263 case AArch64::STURBBi:
264 case AArch64::STRHHui:
265 case AArch64::STURHHi:
266 case AArch64::STRWui:
267 case AArch64::STRWpre:
268 case AArch64::STURWi:
269 case AArch64::STRXui:
270 case AArch64::STRXpre:
271 case AArch64::STURXi:
272 case AArch64::LDRDui:
273 case AArch64::LDURDi:
274 case AArch64::LDRDpre:
275 case AArch64::LDRQui:
276 case AArch64::LDURQi:
277 case AArch64::LDRQpre:
278 case AArch64::LDRWui:
279 case AArch64::LDURWi:
280 case AArch64::LDRWpre:
281 case AArch64::LDRXui:
282 case AArch64::LDURXi:
283 case AArch64::LDRXpre:
284 case AArch64::STRSui:
285 case AArch64::STURSi:
286 case AArch64::STRSpre:
287 case AArch64::LDRSui:
288 case AArch64::LDURSi:
289 case AArch64::LDRSpre:
290 return Opc;
291 case AArch64::LDRSWui:
292 return AArch64::LDRWui;
293 case AArch64::LDURSWi:
294 return AArch64::LDURWi;
295 }
296 }
297
getMatchingWideOpcode(unsigned Opc)298 static unsigned getMatchingWideOpcode(unsigned Opc) {
299 switch (Opc) {
300 default:
301 llvm_unreachable("Opcode has no wide equivalent!");
302 case AArch64::STRBBui:
303 return AArch64::STRHHui;
304 case AArch64::STRHHui:
305 return AArch64::STRWui;
306 case AArch64::STURBBi:
307 return AArch64::STURHHi;
308 case AArch64::STURHHi:
309 return AArch64::STURWi;
310 case AArch64::STURWi:
311 return AArch64::STURXi;
312 case AArch64::STRWui:
313 return AArch64::STRXui;
314 }
315 }
316
getMatchingPairOpcode(unsigned Opc)317 static unsigned getMatchingPairOpcode(unsigned Opc) {
318 switch (Opc) {
319 default:
320 llvm_unreachable("Opcode has no pairwise equivalent!");
321 case AArch64::STRSui:
322 case AArch64::STURSi:
323 return AArch64::STPSi;
324 case AArch64::STRSpre:
325 return AArch64::STPSpre;
326 case AArch64::STRDui:
327 case AArch64::STURDi:
328 return AArch64::STPDi;
329 case AArch64::STRDpre:
330 return AArch64::STPDpre;
331 case AArch64::STRQui:
332 case AArch64::STURQi:
333 return AArch64::STPQi;
334 case AArch64::STRQpre:
335 return AArch64::STPQpre;
336 case AArch64::STRWui:
337 case AArch64::STURWi:
338 return AArch64::STPWi;
339 case AArch64::STRWpre:
340 return AArch64::STPWpre;
341 case AArch64::STRXui:
342 case AArch64::STURXi:
343 return AArch64::STPXi;
344 case AArch64::STRXpre:
345 return AArch64::STPXpre;
346 case AArch64::LDRSui:
347 case AArch64::LDURSi:
348 return AArch64::LDPSi;
349 case AArch64::LDRSpre:
350 return AArch64::LDPSpre;
351 case AArch64::LDRDui:
352 case AArch64::LDURDi:
353 return AArch64::LDPDi;
354 case AArch64::LDRDpre:
355 return AArch64::LDPDpre;
356 case AArch64::LDRQui:
357 case AArch64::LDURQi:
358 return AArch64::LDPQi;
359 case AArch64::LDRQpre:
360 return AArch64::LDPQpre;
361 case AArch64::LDRWui:
362 case AArch64::LDURWi:
363 return AArch64::LDPWi;
364 case AArch64::LDRWpre:
365 return AArch64::LDPWpre;
366 case AArch64::LDRXui:
367 case AArch64::LDURXi:
368 return AArch64::LDPXi;
369 case AArch64::LDRXpre:
370 return AArch64::LDPXpre;
371 case AArch64::LDRSWui:
372 case AArch64::LDURSWi:
373 return AArch64::LDPSWi;
374 }
375 }
376
isMatchingStore(MachineInstr & LoadInst,MachineInstr & StoreInst)377 static unsigned isMatchingStore(MachineInstr &LoadInst,
378 MachineInstr &StoreInst) {
379 unsigned LdOpc = LoadInst.getOpcode();
380 unsigned StOpc = StoreInst.getOpcode();
381 switch (LdOpc) {
382 default:
383 llvm_unreachable("Unsupported load instruction!");
384 case AArch64::LDRBBui:
385 return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
386 StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
387 case AArch64::LDURBBi:
388 return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
389 StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
390 case AArch64::LDRHHui:
391 return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
392 StOpc == AArch64::STRXui;
393 case AArch64::LDURHHi:
394 return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
395 StOpc == AArch64::STURXi;
396 case AArch64::LDRWui:
397 return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
398 case AArch64::LDURWi:
399 return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
400 case AArch64::LDRXui:
401 return StOpc == AArch64::STRXui;
402 case AArch64::LDURXi:
403 return StOpc == AArch64::STURXi;
404 }
405 }
406
getPreIndexedOpcode(unsigned Opc)407 static unsigned getPreIndexedOpcode(unsigned Opc) {
408 // FIXME: We don't currently support creating pre-indexed loads/stores when
409 // the load or store is the unscaled version. If we decide to perform such an
410 // optimization in the future the cases for the unscaled loads/stores will
411 // need to be added here.
412 switch (Opc) {
413 default:
414 llvm_unreachable("Opcode has no pre-indexed equivalent!");
415 case AArch64::STRSui:
416 return AArch64::STRSpre;
417 case AArch64::STRDui:
418 return AArch64::STRDpre;
419 case AArch64::STRQui:
420 return AArch64::STRQpre;
421 case AArch64::STRBBui:
422 return AArch64::STRBBpre;
423 case AArch64::STRHHui:
424 return AArch64::STRHHpre;
425 case AArch64::STRWui:
426 return AArch64::STRWpre;
427 case AArch64::STRXui:
428 return AArch64::STRXpre;
429 case AArch64::LDRSui:
430 return AArch64::LDRSpre;
431 case AArch64::LDRDui:
432 return AArch64::LDRDpre;
433 case AArch64::LDRQui:
434 return AArch64::LDRQpre;
435 case AArch64::LDRBBui:
436 return AArch64::LDRBBpre;
437 case AArch64::LDRHHui:
438 return AArch64::LDRHHpre;
439 case AArch64::LDRWui:
440 return AArch64::LDRWpre;
441 case AArch64::LDRXui:
442 return AArch64::LDRXpre;
443 case AArch64::LDRSWui:
444 return AArch64::LDRSWpre;
445 case AArch64::LDPSi:
446 return AArch64::LDPSpre;
447 case AArch64::LDPSWi:
448 return AArch64::LDPSWpre;
449 case AArch64::LDPDi:
450 return AArch64::LDPDpre;
451 case AArch64::LDPQi:
452 return AArch64::LDPQpre;
453 case AArch64::LDPWi:
454 return AArch64::LDPWpre;
455 case AArch64::LDPXi:
456 return AArch64::LDPXpre;
457 case AArch64::STPSi:
458 return AArch64::STPSpre;
459 case AArch64::STPDi:
460 return AArch64::STPDpre;
461 case AArch64::STPQi:
462 return AArch64::STPQpre;
463 case AArch64::STPWi:
464 return AArch64::STPWpre;
465 case AArch64::STPXi:
466 return AArch64::STPXpre;
467 case AArch64::STGOffset:
468 return AArch64::STGPreIndex;
469 case AArch64::STZGOffset:
470 return AArch64::STZGPreIndex;
471 case AArch64::ST2GOffset:
472 return AArch64::ST2GPreIndex;
473 case AArch64::STZ2GOffset:
474 return AArch64::STZ2GPreIndex;
475 case AArch64::STGPi:
476 return AArch64::STGPpre;
477 }
478 }
479
getPostIndexedOpcode(unsigned Opc)480 static unsigned getPostIndexedOpcode(unsigned Opc) {
481 switch (Opc) {
482 default:
483 llvm_unreachable("Opcode has no post-indexed wise equivalent!");
484 case AArch64::STRSui:
485 case AArch64::STURSi:
486 return AArch64::STRSpost;
487 case AArch64::STRDui:
488 case AArch64::STURDi:
489 return AArch64::STRDpost;
490 case AArch64::STRQui:
491 case AArch64::STURQi:
492 return AArch64::STRQpost;
493 case AArch64::STRBBui:
494 return AArch64::STRBBpost;
495 case AArch64::STRHHui:
496 return AArch64::STRHHpost;
497 case AArch64::STRWui:
498 case AArch64::STURWi:
499 return AArch64::STRWpost;
500 case AArch64::STRXui:
501 case AArch64::STURXi:
502 return AArch64::STRXpost;
503 case AArch64::LDRSui:
504 case AArch64::LDURSi:
505 return AArch64::LDRSpost;
506 case AArch64::LDRDui:
507 case AArch64::LDURDi:
508 return AArch64::LDRDpost;
509 case AArch64::LDRQui:
510 case AArch64::LDURQi:
511 return AArch64::LDRQpost;
512 case AArch64::LDRBBui:
513 return AArch64::LDRBBpost;
514 case AArch64::LDRHHui:
515 return AArch64::LDRHHpost;
516 case AArch64::LDRWui:
517 case AArch64::LDURWi:
518 return AArch64::LDRWpost;
519 case AArch64::LDRXui:
520 case AArch64::LDURXi:
521 return AArch64::LDRXpost;
522 case AArch64::LDRSWui:
523 return AArch64::LDRSWpost;
524 case AArch64::LDPSi:
525 return AArch64::LDPSpost;
526 case AArch64::LDPSWi:
527 return AArch64::LDPSWpost;
528 case AArch64::LDPDi:
529 return AArch64::LDPDpost;
530 case AArch64::LDPQi:
531 return AArch64::LDPQpost;
532 case AArch64::LDPWi:
533 return AArch64::LDPWpost;
534 case AArch64::LDPXi:
535 return AArch64::LDPXpost;
536 case AArch64::STPSi:
537 return AArch64::STPSpost;
538 case AArch64::STPDi:
539 return AArch64::STPDpost;
540 case AArch64::STPQi:
541 return AArch64::STPQpost;
542 case AArch64::STPWi:
543 return AArch64::STPWpost;
544 case AArch64::STPXi:
545 return AArch64::STPXpost;
546 case AArch64::STGOffset:
547 return AArch64::STGPostIndex;
548 case AArch64::STZGOffset:
549 return AArch64::STZGPostIndex;
550 case AArch64::ST2GOffset:
551 return AArch64::ST2GPostIndex;
552 case AArch64::STZ2GOffset:
553 return AArch64::STZ2GPostIndex;
554 case AArch64::STGPi:
555 return AArch64::STGPpost;
556 }
557 }
558
isPreLdStPairCandidate(MachineInstr & FirstMI,MachineInstr & MI)559 static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) {
560
561 unsigned OpcA = FirstMI.getOpcode();
562 unsigned OpcB = MI.getOpcode();
563
564 switch (OpcA) {
565 default:
566 return false;
567 case AArch64::STRSpre:
568 return (OpcB == AArch64::STRSui) || (OpcB == AArch64::STURSi);
569 case AArch64::STRDpre:
570 return (OpcB == AArch64::STRDui) || (OpcB == AArch64::STURDi);
571 case AArch64::STRQpre:
572 return (OpcB == AArch64::STRQui) || (OpcB == AArch64::STURQi);
573 case AArch64::STRWpre:
574 return (OpcB == AArch64::STRWui) || (OpcB == AArch64::STURWi);
575 case AArch64::STRXpre:
576 return (OpcB == AArch64::STRXui) || (OpcB == AArch64::STURXi);
577 case AArch64::LDRSpre:
578 return (OpcB == AArch64::LDRSui) || (OpcB == AArch64::LDURSi);
579 case AArch64::LDRDpre:
580 return (OpcB == AArch64::LDRDui) || (OpcB == AArch64::LDURDi);
581 case AArch64::LDRQpre:
582 return (OpcB == AArch64::LDRQui) || (OpcB == AArch64::LDURQi);
583 case AArch64::LDRWpre:
584 return (OpcB == AArch64::LDRWui) || (OpcB == AArch64::LDURWi);
585 case AArch64::LDRXpre:
586 return (OpcB == AArch64::LDRXui) || (OpcB == AArch64::LDURXi);
587 }
588 }
589
590 // Returns the scale and offset range of pre/post indexed variants of MI.
getPrePostIndexedMemOpInfo(const MachineInstr & MI,int & Scale,int & MinOffset,int & MaxOffset)591 static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
592 int &MinOffset, int &MaxOffset) {
593 bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI);
594 bool IsTagStore = isTagStore(MI);
595 // ST*G and all paired ldst have the same scale in pre/post-indexed variants
596 // as in the "unsigned offset" variant.
597 // All other pre/post indexed ldst instructions are unscaled.
598 Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1;
599
600 if (IsPaired) {
601 MinOffset = -64;
602 MaxOffset = 63;
603 } else {
604 MinOffset = -256;
605 MaxOffset = 255;
606 }
607 }
608
getLdStRegOp(MachineInstr & MI,unsigned PairedRegOp=0)609 static MachineOperand &getLdStRegOp(MachineInstr &MI,
610 unsigned PairedRegOp = 0) {
611 assert(PairedRegOp < 2 && "Unexpected register operand idx.");
612 bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI);
613 if (IsPreLdSt)
614 PairedRegOp += 1;
615 unsigned Idx =
616 AArch64InstrInfo::isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0;
617 return MI.getOperand(Idx);
618 }
619
isLdOffsetInRangeOfSt(MachineInstr & LoadInst,MachineInstr & StoreInst,const AArch64InstrInfo * TII)620 static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
621 MachineInstr &StoreInst,
622 const AArch64InstrInfo *TII) {
623 assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
624 int LoadSize = TII->getMemScale(LoadInst);
625 int StoreSize = TII->getMemScale(StoreInst);
626 int UnscaledStOffset =
627 TII->hasUnscaledLdStOffset(StoreInst)
628 ? AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm()
629 : AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm() * StoreSize;
630 int UnscaledLdOffset =
631 TII->hasUnscaledLdStOffset(LoadInst)
632 ? AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm()
633 : AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm() * LoadSize;
634 return (UnscaledStOffset <= UnscaledLdOffset) &&
635 (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
636 }
637
isPromotableZeroStoreInst(MachineInstr & MI)638 static bool isPromotableZeroStoreInst(MachineInstr &MI) {
639 unsigned Opc = MI.getOpcode();
640 return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
641 isNarrowStore(Opc)) &&
642 getLdStRegOp(MI).getReg() == AArch64::WZR;
643 }
644
isPromotableLoadFromStore(MachineInstr & MI)645 static bool isPromotableLoadFromStore(MachineInstr &MI) {
646 switch (MI.getOpcode()) {
647 default:
648 return false;
649 // Scaled instructions.
650 case AArch64::LDRBBui:
651 case AArch64::LDRHHui:
652 case AArch64::LDRWui:
653 case AArch64::LDRXui:
654 // Unscaled instructions.
655 case AArch64::LDURBBi:
656 case AArch64::LDURHHi:
657 case AArch64::LDURWi:
658 case AArch64::LDURXi:
659 return true;
660 }
661 }
662
isMergeableLdStUpdate(MachineInstr & MI)663 static bool isMergeableLdStUpdate(MachineInstr &MI) {
664 unsigned Opc = MI.getOpcode();
665 switch (Opc) {
666 default:
667 return false;
668 // Scaled instructions.
669 case AArch64::STRSui:
670 case AArch64::STRDui:
671 case AArch64::STRQui:
672 case AArch64::STRXui:
673 case AArch64::STRWui:
674 case AArch64::STRHHui:
675 case AArch64::STRBBui:
676 case AArch64::LDRSui:
677 case AArch64::LDRDui:
678 case AArch64::LDRQui:
679 case AArch64::LDRXui:
680 case AArch64::LDRWui:
681 case AArch64::LDRHHui:
682 case AArch64::LDRBBui:
683 case AArch64::STGOffset:
684 case AArch64::STZGOffset:
685 case AArch64::ST2GOffset:
686 case AArch64::STZ2GOffset:
687 case AArch64::STGPi:
688 // Unscaled instructions.
689 case AArch64::STURSi:
690 case AArch64::STURDi:
691 case AArch64::STURQi:
692 case AArch64::STURWi:
693 case AArch64::STURXi:
694 case AArch64::LDURSi:
695 case AArch64::LDURDi:
696 case AArch64::LDURQi:
697 case AArch64::LDURWi:
698 case AArch64::LDURXi:
699 // Paired instructions.
700 case AArch64::LDPSi:
701 case AArch64::LDPSWi:
702 case AArch64::LDPDi:
703 case AArch64::LDPQi:
704 case AArch64::LDPWi:
705 case AArch64::LDPXi:
706 case AArch64::STPSi:
707 case AArch64::STPDi:
708 case AArch64::STPQi:
709 case AArch64::STPWi:
710 case AArch64::STPXi:
711 // Make sure this is a reg+imm (as opposed to an address reloc).
712 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
713 return false;
714
715 return true;
716 }
717 }
718
719 MachineBasicBlock::iterator
mergeNarrowZeroStores(MachineBasicBlock::iterator I,MachineBasicBlock::iterator MergeMI,const LdStPairFlags & Flags)720 AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
721 MachineBasicBlock::iterator MergeMI,
722 const LdStPairFlags &Flags) {
723 assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
724 "Expected promotable zero stores.");
725
726 MachineBasicBlock::iterator E = I->getParent()->end();
727 MachineBasicBlock::iterator NextI = next_nodbg(I, E);
728 // If NextI is the second of the two instructions to be merged, we need
729 // to skip one further. Either way we merge will invalidate the iterator,
730 // and we don't need to scan the new instruction, as it's a pairwise
731 // instruction, which we're not considering for further action anyway.
732 if (NextI == MergeMI)
733 NextI = next_nodbg(NextI, E);
734
735 unsigned Opc = I->getOpcode();
736 bool IsScaled = !TII->hasUnscaledLdStOffset(Opc);
737 int OffsetStride = IsScaled ? 1 : TII->getMemScale(*I);
738
739 bool MergeForward = Flags.getMergeForward();
740 // Insert our new paired instruction after whichever of the paired
741 // instructions MergeForward indicates.
742 MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
743 // Also based on MergeForward is from where we copy the base register operand
744 // so we get the flags compatible with the input code.
745 const MachineOperand &BaseRegOp =
746 MergeForward ? AArch64InstrInfo::getLdStBaseOp(*MergeMI)
747 : AArch64InstrInfo::getLdStBaseOp(*I);
748
749 // Which register is Rt and which is Rt2 depends on the offset order.
750 MachineInstr *RtMI;
751 if (AArch64InstrInfo::getLdStOffsetOp(*I).getImm() ==
752 AArch64InstrInfo::getLdStOffsetOp(*MergeMI).getImm() + OffsetStride)
753 RtMI = &*MergeMI;
754 else
755 RtMI = &*I;
756
757 int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(*RtMI).getImm();
758 // Change the scaled offset from small to large type.
759 if (IsScaled) {
760 assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
761 OffsetImm /= 2;
762 }
763
764 // Construct the new instruction.
765 DebugLoc DL = I->getDebugLoc();
766 MachineBasicBlock *MBB = I->getParent();
767 MachineInstrBuilder MIB;
768 MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
769 .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
770 .add(BaseRegOp)
771 .addImm(OffsetImm)
772 .cloneMergedMemRefs({&*I, &*MergeMI})
773 .setMIFlags(I->mergeFlagsWith(*MergeMI));
774 (void)MIB;
775
776 LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n ");
777 LLVM_DEBUG(I->print(dbgs()));
778 LLVM_DEBUG(dbgs() << " ");
779 LLVM_DEBUG(MergeMI->print(dbgs()));
780 LLVM_DEBUG(dbgs() << " with instruction:\n ");
781 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
782 LLVM_DEBUG(dbgs() << "\n");
783
784 // Erase the old instructions.
785 I->eraseFromParent();
786 MergeMI->eraseFromParent();
787 return NextI;
788 }
789
790 // Apply Fn to all instructions between MI and the beginning of the block, until
791 // a def for DefReg is reached. Returns true, iff Fn returns true for all
792 // visited instructions. Stop after visiting Limit iterations.
forAllMIsUntilDef(MachineInstr & MI,MCPhysReg DefReg,const TargetRegisterInfo * TRI,unsigned Limit,std::function<bool (MachineInstr &,bool)> & Fn)793 static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg,
794 const TargetRegisterInfo *TRI, unsigned Limit,
795 std::function<bool(MachineInstr &, bool)> &Fn) {
796 auto MBB = MI.getParent();
797 for (MachineInstr &I :
798 instructionsWithoutDebug(MI.getReverseIterator(), MBB->instr_rend())) {
799 if (!Limit)
800 return false;
801 --Limit;
802
803 bool isDef = any_of(I.operands(), [DefReg, TRI](MachineOperand &MOP) {
804 return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() &&
805 TRI->regsOverlap(MOP.getReg(), DefReg);
806 });
807 if (!Fn(I, isDef))
808 return false;
809 if (isDef)
810 break;
811 }
812 return true;
813 }
814
updateDefinedRegisters(MachineInstr & MI,LiveRegUnits & Units,const TargetRegisterInfo * TRI)815 static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units,
816 const TargetRegisterInfo *TRI) {
817
818 for (const MachineOperand &MOP : phys_regs_and_masks(MI))
819 if (MOP.isReg() && MOP.isKill())
820 Units.removeReg(MOP.getReg());
821
822 for (const MachineOperand &MOP : phys_regs_and_masks(MI))
823 if (MOP.isReg() && !MOP.isKill())
824 Units.addReg(MOP.getReg());
825 }
826
827 MachineBasicBlock::iterator
mergePairedInsns(MachineBasicBlock::iterator I,MachineBasicBlock::iterator Paired,const LdStPairFlags & Flags)828 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
829 MachineBasicBlock::iterator Paired,
830 const LdStPairFlags &Flags) {
831 MachineBasicBlock::iterator E = I->getParent()->end();
832 MachineBasicBlock::iterator NextI = next_nodbg(I, E);
833 // If NextI is the second of the two instructions to be merged, we need
834 // to skip one further. Either way we merge will invalidate the iterator,
835 // and we don't need to scan the new instruction, as it's a pairwise
836 // instruction, which we're not considering for further action anyway.
837 if (NextI == Paired)
838 NextI = next_nodbg(NextI, E);
839
840 int SExtIdx = Flags.getSExtIdx();
841 unsigned Opc =
842 SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
843 bool IsUnscaled = TII->hasUnscaledLdStOffset(Opc);
844 int OffsetStride = IsUnscaled ? TII->getMemScale(*I) : 1;
845
846 bool MergeForward = Flags.getMergeForward();
847
848 Optional<MCPhysReg> RenameReg = Flags.getRenameReg();
849 if (MergeForward && RenameReg) {
850 MCRegister RegToRename = getLdStRegOp(*I).getReg();
851 DefinedInBB.addReg(*RenameReg);
852
853 // Return the sub/super register for RenameReg, matching the size of
854 // OriginalReg.
855 auto GetMatchingSubReg = [this,
856 RenameReg](MCPhysReg OriginalReg) -> MCPhysReg {
857 for (MCPhysReg SubOrSuper : TRI->sub_and_superregs_inclusive(*RenameReg))
858 if (TRI->getMinimalPhysRegClass(OriginalReg) ==
859 TRI->getMinimalPhysRegClass(SubOrSuper))
860 return SubOrSuper;
861 llvm_unreachable("Should have found matching sub or super register!");
862 };
863
864 std::function<bool(MachineInstr &, bool)> UpdateMIs =
865 [this, RegToRename, GetMatchingSubReg](MachineInstr &MI, bool IsDef) {
866 if (IsDef) {
867 bool SeenDef = false;
868 for (auto &MOP : MI.operands()) {
869 // Rename the first explicit definition and all implicit
870 // definitions matching RegToRename.
871 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
872 (!SeenDef || (MOP.isDef() && MOP.isImplicit())) &&
873 TRI->regsOverlap(MOP.getReg(), RegToRename)) {
874 assert((MOP.isImplicit() ||
875 (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
876 "Need renamable operands");
877 MOP.setReg(GetMatchingSubReg(MOP.getReg()));
878 SeenDef = true;
879 }
880 }
881 } else {
882 for (auto &MOP : MI.operands()) {
883 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
884 TRI->regsOverlap(MOP.getReg(), RegToRename)) {
885 assert((MOP.isImplicit() ||
886 (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
887 "Need renamable operands");
888 MOP.setReg(GetMatchingSubReg(MOP.getReg()));
889 }
890 }
891 }
892 LLVM_DEBUG(dbgs() << "Renamed " << MI << "\n");
893 return true;
894 };
895 forAllMIsUntilDef(*I, RegToRename, TRI, LdStLimit, UpdateMIs);
896
897 #if !defined(NDEBUG)
898 // Make sure the register used for renaming is not used between the paired
899 // instructions. That would trash the content before the new paired
900 // instruction.
901 for (auto &MI :
902 iterator_range<MachineInstrBundleIterator<llvm::MachineInstr>>(
903 std::next(I), std::next(Paired)))
904 assert(all_of(MI.operands(),
905 [this, &RenameReg](const MachineOperand &MOP) {
906 return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
907 MOP.isUndef() ||
908 !TRI->regsOverlap(MOP.getReg(), *RenameReg);
909 }) &&
910 "Rename register used between paired instruction, trashing the "
911 "content");
912 #endif
913 }
914
915 // Insert our new paired instruction after whichever of the paired
916 // instructions MergeForward indicates.
917 MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
918 // Also based on MergeForward is from where we copy the base register operand
919 // so we get the flags compatible with the input code.
920 const MachineOperand &BaseRegOp =
921 MergeForward ? AArch64InstrInfo::getLdStBaseOp(*Paired)
922 : AArch64InstrInfo::getLdStBaseOp(*I);
923
924 int Offset = AArch64InstrInfo::getLdStOffsetOp(*I).getImm();
925 int PairedOffset = AArch64InstrInfo::getLdStOffsetOp(*Paired).getImm();
926 bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Paired->getOpcode());
927 if (IsUnscaled != PairedIsUnscaled) {
928 // We're trying to pair instructions that differ in how they are scaled. If
929 // I is scaled then scale the offset of Paired accordingly. Otherwise, do
930 // the opposite (i.e., make Paired's offset unscaled).
931 int MemSize = TII->getMemScale(*Paired);
932 if (PairedIsUnscaled) {
933 // If the unscaled offset isn't a multiple of the MemSize, we can't
934 // pair the operations together.
935 assert(!(PairedOffset % TII->getMemScale(*Paired)) &&
936 "Offset should be a multiple of the stride!");
937 PairedOffset /= MemSize;
938 } else {
939 PairedOffset *= MemSize;
940 }
941 }
942
943 // Which register is Rt and which is Rt2 depends on the offset order.
944 // However, for pre load/stores the Rt should be the one of the pre
945 // load/store.
946 MachineInstr *RtMI, *Rt2MI;
947 if (Offset == PairedOffset + OffsetStride &&
948 !AArch64InstrInfo::isPreLdSt(*I)) {
949 RtMI = &*Paired;
950 Rt2MI = &*I;
951 // Here we swapped the assumption made for SExtIdx.
952 // I.e., we turn ldp I, Paired into ldp Paired, I.
953 // Update the index accordingly.
954 if (SExtIdx != -1)
955 SExtIdx = (SExtIdx + 1) % 2;
956 } else {
957 RtMI = &*I;
958 Rt2MI = &*Paired;
959 }
960 int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(*RtMI).getImm();
961 // Scale the immediate offset, if necessary.
962 if (TII->hasUnscaledLdStOffset(RtMI->getOpcode())) {
963 assert(!(OffsetImm % TII->getMemScale(*RtMI)) &&
964 "Unscaled offset cannot be scaled.");
965 OffsetImm /= TII->getMemScale(*RtMI);
966 }
967
968 // Construct the new instruction.
969 MachineInstrBuilder MIB;
970 DebugLoc DL = I->getDebugLoc();
971 MachineBasicBlock *MBB = I->getParent();
972 MachineOperand RegOp0 = getLdStRegOp(*RtMI);
973 MachineOperand RegOp1 = getLdStRegOp(*Rt2MI);
974 // Kill flags may become invalid when moving stores for pairing.
975 if (RegOp0.isUse()) {
976 if (!MergeForward) {
977 // Clear kill flags on store if moving upwards. Example:
978 // STRWui %w0, ...
979 // USE %w1
980 // STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards
981 RegOp0.setIsKill(false);
982 RegOp1.setIsKill(false);
983 } else {
984 // Clear kill flags of the first stores register. Example:
985 // STRWui %w1, ...
986 // USE kill %w1 ; need to clear kill flag when moving STRWui downwards
987 // STRW %w0
988 Register Reg = getLdStRegOp(*I).getReg();
989 for (MachineInstr &MI : make_range(std::next(I), Paired))
990 MI.clearRegisterKills(Reg, TRI);
991 }
992 }
993
994 unsigned int MatchPairOpcode = getMatchingPairOpcode(Opc);
995 MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(MatchPairOpcode));
996
997 // Adds the pre-index operand for pre-indexed ld/st pairs.
998 if (AArch64InstrInfo::isPreLdSt(*RtMI))
999 MIB.addReg(BaseRegOp.getReg(), RegState::Define);
1000
1001 MIB.add(RegOp0)
1002 .add(RegOp1)
1003 .add(BaseRegOp)
1004 .addImm(OffsetImm)
1005 .cloneMergedMemRefs({&*I, &*Paired})
1006 .setMIFlags(I->mergeFlagsWith(*Paired));
1007
1008 (void)MIB;
1009
1010 LLVM_DEBUG(
1011 dbgs() << "Creating pair load/store. Replacing instructions:\n ");
1012 LLVM_DEBUG(I->print(dbgs()));
1013 LLVM_DEBUG(dbgs() << " ");
1014 LLVM_DEBUG(Paired->print(dbgs()));
1015 LLVM_DEBUG(dbgs() << " with instruction:\n ");
1016 if (SExtIdx != -1) {
1017 // Generate the sign extension for the proper result of the ldp.
1018 // I.e., with X1, that would be:
1019 // %w1 = KILL %w1, implicit-def %x1
1020 // %x1 = SBFMXri killed %x1, 0, 31
1021 MachineOperand &DstMO = MIB->getOperand(SExtIdx);
1022 // Right now, DstMO has the extended register, since it comes from an
1023 // extended opcode.
1024 Register DstRegX = DstMO.getReg();
1025 // Get the W variant of that register.
1026 Register DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
1027 // Update the result of LDP to use the W instead of the X variant.
1028 DstMO.setReg(DstRegW);
1029 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1030 LLVM_DEBUG(dbgs() << "\n");
1031 // Make the machine verifier happy by providing a definition for
1032 // the X register.
1033 // Insert this definition right after the generated LDP, i.e., before
1034 // InsertionPoint.
1035 MachineInstrBuilder MIBKill =
1036 BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)
1037 .addReg(DstRegW)
1038 .addReg(DstRegX, RegState::Define);
1039 MIBKill->getOperand(2).setImplicit();
1040 // Create the sign extension.
1041 MachineInstrBuilder MIBSXTW =
1042 BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)
1043 .addReg(DstRegX)
1044 .addImm(0)
1045 .addImm(31);
1046 (void)MIBSXTW;
1047 LLVM_DEBUG(dbgs() << " Extend operand:\n ");
1048 LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
1049 } else {
1050 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1051 }
1052 LLVM_DEBUG(dbgs() << "\n");
1053
1054 if (MergeForward)
1055 for (const MachineOperand &MOP : phys_regs_and_masks(*I))
1056 if (MOP.isReg() && MOP.isKill())
1057 DefinedInBB.addReg(MOP.getReg());
1058
1059 // Erase the old instructions.
1060 I->eraseFromParent();
1061 Paired->eraseFromParent();
1062
1063 return NextI;
1064 }
1065
1066 MachineBasicBlock::iterator
promoteLoadFromStore(MachineBasicBlock::iterator LoadI,MachineBasicBlock::iterator StoreI)1067 AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
1068 MachineBasicBlock::iterator StoreI) {
1069 MachineBasicBlock::iterator NextI =
1070 next_nodbg(LoadI, LoadI->getParent()->end());
1071
1072 int LoadSize = TII->getMemScale(*LoadI);
1073 int StoreSize = TII->getMemScale(*StoreI);
1074 Register LdRt = getLdStRegOp(*LoadI).getReg();
1075 const MachineOperand &StMO = getLdStRegOp(*StoreI);
1076 Register StRt = getLdStRegOp(*StoreI).getReg();
1077 bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
1078
1079 assert((IsStoreXReg ||
1080 TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
1081 "Unexpected RegClass");
1082
1083 MachineInstr *BitExtMI;
1084 if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
1085 // Remove the load, if the destination register of the loads is the same
1086 // register for stored value.
1087 if (StRt == LdRt && LoadSize == 8) {
1088 for (MachineInstr &MI : make_range(StoreI->getIterator(),
1089 LoadI->getIterator())) {
1090 if (MI.killsRegister(StRt, TRI)) {
1091 MI.clearRegisterKills(StRt, TRI);
1092 break;
1093 }
1094 }
1095 LLVM_DEBUG(dbgs() << "Remove load instruction:\n ");
1096 LLVM_DEBUG(LoadI->print(dbgs()));
1097 LLVM_DEBUG(dbgs() << "\n");
1098 LoadI->eraseFromParent();
1099 return NextI;
1100 }
1101 // Replace the load with a mov if the load and store are in the same size.
1102 BitExtMI =
1103 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1104 TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
1105 .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
1106 .add(StMO)
1107 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
1108 .setMIFlags(LoadI->getFlags());
1109 } else {
1110 // FIXME: Currently we disable this transformation in big-endian targets as
1111 // performance and correctness are verified only in little-endian.
1112 if (!Subtarget->isLittleEndian())
1113 return NextI;
1114 bool IsUnscaled = TII->hasUnscaledLdStOffset(*LoadI);
1115 assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) &&
1116 "Unsupported ld/st match");
1117 assert(LoadSize <= StoreSize && "Invalid load size");
1118 int UnscaledLdOffset =
1119 IsUnscaled
1120 ? AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm()
1121 : AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm() * LoadSize;
1122 int UnscaledStOffset =
1123 IsUnscaled
1124 ? AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm()
1125 : AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm() * StoreSize;
1126 int Width = LoadSize * 8;
1127 Register DestReg =
1128 IsStoreXReg ? Register(TRI->getMatchingSuperReg(
1129 LdRt, AArch64::sub_32, &AArch64::GPR64RegClass))
1130 : LdRt;
1131
1132 assert((UnscaledLdOffset >= UnscaledStOffset &&
1133 (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
1134 "Invalid offset");
1135
1136 int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
1137 int Imms = Immr + Width - 1;
1138 if (UnscaledLdOffset == UnscaledStOffset) {
1139 uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
1140 | ((Immr) << 6) // immr
1141 | ((Imms) << 0) // imms
1142 ;
1143
1144 BitExtMI =
1145 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1146 TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
1147 DestReg)
1148 .add(StMO)
1149 .addImm(AndMaskEncoded)
1150 .setMIFlags(LoadI->getFlags());
1151 } else {
1152 BitExtMI =
1153 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1154 TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
1155 DestReg)
1156 .add(StMO)
1157 .addImm(Immr)
1158 .addImm(Imms)
1159 .setMIFlags(LoadI->getFlags());
1160 }
1161 }
1162
1163 // Clear kill flags between store and load.
1164 for (MachineInstr &MI : make_range(StoreI->getIterator(),
1165 BitExtMI->getIterator()))
1166 if (MI.killsRegister(StRt, TRI)) {
1167 MI.clearRegisterKills(StRt, TRI);
1168 break;
1169 }
1170
1171 LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n ");
1172 LLVM_DEBUG(StoreI->print(dbgs()));
1173 LLVM_DEBUG(dbgs() << " ");
1174 LLVM_DEBUG(LoadI->print(dbgs()));
1175 LLVM_DEBUG(dbgs() << " with instructions:\n ");
1176 LLVM_DEBUG(StoreI->print(dbgs()));
1177 LLVM_DEBUG(dbgs() << " ");
1178 LLVM_DEBUG((BitExtMI)->print(dbgs()));
1179 LLVM_DEBUG(dbgs() << "\n");
1180
1181 // Erase the old instructions.
1182 LoadI->eraseFromParent();
1183 return NextI;
1184 }
1185
inBoundsForPair(bool IsUnscaled,int Offset,int OffsetStride)1186 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
1187 // Convert the byte-offset used by unscaled into an "element" offset used
1188 // by the scaled pair load/store instructions.
1189 if (IsUnscaled) {
1190 // If the byte-offset isn't a multiple of the stride, there's no point
1191 // trying to match it.
1192 if (Offset % OffsetStride)
1193 return false;
1194 Offset /= OffsetStride;
1195 }
1196 return Offset <= 63 && Offset >= -64;
1197 }
1198
1199 // Do alignment, specialized to power of 2 and for signed ints,
1200 // avoiding having to do a C-style cast from uint_64t to int when
1201 // using alignTo from include/llvm/Support/MathExtras.h.
1202 // FIXME: Move this function to include/MathExtras.h?
alignTo(int Num,int PowOf2)1203 static int alignTo(int Num, int PowOf2) {
1204 return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
1205 }
1206
mayAlias(MachineInstr & MIa,SmallVectorImpl<MachineInstr * > & MemInsns,AliasAnalysis * AA)1207 static bool mayAlias(MachineInstr &MIa,
1208 SmallVectorImpl<MachineInstr *> &MemInsns,
1209 AliasAnalysis *AA) {
1210 for (MachineInstr *MIb : MemInsns)
1211 if (MIa.mayAlias(AA, *MIb, /*UseTBAA*/ false))
1212 return true;
1213
1214 return false;
1215 }
1216
findMatchingStore(MachineBasicBlock::iterator I,unsigned Limit,MachineBasicBlock::iterator & StoreI)1217 bool AArch64LoadStoreOpt::findMatchingStore(
1218 MachineBasicBlock::iterator I, unsigned Limit,
1219 MachineBasicBlock::iterator &StoreI) {
1220 MachineBasicBlock::iterator B = I->getParent()->begin();
1221 MachineBasicBlock::iterator MBBI = I;
1222 MachineInstr &LoadMI = *I;
1223 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(LoadMI).getReg();
1224
1225 // If the load is the first instruction in the block, there's obviously
1226 // not any matching store.
1227 if (MBBI == B)
1228 return false;
1229
1230 // Track which register units have been modified and used between the first
1231 // insn and the second insn.
1232 ModifiedRegUnits.clear();
1233 UsedRegUnits.clear();
1234
1235 unsigned Count = 0;
1236 do {
1237 MBBI = prev_nodbg(MBBI, B);
1238 MachineInstr &MI = *MBBI;
1239
1240 // Don't count transient instructions towards the search limit since there
1241 // may be different numbers of them if e.g. debug information is present.
1242 if (!MI.isTransient())
1243 ++Count;
1244
1245 // If the load instruction reads directly from the address to which the
1246 // store instruction writes and the stored value is not modified, we can
1247 // promote the load. Since we do not handle stores with pre-/post-index,
1248 // it's unnecessary to check if BaseReg is modified by the store itself.
1249 // Also we can't handle stores without an immediate offset operand,
1250 // while the operand might be the address for a global variable.
1251 if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
1252 BaseReg == AArch64InstrInfo::getLdStBaseOp(MI).getReg() &&
1253 AArch64InstrInfo::getLdStOffsetOp(MI).isImm() &&
1254 isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
1255 ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) {
1256 StoreI = MBBI;
1257 return true;
1258 }
1259
1260 if (MI.isCall())
1261 return false;
1262
1263 // Update modified / uses register units.
1264 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1265
1266 // Otherwise, if the base register is modified, we have no match, so
1267 // return early.
1268 if (!ModifiedRegUnits.available(BaseReg))
1269 return false;
1270
1271 // If we encounter a store aliased with the load, return early.
1272 if (MI.mayStore() && LoadMI.mayAlias(AA, MI, /*UseTBAA*/ false))
1273 return false;
1274 } while (MBBI != B && Count < Limit);
1275 return false;
1276 }
1277
1278 // Returns true if FirstMI and MI are candidates for merging or pairing.
1279 // Otherwise, returns false.
areCandidatesToMergeOrPair(MachineInstr & FirstMI,MachineInstr & MI,LdStPairFlags & Flags,const AArch64InstrInfo * TII)1280 static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
1281 LdStPairFlags &Flags,
1282 const AArch64InstrInfo *TII) {
1283 // If this is volatile or if pairing is suppressed, not a candidate.
1284 if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
1285 return false;
1286
1287 // We should have already checked FirstMI for pair suppression and volatility.
1288 assert(!FirstMI.hasOrderedMemoryRef() &&
1289 !TII->isLdStPairSuppressed(FirstMI) &&
1290 "FirstMI shouldn't get here if either of these checks are true.");
1291
1292 unsigned OpcA = FirstMI.getOpcode();
1293 unsigned OpcB = MI.getOpcode();
1294
1295 // Opcodes match: If the opcodes are pre ld/st there is nothing more to check.
1296 if (OpcA == OpcB)
1297 return !AArch64InstrInfo::isPreLdSt(FirstMI);
1298
1299 // Try to match a sign-extended load/store with a zero-extended load/store.
1300 bool IsValidLdStrOpc, PairIsValidLdStrOpc;
1301 unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
1302 assert(IsValidLdStrOpc &&
1303 "Given Opc should be a Load or Store with an immediate");
1304 // OpcA will be the first instruction in the pair.
1305 if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
1306 Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);
1307 return true;
1308 }
1309
1310 // If the second instruction isn't even a mergable/pairable load/store, bail
1311 // out.
1312 if (!PairIsValidLdStrOpc)
1313 return false;
1314
1315 // FIXME: We don't support merging narrow stores with mixed scaled/unscaled
1316 // offsets.
1317 if (isNarrowStore(OpcA) || isNarrowStore(OpcB))
1318 return false;
1319
1320 // The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and
1321 // LDR<S,D,Q,W,X>pre-LDR<S,D,Q,W,X>ui
1322 // are candidate pairs that can be merged.
1323 if (isPreLdStPairCandidate(FirstMI, MI))
1324 return true;
1325
1326 // Try to match an unscaled load/store with a scaled load/store.
1327 return TII->hasUnscaledLdStOffset(OpcA) != TII->hasUnscaledLdStOffset(OpcB) &&
1328 getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);
1329
1330 // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1331 }
1332
1333 static bool
canRenameUpToDef(MachineInstr & FirstMI,LiveRegUnits & UsedInBetween,SmallPtrSetImpl<const TargetRegisterClass * > & RequiredClasses,const TargetRegisterInfo * TRI)1334 canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
1335 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1336 const TargetRegisterInfo *TRI) {
1337 if (!FirstMI.mayStore())
1338 return false;
1339
1340 // Check if we can find an unused register which we can use to rename
1341 // the register used by the first load/store.
1342 auto *RegClass = TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg());
1343 MachineFunction &MF = *FirstMI.getParent()->getParent();
1344 if (!RegClass || !MF.getRegInfo().tracksLiveness())
1345 return false;
1346
1347 auto RegToRename = getLdStRegOp(FirstMI).getReg();
1348 // For now, we only rename if the store operand gets killed at the store.
1349 if (!getLdStRegOp(FirstMI).isKill() &&
1350 !any_of(FirstMI.operands(),
1351 [TRI, RegToRename](const MachineOperand &MOP) {
1352 return MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1353 MOP.isImplicit() && MOP.isKill() &&
1354 TRI->regsOverlap(RegToRename, MOP.getReg());
1355 })) {
1356 LLVM_DEBUG(dbgs() << " Operand not killed at " << FirstMI << "\n");
1357 return false;
1358 }
1359 auto canRenameMOP = [TRI](const MachineOperand &MOP) {
1360 if (MOP.isReg()) {
1361 auto *RegClass = TRI->getMinimalPhysRegClass(MOP.getReg());
1362 // Renaming registers with multiple disjunct sub-registers (e.g. the
1363 // result of a LD3) means that all sub-registers are renamed, potentially
1364 // impacting other instructions we did not check. Bail out.
1365 // Note that this relies on the structure of the AArch64 register file. In
1366 // particular, a subregister cannot be written without overwriting the
1367 // whole register.
1368 if (RegClass->HasDisjunctSubRegs) {
1369 LLVM_DEBUG(
1370 dbgs()
1371 << " Cannot rename operands with multiple disjunct subregisters ("
1372 << MOP << ")\n");
1373 return false;
1374 }
1375 }
1376 return MOP.isImplicit() ||
1377 (MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied());
1378 };
1379
1380 bool FoundDef = false;
1381
1382 // For each instruction between FirstMI and the previous def for RegToRename,
1383 // we
1384 // * check if we can rename RegToRename in this instruction
1385 // * collect the registers used and required register classes for RegToRename.
1386 std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI,
1387 bool IsDef) {
1388 LLVM_DEBUG(dbgs() << "Checking " << MI << "\n");
1389 // Currently we do not try to rename across frame-setup instructions.
1390 if (MI.getFlag(MachineInstr::FrameSetup)) {
1391 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions currently ("
1392 << MI << ")\n");
1393 return false;
1394 }
1395
1396 UsedInBetween.accumulate(MI);
1397
1398 // For a definition, check that we can rename the definition and exit the
1399 // loop.
1400 FoundDef = IsDef;
1401
1402 // For defs, check if we can rename the first def of RegToRename.
1403 if (FoundDef) {
1404 // For some pseudo instructions, we might not generate code in the end
1405 // (e.g. KILL) and we would end up without a correct def for the rename
1406 // register.
1407 // TODO: This might be overly conservative and we could handle those cases
1408 // in multiple ways:
1409 // 1. Insert an extra copy, to materialize the def.
1410 // 2. Skip pseudo-defs until we find an non-pseudo def.
1411 if (MI.isPseudo()) {
1412 LLVM_DEBUG(dbgs() << " Cannot rename pseudo instruction " << MI
1413 << "\n");
1414 return false;
1415 }
1416
1417 for (auto &MOP : MI.operands()) {
1418 if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() ||
1419 !TRI->regsOverlap(MOP.getReg(), RegToRename))
1420 continue;
1421 if (!canRenameMOP(MOP)) {
1422 LLVM_DEBUG(dbgs()
1423 << " Cannot rename " << MOP << " in " << MI << "\n");
1424 return false;
1425 }
1426 RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
1427 }
1428 return true;
1429 } else {
1430 for (auto &MOP : MI.operands()) {
1431 if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1432 !TRI->regsOverlap(MOP.getReg(), RegToRename))
1433 continue;
1434
1435 if (!canRenameMOP(MOP)) {
1436 LLVM_DEBUG(dbgs()
1437 << " Cannot rename " << MOP << " in " << MI << "\n");
1438 return false;
1439 }
1440 RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
1441 }
1442 }
1443 return true;
1444 };
1445
1446 if (!forAllMIsUntilDef(FirstMI, RegToRename, TRI, LdStLimit, CheckMIs))
1447 return false;
1448
1449 if (!FoundDef) {
1450 LLVM_DEBUG(dbgs() << " Did not find definition for register in BB\n");
1451 return false;
1452 }
1453 return true;
1454 }
1455
1456 // Check if we can find a physical register for renaming \p Reg. This register
1457 // must:
1458 // * not be defined already in \p DefinedInBB; DefinedInBB must contain all
1459 // defined registers up to the point where the renamed register will be used,
1460 // * not used in \p UsedInBetween; UsedInBetween must contain all accessed
1461 // registers in the range the rename register will be used,
1462 // * is available in all used register classes (checked using RequiredClasses).
tryToFindRegisterToRename(const MachineFunction & MF,Register Reg,LiveRegUnits & DefinedInBB,LiveRegUnits & UsedInBetween,SmallPtrSetImpl<const TargetRegisterClass * > & RequiredClasses,const TargetRegisterInfo * TRI)1463 static Optional<MCPhysReg> tryToFindRegisterToRename(
1464 const MachineFunction &MF, Register Reg, LiveRegUnits &DefinedInBB,
1465 LiveRegUnits &UsedInBetween,
1466 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1467 const TargetRegisterInfo *TRI) {
1468 const MachineRegisterInfo &RegInfo = MF.getRegInfo();
1469
1470 // Checks if any sub- or super-register of PR is callee saved.
1471 auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) {
1472 return any_of(TRI->sub_and_superregs_inclusive(PR),
1473 [&MF, TRI](MCPhysReg SubOrSuper) {
1474 return TRI->isCalleeSavedPhysReg(SubOrSuper, MF);
1475 });
1476 };
1477
1478 // Check if PR or one of its sub- or super-registers can be used for all
1479 // required register classes.
1480 auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) {
1481 return all_of(RequiredClasses, [PR, TRI](const TargetRegisterClass *C) {
1482 return any_of(TRI->sub_and_superregs_inclusive(PR),
1483 [C, TRI](MCPhysReg SubOrSuper) {
1484 return C == TRI->getMinimalPhysRegClass(SubOrSuper);
1485 });
1486 });
1487 };
1488
1489 auto *RegClass = TRI->getMinimalPhysRegClass(Reg);
1490 for (const MCPhysReg &PR : *RegClass) {
1491 if (DefinedInBB.available(PR) && UsedInBetween.available(PR) &&
1492 !RegInfo.isReserved(PR) && !AnySubOrSuperRegCalleePreserved(PR) &&
1493 CanBeUsedForAllClasses(PR)) {
1494 DefinedInBB.addReg(PR);
1495 LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR, TRI)
1496 << "\n");
1497 return {PR};
1498 }
1499 }
1500 LLVM_DEBUG(dbgs() << "No rename register found from "
1501 << TRI->getRegClassName(RegClass) << "\n");
1502 return None;
1503 }
1504
1505 /// Scan the instructions looking for a load/store that can be combined with the
1506 /// current instruction into a wider equivalent or a load/store pair.
1507 MachineBasicBlock::iterator
findMatchingInsn(MachineBasicBlock::iterator I,LdStPairFlags & Flags,unsigned Limit,bool FindNarrowMerge)1508 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
1509 LdStPairFlags &Flags, unsigned Limit,
1510 bool FindNarrowMerge) {
1511 MachineBasicBlock::iterator E = I->getParent()->end();
1512 MachineBasicBlock::iterator MBBI = I;
1513 MachineBasicBlock::iterator MBBIWithRenameReg;
1514 MachineInstr &FirstMI = *I;
1515 MBBI = next_nodbg(MBBI, E);
1516
1517 bool MayLoad = FirstMI.mayLoad();
1518 bool IsUnscaled = TII->hasUnscaledLdStOffset(FirstMI);
1519 Register Reg = getLdStRegOp(FirstMI).getReg();
1520 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(FirstMI).getReg();
1521 int Offset = AArch64InstrInfo::getLdStOffsetOp(FirstMI).getImm();
1522 int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1;
1523 bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
1524
1525 Optional<bool> MaybeCanRename = None;
1526 if (!EnableRenaming)
1527 MaybeCanRename = {false};
1528
1529 SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses;
1530 LiveRegUnits UsedInBetween;
1531 UsedInBetween.init(*TRI);
1532
1533 Flags.clearRenameReg();
1534
1535 // Track which register units have been modified and used between the first
1536 // insn (inclusive) and the second insn.
1537 ModifiedRegUnits.clear();
1538 UsedRegUnits.clear();
1539
1540 // Remember any instructions that read/write memory between FirstMI and MI.
1541 SmallVector<MachineInstr *, 4> MemInsns;
1542
1543 for (unsigned Count = 0; MBBI != E && Count < Limit;
1544 MBBI = next_nodbg(MBBI, E)) {
1545 MachineInstr &MI = *MBBI;
1546
1547 UsedInBetween.accumulate(MI);
1548
1549 // Don't count transient instructions towards the search limit since there
1550 // may be different numbers of them if e.g. debug information is present.
1551 if (!MI.isTransient())
1552 ++Count;
1553
1554 Flags.setSExtIdx(-1);
1555 if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
1556 AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) {
1557 assert(MI.mayLoadOrStore() && "Expected memory operation.");
1558 // If we've found another instruction with the same opcode, check to see
1559 // if the base and offset are compatible with our starting instruction.
1560 // These instructions all have scaled immediate operands, so we just
1561 // check for +1/-1. Make sure to check the new instruction offset is
1562 // actually an immediate and not a symbolic reference destined for
1563 // a relocation.
1564 Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg();
1565 int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
1566 bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI);
1567 if (IsUnscaled != MIIsUnscaled) {
1568 // We're trying to pair instructions that differ in how they are scaled.
1569 // If FirstMI is scaled then scale the offset of MI accordingly.
1570 // Otherwise, do the opposite (i.e., make MI's offset unscaled).
1571 int MemSize = TII->getMemScale(MI);
1572 if (MIIsUnscaled) {
1573 // If the unscaled offset isn't a multiple of the MemSize, we can't
1574 // pair the operations together: bail and keep looking.
1575 if (MIOffset % MemSize) {
1576 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1577 UsedRegUnits, TRI);
1578 MemInsns.push_back(&MI);
1579 continue;
1580 }
1581 MIOffset /= MemSize;
1582 } else {
1583 MIOffset *= MemSize;
1584 }
1585 }
1586
1587 bool IsPreLdSt = isPreLdStPairCandidate(FirstMI, MI);
1588
1589 if (BaseReg == MIBaseReg) {
1590 // If the offset of the second ld/st is not equal to the size of the
1591 // destination register it can’t be paired with a pre-index ld/st
1592 // pair. Additionally if the base reg is used or modified the operations
1593 // can't be paired: bail and keep looking.
1594 if (IsPreLdSt) {
1595 bool IsOutOfBounds = MIOffset != TII->getMemScale(MI);
1596 bool IsBaseRegUsed = !UsedRegUnits.available(
1597 AArch64InstrInfo::getLdStBaseOp(MI).getReg());
1598 bool IsBaseRegModified = !ModifiedRegUnits.available(
1599 AArch64InstrInfo::getLdStBaseOp(MI).getReg());
1600 // If the stored value and the address of the second instruction is
1601 // the same, it needs to be using the updated register and therefore
1602 // it must not be folded.
1603 bool IsMIRegTheSame =
1604 TRI->regsOverlap(getLdStRegOp(MI).getReg(),
1605 AArch64InstrInfo::getLdStBaseOp(MI).getReg());
1606 if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified ||
1607 IsMIRegTheSame) {
1608 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1609 UsedRegUnits, TRI);
1610 MemInsns.push_back(&MI);
1611 continue;
1612 }
1613 } else {
1614 if ((Offset != MIOffset + OffsetStride) &&
1615 (Offset + OffsetStride != MIOffset)) {
1616 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1617 UsedRegUnits, TRI);
1618 MemInsns.push_back(&MI);
1619 continue;
1620 }
1621 }
1622
1623 int MinOffset = Offset < MIOffset ? Offset : MIOffset;
1624 if (FindNarrowMerge) {
1625 // If the alignment requirements of the scaled wide load/store
1626 // instruction can't express the offset of the scaled narrow input,
1627 // bail and keep looking. For promotable zero stores, allow only when
1628 // the stored value is the same (i.e., WZR).
1629 if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) ||
1630 (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
1631 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1632 UsedRegUnits, TRI);
1633 MemInsns.push_back(&MI);
1634 continue;
1635 }
1636 } else {
1637 // Pairwise instructions have a 7-bit signed offset field. Single
1638 // insns have a 12-bit unsigned offset field. If the resultant
1639 // immediate offset of merging these instructions is out of range for
1640 // a pairwise instruction, bail and keep looking.
1641 if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
1642 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1643 UsedRegUnits, TRI);
1644 MemInsns.push_back(&MI);
1645 continue;
1646 }
1647 // If the alignment requirements of the paired (scaled) instruction
1648 // can't express the offset of the unscaled input, bail and keep
1649 // looking.
1650 if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
1651 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1652 UsedRegUnits, TRI);
1653 MemInsns.push_back(&MI);
1654 continue;
1655 }
1656 }
1657 // If the destination register of one load is the same register or a
1658 // sub/super register of the other load, bail and keep looking. A
1659 // load-pair instruction with both destination registers the same is
1660 // UNPREDICTABLE and will result in an exception.
1661 if (MayLoad &&
1662 TRI->isSuperOrSubRegisterEq(Reg, getLdStRegOp(MI).getReg())) {
1663 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
1664 TRI);
1665 MemInsns.push_back(&MI);
1666 continue;
1667 }
1668
1669 // If the BaseReg has been modified, then we cannot do the optimization.
1670 // For example, in the following pattern
1671 // ldr x1 [x2]
1672 // ldr x2 [x3]
1673 // ldr x4 [x2, #8],
1674 // the first and third ldr cannot be converted to ldp x1, x4, [x2]
1675 if (!ModifiedRegUnits.available(BaseReg))
1676 return E;
1677
1678 // If the Rt of the second instruction was not modified or used between
1679 // the two instructions and none of the instructions between the second
1680 // and first alias with the second, we can combine the second into the
1681 // first.
1682 if (ModifiedRegUnits.available(getLdStRegOp(MI).getReg()) &&
1683 !(MI.mayLoad() &&
1684 !UsedRegUnits.available(getLdStRegOp(MI).getReg())) &&
1685 !mayAlias(MI, MemInsns, AA)) {
1686
1687 Flags.setMergeForward(false);
1688 Flags.clearRenameReg();
1689 return MBBI;
1690 }
1691
1692 // Likewise, if the Rt of the first instruction is not modified or used
1693 // between the two instructions and none of the instructions between the
1694 // first and the second alias with the first, we can combine the first
1695 // into the second.
1696 if (!(MayLoad &&
1697 !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg())) &&
1698 !mayAlias(FirstMI, MemInsns, AA)) {
1699
1700 if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg())) {
1701 Flags.setMergeForward(true);
1702 Flags.clearRenameReg();
1703 return MBBI;
1704 }
1705
1706 if (DebugCounter::shouldExecute(RegRenamingCounter)) {
1707 if (!MaybeCanRename)
1708 MaybeCanRename = {canRenameUpToDef(FirstMI, UsedInBetween,
1709 RequiredClasses, TRI)};
1710
1711 if (*MaybeCanRename) {
1712 Optional<MCPhysReg> MaybeRenameReg = tryToFindRegisterToRename(
1713 *FirstMI.getParent()->getParent(), Reg, DefinedInBB,
1714 UsedInBetween, RequiredClasses, TRI);
1715 if (MaybeRenameReg) {
1716 Flags.setRenameReg(*MaybeRenameReg);
1717 Flags.setMergeForward(true);
1718 MBBIWithRenameReg = MBBI;
1719 }
1720 }
1721 }
1722 }
1723 // Unable to combine these instructions due to interference in between.
1724 // Keep looking.
1725 }
1726 }
1727
1728 if (Flags.getRenameReg())
1729 return MBBIWithRenameReg;
1730
1731 // If the instruction wasn't a matching load or store. Stop searching if we
1732 // encounter a call instruction that might modify memory.
1733 if (MI.isCall())
1734 return E;
1735
1736 // Update modified / uses register units.
1737 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1738
1739 // Otherwise, if the base register is modified, we have no match, so
1740 // return early.
1741 if (!ModifiedRegUnits.available(BaseReg))
1742 return E;
1743
1744 // Update list of instructions that read/write memory.
1745 if (MI.mayLoadOrStore())
1746 MemInsns.push_back(&MI);
1747 }
1748 return E;
1749 }
1750
1751 static MachineBasicBlock::iterator
maybeMoveCFI(MachineInstr & MI,MachineBasicBlock::iterator MaybeCFI)1752 maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) {
1753 auto End = MI.getParent()->end();
1754 if (MaybeCFI == End ||
1755 MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION ||
1756 !(MI.getFlag(MachineInstr::FrameSetup) ||
1757 MI.getFlag(MachineInstr::FrameDestroy)) ||
1758 AArch64InstrInfo::getLdStBaseOp(MI).getReg() != AArch64::SP)
1759 return End;
1760
1761 const MachineFunction &MF = *MI.getParent()->getParent();
1762 unsigned CFIIndex = MaybeCFI->getOperand(0).getCFIIndex();
1763 const MCCFIInstruction &CFI = MF.getFrameInstructions()[CFIIndex];
1764 switch (CFI.getOperation()) {
1765 case MCCFIInstruction::OpDefCfa:
1766 case MCCFIInstruction::OpDefCfaOffset:
1767 return MaybeCFI;
1768 default:
1769 return End;
1770 }
1771 }
1772
1773 MachineBasicBlock::iterator
mergeUpdateInsn(MachineBasicBlock::iterator I,MachineBasicBlock::iterator Update,bool IsPreIdx)1774 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
1775 MachineBasicBlock::iterator Update,
1776 bool IsPreIdx) {
1777 assert((Update->getOpcode() == AArch64::ADDXri ||
1778 Update->getOpcode() == AArch64::SUBXri) &&
1779 "Unexpected base register update instruction to merge!");
1780 MachineBasicBlock::iterator E = I->getParent()->end();
1781 MachineBasicBlock::iterator NextI = next_nodbg(I, E);
1782
1783 // If updating the SP and the following instruction is CFA offset related CFI
1784 // instruction move it after the merged instruction.
1785 MachineBasicBlock::iterator CFI =
1786 IsPreIdx ? maybeMoveCFI(*Update, next_nodbg(Update, E)) : E;
1787
1788 // Return the instruction following the merged instruction, which is
1789 // the instruction following our unmerged load. Unless that's the add/sub
1790 // instruction we're merging, in which case it's the one after that.
1791 if (NextI == Update)
1792 NextI = next_nodbg(NextI, E);
1793
1794 int Value = Update->getOperand(2).getImm();
1795 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
1796 "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
1797 if (Update->getOpcode() == AArch64::SUBXri)
1798 Value = -Value;
1799
1800 unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
1801 : getPostIndexedOpcode(I->getOpcode());
1802 MachineInstrBuilder MIB;
1803 int Scale, MinOffset, MaxOffset;
1804 getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset);
1805 if (!AArch64InstrInfo::isPairedLdSt(*I)) {
1806 // Non-paired instruction.
1807 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1808 .add(getLdStRegOp(*Update))
1809 .add(getLdStRegOp(*I))
1810 .add(AArch64InstrInfo::getLdStBaseOp(*I))
1811 .addImm(Value / Scale)
1812 .setMemRefs(I->memoperands())
1813 .setMIFlags(I->mergeFlagsWith(*Update));
1814 } else {
1815 // Paired instruction.
1816 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1817 .add(getLdStRegOp(*Update))
1818 .add(getLdStRegOp(*I, 0))
1819 .add(getLdStRegOp(*I, 1))
1820 .add(AArch64InstrInfo::getLdStBaseOp(*I))
1821 .addImm(Value / Scale)
1822 .setMemRefs(I->memoperands())
1823 .setMIFlags(I->mergeFlagsWith(*Update));
1824 }
1825 if (CFI != E) {
1826 MachineBasicBlock *MBB = I->getParent();
1827 MBB->splice(std::next(MIB.getInstr()->getIterator()), MBB, CFI);
1828 }
1829
1830 if (IsPreIdx) {
1831 ++NumPreFolded;
1832 LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
1833 } else {
1834 ++NumPostFolded;
1835 LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
1836 }
1837 LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
1838 LLVM_DEBUG(I->print(dbgs()));
1839 LLVM_DEBUG(dbgs() << " ");
1840 LLVM_DEBUG(Update->print(dbgs()));
1841 LLVM_DEBUG(dbgs() << " with instruction:\n ");
1842 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1843 LLVM_DEBUG(dbgs() << "\n");
1844
1845 // Erase the old instructions for the block.
1846 I->eraseFromParent();
1847 Update->eraseFromParent();
1848
1849 return NextI;
1850 }
1851
isMatchingUpdateInsn(MachineInstr & MemMI,MachineInstr & MI,unsigned BaseReg,int Offset)1852 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
1853 MachineInstr &MI,
1854 unsigned BaseReg, int Offset) {
1855 switch (MI.getOpcode()) {
1856 default:
1857 break;
1858 case AArch64::SUBXri:
1859 case AArch64::ADDXri:
1860 // Make sure it's a vanilla immediate operand, not a relocation or
1861 // anything else we can't handle.
1862 if (!MI.getOperand(2).isImm())
1863 break;
1864 // Watch out for 1 << 12 shifted value.
1865 if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm()))
1866 break;
1867
1868 // The update instruction source and destination register must be the
1869 // same as the load/store base register.
1870 if (MI.getOperand(0).getReg() != BaseReg ||
1871 MI.getOperand(1).getReg() != BaseReg)
1872 break;
1873
1874 int UpdateOffset = MI.getOperand(2).getImm();
1875 if (MI.getOpcode() == AArch64::SUBXri)
1876 UpdateOffset = -UpdateOffset;
1877
1878 // The immediate must be a multiple of the scaling factor of the pre/post
1879 // indexed instruction.
1880 int Scale, MinOffset, MaxOffset;
1881 getPrePostIndexedMemOpInfo(MemMI, Scale, MinOffset, MaxOffset);
1882 if (UpdateOffset % Scale != 0)
1883 break;
1884
1885 // Scaled offset must fit in the instruction immediate.
1886 int ScaledOffset = UpdateOffset / Scale;
1887 if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset)
1888 break;
1889
1890 // If we have a non-zero Offset, we check that it matches the amount
1891 // we're adding to the register.
1892 if (!Offset || Offset == UpdateOffset)
1893 return true;
1894 break;
1895 }
1896 return false;
1897 }
1898
needsWinCFI(const MachineFunction * MF)1899 static bool needsWinCFI(const MachineFunction *MF) {
1900 return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() &&
1901 MF->getFunction().needsUnwindTableEntry();
1902 }
1903
findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,int UnscaledOffset,unsigned Limit)1904 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
1905 MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
1906 MachineBasicBlock::iterator E = I->getParent()->end();
1907 MachineInstr &MemMI = *I;
1908 MachineBasicBlock::iterator MBBI = I;
1909
1910 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
1911 int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm() *
1912 TII->getMemScale(MemMI);
1913
1914 // Scan forward looking for post-index opportunities. Updating instructions
1915 // can't be formed if the memory instruction doesn't have the offset we're
1916 // looking for.
1917 if (MIUnscaledOffset != UnscaledOffset)
1918 return E;
1919
1920 // If the base register overlaps a source/destination register, we can't
1921 // merge the update. This does not apply to tag store instructions which
1922 // ignore the address part of the source register.
1923 // This does not apply to STGPi as well, which does not have unpredictable
1924 // behavior in this case unlike normal stores, and always performs writeback
1925 // after reading the source register value.
1926 if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) {
1927 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);
1928 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
1929 Register DestReg = getLdStRegOp(MemMI, i).getReg();
1930 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
1931 return E;
1932 }
1933 }
1934
1935 // Track which register units have been modified and used between the first
1936 // insn (inclusive) and the second insn.
1937 ModifiedRegUnits.clear();
1938 UsedRegUnits.clear();
1939 MBBI = next_nodbg(MBBI, E);
1940
1941 // We can't post-increment the stack pointer if any instruction between
1942 // the memory access (I) and the increment (MBBI) can access the memory
1943 // region defined by [SP, MBBI].
1944 const bool BaseRegSP = BaseReg == AArch64::SP;
1945 if (BaseRegSP && needsWinCFI(I->getMF())) {
1946 // FIXME: For now, we always block the optimization over SP in windows
1947 // targets as it requires to adjust the unwind/debug info, messing up
1948 // the unwind info can actually cause a miscompile.
1949 return E;
1950 }
1951
1952 for (unsigned Count = 0; MBBI != E && Count < Limit;
1953 MBBI = next_nodbg(MBBI, E)) {
1954 MachineInstr &MI = *MBBI;
1955
1956 // Don't count transient instructions towards the search limit since there
1957 // may be different numbers of them if e.g. debug information is present.
1958 if (!MI.isTransient())
1959 ++Count;
1960
1961 // If we found a match, return it.
1962 if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
1963 return MBBI;
1964
1965 // Update the status of what the instruction clobbered and used.
1966 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1967
1968 // Otherwise, if the base register is used or modified, we have no match, so
1969 // return early.
1970 // If we are optimizing SP, do not allow instructions that may load or store
1971 // in between the load and the optimized value update.
1972 if (!ModifiedRegUnits.available(BaseReg) ||
1973 !UsedRegUnits.available(BaseReg) ||
1974 (BaseRegSP && MBBI->mayLoadOrStore()))
1975 return E;
1976 }
1977 return E;
1978 }
1979
findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I,unsigned Limit)1980 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
1981 MachineBasicBlock::iterator I, unsigned Limit) {
1982 MachineBasicBlock::iterator B = I->getParent()->begin();
1983 MachineBasicBlock::iterator E = I->getParent()->end();
1984 MachineInstr &MemMI = *I;
1985 MachineBasicBlock::iterator MBBI = I;
1986 MachineFunction &MF = *MemMI.getMF();
1987
1988 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
1989 int Offset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm();
1990
1991 // If the load/store is the first instruction in the block, there's obviously
1992 // not any matching update. Ditto if the memory offset isn't zero.
1993 if (MBBI == B || Offset != 0)
1994 return E;
1995 // If the base register overlaps a destination register, we can't
1996 // merge the update.
1997 if (!isTagStore(MemMI)) {
1998 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);
1999 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
2000 Register DestReg = getLdStRegOp(MemMI, i).getReg();
2001 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
2002 return E;
2003 }
2004 }
2005
2006 const bool BaseRegSP = BaseReg == AArch64::SP;
2007 if (BaseRegSP && needsWinCFI(I->getMF())) {
2008 // FIXME: For now, we always block the optimization over SP in windows
2009 // targets as it requires to adjust the unwind/debug info, messing up
2010 // the unwind info can actually cause a miscompile.
2011 return E;
2012 }
2013
2014 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2015 unsigned RedZoneSize =
2016 Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction());
2017
2018 // Track which register units have been modified and used between the first
2019 // insn (inclusive) and the second insn.
2020 ModifiedRegUnits.clear();
2021 UsedRegUnits.clear();
2022 unsigned Count = 0;
2023 bool MemAcessBeforeSPPreInc = false;
2024 do {
2025 MBBI = prev_nodbg(MBBI, B);
2026 MachineInstr &MI = *MBBI;
2027
2028 // Don't count transient instructions towards the search limit since there
2029 // may be different numbers of them if e.g. debug information is present.
2030 if (!MI.isTransient())
2031 ++Count;
2032
2033 // If we found a match, return it.
2034 if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset)) {
2035 // Check that the update value is within our red zone limit (which may be
2036 // zero).
2037 if (MemAcessBeforeSPPreInc && MBBI->getOperand(2).getImm() > RedZoneSize)
2038 return E;
2039 return MBBI;
2040 }
2041
2042 // Update the status of what the instruction clobbered and used.
2043 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2044
2045 // Otherwise, if the base register is used or modified, we have no match, so
2046 // return early.
2047 if (!ModifiedRegUnits.available(BaseReg) ||
2048 !UsedRegUnits.available(BaseReg))
2049 return E;
2050 // Keep track if we have a memory access before an SP pre-increment, in this
2051 // case we need to validate later that the update amount respects the red
2052 // zone.
2053 if (BaseRegSP && MBBI->mayLoadOrStore())
2054 MemAcessBeforeSPPreInc = true;
2055 } while (MBBI != B && Count < Limit);
2056 return E;
2057 }
2058
tryToPromoteLoadFromStore(MachineBasicBlock::iterator & MBBI)2059 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
2060 MachineBasicBlock::iterator &MBBI) {
2061 MachineInstr &MI = *MBBI;
2062 // If this is a volatile load, don't mess with it.
2063 if (MI.hasOrderedMemoryRef())
2064 return false;
2065
2066 // Make sure this is a reg+imm.
2067 // FIXME: It is possible to extend it to handle reg+reg cases.
2068 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
2069 return false;
2070
2071 // Look backward up to LdStLimit instructions.
2072 MachineBasicBlock::iterator StoreI;
2073 if (findMatchingStore(MBBI, LdStLimit, StoreI)) {
2074 ++NumLoadsFromStoresPromoted;
2075 // Promote the load. Keeping the iterator straight is a
2076 // pain, so we let the merge routine tell us what the next instruction
2077 // is after it's done mucking about.
2078 MBBI = promoteLoadFromStore(MBBI, StoreI);
2079 return true;
2080 }
2081 return false;
2082 }
2083
2084 // Merge adjacent zero stores into a wider store.
tryToMergeZeroStInst(MachineBasicBlock::iterator & MBBI)2085 bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
2086 MachineBasicBlock::iterator &MBBI) {
2087 assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
2088 MachineInstr &MI = *MBBI;
2089 MachineBasicBlock::iterator E = MI.getParent()->end();
2090
2091 if (!TII->isCandidateToMergeOrPair(MI))
2092 return false;
2093
2094 // Look ahead up to LdStLimit instructions for a mergable instruction.
2095 LdStPairFlags Flags;
2096 MachineBasicBlock::iterator MergeMI =
2097 findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
2098 if (MergeMI != E) {
2099 ++NumZeroStoresPromoted;
2100
2101 // Keeping the iterator straight is a pain, so we let the merge routine tell
2102 // us what the next instruction is after it's done mucking about.
2103 MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags);
2104 return true;
2105 }
2106 return false;
2107 }
2108
2109 // Find loads and stores that can be merged into a single load or store pair
2110 // instruction.
tryToPairLdStInst(MachineBasicBlock::iterator & MBBI)2111 bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
2112 MachineInstr &MI = *MBBI;
2113 MachineBasicBlock::iterator E = MI.getParent()->end();
2114
2115 if (!TII->isCandidateToMergeOrPair(MI))
2116 return false;
2117
2118 // Early exit if the offset is not possible to match. (6 bits of positive
2119 // range, plus allow an extra one in case we find a later insn that matches
2120 // with Offset-1)
2121 bool IsUnscaled = TII->hasUnscaledLdStOffset(MI);
2122 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
2123 int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1;
2124 // Allow one more for offset.
2125 if (Offset > 0)
2126 Offset -= OffsetStride;
2127 if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
2128 return false;
2129
2130 // Look ahead up to LdStLimit instructions for a pairable instruction.
2131 LdStPairFlags Flags;
2132 MachineBasicBlock::iterator Paired =
2133 findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
2134 if (Paired != E) {
2135 ++NumPairCreated;
2136 if (TII->hasUnscaledLdStOffset(MI))
2137 ++NumUnscaledPairCreated;
2138 // Keeping the iterator straight is a pain, so we let the merge routine tell
2139 // us what the next instruction is after it's done mucking about.
2140 auto Prev = std::prev(MBBI);
2141 MBBI = mergePairedInsns(MBBI, Paired, Flags);
2142 // Collect liveness info for instructions between Prev and the new position
2143 // MBBI.
2144 for (auto I = std::next(Prev); I != MBBI; I++)
2145 updateDefinedRegisters(*I, DefinedInBB, TRI);
2146
2147 return true;
2148 }
2149 return false;
2150 }
2151
tryToMergeLdStUpdate(MachineBasicBlock::iterator & MBBI)2152 bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
2153 (MachineBasicBlock::iterator &MBBI) {
2154 MachineInstr &MI = *MBBI;
2155 MachineBasicBlock::iterator E = MI.getParent()->end();
2156 MachineBasicBlock::iterator Update;
2157
2158 // Look forward to try to form a post-index instruction. For example,
2159 // ldr x0, [x20]
2160 // add x20, x20, #32
2161 // merged into:
2162 // ldr x0, [x20], #32
2163 Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
2164 if (Update != E) {
2165 // Merge the update into the ld/st.
2166 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
2167 return true;
2168 }
2169
2170 // Don't know how to handle unscaled pre/post-index versions below, so bail.
2171 if (TII->hasUnscaledLdStOffset(MI.getOpcode()))
2172 return false;
2173
2174 // Look back to try to find a pre-index instruction. For example,
2175 // add x0, x0, #8
2176 // ldr x1, [x0]
2177 // merged into:
2178 // ldr x1, [x0, #8]!
2179 Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
2180 if (Update != E) {
2181 // Merge the update into the ld/st.
2182 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
2183 return true;
2184 }
2185
2186 // The immediate in the load/store is scaled by the size of the memory
2187 // operation. The immediate in the add we're looking for,
2188 // however, is not, so adjust here.
2189 int UnscaledOffset =
2190 AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);
2191
2192 // Look forward to try to find a pre-index instruction. For example,
2193 // ldr x1, [x0, #64]
2194 // add x0, x0, #64
2195 // merged into:
2196 // ldr x1, [x0, #64]!
2197 Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
2198 if (Update != E) {
2199 // Merge the update into the ld/st.
2200 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
2201 return true;
2202 }
2203
2204 return false;
2205 }
2206
optimizeBlock(MachineBasicBlock & MBB,bool EnableNarrowZeroStOpt)2207 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
2208 bool EnableNarrowZeroStOpt) {
2209
2210 bool Modified = false;
2211 // Four tranformations to do here:
2212 // 1) Find loads that directly read from stores and promote them by
2213 // replacing with mov instructions. If the store is wider than the load,
2214 // the load will be replaced with a bitfield extract.
2215 // e.g.,
2216 // str w1, [x0, #4]
2217 // ldrh w2, [x0, #6]
2218 // ; becomes
2219 // str w1, [x0, #4]
2220 // lsr w2, w1, #16
2221 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2222 MBBI != E;) {
2223 if (isPromotableLoadFromStore(*MBBI) && tryToPromoteLoadFromStore(MBBI))
2224 Modified = true;
2225 else
2226 ++MBBI;
2227 }
2228 // 2) Merge adjacent zero stores into a wider store.
2229 // e.g.,
2230 // strh wzr, [x0]
2231 // strh wzr, [x0, #2]
2232 // ; becomes
2233 // str wzr, [x0]
2234 // e.g.,
2235 // str wzr, [x0]
2236 // str wzr, [x0, #4]
2237 // ; becomes
2238 // str xzr, [x0]
2239 if (EnableNarrowZeroStOpt)
2240 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2241 MBBI != E;) {
2242 if (isPromotableZeroStoreInst(*MBBI) && tryToMergeZeroStInst(MBBI))
2243 Modified = true;
2244 else
2245 ++MBBI;
2246 }
2247 // 3) Find loads and stores that can be merged into a single load or store
2248 // pair instruction.
2249 // e.g.,
2250 // ldr x0, [x2]
2251 // ldr x1, [x2, #8]
2252 // ; becomes
2253 // ldp x0, x1, [x2]
2254
2255 if (MBB.getParent()->getRegInfo().tracksLiveness()) {
2256 DefinedInBB.clear();
2257 DefinedInBB.addLiveIns(MBB);
2258 }
2259
2260 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2261 MBBI != E;) {
2262 // Track currently live registers up to this point, to help with
2263 // searching for a rename register on demand.
2264 updateDefinedRegisters(*MBBI, DefinedInBB, TRI);
2265 if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI))
2266 Modified = true;
2267 else
2268 ++MBBI;
2269 }
2270 // 4) Find base register updates that can be merged into the load or store
2271 // as a base-reg writeback.
2272 // e.g.,
2273 // ldr x0, [x2]
2274 // add x2, x2, #4
2275 // ; becomes
2276 // ldr x0, [x2], #4
2277 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2278 MBBI != E;) {
2279 if (isMergeableLdStUpdate(*MBBI) && tryToMergeLdStUpdate(MBBI))
2280 Modified = true;
2281 else
2282 ++MBBI;
2283 }
2284
2285 return Modified;
2286 }
2287
runOnMachineFunction(MachineFunction & Fn)2288 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
2289 if (skipFunction(Fn.getFunction()))
2290 return false;
2291
2292 Subtarget = &Fn.getSubtarget<AArch64Subtarget>();
2293 TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
2294 TRI = Subtarget->getRegisterInfo();
2295 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2296
2297 // Resize the modified and used register unit trackers. We do this once
2298 // per function and then clear the register units each time we optimize a load
2299 // or store.
2300 ModifiedRegUnits.init(*TRI);
2301 UsedRegUnits.init(*TRI);
2302 DefinedInBB.init(*TRI);
2303
2304 bool Modified = false;
2305 bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
2306 for (auto &MBB : Fn) {
2307 auto M = optimizeBlock(MBB, enableNarrowZeroStOpt);
2308 Modified |= M;
2309 }
2310
2311 return Modified;
2312 }
2313
2314 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
2315 // stores near one another? Note: The pre-RA instruction scheduler already has
2316 // hooks to try and schedule pairable loads/stores together to improve pairing
2317 // opportunities. Thus, pre-RA pairing pass may not be worth the effort.
2318
2319 // FIXME: When pairing store instructions it's very possible for this pass to
2320 // hoist a store with a KILL marker above another use (without a KILL marker).
2321 // The resulting IR is invalid, but nothing uses the KILL markers after this
2322 // pass, so it's never caused a problem in practice.
2323
2324 /// createAArch64LoadStoreOptimizationPass - returns an instance of the
2325 /// load / store optimization pass.
createAArch64LoadStoreOptimizationPass()2326 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
2327 return new AArch64LoadStoreOpt();
2328 }
2329