1 //===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of TargetFrameLowering class.
10 //
11 // On AArch64, stack frames are structured as follows:
12 //
13 // The stack grows downward.
14 //
15 // All of the individual frame areas on the frame below are optional, i.e. it's
16 // possible to create a function so that the particular area isn't present
17 // in the frame.
18 //
19 // At function entry, the "frame" looks as follows:
20 //
21 // |                                   | Higher address
22 // |-----------------------------------|
23 // |                                   |
24 // | arguments passed on the stack     |
25 // |                                   |
26 // |-----------------------------------| <- sp
27 // |                                   | Lower address
28 //
29 //
30 // After the prologue has run, the frame has the following general structure.
31 // Note that this doesn't depict the case where a red-zone is used. Also,
32 // technically the last frame area (VLAs) doesn't get created until in the
33 // main function body, after the prologue is run. However, it's depicted here
34 // for completeness.
35 //
36 // |                                   | Higher address
37 // |-----------------------------------|
38 // |                                   |
39 // | arguments passed on the stack     |
40 // |                                   |
41 // |-----------------------------------|
42 // |                                   |
43 // | (Win64 only) varargs from reg     |
44 // |                                   |
45 // |-----------------------------------|
46 // |                                   |
47 // | callee-saved gpr registers        | <--.
48 // |                                   |    | On Darwin platforms these
49 // |- - - - - - - - - - - - - - - - - -|    | callee saves are swapped,
50 // | prev_lr                           |    | (frame record first)
51 // | prev_fp                           | <--'
52 // | async context if needed           |
53 // | (a.k.a. "frame record")           |
54 // |-----------------------------------| <- fp(=x29)
55 // |                                   |
56 // | callee-saved fp/simd/SVE regs     |
57 // |                                   |
58 // |-----------------------------------|
59 // |                                   |
60 // |        SVE stack objects          |
61 // |                                   |
62 // |-----------------------------------|
63 // |.empty.space.to.make.part.below....|
64 // |.aligned.in.case.it.needs.more.than| (size of this area is unknown at
65 // |.the.standard.16-byte.alignment....|  compile time; if present)
66 // |-----------------------------------|
67 // |                                   |
68 // | local variables of fixed size     |
69 // | including spill slots             |
70 // |-----------------------------------| <- bp(not defined by ABI,
71 // |.variable-sized.local.variables....|       LLVM chooses X19)
72 // |.(VLAs)............................| (size of this area is unknown at
73 // |...................................|  compile time)
74 // |-----------------------------------| <- sp
75 // |                                   | Lower address
76 //
77 //
78 // To access the data in a frame, at-compile time, a constant offset must be
79 // computable from one of the pointers (fp, bp, sp) to access it. The size
80 // of the areas with a dotted background cannot be computed at compile-time
81 // if they are present, making it required to have all three of fp, bp and
82 // sp to be set up to be able to access all contents in the frame areas,
83 // assuming all of the frame areas are non-empty.
84 //
85 // For most functions, some of the frame areas are empty. For those functions,
86 // it may not be necessary to set up fp or bp:
87 // * A base pointer is definitely needed when there are both VLAs and local
88 //   variables with more-than-default alignment requirements.
89 // * A frame pointer is definitely needed when there are local variables with
90 //   more-than-default alignment requirements.
91 //
92 // For Darwin platforms the frame-record (fp, lr) is stored at the top of the
93 // callee-saved area, since the unwind encoding does not allow for encoding
94 // this dynamically and existing tools depend on this layout. For other
95 // platforms, the frame-record is stored at the bottom of the (gpr) callee-saved
96 // area to allow SVE stack objects (allocated directly below the callee-saves,
97 // if available) to be accessed directly from the framepointer.
98 // The SVE spill/fill instructions have VL-scaled addressing modes such
99 // as:
100 //    ldr z8, [fp, #-7 mul vl]
101 // For SVE the size of the vector length (VL) is not known at compile-time, so
102 // '#-7 mul vl' is an offset that can only be evaluated at runtime. With this
103 // layout, we don't need to add an unscaled offset to the framepointer before
104 // accessing the SVE object in the frame.
105 //
106 // In some cases when a base pointer is not strictly needed, it is generated
107 // anyway when offsets from the frame pointer to access local variables become
108 // so large that the offset can't be encoded in the immediate fields of loads
109 // or stores.
110 //
111 // Outgoing function arguments must be at the bottom of the stack frame when
112 // calling another function. If we do not have variable-sized stack objects, we
113 // can allocate a "reserved call frame" area at the bottom of the local
114 // variable area, large enough for all outgoing calls. If we do have VLAs, then
115 // the stack pointer must be decremented and incremented around each call to
116 // make space for the arguments below the VLAs.
117 //
118 // FIXME: also explain the redzone concept.
119 //
120 // An example of the prologue:
121 //
122 //     .globl __foo
123 //     .align 2
124 //  __foo:
125 // Ltmp0:
126 //     .cfi_startproc
127 //     .cfi_personality 155, ___gxx_personality_v0
128 // Leh_func_begin:
129 //     .cfi_lsda 16, Lexception33
130 //
131 //     stp  xa,bx, [sp, -#offset]!
132 //     ...
133 //     stp  x28, x27, [sp, #offset-32]
134 //     stp  fp, lr, [sp, #offset-16]
135 //     add  fp, sp, #offset - 16
136 //     sub  sp, sp, #1360
137 //
138 // The Stack:
139 //       +-------------------------------------------+
140 // 10000 | ........ | ........ | ........ | ........ |
141 // 10004 | ........ | ........ | ........ | ........ |
142 //       +-------------------------------------------+
143 // 10008 | ........ | ........ | ........ | ........ |
144 // 1000c | ........ | ........ | ........ | ........ |
145 //       +===========================================+
146 // 10010 |                X28 Register               |
147 // 10014 |                X28 Register               |
148 //       +-------------------------------------------+
149 // 10018 |                X27 Register               |
150 // 1001c |                X27 Register               |
151 //       +===========================================+
152 // 10020 |                Frame Pointer              |
153 // 10024 |                Frame Pointer              |
154 //       +-------------------------------------------+
155 // 10028 |                Link Register              |
156 // 1002c |                Link Register              |
157 //       +===========================================+
158 // 10030 | ........ | ........ | ........ | ........ |
159 // 10034 | ........ | ........ | ........ | ........ |
160 //       +-------------------------------------------+
161 // 10038 | ........ | ........ | ........ | ........ |
162 // 1003c | ........ | ........ | ........ | ........ |
163 //       +-------------------------------------------+
164 //
165 //     [sp] = 10030        ::    >>initial value<<
166 //     sp = 10020          ::  stp fp, lr, [sp, #-16]!
167 //     fp = sp == 10020    ::  mov fp, sp
168 //     [sp] == 10020       ::  stp x28, x27, [sp, #-16]!
169 //     sp == 10010         ::    >>final value<<
170 //
171 // The frame pointer (w29) points to address 10020. If we use an offset of
172 // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
173 // for w27, and -32 for w28:
174 //
175 //  Ltmp1:
176 //     .cfi_def_cfa w29, 16
177 //  Ltmp2:
178 //     .cfi_offset w30, -8
179 //  Ltmp3:
180 //     .cfi_offset w29, -16
181 //  Ltmp4:
182 //     .cfi_offset w27, -24
183 //  Ltmp5:
184 //     .cfi_offset w28, -32
185 //
186 //===----------------------------------------------------------------------===//
187 
188 #include "AArch64FrameLowering.h"
189 #include "AArch64InstrInfo.h"
190 #include "AArch64MachineFunctionInfo.h"
191 #include "AArch64RegisterInfo.h"
192 #include "AArch64Subtarget.h"
193 #include "AArch64TargetMachine.h"
194 #include "MCTargetDesc/AArch64AddressingModes.h"
195 #include "MCTargetDesc/AArch64MCTargetDesc.h"
196 #include "llvm/ADT/ScopeExit.h"
197 #include "llvm/ADT/SmallVector.h"
198 #include "llvm/ADT/Statistic.h"
199 #include "llvm/CodeGen/LivePhysRegs.h"
200 #include "llvm/CodeGen/MachineBasicBlock.h"
201 #include "llvm/CodeGen/MachineFrameInfo.h"
202 #include "llvm/CodeGen/MachineFunction.h"
203 #include "llvm/CodeGen/MachineInstr.h"
204 #include "llvm/CodeGen/MachineInstrBuilder.h"
205 #include "llvm/CodeGen/MachineMemOperand.h"
206 #include "llvm/CodeGen/MachineModuleInfo.h"
207 #include "llvm/CodeGen/MachineOperand.h"
208 #include "llvm/CodeGen/MachineRegisterInfo.h"
209 #include "llvm/CodeGen/RegisterScavenging.h"
210 #include "llvm/CodeGen/TargetInstrInfo.h"
211 #include "llvm/CodeGen/TargetRegisterInfo.h"
212 #include "llvm/CodeGen/TargetSubtargetInfo.h"
213 #include "llvm/CodeGen/WinEHFuncInfo.h"
214 #include "llvm/IR/Attributes.h"
215 #include "llvm/IR/CallingConv.h"
216 #include "llvm/IR/DataLayout.h"
217 #include "llvm/IR/DebugLoc.h"
218 #include "llvm/IR/Function.h"
219 #include "llvm/MC/MCAsmInfo.h"
220 #include "llvm/MC/MCDwarf.h"
221 #include "llvm/Support/CommandLine.h"
222 #include "llvm/Support/Debug.h"
223 #include "llvm/Support/ErrorHandling.h"
224 #include "llvm/Support/MathExtras.h"
225 #include "llvm/Support/raw_ostream.h"
226 #include "llvm/Target/TargetMachine.h"
227 #include "llvm/Target/TargetOptions.h"
228 #include <cassert>
229 #include <cstdint>
230 #include <iterator>
231 #include <vector>
232 
233 using namespace llvm;
234 
235 #define DEBUG_TYPE "frame-info"
236 
237 static cl::opt<bool> EnableRedZone("aarch64-redzone",
238                                    cl::desc("enable use of redzone on AArch64"),
239                                    cl::init(false), cl::Hidden);
240 
241 static cl::opt<bool>
242     ReverseCSRRestoreSeq("reverse-csr-restore-seq",
243                          cl::desc("reverse the CSR restore sequence"),
244                          cl::init(false), cl::Hidden);
245 
246 static cl::opt<bool> StackTaggingMergeSetTag(
247     "stack-tagging-merge-settag",
248     cl::desc("merge settag instruction in function epilog"), cl::init(true),
249     cl::Hidden);
250 
251 static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects",
252                                        cl::desc("sort stack allocations"),
253                                        cl::init(true), cl::Hidden);
254 
255 cl::opt<bool> EnableHomogeneousPrologEpilog(
256     "homogeneous-prolog-epilog", cl::Hidden,
257     cl::desc("Emit homogeneous prologue and epilogue for the size "
258              "optimization (default = off)"));
259 
260 STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
261 
262 /// Returns how much of the incoming argument stack area (in bytes) we should
263 /// clean up in an epilogue. For the C calling convention this will be 0, for
264 /// guaranteed tail call conventions it can be positive (a normal return or a
265 /// tail call to a function that uses less stack space for arguments) or
266 /// negative (for a tail call to a function that needs more stack space than us
267 /// for arguments).
getArgumentStackToRestore(MachineFunction & MF,MachineBasicBlock & MBB)268 static int64_t getArgumentStackToRestore(MachineFunction &MF,
269                                          MachineBasicBlock &MBB) {
270   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
271   bool IsTailCallReturn = false;
272   if (MBB.end() != MBBI) {
273     unsigned RetOpcode = MBBI->getOpcode();
274     IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi ||
275                        RetOpcode == AArch64::TCRETURNri ||
276                        RetOpcode == AArch64::TCRETURNriBTI;
277   }
278   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
279 
280   int64_t ArgumentPopSize = 0;
281   if (IsTailCallReturn) {
282     MachineOperand &StackAdjust = MBBI->getOperand(1);
283 
284     // For a tail-call in a callee-pops-arguments environment, some or all of
285     // the stack may actually be in use for the call's arguments, this is
286     // calculated during LowerCall and consumed here...
287     ArgumentPopSize = StackAdjust.getImm();
288   } else {
289     // ... otherwise the amount to pop is *all* of the argument space,
290     // conveniently stored in the MachineFunctionInfo by
291     // LowerFormalArguments. This will, of course, be zero for the C calling
292     // convention.
293     ArgumentPopSize = AFI->getArgumentStackToRestore();
294   }
295 
296   return ArgumentPopSize;
297 }
298 
299 static bool produceCompactUnwindFrame(MachineFunction &MF);
300 static bool needsWinCFI(const MachineFunction &MF);
301 static StackOffset getSVEStackSize(const MachineFunction &MF);
302 static bool needsShadowCallStackPrologueEpilogue(MachineFunction &MF);
303 
304 /// Returns true if a homogeneous prolog or epilog code can be emitted
305 /// for the size optimization. If possible, a frame helper call is injected.
306 /// When Exit block is given, this check is for epilog.
homogeneousPrologEpilog(MachineFunction & MF,MachineBasicBlock * Exit) const307 bool AArch64FrameLowering::homogeneousPrologEpilog(
308     MachineFunction &MF, MachineBasicBlock *Exit) const {
309   if (!MF.getFunction().hasMinSize())
310     return false;
311   if (!EnableHomogeneousPrologEpilog)
312     return false;
313   if (ReverseCSRRestoreSeq)
314     return false;
315   if (EnableRedZone)
316     return false;
317 
318   // TODO: Window is supported yet.
319   if (needsWinCFI(MF))
320     return false;
321   // TODO: SVE is not supported yet.
322   if (getSVEStackSize(MF))
323     return false;
324 
325   // Bail on stack adjustment needed on return for simplicity.
326   const MachineFrameInfo &MFI = MF.getFrameInfo();
327   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
328   if (MFI.hasVarSizedObjects() || RegInfo->hasStackRealignment(MF))
329     return false;
330   if (Exit && getArgumentStackToRestore(MF, *Exit))
331     return false;
332 
333   return true;
334 }
335 
336 /// Returns true if CSRs should be paired.
producePairRegisters(MachineFunction & MF) const337 bool AArch64FrameLowering::producePairRegisters(MachineFunction &MF) const {
338   return produceCompactUnwindFrame(MF) || homogeneousPrologEpilog(MF);
339 }
340 
341 /// This is the biggest offset to the stack pointer we can encode in aarch64
342 /// instructions (without using a separate calculation and a temp register).
343 /// Note that the exception here are vector stores/loads which cannot encode any
344 /// displacements (see estimateRSStackSizeLimit(), isAArch64FrameOffsetLegal()).
345 static const unsigned DefaultSafeSPDisplacement = 255;
346 
347 /// Look at each instruction that references stack frames and return the stack
348 /// size limit beyond which some of these instructions will require a scratch
349 /// register during their expansion later.
estimateRSStackSizeLimit(MachineFunction & MF)350 static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
351   // FIXME: For now, just conservatively guestimate based on unscaled indexing
352   // range. We'll end up allocating an unnecessary spill slot a lot, but
353   // realistically that's not a big deal at this stage of the game.
354   for (MachineBasicBlock &MBB : MF) {
355     for (MachineInstr &MI : MBB) {
356       if (MI.isDebugInstr() || MI.isPseudo() ||
357           MI.getOpcode() == AArch64::ADDXri ||
358           MI.getOpcode() == AArch64::ADDSXri)
359         continue;
360 
361       for (const MachineOperand &MO : MI.operands()) {
362         if (!MO.isFI())
363           continue;
364 
365         StackOffset Offset;
366         if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) ==
367             AArch64FrameOffsetCannotUpdate)
368           return 0;
369       }
370     }
371   }
372   return DefaultSafeSPDisplacement;
373 }
374 
375 TargetStackID::Value
getStackIDForScalableVectors() const376 AArch64FrameLowering::getStackIDForScalableVectors() const {
377   return TargetStackID::ScalableVector;
378 }
379 
380 /// Returns the size of the fixed object area (allocated next to sp on entry)
381 /// On Win64 this may include a var args area and an UnwindHelp object for EH.
getFixedObjectSize(const MachineFunction & MF,const AArch64FunctionInfo * AFI,bool IsWin64,bool IsFunclet)382 static unsigned getFixedObjectSize(const MachineFunction &MF,
383                                    const AArch64FunctionInfo *AFI, bool IsWin64,
384                                    bool IsFunclet) {
385   if (!IsWin64 || IsFunclet) {
386     return AFI->getTailCallReservedStack();
387   } else {
388     if (AFI->getTailCallReservedStack() != 0)
389       report_fatal_error("cannot generate ABI-changing tail call for Win64");
390     // Var args are stored here in the primary function.
391     const unsigned VarArgsArea = AFI->getVarArgsGPRSize();
392     // To support EH funclets we allocate an UnwindHelp object
393     const unsigned UnwindHelpObject = (MF.hasEHFunclets() ? 8 : 0);
394     return alignTo(VarArgsArea + UnwindHelpObject, 16);
395   }
396 }
397 
398 /// Returns the size of the entire SVE stackframe (calleesaves + spills).
getSVEStackSize(const MachineFunction & MF)399 static StackOffset getSVEStackSize(const MachineFunction &MF) {
400   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
401   return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE());
402 }
403 
canUseRedZone(const MachineFunction & MF) const404 bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
405   if (!EnableRedZone)
406     return false;
407 
408   // Don't use the red zone if the function explicitly asks us not to.
409   // This is typically used for kernel code.
410   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
411   const unsigned RedZoneSize =
412       Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction());
413   if (!RedZoneSize)
414     return false;
415 
416   const MachineFrameInfo &MFI = MF.getFrameInfo();
417   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
418   uint64_t NumBytes = AFI->getLocalStackSize();
419 
420   return !(MFI.hasCalls() || hasFP(MF) || NumBytes > RedZoneSize ||
421            getSVEStackSize(MF));
422 }
423 
424 /// hasFP - Return true if the specified function should have a dedicated frame
425 /// pointer register.
hasFP(const MachineFunction & MF) const426 bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
427   const MachineFrameInfo &MFI = MF.getFrameInfo();
428   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
429   // Win64 EH requires a frame pointer if funclets are present, as the locals
430   // are accessed off the frame pointer in both the parent function and the
431   // funclets.
432   if (MF.hasEHFunclets())
433     return true;
434   // Retain behavior of always omitting the FP for leaf functions when possible.
435   if (MF.getTarget().Options.DisableFramePointerElim(MF))
436     return true;
437   if (MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
438       MFI.hasStackMap() || MFI.hasPatchPoint() ||
439       RegInfo->hasStackRealignment(MF))
440     return true;
441   // With large callframes around we may need to use FP to access the scavenging
442   // emergency spillslot.
443   //
444   // Unfortunately some calls to hasFP() like machine verifier ->
445   // getReservedReg() -> hasFP in the middle of global isel are too early
446   // to know the max call frame size. Hopefully conservatively returning "true"
447   // in those cases is fine.
448   // DefaultSafeSPDisplacement is fine as we only emergency spill GP regs.
449   if (!MFI.isMaxCallFrameSizeComputed() ||
450       MFI.getMaxCallFrameSize() > DefaultSafeSPDisplacement)
451     return true;
452 
453   return false;
454 }
455 
456 /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
457 /// not required, we reserve argument space for call sites in the function
458 /// immediately on entry to the current function.  This eliminates the need for
459 /// add/sub sp brackets around call sites.  Returns true if the call frame is
460 /// included as part of the stack frame.
461 bool
hasReservedCallFrame(const MachineFunction & MF) const462 AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
463   return !MF.getFrameInfo().hasVarSizedObjects();
464 }
465 
eliminateCallFramePseudoInstr(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I) const466 MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
467     MachineFunction &MF, MachineBasicBlock &MBB,
468     MachineBasicBlock::iterator I) const {
469   const AArch64InstrInfo *TII =
470       static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
471   DebugLoc DL = I->getDebugLoc();
472   unsigned Opc = I->getOpcode();
473   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
474   uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
475 
476   if (!hasReservedCallFrame(MF)) {
477     int64_t Amount = I->getOperand(0).getImm();
478     Amount = alignTo(Amount, getStackAlign());
479     if (!IsDestroy)
480       Amount = -Amount;
481 
482     // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
483     // doesn't have to pop anything), then the first operand will be zero too so
484     // this adjustment is a no-op.
485     if (CalleePopAmount == 0) {
486       // FIXME: in-function stack adjustment for calls is limited to 24-bits
487       // because there's no guaranteed temporary register available.
488       //
489       // ADD/SUB (immediate) has only LSL #0 and LSL #12 available.
490       // 1) For offset <= 12-bit, we use LSL #0
491       // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
492       // LSL #0, and the other uses LSL #12.
493       //
494       // Most call frames will be allocated at the start of a function so
495       // this is OK, but it is a limitation that needs dealing with.
496       assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
497       emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
498                       StackOffset::getFixed(Amount), TII);
499     }
500   } else if (CalleePopAmount != 0) {
501     // If the calling convention demands that the callee pops arguments from the
502     // stack, we want to add it back if we have a reserved call frame.
503     assert(CalleePopAmount < 0xffffff && "call frame too large");
504     emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
505                     StackOffset::getFixed(-(int64_t)CalleePopAmount), TII);
506   }
507   return MBB.erase(I);
508 }
509 
emitCalleeSavedGPRLocations(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI) const510 void AArch64FrameLowering::emitCalleeSavedGPRLocations(
511     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
512   MachineFunction &MF = *MBB.getParent();
513   MachineFrameInfo &MFI = MF.getFrameInfo();
514 
515   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
516   if (CSI.empty())
517     return;
518 
519   const TargetSubtargetInfo &STI = MF.getSubtarget();
520   const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
521   const TargetInstrInfo &TII = *STI.getInstrInfo();
522   DebugLoc DL = MBB.findDebugLoc(MBBI);
523 
524   for (const auto &Info : CSI) {
525     if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector)
526       continue;
527 
528     assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
529     unsigned DwarfReg = TRI.getDwarfRegNum(Info.getReg(), true);
530 
531     int64_t Offset =
532         MFI.getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea();
533     unsigned CFIIndex = MF.addFrameInst(
534         MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
535     BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
536         .addCFIIndex(CFIIndex)
537         .setMIFlags(MachineInstr::FrameSetup);
538   }
539 }
540 
emitCalleeSavedSVELocations(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI) const541 void AArch64FrameLowering::emitCalleeSavedSVELocations(
542     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
543   MachineFunction &MF = *MBB.getParent();
544   MachineFrameInfo &MFI = MF.getFrameInfo();
545 
546   // Add callee saved registers to move list.
547   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
548   if (CSI.empty())
549     return;
550 
551   const TargetSubtargetInfo &STI = MF.getSubtarget();
552   const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
553   const TargetInstrInfo &TII = *STI.getInstrInfo();
554   DebugLoc DL = MBB.findDebugLoc(MBBI);
555   AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
556 
557   for (const auto &Info : CSI) {
558     if (!(MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector))
559       continue;
560 
561     // Not all unwinders may know about SVE registers, so assume the lowest
562     // common demoninator.
563     assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
564     unsigned Reg = Info.getReg();
565     if (!static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
566       continue;
567 
568     StackOffset Offset =
569         StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
570         StackOffset::getFixed(AFI.getCalleeSavedStackSize(MFI));
571 
572     unsigned CFIIndex = MF.addFrameInst(createCFAOffset(TRI, Reg, Offset));
573     BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
574         .addCFIIndex(CFIIndex)
575         .setMIFlags(MachineInstr::FrameSetup);
576   }
577 }
578 
emitCalleeSavedFrameMoves(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI) const579 void AArch64FrameLowering::emitCalleeSavedFrameMoves(
580     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
581   emitCalleeSavedGPRLocations(MBB, MBBI);
582   emitCalleeSavedSVELocations(MBB, MBBI);
583 }
584 
insertCFISameValue(const MCInstrDesc & Desc,MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertPt,unsigned DwarfReg)585 static void insertCFISameValue(const MCInstrDesc &Desc, MachineFunction &MF,
586                                MachineBasicBlock &MBB,
587                                MachineBasicBlock::iterator InsertPt,
588                                unsigned DwarfReg) {
589   unsigned CFIIndex =
590       MF.addFrameInst(MCCFIInstruction::createSameValue(nullptr, DwarfReg));
591   BuildMI(MBB, InsertPt, DebugLoc(), Desc).addCFIIndex(CFIIndex);
592 }
593 
resetCFIToInitialState(MachineBasicBlock & MBB) const594 void AArch64FrameLowering::resetCFIToInitialState(
595     MachineBasicBlock &MBB) const {
596 
597   MachineFunction &MF = *MBB.getParent();
598   const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
599   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
600   const auto &TRI =
601       static_cast<const AArch64RegisterInfo &>(*Subtarget.getRegisterInfo());
602   const auto &MFI = *MF.getInfo<AArch64FunctionInfo>();
603 
604   const MCInstrDesc &CFIDesc = TII.get(TargetOpcode::CFI_INSTRUCTION);
605   DebugLoc DL;
606 
607   // Reset the CFA to `SP + 0`.
608   MachineBasicBlock::iterator InsertPt = MBB.begin();
609   unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
610       nullptr, TRI.getDwarfRegNum(AArch64::SP, true), 0));
611   BuildMI(MBB, InsertPt, DL, CFIDesc).addCFIIndex(CFIIndex);
612 
613   // Flip the RA sign state.
614   if (MFI.shouldSignReturnAddress()) {
615     CFIIndex = MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
616     BuildMI(MBB, InsertPt, DL, CFIDesc).addCFIIndex(CFIIndex);
617   }
618 
619   // Shadow call stack uses X18, reset it.
620   if (needsShadowCallStackPrologueEpilogue(MF))
621     insertCFISameValue(CFIDesc, MF, MBB, InsertPt,
622                        TRI.getDwarfRegNum(AArch64::X18, true));
623 
624   // Emit .cfi_same_value for callee-saved registers.
625   const std::vector<CalleeSavedInfo> &CSI =
626       MF.getFrameInfo().getCalleeSavedInfo();
627   for (const auto &Info : CSI) {
628     unsigned Reg = Info.getReg();
629     if (!TRI.regNeedsCFI(Reg, Reg))
630       continue;
631     insertCFISameValue(CFIDesc, MF, MBB, InsertPt,
632                        TRI.getDwarfRegNum(Reg, true));
633   }
634 }
635 
emitCalleeSavedRestores(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,bool SVE)636 static void emitCalleeSavedRestores(MachineBasicBlock &MBB,
637                                     MachineBasicBlock::iterator MBBI,
638                                     bool SVE) {
639   MachineFunction &MF = *MBB.getParent();
640   MachineFrameInfo &MFI = MF.getFrameInfo();
641 
642   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
643   if (CSI.empty())
644     return;
645 
646   const TargetSubtargetInfo &STI = MF.getSubtarget();
647   const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
648   const TargetInstrInfo &TII = *STI.getInstrInfo();
649   DebugLoc DL = MBB.findDebugLoc(MBBI);
650 
651   for (const auto &Info : CSI) {
652     if (SVE !=
653         (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector))
654       continue;
655 
656     unsigned Reg = Info.getReg();
657     if (SVE &&
658         !static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
659       continue;
660 
661     unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore(
662         nullptr, TRI.getDwarfRegNum(Info.getReg(), true)));
663     BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
664         .addCFIIndex(CFIIndex)
665         .setMIFlags(MachineInstr::FrameDestroy);
666   }
667 }
668 
emitCalleeSavedGPRRestores(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI) const669 void AArch64FrameLowering::emitCalleeSavedGPRRestores(
670     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
671   emitCalleeSavedRestores(MBB, MBBI, false);
672 }
673 
emitCalleeSavedSVERestores(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI) const674 void AArch64FrameLowering::emitCalleeSavedSVERestores(
675     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
676   emitCalleeSavedRestores(MBB, MBBI, true);
677 }
678 
getRegisterOrZero(MCRegister Reg,bool HasSVE)679 static MCRegister getRegisterOrZero(MCRegister Reg, bool HasSVE) {
680   switch (Reg.id()) {
681   default:
682     // The called routine is expected to preserve r19-r28
683     // r29 and r30 are used as frame pointer and link register resp.
684     return 0;
685 
686     // GPRs
687 #define CASE(n)                                                                \
688   case AArch64::W##n:                                                          \
689   case AArch64::X##n:                                                          \
690     return AArch64::X##n
691   CASE(0);
692   CASE(1);
693   CASE(2);
694   CASE(3);
695   CASE(4);
696   CASE(5);
697   CASE(6);
698   CASE(7);
699   CASE(8);
700   CASE(9);
701   CASE(10);
702   CASE(11);
703   CASE(12);
704   CASE(13);
705   CASE(14);
706   CASE(15);
707   CASE(16);
708   CASE(17);
709   CASE(18);
710 #undef CASE
711 
712     // FPRs
713 #define CASE(n)                                                                \
714   case AArch64::B##n:                                                          \
715   case AArch64::H##n:                                                          \
716   case AArch64::S##n:                                                          \
717   case AArch64::D##n:                                                          \
718   case AArch64::Q##n:                                                          \
719     return HasSVE ? AArch64::Z##n : AArch64::Q##n
720   CASE(0);
721   CASE(1);
722   CASE(2);
723   CASE(3);
724   CASE(4);
725   CASE(5);
726   CASE(6);
727   CASE(7);
728   CASE(8);
729   CASE(9);
730   CASE(10);
731   CASE(11);
732   CASE(12);
733   CASE(13);
734   CASE(14);
735   CASE(15);
736   CASE(16);
737   CASE(17);
738   CASE(18);
739   CASE(19);
740   CASE(20);
741   CASE(21);
742   CASE(22);
743   CASE(23);
744   CASE(24);
745   CASE(25);
746   CASE(26);
747   CASE(27);
748   CASE(28);
749   CASE(29);
750   CASE(30);
751   CASE(31);
752 #undef CASE
753   }
754 }
755 
emitZeroCallUsedRegs(BitVector RegsToZero,MachineBasicBlock & MBB) const756 void AArch64FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero,
757                                                 MachineBasicBlock &MBB) const {
758   // Insertion point.
759   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
760 
761   // Fake a debug loc.
762   DebugLoc DL;
763   if (MBBI != MBB.end())
764     DL = MBBI->getDebugLoc();
765 
766   const MachineFunction &MF = *MBB.getParent();
767   const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
768   const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
769 
770   BitVector GPRsToZero(TRI.getNumRegs());
771   BitVector FPRsToZero(TRI.getNumRegs());
772   bool HasSVE = STI.hasSVE();
773   for (MCRegister Reg : RegsToZero.set_bits()) {
774     if (TRI.isGeneralPurposeRegister(MF, Reg)) {
775       // For GPRs, we only care to clear out the 64-bit register.
776       if (MCRegister XReg = getRegisterOrZero(Reg, HasSVE))
777         GPRsToZero.set(XReg);
778     } else if (AArch64::FPR128RegClass.contains(Reg) ||
779                AArch64::FPR64RegClass.contains(Reg) ||
780                AArch64::FPR32RegClass.contains(Reg) ||
781                AArch64::FPR16RegClass.contains(Reg) ||
782                AArch64::FPR8RegClass.contains(Reg)) {
783       // For FPRs,
784       if (MCRegister XReg = getRegisterOrZero(Reg, HasSVE))
785         FPRsToZero.set(XReg);
786     }
787   }
788 
789   const AArch64InstrInfo &TII = *STI.getInstrInfo();
790 
791   // Zero out GPRs.
792   for (MCRegister Reg : GPRsToZero.set_bits())
793     BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), Reg).addImm(0);
794 
795   // Zero out FP/vector registers.
796   for (MCRegister Reg : FPRsToZero.set_bits())
797     if (HasSVE)
798       BuildMI(MBB, MBBI, DL, TII.get(AArch64::DUP_ZI_D), Reg)
799         .addImm(0)
800         .addImm(0);
801     else
802       BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVIv2d_ns), Reg).addImm(0);
803 
804   if (HasSVE) {
805     for (MCRegister PReg :
806          {AArch64::P0, AArch64::P1, AArch64::P2, AArch64::P3, AArch64::P4,
807           AArch64::P5, AArch64::P6, AArch64::P7, AArch64::P8, AArch64::P9,
808           AArch64::P10, AArch64::P11, AArch64::P12, AArch64::P13, AArch64::P14,
809           AArch64::P15}) {
810       if (RegsToZero[PReg])
811         BuildMI(MBB, MBBI, DL, TII.get(AArch64::PFALSE), PReg);
812     }
813   }
814 }
815 
816 // Find a scratch register that we can use at the start of the prologue to
817 // re-align the stack pointer.  We avoid using callee-save registers since they
818 // may appear to be free when this is called from canUseAsPrologue (during
819 // shrink wrapping), but then no longer be free when this is called from
820 // emitPrologue.
821 //
822 // FIXME: This is a bit conservative, since in the above case we could use one
823 // of the callee-save registers as a scratch temp to re-align the stack pointer,
824 // but we would then have to make sure that we were in fact saving at least one
825 // callee-save register in the prologue, which is additional complexity that
826 // doesn't seem worth the benefit.
findScratchNonCalleeSaveRegister(MachineBasicBlock * MBB)827 static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
828   MachineFunction *MF = MBB->getParent();
829 
830   // If MBB is an entry block, use X9 as the scratch register
831   if (&MF->front() == MBB)
832     return AArch64::X9;
833 
834   const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
835   const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
836   LivePhysRegs LiveRegs(TRI);
837   LiveRegs.addLiveIns(*MBB);
838 
839   // Mark callee saved registers as used so we will not choose them.
840   const MCPhysReg *CSRegs = MF->getRegInfo().getCalleeSavedRegs();
841   for (unsigned i = 0; CSRegs[i]; ++i)
842     LiveRegs.addReg(CSRegs[i]);
843 
844   // Prefer X9 since it was historically used for the prologue scratch reg.
845   const MachineRegisterInfo &MRI = MF->getRegInfo();
846   if (LiveRegs.available(MRI, AArch64::X9))
847     return AArch64::X9;
848 
849   for (unsigned Reg : AArch64::GPR64RegClass) {
850     if (LiveRegs.available(MRI, Reg))
851       return Reg;
852   }
853   return AArch64::NoRegister;
854 }
855 
canUseAsPrologue(const MachineBasicBlock & MBB) const856 bool AArch64FrameLowering::canUseAsPrologue(
857     const MachineBasicBlock &MBB) const {
858   const MachineFunction *MF = MBB.getParent();
859   MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
860   const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
861   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
862 
863   // Don't need a scratch register if we're not going to re-align the stack.
864   if (!RegInfo->hasStackRealignment(*MF))
865     return true;
866   // Otherwise, we can use any block as long as it has a scratch register
867   // available.
868   return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
869 }
870 
windowsRequiresStackProbe(MachineFunction & MF,uint64_t StackSizeInBytes)871 static bool windowsRequiresStackProbe(MachineFunction &MF,
872                                       uint64_t StackSizeInBytes) {
873   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
874   if (!Subtarget.isTargetWindows())
875     return false;
876   const Function &F = MF.getFunction();
877   // TODO: When implementing stack protectors, take that into account
878   // for the probe threshold.
879   unsigned StackProbeSize = 4096;
880   if (F.hasFnAttribute("stack-probe-size"))
881     F.getFnAttribute("stack-probe-size")
882         .getValueAsString()
883         .getAsInteger(0, StackProbeSize);
884   return (StackSizeInBytes >= StackProbeSize) &&
885          !F.hasFnAttribute("no-stack-arg-probe");
886 }
887 
needsWinCFI(const MachineFunction & MF)888 static bool needsWinCFI(const MachineFunction &MF) {
889   const Function &F = MF.getFunction();
890   return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
891          F.needsUnwindTableEntry();
892 }
893 
shouldCombineCSRLocalStackBump(MachineFunction & MF,uint64_t StackBumpBytes) const894 bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
895     MachineFunction &MF, uint64_t StackBumpBytes) const {
896   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
897   const MachineFrameInfo &MFI = MF.getFrameInfo();
898   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
899   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
900   if (homogeneousPrologEpilog(MF))
901     return false;
902 
903   if (AFI->getLocalStackSize() == 0)
904     return false;
905 
906   // For WinCFI, if optimizing for size, prefer to not combine the stack bump
907   // (to force a stp with predecrement) to match the packed unwind format,
908   // provided that there actually are any callee saved registers to merge the
909   // decrement with.
910   // This is potentially marginally slower, but allows using the packed
911   // unwind format for functions that both have a local area and callee saved
912   // registers. Using the packed unwind format notably reduces the size of
913   // the unwind info.
914   if (needsWinCFI(MF) && AFI->getCalleeSavedStackSize() > 0 &&
915       MF.getFunction().hasOptSize())
916     return false;
917 
918   // 512 is the maximum immediate for stp/ldp that will be used for
919   // callee-save save/restores
920   if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackBumpBytes))
921     return false;
922 
923   if (MFI.hasVarSizedObjects())
924     return false;
925 
926   if (RegInfo->hasStackRealignment(MF))
927     return false;
928 
929   // This isn't strictly necessary, but it simplifies things a bit since the
930   // current RedZone handling code assumes the SP is adjusted by the
931   // callee-save save/restore code.
932   if (canUseRedZone(MF))
933     return false;
934 
935   // When there is an SVE area on the stack, always allocate the
936   // callee-saves and spills/locals separately.
937   if (getSVEStackSize(MF))
938     return false;
939 
940   return true;
941 }
942 
shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock & MBB,unsigned StackBumpBytes) const943 bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
944     MachineBasicBlock &MBB, unsigned StackBumpBytes) const {
945   if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes))
946     return false;
947 
948   if (MBB.empty())
949     return true;
950 
951   // Disable combined SP bump if the last instruction is an MTE tag store. It
952   // is almost always better to merge SP adjustment into those instructions.
953   MachineBasicBlock::iterator LastI = MBB.getFirstTerminator();
954   MachineBasicBlock::iterator Begin = MBB.begin();
955   while (LastI != Begin) {
956     --LastI;
957     if (LastI->isTransient())
958       continue;
959     if (!LastI->getFlag(MachineInstr::FrameDestroy))
960       break;
961   }
962   switch (LastI->getOpcode()) {
963   case AArch64::STGloop:
964   case AArch64::STZGloop:
965   case AArch64::STGOffset:
966   case AArch64::STZGOffset:
967   case AArch64::ST2GOffset:
968   case AArch64::STZ2GOffset:
969     return false;
970   default:
971     return true;
972   }
973   llvm_unreachable("unreachable");
974 }
975 
976 // Given a load or a store instruction, generate an appropriate unwinding SEH
977 // code on Windows.
InsertSEH(MachineBasicBlock::iterator MBBI,const TargetInstrInfo & TII,MachineInstr::MIFlag Flag)978 static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
979                                              const TargetInstrInfo &TII,
980                                              MachineInstr::MIFlag Flag) {
981   unsigned Opc = MBBI->getOpcode();
982   MachineBasicBlock *MBB = MBBI->getParent();
983   MachineFunction &MF = *MBB->getParent();
984   DebugLoc DL = MBBI->getDebugLoc();
985   unsigned ImmIdx = MBBI->getNumOperands() - 1;
986   int Imm = MBBI->getOperand(ImmIdx).getImm();
987   MachineInstrBuilder MIB;
988   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
989   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
990 
991   switch (Opc) {
992   default:
993     llvm_unreachable("No SEH Opcode for this instruction");
994   case AArch64::LDPDpost:
995     Imm = -Imm;
996     LLVM_FALLTHROUGH;
997   case AArch64::STPDpre: {
998     unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
999     unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg());
1000     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP_X))
1001               .addImm(Reg0)
1002               .addImm(Reg1)
1003               .addImm(Imm * 8)
1004               .setMIFlag(Flag);
1005     break;
1006   }
1007   case AArch64::LDPXpost:
1008     Imm = -Imm;
1009     LLVM_FALLTHROUGH;
1010   case AArch64::STPXpre: {
1011     Register Reg0 = MBBI->getOperand(1).getReg();
1012     Register Reg1 = MBBI->getOperand(2).getReg();
1013     if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
1014       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X))
1015                 .addImm(Imm * 8)
1016                 .setMIFlag(Flag);
1017     else
1018       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP_X))
1019                 .addImm(RegInfo->getSEHRegNum(Reg0))
1020                 .addImm(RegInfo->getSEHRegNum(Reg1))
1021                 .addImm(Imm * 8)
1022                 .setMIFlag(Flag);
1023     break;
1024   }
1025   case AArch64::LDRDpost:
1026     Imm = -Imm;
1027     LLVM_FALLTHROUGH;
1028   case AArch64::STRDpre: {
1029     unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
1030     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg_X))
1031               .addImm(Reg)
1032               .addImm(Imm)
1033               .setMIFlag(Flag);
1034     break;
1035   }
1036   case AArch64::LDRXpost:
1037     Imm = -Imm;
1038     LLVM_FALLTHROUGH;
1039   case AArch64::STRXpre: {
1040     unsigned Reg =  RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
1041     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg_X))
1042               .addImm(Reg)
1043               .addImm(Imm)
1044               .setMIFlag(Flag);
1045     break;
1046   }
1047   case AArch64::STPDi:
1048   case AArch64::LDPDi: {
1049     unsigned Reg0 =  RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
1050     unsigned Reg1 =  RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
1051     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP))
1052               .addImm(Reg0)
1053               .addImm(Reg1)
1054               .addImm(Imm * 8)
1055               .setMIFlag(Flag);
1056     break;
1057   }
1058   case AArch64::STPXi:
1059   case AArch64::LDPXi: {
1060     Register Reg0 = MBBI->getOperand(0).getReg();
1061     Register Reg1 = MBBI->getOperand(1).getReg();
1062     if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
1063       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR))
1064                 .addImm(Imm * 8)
1065                 .setMIFlag(Flag);
1066     else
1067       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP))
1068                 .addImm(RegInfo->getSEHRegNum(Reg0))
1069                 .addImm(RegInfo->getSEHRegNum(Reg1))
1070                 .addImm(Imm * 8)
1071                 .setMIFlag(Flag);
1072     break;
1073   }
1074   case AArch64::STRXui:
1075   case AArch64::LDRXui: {
1076     int Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
1077     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg))
1078               .addImm(Reg)
1079               .addImm(Imm * 8)
1080               .setMIFlag(Flag);
1081     break;
1082   }
1083   case AArch64::STRDui:
1084   case AArch64::LDRDui: {
1085     unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
1086     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg))
1087               .addImm(Reg)
1088               .addImm(Imm * 8)
1089               .setMIFlag(Flag);
1090     break;
1091   }
1092   }
1093   auto I = MBB->insertAfter(MBBI, MIB);
1094   return I;
1095 }
1096 
1097 // Fix up the SEH opcode associated with the save/restore instruction.
fixupSEHOpcode(MachineBasicBlock::iterator MBBI,unsigned LocalStackSize)1098 static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
1099                            unsigned LocalStackSize) {
1100   MachineOperand *ImmOpnd = nullptr;
1101   unsigned ImmIdx = MBBI->getNumOperands() - 1;
1102   switch (MBBI->getOpcode()) {
1103   default:
1104     llvm_unreachable("Fix the offset in the SEH instruction");
1105   case AArch64::SEH_SaveFPLR:
1106   case AArch64::SEH_SaveRegP:
1107   case AArch64::SEH_SaveReg:
1108   case AArch64::SEH_SaveFRegP:
1109   case AArch64::SEH_SaveFReg:
1110     ImmOpnd = &MBBI->getOperand(ImmIdx);
1111     break;
1112   }
1113   if (ImmOpnd)
1114     ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize);
1115 }
1116 
1117 // Convert callee-save register save/restore instruction to do stack pointer
1118 // decrement/increment to allocate/deallocate the callee-save stack area by
1119 // converting store/load to use pre/post increment version.
convertCalleeSaveRestoreToSPPrePostIncDec(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,const TargetInstrInfo * TII,int CSStackSizeInc,bool NeedsWinCFI,bool * HasWinCFI,bool EmitCFI,MachineInstr::MIFlag FrameFlag=MachineInstr::FrameSetup,int CFAOffset=0)1120 static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
1121     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
1122     const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
1123     bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI,
1124     MachineInstr::MIFlag FrameFlag = MachineInstr::FrameSetup,
1125     int CFAOffset = 0) {
1126   unsigned NewOpc;
1127   switch (MBBI->getOpcode()) {
1128   default:
1129     llvm_unreachable("Unexpected callee-save save/restore opcode!");
1130   case AArch64::STPXi:
1131     NewOpc = AArch64::STPXpre;
1132     break;
1133   case AArch64::STPDi:
1134     NewOpc = AArch64::STPDpre;
1135     break;
1136   case AArch64::STPQi:
1137     NewOpc = AArch64::STPQpre;
1138     break;
1139   case AArch64::STRXui:
1140     NewOpc = AArch64::STRXpre;
1141     break;
1142   case AArch64::STRDui:
1143     NewOpc = AArch64::STRDpre;
1144     break;
1145   case AArch64::STRQui:
1146     NewOpc = AArch64::STRQpre;
1147     break;
1148   case AArch64::LDPXi:
1149     NewOpc = AArch64::LDPXpost;
1150     break;
1151   case AArch64::LDPDi:
1152     NewOpc = AArch64::LDPDpost;
1153     break;
1154   case AArch64::LDPQi:
1155     NewOpc = AArch64::LDPQpost;
1156     break;
1157   case AArch64::LDRXui:
1158     NewOpc = AArch64::LDRXpost;
1159     break;
1160   case AArch64::LDRDui:
1161     NewOpc = AArch64::LDRDpost;
1162     break;
1163   case AArch64::LDRQui:
1164     NewOpc = AArch64::LDRQpost;
1165     break;
1166   }
1167   // Get rid of the SEH code associated with the old instruction.
1168   if (NeedsWinCFI) {
1169     auto SEH = std::next(MBBI);
1170     if (AArch64InstrInfo::isSEHInstruction(*SEH))
1171       SEH->eraseFromParent();
1172   }
1173 
1174   TypeSize Scale = TypeSize::Fixed(1);
1175   unsigned Width;
1176   int64_t MinOffset, MaxOffset;
1177   bool Success = static_cast<const AArch64InstrInfo *>(TII)->getMemOpInfo(
1178       NewOpc, Scale, Width, MinOffset, MaxOffset);
1179   (void)Success;
1180   assert(Success && "unknown load/store opcode");
1181 
1182   // If the first store isn't right where we want SP then we can't fold the
1183   // update in so create a normal arithmetic instruction instead.
1184   MachineFunction &MF = *MBB.getParent();
1185   if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 ||
1186       CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) {
1187     emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1188                     StackOffset::getFixed(CSStackSizeInc), TII, FrameFlag,
1189                     false, false, nullptr, EmitCFI,
1190                     StackOffset::getFixed(CFAOffset));
1191 
1192     return std::prev(MBBI);
1193   }
1194 
1195   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
1196   MIB.addReg(AArch64::SP, RegState::Define);
1197 
1198   // Copy all operands other than the immediate offset.
1199   unsigned OpndIdx = 0;
1200   for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
1201        ++OpndIdx)
1202     MIB.add(MBBI->getOperand(OpndIdx));
1203 
1204   assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
1205          "Unexpected immediate offset in first/last callee-save save/restore "
1206          "instruction!");
1207   assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
1208          "Unexpected base register in callee-save save/restore instruction!");
1209   assert(CSStackSizeInc % Scale == 0);
1210   MIB.addImm(CSStackSizeInc / (int)Scale);
1211 
1212   MIB.setMIFlags(MBBI->getFlags());
1213   MIB.setMemRefs(MBBI->memoperands());
1214 
1215   // Generate a new SEH code that corresponds to the new instruction.
1216   if (NeedsWinCFI) {
1217     *HasWinCFI = true;
1218     InsertSEH(*MIB, *TII, FrameFlag);
1219   }
1220 
1221   if (EmitCFI) {
1222     unsigned CFIIndex = MF.addFrameInst(
1223         MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset - CSStackSizeInc));
1224     BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1225         .addCFIIndex(CFIIndex)
1226         .setMIFlags(FrameFlag);
1227   }
1228 
1229   return std::prev(MBB.erase(MBBI));
1230 }
1231 
1232 // Fixup callee-save register save/restore instructions to take into account
1233 // combined SP bump by adding the local stack size to the stack offsets.
fixupCalleeSaveRestoreStackOffset(MachineInstr & MI,uint64_t LocalStackSize,bool NeedsWinCFI,bool * HasWinCFI)1234 static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
1235                                               uint64_t LocalStackSize,
1236                                               bool NeedsWinCFI,
1237                                               bool *HasWinCFI) {
1238   if (AArch64InstrInfo::isSEHInstruction(MI))
1239     return;
1240 
1241   unsigned Opc = MI.getOpcode();
1242   unsigned Scale;
1243   switch (Opc) {
1244   case AArch64::STPXi:
1245   case AArch64::STRXui:
1246   case AArch64::STPDi:
1247   case AArch64::STRDui:
1248   case AArch64::LDPXi:
1249   case AArch64::LDRXui:
1250   case AArch64::LDPDi:
1251   case AArch64::LDRDui:
1252     Scale = 8;
1253     break;
1254   case AArch64::STPQi:
1255   case AArch64::STRQui:
1256   case AArch64::LDPQi:
1257   case AArch64::LDRQui:
1258     Scale = 16;
1259     break;
1260   default:
1261     llvm_unreachable("Unexpected callee-save save/restore opcode!");
1262   }
1263 
1264   unsigned OffsetIdx = MI.getNumExplicitOperands() - 1;
1265   assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP &&
1266          "Unexpected base register in callee-save save/restore instruction!");
1267   // Last operand is immediate offset that needs fixing.
1268   MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx);
1269   // All generated opcodes have scaled offsets.
1270   assert(LocalStackSize % Scale == 0);
1271   OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / Scale);
1272 
1273   if (NeedsWinCFI) {
1274     *HasWinCFI = true;
1275     auto MBBI = std::next(MachineBasicBlock::iterator(MI));
1276     assert(MBBI != MI.getParent()->end() && "Expecting a valid instruction");
1277     assert(AArch64InstrInfo::isSEHInstruction(*MBBI) &&
1278            "Expecting a SEH instruction");
1279     fixupSEHOpcode(MBBI, LocalStackSize);
1280   }
1281 }
1282 
isTargetWindows(const MachineFunction & MF)1283 static bool isTargetWindows(const MachineFunction &MF) {
1284   return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
1285 }
1286 
1287 // Convenience function to determine whether I is an SVE callee save.
IsSVECalleeSave(MachineBasicBlock::iterator I)1288 static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
1289   switch (I->getOpcode()) {
1290   default:
1291     return false;
1292   case AArch64::STR_ZXI:
1293   case AArch64::STR_PXI:
1294   case AArch64::LDR_ZXI:
1295   case AArch64::LDR_PXI:
1296     return I->getFlag(MachineInstr::FrameSetup) ||
1297            I->getFlag(MachineInstr::FrameDestroy);
1298   }
1299 }
1300 
needsShadowCallStackPrologueEpilogue(MachineFunction & MF)1301 static bool needsShadowCallStackPrologueEpilogue(MachineFunction &MF) {
1302   if (!(llvm::any_of(
1303             MF.getFrameInfo().getCalleeSavedInfo(),
1304             [](const auto &Info) { return Info.getReg() == AArch64::LR; }) &&
1305         MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)))
1306     return false;
1307 
1308   if (!MF.getSubtarget<AArch64Subtarget>().isXRegisterReserved(18))
1309     report_fatal_error("Must reserve x18 to use shadow call stack");
1310 
1311   return true;
1312 }
1313 
emitShadowCallStackPrologue(const TargetInstrInfo & TII,MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,bool NeedsWinCFI,bool NeedsUnwindInfo)1314 static void emitShadowCallStackPrologue(const TargetInstrInfo &TII,
1315                                         MachineFunction &MF,
1316                                         MachineBasicBlock &MBB,
1317                                         MachineBasicBlock::iterator MBBI,
1318                                         const DebugLoc &DL, bool NeedsWinCFI,
1319                                         bool NeedsUnwindInfo) {
1320   // Shadow call stack prolog: str x30, [x18], #8
1321   BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXpost))
1322       .addReg(AArch64::X18, RegState::Define)
1323       .addReg(AArch64::LR)
1324       .addReg(AArch64::X18)
1325       .addImm(8)
1326       .setMIFlag(MachineInstr::FrameSetup);
1327 
1328   // This instruction also makes x18 live-in to the entry block.
1329   MBB.addLiveIn(AArch64::X18);
1330 
1331   if (NeedsWinCFI)
1332     BuildMI(MBB, MBBI, DL, TII.get(AArch64::SEH_Nop))
1333         .setMIFlag(MachineInstr::FrameSetup);
1334 
1335   if (NeedsUnwindInfo) {
1336     // Emit a CFI instruction that causes 8 to be subtracted from the value of
1337     // x18 when unwinding past this frame.
1338     static const char CFIInst[] = {
1339         dwarf::DW_CFA_val_expression,
1340         18, // register
1341         2,  // length
1342         static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
1343         static_cast<char>(-8) & 0x7f, // addend (sleb128)
1344     };
1345     unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape(
1346         nullptr, StringRef(CFIInst, sizeof(CFIInst))));
1347     BuildMI(MBB, MBBI, DL, TII.get(AArch64::CFI_INSTRUCTION))
1348         .addCFIIndex(CFIIndex)
1349         .setMIFlag(MachineInstr::FrameSetup);
1350   }
1351 }
1352 
emitShadowCallStackEpilogue(const TargetInstrInfo & TII,MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL)1353 static void emitShadowCallStackEpilogue(const TargetInstrInfo &TII,
1354                                         MachineFunction &MF,
1355                                         MachineBasicBlock &MBB,
1356                                         MachineBasicBlock::iterator MBBI,
1357                                         const DebugLoc &DL) {
1358   // Shadow call stack epilog: ldr x30, [x18, #-8]!
1359   BuildMI(MBB, MBBI, DL, TII.get(AArch64::LDRXpre))
1360       .addReg(AArch64::X18, RegState::Define)
1361       .addReg(AArch64::LR, RegState::Define)
1362       .addReg(AArch64::X18)
1363       .addImm(-8)
1364       .setMIFlag(MachineInstr::FrameDestroy);
1365 
1366   if (MF.getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo()) {
1367     unsigned CFIIndex =
1368         MF.addFrameInst(MCCFIInstruction::createRestore(nullptr, 18));
1369     BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
1370         .addCFIIndex(CFIIndex)
1371         .setMIFlags(MachineInstr::FrameDestroy);
1372   }
1373 }
1374 
emitPrologue(MachineFunction & MF,MachineBasicBlock & MBB) const1375 void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
1376                                         MachineBasicBlock &MBB) const {
1377   MachineBasicBlock::iterator MBBI = MBB.begin();
1378   const MachineFrameInfo &MFI = MF.getFrameInfo();
1379   const Function &F = MF.getFunction();
1380   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1381   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1382   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1383   MachineModuleInfo &MMI = MF.getMMI();
1384   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1385   bool EmitCFI = AFI->needsDwarfUnwindInfo();
1386   bool HasFP = hasFP(MF);
1387   bool NeedsWinCFI = needsWinCFI(MF);
1388   bool HasWinCFI = false;
1389   auto Cleanup = make_scope_exit([&]() { MF.setHasWinCFI(HasWinCFI); });
1390 
1391   bool IsFunclet = MBB.isEHFuncletEntry();
1392 
1393   // At this point, we're going to decide whether or not the function uses a
1394   // redzone. In most cases, the function doesn't have a redzone so let's
1395   // assume that's false and set it to true in the case that there's a redzone.
1396   AFI->setHasRedZone(false);
1397 
1398   // Debug location must be unknown since the first debug location is used
1399   // to determine the end of the prologue.
1400   DebugLoc DL;
1401 
1402   const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>();
1403   if (needsShadowCallStackPrologueEpilogue(MF))
1404     emitShadowCallStackPrologue(*TII, MF, MBB, MBBI, DL, NeedsWinCFI,
1405                                 MFnI.needsDwarfUnwindInfo());
1406 
1407   if (MFnI.shouldSignReturnAddress()) {
1408     unsigned PACI;
1409     if (MFnI.shouldSignWithBKey()) {
1410       BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITBKEY))
1411           .setMIFlag(MachineInstr::FrameSetup);
1412       PACI = Subtarget.hasPAuth() ? AArch64::PACIB : AArch64::PACIBSP;
1413     } else {
1414       PACI = Subtarget.hasPAuth() ? AArch64::PACIA : AArch64::PACIASP;
1415     }
1416 
1417     auto MI = BuildMI(MBB, MBBI, DL, TII->get(PACI));
1418     if (Subtarget.hasPAuth())
1419       MI.addReg(AArch64::LR, RegState::Define)
1420           .addReg(AArch64::LR)
1421           .addReg(AArch64::SP, RegState::InternalRead);
1422     MI.setMIFlag(MachineInstr::FrameSetup);
1423     if (EmitCFI) {
1424       unsigned CFIIndex =
1425           MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
1426       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1427           .addCFIIndex(CFIIndex)
1428           .setMIFlags(MachineInstr::FrameSetup);
1429     }
1430   }
1431   if (EmitCFI && MFnI.isMTETagged()) {
1432     BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITMTETAGGED))
1433         .setMIFlag(MachineInstr::FrameSetup);
1434   }
1435 
1436   // We signal the presence of a Swift extended frame to external tools by
1437   // storing FP with 0b0001 in bits 63:60. In normal userland operation a simple
1438   // ORR is sufficient, it is assumed a Swift kernel would initialize the TBI
1439   // bits so that is still true.
1440   if (HasFP && AFI->hasSwiftAsyncContext()) {
1441     switch (MF.getTarget().Options.SwiftAsyncFramePointer) {
1442     case SwiftAsyncFramePointerMode::DeploymentBased:
1443       if (Subtarget.swiftAsyncContextIsDynamicallySet()) {
1444         // The special symbol below is absolute and has a *value* that can be
1445         // combined with the frame pointer to signal an extended frame.
1446         BuildMI(MBB, MBBI, DL, TII->get(AArch64::LOADgot), AArch64::X16)
1447             .addExternalSymbol("swift_async_extendedFramePointerFlags",
1448                                AArch64II::MO_GOT);
1449         BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrs), AArch64::FP)
1450             .addUse(AArch64::FP)
1451             .addUse(AArch64::X16)
1452             .addImm(Subtarget.isTargetILP32() ? 32 : 0);
1453         break;
1454       }
1455       LLVM_FALLTHROUGH;
1456 
1457     case SwiftAsyncFramePointerMode::Always:
1458       // ORR x29, x29, #0x1000_0000_0000_0000
1459       BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXri), AArch64::FP)
1460           .addUse(AArch64::FP)
1461           .addImm(0x1100)
1462           .setMIFlag(MachineInstr::FrameSetup);
1463       break;
1464 
1465     case SwiftAsyncFramePointerMode::Never:
1466       break;
1467     }
1468   }
1469 
1470   // All calls are tail calls in GHC calling conv, and functions have no
1471   // prologue/epilogue.
1472   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
1473     return;
1474 
1475   // Set tagged base pointer to the requested stack slot.
1476   // Ideally it should match SP value after prologue.
1477   Optional<int> TBPI = AFI->getTaggedBasePointerIndex();
1478   if (TBPI)
1479     AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI));
1480   else
1481     AFI->setTaggedBasePointerOffset(MFI.getStackSize());
1482 
1483   const StackOffset &SVEStackSize = getSVEStackSize(MF);
1484 
1485   // getStackSize() includes all the locals in its size calculation. We don't
1486   // include these locals when computing the stack size of a funclet, as they
1487   // are allocated in the parent's stack frame and accessed via the frame
1488   // pointer from the funclet.  We only save the callee saved registers in the
1489   // funclet, which are really the callee saved registers of the parent
1490   // function, including the funclet.
1491   int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
1492                                : MFI.getStackSize();
1493   if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
1494     assert(!HasFP && "unexpected function without stack frame but with FP");
1495     assert(!SVEStackSize &&
1496            "unexpected function without stack frame but with SVE objects");
1497     // All of the stack allocation is for locals.
1498     AFI->setLocalStackSize(NumBytes);
1499     if (!NumBytes)
1500       return;
1501     // REDZONE: If the stack size is less than 128 bytes, we don't need
1502     // to actually allocate.
1503     if (canUseRedZone(MF)) {
1504       AFI->setHasRedZone(true);
1505       ++NumRedZoneFunctions;
1506     } else {
1507       emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1508                       StackOffset::getFixed(-NumBytes), TII,
1509                       MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1510       if (EmitCFI) {
1511         // Label used to tie together the PROLOG_LABEL and the MachineMoves.
1512         MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
1513           // Encode the stack size of the leaf function.
1514         unsigned CFIIndex = MF.addFrameInst(
1515             MCCFIInstruction::cfiDefCfaOffset(FrameLabel, NumBytes));
1516         BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1517             .addCFIIndex(CFIIndex)
1518             .setMIFlags(MachineInstr::FrameSetup);
1519       }
1520     }
1521 
1522     if (NeedsWinCFI) {
1523       HasWinCFI = true;
1524       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
1525           .setMIFlag(MachineInstr::FrameSetup);
1526     }
1527 
1528     return;
1529   }
1530 
1531   bool IsWin64 =
1532       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1533   unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
1534 
1535   auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
1536   // All of the remaining stack allocations are for locals.
1537   AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
1538   bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
1539   bool HomPrologEpilog = homogeneousPrologEpilog(MF);
1540   if (CombineSPBump) {
1541     assert(!SVEStackSize && "Cannot combine SP bump with SVE");
1542     emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1543                     StackOffset::getFixed(-NumBytes), TII,
1544                     MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI,
1545                     EmitCFI);
1546     NumBytes = 0;
1547   } else if (HomPrologEpilog) {
1548     // Stack has been already adjusted.
1549     NumBytes -= PrologueSaveSize;
1550   } else if (PrologueSaveSize != 0) {
1551     MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
1552         MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI,
1553         EmitCFI);
1554     NumBytes -= PrologueSaveSize;
1555   }
1556   assert(NumBytes >= 0 && "Negative stack allocation size!?");
1557 
1558   // Move past the saves of the callee-saved registers, fixing up the offsets
1559   // and pre-inc if we decided to combine the callee-save and local stack
1560   // pointer bump above.
1561   MachineBasicBlock::iterator End = MBB.end();
1562   while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
1563          !IsSVECalleeSave(MBBI)) {
1564     if (CombineSPBump)
1565       fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
1566                                         NeedsWinCFI, &HasWinCFI);
1567     ++MBBI;
1568   }
1569 
1570   // For funclets the FP belongs to the containing function.
1571   if (!IsFunclet && HasFP) {
1572     // Only set up FP if we actually need to.
1573     int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset();
1574 
1575     if (CombineSPBump)
1576       FPOffset += AFI->getLocalStackSize();
1577 
1578     if (AFI->hasSwiftAsyncContext()) {
1579       // Before we update the live FP we have to ensure there's a valid (or
1580       // null) asynchronous context in its slot just before FP in the frame
1581       // record, so store it now.
1582       const auto &Attrs = MF.getFunction().getAttributes();
1583       bool HaveInitialContext = Attrs.hasAttrSomewhere(Attribute::SwiftAsync);
1584       if (HaveInitialContext)
1585         MBB.addLiveIn(AArch64::X22);
1586       BuildMI(MBB, MBBI, DL, TII->get(AArch64::StoreSwiftAsyncContext))
1587           .addUse(HaveInitialContext ? AArch64::X22 : AArch64::XZR)
1588           .addUse(AArch64::SP)
1589           .addImm(FPOffset - 8)
1590           .setMIFlags(MachineInstr::FrameSetup);
1591     }
1592 
1593     if (HomPrologEpilog) {
1594       auto Prolog = MBBI;
1595       --Prolog;
1596       assert(Prolog->getOpcode() == AArch64::HOM_Prolog);
1597       Prolog->addOperand(MachineOperand::CreateImm(FPOffset));
1598     } else {
1599       // Issue    sub fp, sp, FPOffset or
1600       //          mov fp,sp          when FPOffset is zero.
1601       // Note: All stores of callee-saved registers are marked as "FrameSetup".
1602       // This code marks the instruction(s) that set the FP also.
1603       emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
1604                       StackOffset::getFixed(FPOffset), TII,
1605                       MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1606     }
1607     if (EmitCFI) {
1608       // Define the current CFA rule to use the provided FP.
1609       const int OffsetToFirstCalleeSaveFromFP =
1610           AFI->getCalleeSaveBaseToFrameRecordOffset() -
1611           AFI->getCalleeSavedStackSize();
1612       Register FramePtr = RegInfo->getFrameRegister(MF);
1613       unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
1614       unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
1615           nullptr, Reg, FixedObject - OffsetToFirstCalleeSaveFromFP));
1616       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1617           .addCFIIndex(CFIIndex)
1618           .setMIFlags(MachineInstr::FrameSetup);
1619     }
1620   }
1621 
1622   // Now emit the moves for whatever callee saved regs we have (including FP,
1623   // LR if those are saved). Frame instructions for SVE register are emitted
1624   // later, after the instruction which actually save SVE regs.
1625   if (EmitCFI)
1626     emitCalleeSavedGPRLocations(MBB, MBBI);
1627 
1628   if (windowsRequiresStackProbe(MF, NumBytes)) {
1629     uint64_t NumWords = NumBytes >> 4;
1630     if (NeedsWinCFI) {
1631       HasWinCFI = true;
1632       // alloc_l can hold at most 256MB, so assume that NumBytes doesn't
1633       // exceed this amount.  We need to move at most 2^24 - 1 into x15.
1634       // This is at most two instructions, MOVZ follwed by MOVK.
1635       // TODO: Fix to use multiple stack alloc unwind codes for stacks
1636       // exceeding 256MB in size.
1637       if (NumBytes >= (1 << 28))
1638         report_fatal_error("Stack size cannot exceed 256MB for stack "
1639                             "unwinding purposes");
1640 
1641       uint32_t LowNumWords = NumWords & 0xFFFF;
1642       BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVZXi), AArch64::X15)
1643             .addImm(LowNumWords)
1644             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
1645             .setMIFlag(MachineInstr::FrameSetup);
1646       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1647             .setMIFlag(MachineInstr::FrameSetup);
1648       if ((NumWords & 0xFFFF0000) != 0) {
1649           BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X15)
1650               .addReg(AArch64::X15)
1651               .addImm((NumWords & 0xFFFF0000) >> 16) // High half
1652               .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16))
1653               .setMIFlag(MachineInstr::FrameSetup);
1654           BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1655             .setMIFlag(MachineInstr::FrameSetup);
1656       }
1657     } else {
1658       BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
1659           .addImm(NumWords)
1660           .setMIFlags(MachineInstr::FrameSetup);
1661     }
1662 
1663     switch (MF.getTarget().getCodeModel()) {
1664     case CodeModel::Tiny:
1665     case CodeModel::Small:
1666     case CodeModel::Medium:
1667     case CodeModel::Kernel:
1668       BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
1669           .addExternalSymbol("__chkstk")
1670           .addReg(AArch64::X15, RegState::Implicit)
1671           .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
1672           .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
1673           .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
1674           .setMIFlags(MachineInstr::FrameSetup);
1675       if (NeedsWinCFI) {
1676         HasWinCFI = true;
1677         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1678             .setMIFlag(MachineInstr::FrameSetup);
1679       }
1680       break;
1681     case CodeModel::Large:
1682       BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT))
1683           .addReg(AArch64::X16, RegState::Define)
1684           .addExternalSymbol("__chkstk")
1685           .addExternalSymbol("__chkstk")
1686           .setMIFlags(MachineInstr::FrameSetup);
1687       if (NeedsWinCFI) {
1688         HasWinCFI = true;
1689         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1690             .setMIFlag(MachineInstr::FrameSetup);
1691       }
1692 
1693       BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF)))
1694           .addReg(AArch64::X16, RegState::Kill)
1695           .addReg(AArch64::X15, RegState::Implicit | RegState::Define)
1696           .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
1697           .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
1698           .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
1699           .setMIFlags(MachineInstr::FrameSetup);
1700       if (NeedsWinCFI) {
1701         HasWinCFI = true;
1702         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1703             .setMIFlag(MachineInstr::FrameSetup);
1704       }
1705       break;
1706     }
1707 
1708     BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
1709         .addReg(AArch64::SP, RegState::Kill)
1710         .addReg(AArch64::X15, RegState::Kill)
1711         .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4))
1712         .setMIFlags(MachineInstr::FrameSetup);
1713     if (NeedsWinCFI) {
1714       HasWinCFI = true;
1715       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
1716           .addImm(NumBytes)
1717           .setMIFlag(MachineInstr::FrameSetup);
1718     }
1719     NumBytes = 0;
1720   }
1721 
1722   StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {};
1723   MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI;
1724 
1725   // Process the SVE callee-saves to determine what space needs to be
1726   // allocated.
1727   if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
1728     // Find callee save instructions in frame.
1729     CalleeSavesBegin = MBBI;
1730     assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
1731     while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator())
1732       ++MBBI;
1733     CalleeSavesEnd = MBBI;
1734 
1735     AllocateBefore = StackOffset::getScalable(CalleeSavedSize);
1736     AllocateAfter = SVEStackSize - AllocateBefore;
1737   }
1738 
1739   // Allocate space for the callee saves (if any).
1740   emitFrameOffset(
1741       MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP, -AllocateBefore, TII,
1742       MachineInstr::FrameSetup, false, false, nullptr,
1743       EmitCFI && !HasFP && AllocateBefore,
1744       StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes));
1745 
1746   if (EmitCFI)
1747     emitCalleeSavedSVELocations(MBB, CalleeSavesEnd);
1748 
1749   // Finally allocate remaining SVE stack space.
1750   emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP,
1751                   -AllocateAfter, TII, MachineInstr::FrameSetup, false, false,
1752                   nullptr, EmitCFI && !HasFP && AllocateAfter,
1753                   AllocateBefore + StackOffset::getFixed(
1754                                        (int64_t)MFI.getStackSize() - NumBytes));
1755 
1756   // Allocate space for the rest of the frame.
1757   if (NumBytes) {
1758     // Alignment is required for the parent frame, not the funclet
1759     const bool NeedsRealignment =
1760         !IsFunclet && RegInfo->hasStackRealignment(MF);
1761     unsigned scratchSPReg = AArch64::SP;
1762 
1763     if (NeedsRealignment) {
1764       scratchSPReg = findScratchNonCalleeSaveRegister(&MBB);
1765       assert(scratchSPReg != AArch64::NoRegister);
1766     }
1767 
1768     // If we're a leaf function, try using the red zone.
1769     if (!canUseRedZone(MF)) {
1770       // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
1771       // the correct value here, as NumBytes also includes padding bytes,
1772       // which shouldn't be counted here.
1773       emitFrameOffset(
1774           MBB, MBBI, DL, scratchSPReg, AArch64::SP,
1775           StackOffset::getFixed(-NumBytes), TII, MachineInstr::FrameSetup,
1776           false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
1777           SVEStackSize +
1778               StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes));
1779     }
1780     if (NeedsRealignment) {
1781       const unsigned NrBitsToZero = Log2(MFI.getMaxAlign());
1782       assert(NrBitsToZero > 1);
1783       assert(scratchSPReg != AArch64::SP);
1784 
1785       // SUB X9, SP, NumBytes
1786       //   -- X9 is temporary register, so shouldn't contain any live data here,
1787       //   -- free to use. This is already produced by emitFrameOffset above.
1788       // AND SP, X9, 0b11111...0000
1789       // The logical immediates have a non-trivial encoding. The following
1790       // formula computes the encoded immediate with all ones but
1791       // NrBitsToZero zero bits as least significant bits.
1792       uint32_t andMaskEncoded = (1 << 12)                         // = N
1793                                 | ((64 - NrBitsToZero) << 6)      // immr
1794                                 | ((64 - NrBitsToZero - 1) << 0); // imms
1795 
1796       BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
1797           .addReg(scratchSPReg, RegState::Kill)
1798           .addImm(andMaskEncoded);
1799       AFI->setStackRealigned(true);
1800       if (NeedsWinCFI) {
1801         HasWinCFI = true;
1802         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
1803             .addImm(NumBytes & andMaskEncoded)
1804             .setMIFlag(MachineInstr::FrameSetup);
1805       }
1806     }
1807   }
1808 
1809   // If we need a base pointer, set it up here. It's whatever the value of the
1810   // stack pointer is at this point. Any variable size objects will be allocated
1811   // after this, so we can still use the base pointer to reference locals.
1812   //
1813   // FIXME: Clarify FrameSetup flags here.
1814   // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
1815   // needed.
1816   // For funclets the BP belongs to the containing function.
1817   if (!IsFunclet && RegInfo->hasBasePointer(MF)) {
1818     TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
1819                      false);
1820     if (NeedsWinCFI) {
1821       HasWinCFI = true;
1822       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1823           .setMIFlag(MachineInstr::FrameSetup);
1824     }
1825   }
1826 
1827   // The very last FrameSetup instruction indicates the end of prologue. Emit a
1828   // SEH opcode indicating the prologue end.
1829   if (NeedsWinCFI && HasWinCFI) {
1830     BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
1831         .setMIFlag(MachineInstr::FrameSetup);
1832   }
1833 
1834   // SEH funclets are passed the frame pointer in X1.  If the parent
1835   // function uses the base register, then the base register is used
1836   // directly, and is not retrieved from X1.
1837   if (IsFunclet && F.hasPersonalityFn()) {
1838     EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
1839     if (isAsynchronousEHPersonality(Per)) {
1840       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP)
1841           .addReg(AArch64::X1)
1842           .setMIFlag(MachineInstr::FrameSetup);
1843       MBB.addLiveIn(AArch64::X1);
1844     }
1845   }
1846 }
1847 
InsertReturnAddressAuth(MachineFunction & MF,MachineBasicBlock & MBB)1848 static void InsertReturnAddressAuth(MachineFunction &MF,
1849                                     MachineBasicBlock &MBB) {
1850   const auto &MFI = *MF.getInfo<AArch64FunctionInfo>();
1851   if (!MFI.shouldSignReturnAddress())
1852     return;
1853   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1854   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1855 
1856   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
1857   DebugLoc DL;
1858   if (MBBI != MBB.end())
1859     DL = MBBI->getDebugLoc();
1860 
1861   // The AUTIASP instruction assembles to a hint instruction before v8.3a so
1862   // this instruction can safely used for any v8a architecture.
1863   // From v8.3a onwards there are optimised authenticate LR and return
1864   // instructions, namely RETA{A,B}, that can be used instead. In this case the
1865   // DW_CFA_AARCH64_negate_ra_state can't be emitted.
1866   if (Subtarget.hasPAuth() && MBBI != MBB.end() &&
1867       MBBI->getOpcode() == AArch64::RET_ReallyLR) {
1868     BuildMI(MBB, MBBI, DL,
1869             TII->get(MFI.shouldSignWithBKey() ? AArch64::RETAB : AArch64::RETAA))
1870         .copyImplicitOps(*MBBI);
1871     MBB.erase(MBBI);
1872   } else {
1873     BuildMI(
1874         MBB, MBBI, DL,
1875         TII->get(MFI.shouldSignWithBKey() ? AArch64::AUTIBSP : AArch64::AUTIASP))
1876         .setMIFlag(MachineInstr::FrameDestroy);
1877 
1878     unsigned CFIIndex =
1879         MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
1880     BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1881         .addCFIIndex(CFIIndex)
1882         .setMIFlags(MachineInstr::FrameDestroy);
1883   }
1884 }
1885 
isFuncletReturnInstr(const MachineInstr & MI)1886 static bool isFuncletReturnInstr(const MachineInstr &MI) {
1887   switch (MI.getOpcode()) {
1888   default:
1889     return false;
1890   case AArch64::CATCHRET:
1891   case AArch64::CLEANUPRET:
1892     return true;
1893   }
1894 }
1895 
emitEpilogue(MachineFunction & MF,MachineBasicBlock & MBB) const1896 void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
1897                                         MachineBasicBlock &MBB) const {
1898   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
1899   MachineFrameInfo &MFI = MF.getFrameInfo();
1900   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1901   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1902   DebugLoc DL;
1903   bool NeedsWinCFI = needsWinCFI(MF);
1904   bool EmitCFI = MF.getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo();
1905   bool HasWinCFI = false;
1906   bool IsFunclet = false;
1907   auto WinCFI = make_scope_exit([&]() { assert(HasWinCFI == MF.hasWinCFI()); });
1908 
1909   if (MBB.end() != MBBI) {
1910     DL = MBBI->getDebugLoc();
1911     IsFunclet = isFuncletReturnInstr(*MBBI);
1912   }
1913 
1914   auto FinishingTouches = make_scope_exit([&]() {
1915     InsertReturnAddressAuth(MF, MBB);
1916     if (needsShadowCallStackPrologueEpilogue(MF))
1917       emitShadowCallStackEpilogue(*TII, MF, MBB, MBB.getFirstTerminator(), DL);
1918     if (EmitCFI)
1919       emitCalleeSavedGPRRestores(MBB, MBB.getFirstTerminator());
1920   });
1921 
1922   int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
1923                                : MFI.getStackSize();
1924   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1925 
1926   // All calls are tail calls in GHC calling conv, and functions have no
1927   // prologue/epilogue.
1928   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
1929     return;
1930 
1931   // How much of the stack used by incoming arguments this function is expected
1932   // to restore in this particular epilogue.
1933   int64_t ArgumentStackToRestore = getArgumentStackToRestore(MF, MBB);
1934   bool IsWin64 =
1935       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1936   unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
1937 
1938   int64_t AfterCSRPopSize = ArgumentStackToRestore;
1939   auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
1940   // We cannot rely on the local stack size set in emitPrologue if the function
1941   // has funclets, as funclets have different local stack size requirements, and
1942   // the current value set in emitPrologue may be that of the containing
1943   // function.
1944   if (MF.hasEHFunclets())
1945     AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
1946   if (homogeneousPrologEpilog(MF, &MBB)) {
1947     assert(!NeedsWinCFI);
1948     auto LastPopI = MBB.getFirstTerminator();
1949     if (LastPopI != MBB.begin()) {
1950       auto HomogeneousEpilog = std::prev(LastPopI);
1951       if (HomogeneousEpilog->getOpcode() == AArch64::HOM_Epilog)
1952         LastPopI = HomogeneousEpilog;
1953     }
1954 
1955     // Adjust local stack
1956     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
1957                     StackOffset::getFixed(AFI->getLocalStackSize()), TII,
1958                     MachineInstr::FrameDestroy, false, NeedsWinCFI);
1959 
1960     // SP has been already adjusted while restoring callee save regs.
1961     // We've bailed-out the case with adjusting SP for arguments.
1962     assert(AfterCSRPopSize == 0);
1963     return;
1964   }
1965   bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
1966   // Assume we can't combine the last pop with the sp restore.
1967 
1968   bool CombineAfterCSRBump = false;
1969   if (!CombineSPBump && PrologueSaveSize != 0) {
1970     MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
1971     while (Pop->getOpcode() == TargetOpcode::CFI_INSTRUCTION ||
1972            AArch64InstrInfo::isSEHInstruction(*Pop))
1973       Pop = std::prev(Pop);
1974     // Converting the last ldp to a post-index ldp is valid only if the last
1975     // ldp's offset is 0.
1976     const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
1977     // If the offset is 0 and the AfterCSR pop is not actually trying to
1978     // allocate more stack for arguments (in space that an untimely interrupt
1979     // may clobber), convert it to a post-index ldp.
1980     if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0) {
1981       convertCalleeSaveRestoreToSPPrePostIncDec(
1982           MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, EmitCFI,
1983           MachineInstr::FrameDestroy, PrologueSaveSize);
1984     } else {
1985       // If not, make sure to emit an add after the last ldp.
1986       // We're doing this by transfering the size to be restored from the
1987       // adjustment *before* the CSR pops to the adjustment *after* the CSR
1988       // pops.
1989       AfterCSRPopSize += PrologueSaveSize;
1990       CombineAfterCSRBump = true;
1991     }
1992   }
1993 
1994   // Move past the restores of the callee-saved registers.
1995   // If we plan on combining the sp bump of the local stack size and the callee
1996   // save stack size, we might need to adjust the CSR save and restore offsets.
1997   MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
1998   MachineBasicBlock::iterator Begin = MBB.begin();
1999   while (LastPopI != Begin) {
2000     --LastPopI;
2001     if (!LastPopI->getFlag(MachineInstr::FrameDestroy) ||
2002         IsSVECalleeSave(LastPopI)) {
2003       ++LastPopI;
2004       break;
2005     } else if (CombineSPBump)
2006       fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize(),
2007                                         NeedsWinCFI, &HasWinCFI);
2008   }
2009 
2010   if (MF.hasWinCFI()) {
2011     // If the prologue didn't contain any SEH opcodes and didn't set the
2012     // MF.hasWinCFI() flag, assume the epilogue won't either, and skip the
2013     // EpilogStart - to avoid generating CFI for functions that don't need it.
2014     // (And as we didn't generate any prologue at all, it would be asymmetrical
2015     // to the epilogue.) By the end of the function, we assert that
2016     // HasWinCFI is equal to MF.hasWinCFI(), to verify this assumption.
2017     HasWinCFI = true;
2018     BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart))
2019         .setMIFlag(MachineInstr::FrameDestroy);
2020   }
2021 
2022   if (hasFP(MF) && AFI->hasSwiftAsyncContext()) {
2023     switch (MF.getTarget().Options.SwiftAsyncFramePointer) {
2024     case SwiftAsyncFramePointerMode::DeploymentBased:
2025       // Avoid the reload as it is GOT relative, and instead fall back to the
2026       // hardcoded value below.  This allows a mismatch between the OS and
2027       // application without immediately terminating on the difference.
2028       LLVM_FALLTHROUGH;
2029     case SwiftAsyncFramePointerMode::Always:
2030       // We need to reset FP to its untagged state on return. Bit 60 is
2031       // currently used to show the presence of an extended frame.
2032 
2033       // BIC x29, x29, #0x1000_0000_0000_0000
2034       BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::ANDXri),
2035               AArch64::FP)
2036           .addUse(AArch64::FP)
2037           .addImm(0x10fe)
2038           .setMIFlag(MachineInstr::FrameDestroy);
2039       break;
2040 
2041     case SwiftAsyncFramePointerMode::Never:
2042       break;
2043     }
2044   }
2045 
2046   const StackOffset &SVEStackSize = getSVEStackSize(MF);
2047 
2048   // If there is a single SP update, insert it before the ret and we're done.
2049   if (CombineSPBump) {
2050     assert(!SVEStackSize && "Cannot combine SP bump with SVE");
2051 
2052     // When we are about to restore the CSRs, the CFA register is SP again.
2053     if (EmitCFI && hasFP(MF)) {
2054       const AArch64RegisterInfo &RegInfo = *Subtarget.getRegisterInfo();
2055       unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true);
2056       unsigned CFIIndex =
2057           MF.addFrameInst(MCCFIInstruction::cfiDefCfa(nullptr, Reg, NumBytes));
2058       BuildMI(MBB, LastPopI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
2059           .addCFIIndex(CFIIndex)
2060           .setMIFlags(MachineInstr::FrameDestroy);
2061     }
2062 
2063     emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
2064                     StackOffset::getFixed(NumBytes + (int64_t)AfterCSRPopSize),
2065                     TII, MachineInstr::FrameDestroy, false, NeedsWinCFI,
2066                     &HasWinCFI, EmitCFI, StackOffset::getFixed(NumBytes));
2067     if (HasWinCFI)
2068       BuildMI(MBB, MBB.getFirstTerminator(), DL,
2069               TII->get(AArch64::SEH_EpilogEnd))
2070           .setMIFlag(MachineInstr::FrameDestroy);
2071     return;
2072   }
2073 
2074   NumBytes -= PrologueSaveSize;
2075   assert(NumBytes >= 0 && "Negative stack allocation size!?");
2076 
2077   // Process the SVE callee-saves to determine what space needs to be
2078   // deallocated.
2079   StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
2080   MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI;
2081   if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
2082     RestoreBegin = std::prev(RestoreEnd);
2083     while (RestoreBegin != MBB.begin() &&
2084            IsSVECalleeSave(std::prev(RestoreBegin)))
2085       --RestoreBegin;
2086 
2087     assert(IsSVECalleeSave(RestoreBegin) &&
2088            IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
2089 
2090     StackOffset CalleeSavedSizeAsOffset =
2091         StackOffset::getScalable(CalleeSavedSize);
2092     DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
2093     DeallocateAfter = CalleeSavedSizeAsOffset;
2094   }
2095 
2096   // Deallocate the SVE area.
2097   if (SVEStackSize) {
2098     // If we have stack realignment or variable sized objects on the stack,
2099     // restore the stack pointer from the frame pointer prior to SVE CSR
2100     // restoration.
2101     if (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) {
2102       if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
2103         // Set SP to start of SVE callee-save area from which they can
2104         // be reloaded. The code below will deallocate the stack space
2105         // space by moving FP -> SP.
2106         emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP,
2107                         StackOffset::getScalable(-CalleeSavedSize), TII,
2108                         MachineInstr::FrameDestroy);
2109       }
2110     } else {
2111       if (AFI->getSVECalleeSavedStackSize()) {
2112         // Deallocate the non-SVE locals first before we can deallocate (and
2113         // restore callee saves) from the SVE area.
2114         emitFrameOffset(
2115             MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
2116             StackOffset::getFixed(NumBytes), TII, MachineInstr::FrameDestroy,
2117             false, false, nullptr, EmitCFI && !hasFP(MF),
2118             SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize));
2119         NumBytes = 0;
2120       }
2121 
2122       emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
2123                       DeallocateBefore, TII, MachineInstr::FrameDestroy, false,
2124                       false, nullptr, EmitCFI && !hasFP(MF),
2125                       SVEStackSize +
2126                           StackOffset::getFixed(NumBytes + PrologueSaveSize));
2127 
2128       emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
2129                       DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
2130                       false, nullptr, EmitCFI && !hasFP(MF),
2131                       DeallocateAfter +
2132                           StackOffset::getFixed(NumBytes + PrologueSaveSize));
2133     }
2134     if (EmitCFI)
2135       emitCalleeSavedSVERestores(MBB, RestoreEnd);
2136   }
2137 
2138   if (!hasFP(MF)) {
2139     bool RedZone = canUseRedZone(MF);
2140     // If this was a redzone leaf function, we don't need to restore the
2141     // stack pointer (but we may need to pop stack args for fastcc).
2142     if (RedZone && AfterCSRPopSize == 0)
2143       return;
2144 
2145     // Pop the local variables off the stack. If there are no callee-saved
2146     // registers, it means we are actually positioned at the terminator and can
2147     // combine stack increment for the locals and the stack increment for
2148     // callee-popped arguments into (possibly) a single instruction and be done.
2149     bool NoCalleeSaveRestore = PrologueSaveSize == 0;
2150     int64_t StackRestoreBytes = RedZone ? 0 : NumBytes;
2151     if (NoCalleeSaveRestore)
2152       StackRestoreBytes += AfterCSRPopSize;
2153 
2154     emitFrameOffset(
2155         MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
2156         StackOffset::getFixed(StackRestoreBytes), TII,
2157         MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI, EmitCFI,
2158         StackOffset::getFixed((RedZone ? 0 : NumBytes) + PrologueSaveSize));
2159 
2160     // If we were able to combine the local stack pop with the argument pop,
2161     // then we're done.
2162     if (NoCalleeSaveRestore || AfterCSRPopSize == 0) {
2163       if (HasWinCFI) {
2164         BuildMI(MBB, MBB.getFirstTerminator(), DL,
2165                 TII->get(AArch64::SEH_EpilogEnd))
2166             .setMIFlag(MachineInstr::FrameDestroy);
2167       }
2168       return;
2169     }
2170 
2171     NumBytes = 0;
2172   }
2173 
2174   // Restore the original stack pointer.
2175   // FIXME: Rather than doing the math here, we should instead just use
2176   // non-post-indexed loads for the restores if we aren't actually going to
2177   // be able to save any instructions.
2178   if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) {
2179     emitFrameOffset(
2180         MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
2181         StackOffset::getFixed(-AFI->getCalleeSaveBaseToFrameRecordOffset()),
2182         TII, MachineInstr::FrameDestroy, false, NeedsWinCFI);
2183   } else if (NumBytes)
2184     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
2185                     StackOffset::getFixed(NumBytes), TII,
2186                     MachineInstr::FrameDestroy, false, NeedsWinCFI);
2187 
2188   // When we are about to restore the CSRs, the CFA register is SP again.
2189   if (EmitCFI && hasFP(MF)) {
2190     const AArch64RegisterInfo &RegInfo = *Subtarget.getRegisterInfo();
2191     unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true);
2192     unsigned CFIIndex = MF.addFrameInst(
2193         MCCFIInstruction::cfiDefCfa(nullptr, Reg, PrologueSaveSize));
2194     BuildMI(MBB, LastPopI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
2195         .addCFIIndex(CFIIndex)
2196         .setMIFlags(MachineInstr::FrameDestroy);
2197   }
2198 
2199   // This must be placed after the callee-save restore code because that code
2200   // assumes the SP is at the same location as it was after the callee-save save
2201   // code in the prologue.
2202   if (AfterCSRPopSize) {
2203     assert(AfterCSRPopSize > 0 && "attempting to reallocate arg stack that an "
2204                                   "interrupt may have clobbered");
2205 
2206     emitFrameOffset(
2207         MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
2208         StackOffset::getFixed(AfterCSRPopSize), TII, MachineInstr::FrameDestroy,
2209         false, NeedsWinCFI, &HasWinCFI, EmitCFI,
2210         StackOffset::getFixed(CombineAfterCSRBump ? PrologueSaveSize : 0));
2211   }
2212   if (HasWinCFI)
2213     BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd))
2214         .setMIFlag(MachineInstr::FrameDestroy);
2215 }
2216 
2217 /// getFrameIndexReference - Provide a base+offset reference to an FI slot for
2218 /// debug info.  It's the same as what we use for resolving the code-gen
2219 /// references for now.  FIXME: This can go wrong when references are
2220 /// SP-relative and simple call frames aren't used.
2221 StackOffset
getFrameIndexReference(const MachineFunction & MF,int FI,Register & FrameReg) const2222 AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
2223                                              Register &FrameReg) const {
2224   return resolveFrameIndexReference(
2225       MF, FI, FrameReg,
2226       /*PreferFP=*/
2227       MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
2228       /*ForSimm=*/false);
2229 }
2230 
2231 StackOffset
getNonLocalFrameIndexReference(const MachineFunction & MF,int FI) const2232 AArch64FrameLowering::getNonLocalFrameIndexReference(const MachineFunction &MF,
2233                                                      int FI) const {
2234   return StackOffset::getFixed(getSEHFrameIndexOffset(MF, FI));
2235 }
2236 
getFPOffset(const MachineFunction & MF,int64_t ObjectOffset)2237 static StackOffset getFPOffset(const MachineFunction &MF,
2238                                int64_t ObjectOffset) {
2239   const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2240   const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2241   bool IsWin64 =
2242       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
2243   unsigned FixedObject =
2244       getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false);
2245   int64_t CalleeSaveSize = AFI->getCalleeSavedStackSize(MF.getFrameInfo());
2246   int64_t FPAdjust =
2247       CalleeSaveSize - AFI->getCalleeSaveBaseToFrameRecordOffset();
2248   return StackOffset::getFixed(ObjectOffset + FixedObject + FPAdjust);
2249 }
2250 
getStackOffset(const MachineFunction & MF,int64_t ObjectOffset)2251 static StackOffset getStackOffset(const MachineFunction &MF,
2252                                   int64_t ObjectOffset) {
2253   const auto &MFI = MF.getFrameInfo();
2254   return StackOffset::getFixed(ObjectOffset + (int64_t)MFI.getStackSize());
2255 }
2256 
2257   // TODO: This function currently does not work for scalable vectors.
getSEHFrameIndexOffset(const MachineFunction & MF,int FI) const2258 int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
2259                                                  int FI) const {
2260   const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
2261       MF.getSubtarget().getRegisterInfo());
2262   int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI);
2263   return RegInfo->getLocalAddressRegister(MF) == AArch64::FP
2264              ? getFPOffset(MF, ObjectOffset).getFixed()
2265              : getStackOffset(MF, ObjectOffset).getFixed();
2266 }
2267 
resolveFrameIndexReference(const MachineFunction & MF,int FI,Register & FrameReg,bool PreferFP,bool ForSimm) const2268 StackOffset AArch64FrameLowering::resolveFrameIndexReference(
2269     const MachineFunction &MF, int FI, Register &FrameReg, bool PreferFP,
2270     bool ForSimm) const {
2271   const auto &MFI = MF.getFrameInfo();
2272   int64_t ObjectOffset = MFI.getObjectOffset(FI);
2273   bool isFixed = MFI.isFixedObjectIndex(FI);
2274   bool isSVE = MFI.getStackID(FI) == TargetStackID::ScalableVector;
2275   return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
2276                                      PreferFP, ForSimm);
2277 }
2278 
resolveFrameOffsetReference(const MachineFunction & MF,int64_t ObjectOffset,bool isFixed,bool isSVE,Register & FrameReg,bool PreferFP,bool ForSimm) const2279 StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
2280     const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE,
2281     Register &FrameReg, bool PreferFP, bool ForSimm) const {
2282   const auto &MFI = MF.getFrameInfo();
2283   const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
2284       MF.getSubtarget().getRegisterInfo());
2285   const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2286   const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2287 
2288   int64_t FPOffset = getFPOffset(MF, ObjectOffset).getFixed();
2289   int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed();
2290   bool isCSR =
2291       !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
2292 
2293   const StackOffset &SVEStackSize = getSVEStackSize(MF);
2294 
2295   // Use frame pointer to reference fixed objects. Use it for locals if
2296   // there are VLAs or a dynamically realigned SP (and thus the SP isn't
2297   // reliable as a base). Make sure useFPForScavengingIndex() does the
2298   // right thing for the emergency spill slot.
2299   bool UseFP = false;
2300   if (AFI->hasStackFrame() && !isSVE) {
2301     // We shouldn't prefer using the FP to access fixed-sized stack objects when
2302     // there are scalable (SVE) objects in between the FP and the fixed-sized
2303     // objects.
2304     PreferFP &= !SVEStackSize;
2305 
2306     // Note: Keeping the following as multiple 'if' statements rather than
2307     // merging to a single expression for readability.
2308     //
2309     // Argument access should always use the FP.
2310     if (isFixed) {
2311       UseFP = hasFP(MF);
2312     } else if (isCSR && RegInfo->hasStackRealignment(MF)) {
2313       // References to the CSR area must use FP if we're re-aligning the stack
2314       // since the dynamically-sized alignment padding is between the SP/BP and
2315       // the CSR area.
2316       assert(hasFP(MF) && "Re-aligned stack must have frame pointer");
2317       UseFP = true;
2318     } else if (hasFP(MF) && !RegInfo->hasStackRealignment(MF)) {
2319       // If the FPOffset is negative and we're producing a signed immediate, we
2320       // have to keep in mind that the available offset range for negative
2321       // offsets is smaller than for positive ones. If an offset is available
2322       // via the FP and the SP, use whichever is closest.
2323       bool FPOffsetFits = !ForSimm || FPOffset >= -256;
2324       PreferFP |= Offset > -FPOffset && !SVEStackSize;
2325 
2326       if (MFI.hasVarSizedObjects()) {
2327         // If we have variable sized objects, we can use either FP or BP, as the
2328         // SP offset is unknown. We can use the base pointer if we have one and
2329         // FP is not preferred. If not, we're stuck with using FP.
2330         bool CanUseBP = RegInfo->hasBasePointer(MF);
2331         if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best.
2332           UseFP = PreferFP;
2333         else if (!CanUseBP) // Can't use BP. Forced to use FP.
2334           UseFP = true;
2335         // else we can use BP and FP, but the offset from FP won't fit.
2336         // That will make us scavenge registers which we can probably avoid by
2337         // using BP. If it won't fit for BP either, we'll scavenge anyway.
2338       } else if (FPOffset >= 0) {
2339         // Use SP or FP, whichever gives us the best chance of the offset
2340         // being in range for direct access. If the FPOffset is positive,
2341         // that'll always be best, as the SP will be even further away.
2342         UseFP = true;
2343       } else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) {
2344         // Funclets access the locals contained in the parent's stack frame
2345         // via the frame pointer, so we have to use the FP in the parent
2346         // function.
2347         (void) Subtarget;
2348         assert(
2349             Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()) &&
2350             "Funclets should only be present on Win64");
2351         UseFP = true;
2352       } else {
2353         // We have the choice between FP and (SP or BP).
2354         if (FPOffsetFits && PreferFP) // If FP is the best fit, use it.
2355           UseFP = true;
2356       }
2357     }
2358   }
2359 
2360   assert(
2361       ((isFixed || isCSR) || !RegInfo->hasStackRealignment(MF) || !UseFP) &&
2362       "In the presence of dynamic stack pointer realignment, "
2363       "non-argument/CSR objects cannot be accessed through the frame pointer");
2364 
2365   if (isSVE) {
2366     StackOffset FPOffset =
2367         StackOffset::get(-AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset);
2368     StackOffset SPOffset =
2369         SVEStackSize +
2370         StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(),
2371                          ObjectOffset);
2372     // Always use the FP for SVE spills if available and beneficial.
2373     if (hasFP(MF) && (SPOffset.getFixed() ||
2374                       FPOffset.getScalable() < SPOffset.getScalable() ||
2375                       RegInfo->hasStackRealignment(MF))) {
2376       FrameReg = RegInfo->getFrameRegister(MF);
2377       return FPOffset;
2378     }
2379 
2380     FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister()
2381                                            : (unsigned)AArch64::SP;
2382     return SPOffset;
2383   }
2384 
2385   StackOffset ScalableOffset = {};
2386   if (UseFP && !(isFixed || isCSR))
2387     ScalableOffset = -SVEStackSize;
2388   if (!UseFP && (isFixed || isCSR))
2389     ScalableOffset = SVEStackSize;
2390 
2391   if (UseFP) {
2392     FrameReg = RegInfo->getFrameRegister(MF);
2393     return StackOffset::getFixed(FPOffset) + ScalableOffset;
2394   }
2395 
2396   // Use the base pointer if we have one.
2397   if (RegInfo->hasBasePointer(MF))
2398     FrameReg = RegInfo->getBaseRegister();
2399   else {
2400     assert(!MFI.hasVarSizedObjects() &&
2401            "Can't use SP when we have var sized objects.");
2402     FrameReg = AArch64::SP;
2403     // If we're using the red zone for this function, the SP won't actually
2404     // be adjusted, so the offsets will be negative. They're also all
2405     // within range of the signed 9-bit immediate instructions.
2406     if (canUseRedZone(MF))
2407       Offset -= AFI->getLocalStackSize();
2408   }
2409 
2410   return StackOffset::getFixed(Offset) + ScalableOffset;
2411 }
2412 
getPrologueDeath(MachineFunction & MF,unsigned Reg)2413 static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
2414   // Do not set a kill flag on values that are also marked as live-in. This
2415   // happens with the @llvm-returnaddress intrinsic and with arguments passed in
2416   // callee saved registers.
2417   // Omitting the kill flags is conservatively correct even if the live-in
2418   // is not used after all.
2419   bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg);
2420   return getKillRegState(!IsLiveIn);
2421 }
2422 
produceCompactUnwindFrame(MachineFunction & MF)2423 static bool produceCompactUnwindFrame(MachineFunction &MF) {
2424   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2425   AttributeList Attrs = MF.getFunction().getAttributes();
2426   return Subtarget.isTargetMachO() &&
2427          !(Subtarget.getTargetLowering()->supportSwiftError() &&
2428            Attrs.hasAttrSomewhere(Attribute::SwiftError)) &&
2429          MF.getFunction().getCallingConv() != CallingConv::SwiftTail;
2430 }
2431 
invalidateWindowsRegisterPairing(unsigned Reg1,unsigned Reg2,bool NeedsWinCFI,bool IsFirst)2432 static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
2433                                              bool NeedsWinCFI, bool IsFirst) {
2434   // If we are generating register pairs for a Windows function that requires
2435   // EH support, then pair consecutive registers only.  There are no unwind
2436   // opcodes for saves/restores of non-consectuve register pairs.
2437   // The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x,
2438   // save_lrpair.
2439   // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
2440 
2441   if (Reg2 == AArch64::FP)
2442     return true;
2443   if (!NeedsWinCFI)
2444     return false;
2445   if (Reg2 == Reg1 + 1)
2446     return false;
2447   // If pairing a GPR with LR, the pair can be described by the save_lrpair
2448   // opcode. If this is the first register pair, it would end up with a
2449   // predecrement, but there's no save_lrpair_x opcode, so we can only do this
2450   // if LR is paired with something else than the first register.
2451   // The save_lrpair opcode requires the first register to be an odd one.
2452   if (Reg1 >= AArch64::X19 && Reg1 <= AArch64::X27 &&
2453       (Reg1 - AArch64::X19) % 2 == 0 && Reg2 == AArch64::LR && !IsFirst)
2454     return false;
2455   return true;
2456 }
2457 
2458 /// Returns true if Reg1 and Reg2 cannot be paired using a ldp/stp instruction.
2459 /// WindowsCFI requires that only consecutive registers can be paired.
2460 /// LR and FP need to be allocated together when the frame needs to save
2461 /// the frame-record. This means any other register pairing with LR is invalid.
invalidateRegisterPairing(unsigned Reg1,unsigned Reg2,bool UsesWinAAPCS,bool NeedsWinCFI,bool NeedsFrameRecord,bool IsFirst)2462 static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2,
2463                                       bool UsesWinAAPCS, bool NeedsWinCFI,
2464                                       bool NeedsFrameRecord, bool IsFirst) {
2465   if (UsesWinAAPCS)
2466     return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI, IsFirst);
2467 
2468   // If we need to store the frame record, don't pair any register
2469   // with LR other than FP.
2470   if (NeedsFrameRecord)
2471     return Reg2 == AArch64::LR;
2472 
2473   return false;
2474 }
2475 
2476 namespace {
2477 
2478 struct RegPairInfo {
2479   unsigned Reg1 = AArch64::NoRegister;
2480   unsigned Reg2 = AArch64::NoRegister;
2481   int FrameIdx;
2482   int Offset;
2483   enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type;
2484 
2485   RegPairInfo() = default;
2486 
isPaired__anon23de39c90511::RegPairInfo2487   bool isPaired() const { return Reg2 != AArch64::NoRegister; }
2488 
getScale__anon23de39c90511::RegPairInfo2489   unsigned getScale() const {
2490     switch (Type) {
2491     case PPR:
2492       return 2;
2493     case GPR:
2494     case FPR64:
2495       return 8;
2496     case ZPR:
2497     case FPR128:
2498       return 16;
2499     }
2500     llvm_unreachable("Unsupported type");
2501   }
2502 
isScalable__anon23de39c90511::RegPairInfo2503   bool isScalable() const { return Type == PPR || Type == ZPR; }
2504 };
2505 
2506 } // end anonymous namespace
2507 
computeCalleeSaveRegisterPairs(MachineFunction & MF,ArrayRef<CalleeSavedInfo> CSI,const TargetRegisterInfo * TRI,SmallVectorImpl<RegPairInfo> & RegPairs,bool NeedsFrameRecord)2508 static void computeCalleeSaveRegisterPairs(
2509     MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
2510     const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
2511     bool NeedsFrameRecord) {
2512 
2513   if (CSI.empty())
2514     return;
2515 
2516   bool IsWindows = isTargetWindows(MF);
2517   bool NeedsWinCFI = needsWinCFI(MF);
2518   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2519   MachineFrameInfo &MFI = MF.getFrameInfo();
2520   CallingConv::ID CC = MF.getFunction().getCallingConv();
2521   unsigned Count = CSI.size();
2522   (void)CC;
2523   // MachO's compact unwind format relies on all registers being stored in
2524   // pairs.
2525   assert((!produceCompactUnwindFrame(MF) ||
2526           CC == CallingConv::PreserveMost || CC == CallingConv::CXX_FAST_TLS ||
2527           (Count & 1) == 0) &&
2528          "Odd number of callee-saved regs to spill!");
2529   int ByteOffset = AFI->getCalleeSavedStackSize();
2530   int StackFillDir = -1;
2531   int RegInc = 1;
2532   unsigned FirstReg = 0;
2533   if (NeedsWinCFI) {
2534     // For WinCFI, fill the stack from the bottom up.
2535     ByteOffset = 0;
2536     StackFillDir = 1;
2537     // As the CSI array is reversed to match PrologEpilogInserter, iterate
2538     // backwards, to pair up registers starting from lower numbered registers.
2539     RegInc = -1;
2540     FirstReg = Count - 1;
2541   }
2542   int ScalableByteOffset = AFI->getSVECalleeSavedStackSize();
2543   bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace();
2544 
2545   // When iterating backwards, the loop condition relies on unsigned wraparound.
2546   for (unsigned i = FirstReg; i < Count; i += RegInc) {
2547     RegPairInfo RPI;
2548     RPI.Reg1 = CSI[i].getReg();
2549 
2550     if (AArch64::GPR64RegClass.contains(RPI.Reg1))
2551       RPI.Type = RegPairInfo::GPR;
2552     else if (AArch64::FPR64RegClass.contains(RPI.Reg1))
2553       RPI.Type = RegPairInfo::FPR64;
2554     else if (AArch64::FPR128RegClass.contains(RPI.Reg1))
2555       RPI.Type = RegPairInfo::FPR128;
2556     else if (AArch64::ZPRRegClass.contains(RPI.Reg1))
2557       RPI.Type = RegPairInfo::ZPR;
2558     else if (AArch64::PPRRegClass.contains(RPI.Reg1))
2559       RPI.Type = RegPairInfo::PPR;
2560     else
2561       llvm_unreachable("Unsupported register class.");
2562 
2563     // Add the next reg to the pair if it is in the same register class.
2564     if (unsigned(i + RegInc) < Count) {
2565       Register NextReg = CSI[i + RegInc].getReg();
2566       bool IsFirst = i == FirstReg;
2567       switch (RPI.Type) {
2568       case RegPairInfo::GPR:
2569         if (AArch64::GPR64RegClass.contains(NextReg) &&
2570             !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows,
2571                                        NeedsWinCFI, NeedsFrameRecord, IsFirst))
2572           RPI.Reg2 = NextReg;
2573         break;
2574       case RegPairInfo::FPR64:
2575         if (AArch64::FPR64RegClass.contains(NextReg) &&
2576             !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI,
2577                                               IsFirst))
2578           RPI.Reg2 = NextReg;
2579         break;
2580       case RegPairInfo::FPR128:
2581         if (AArch64::FPR128RegClass.contains(NextReg))
2582           RPI.Reg2 = NextReg;
2583         break;
2584       case RegPairInfo::PPR:
2585       case RegPairInfo::ZPR:
2586         break;
2587       }
2588     }
2589 
2590     // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
2591     // list to come in sorted by frame index so that we can issue the store
2592     // pair instructions directly. Assert if we see anything otherwise.
2593     //
2594     // The order of the registers in the list is controlled by
2595     // getCalleeSavedRegs(), so they will always be in-order, as well.
2596     assert((!RPI.isPaired() ||
2597             (CSI[i].getFrameIdx() + RegInc == CSI[i + RegInc].getFrameIdx())) &&
2598            "Out of order callee saved regs!");
2599 
2600     assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg2 != AArch64::FP ||
2601             RPI.Reg1 == AArch64::LR) &&
2602            "FrameRecord must be allocated together with LR");
2603 
2604     // Windows AAPCS has FP and LR reversed.
2605     assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg1 != AArch64::FP ||
2606             RPI.Reg2 == AArch64::LR) &&
2607            "FrameRecord must be allocated together with LR");
2608 
2609     // MachO's compact unwind format relies on all registers being stored in
2610     // adjacent register pairs.
2611     assert((!produceCompactUnwindFrame(MF) ||
2612             CC == CallingConv::PreserveMost || CC == CallingConv::CXX_FAST_TLS ||
2613             (RPI.isPaired() &&
2614              ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
2615               RPI.Reg1 + 1 == RPI.Reg2))) &&
2616            "Callee-save registers not saved as adjacent register pair!");
2617 
2618     RPI.FrameIdx = CSI[i].getFrameIdx();
2619     if (NeedsWinCFI &&
2620         RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair
2621       RPI.FrameIdx = CSI[i + RegInc].getFrameIdx();
2622 
2623     int Scale = RPI.getScale();
2624 
2625     int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
2626     assert(OffsetPre % Scale == 0);
2627 
2628     if (RPI.isScalable())
2629       ScalableByteOffset += StackFillDir * Scale;
2630     else
2631       ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
2632 
2633     // Swift's async context is directly before FP, so allocate an extra
2634     // 8 bytes for it.
2635     if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
2636         RPI.Reg2 == AArch64::FP)
2637       ByteOffset += StackFillDir * 8;
2638 
2639     assert(!(RPI.isScalable() && RPI.isPaired()) &&
2640            "Paired spill/fill instructions don't exist for SVE vectors");
2641 
2642     // Round up size of non-pair to pair size if we need to pad the
2643     // callee-save area to ensure 16-byte alignment.
2644     if (NeedGapToAlignStack && !NeedsWinCFI &&
2645         !RPI.isScalable() && RPI.Type != RegPairInfo::FPR128 &&
2646         !RPI.isPaired() && ByteOffset % 16 != 0) {
2647       ByteOffset += 8 * StackFillDir;
2648       assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16));
2649       // A stack frame with a gap looks like this, bottom up:
2650       // d9, d8. x21, gap, x20, x19.
2651       // Set extra alignment on the x21 object to create the gap above it.
2652       MFI.setObjectAlignment(RPI.FrameIdx, Align(16));
2653       NeedGapToAlignStack = false;
2654     }
2655 
2656     int OffsetPost = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
2657     assert(OffsetPost % Scale == 0);
2658     // If filling top down (default), we want the offset after incrementing it.
2659     // If fillibg bootom up (WinCFI) we need the original offset.
2660     int Offset = NeedsWinCFI ? OffsetPre : OffsetPost;
2661 
2662     // The FP, LR pair goes 8 bytes into our expanded 24-byte slot so that the
2663     // Swift context can directly precede FP.
2664     if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
2665         RPI.Reg2 == AArch64::FP)
2666       Offset += 8;
2667     RPI.Offset = Offset / Scale;
2668 
2669     assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) ||
2670             (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) &&
2671            "Offset out of bounds for LDP/STP immediate");
2672 
2673     // Save the offset to frame record so that the FP register can point to the
2674     // innermost frame record (spilled FP and LR registers).
2675     if (NeedsFrameRecord && ((!IsWindows && RPI.Reg1 == AArch64::LR &&
2676                               RPI.Reg2 == AArch64::FP) ||
2677                              (IsWindows && RPI.Reg1 == AArch64::FP &&
2678                               RPI.Reg2 == AArch64::LR)))
2679       AFI->setCalleeSaveBaseToFrameRecordOffset(Offset);
2680 
2681     RegPairs.push_back(RPI);
2682     if (RPI.isPaired())
2683       i += RegInc;
2684   }
2685   if (NeedsWinCFI) {
2686     // If we need an alignment gap in the stack, align the topmost stack
2687     // object. A stack frame with a gap looks like this, bottom up:
2688     // x19, d8. d9, gap.
2689     // Set extra alignment on the topmost stack object (the first element in
2690     // CSI, which goes top down), to create the gap above it.
2691     if (AFI->hasCalleeSaveStackFreeSpace())
2692       MFI.setObjectAlignment(CSI[0].getFrameIdx(), Align(16));
2693     // We iterated bottom up over the registers; flip RegPairs back to top
2694     // down order.
2695     std::reverse(RegPairs.begin(), RegPairs.end());
2696   }
2697 }
2698 
spillCalleeSavedRegisters(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,ArrayRef<CalleeSavedInfo> CSI,const TargetRegisterInfo * TRI) const2699 bool AArch64FrameLowering::spillCalleeSavedRegisters(
2700     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2701     ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2702   MachineFunction &MF = *MBB.getParent();
2703   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
2704   bool NeedsWinCFI = needsWinCFI(MF);
2705   DebugLoc DL;
2706   SmallVector<RegPairInfo, 8> RegPairs;
2707 
2708   computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
2709 
2710   const MachineRegisterInfo &MRI = MF.getRegInfo();
2711   if (homogeneousPrologEpilog(MF)) {
2712     auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Prolog))
2713                    .setMIFlag(MachineInstr::FrameSetup);
2714 
2715     for (auto &RPI : RegPairs) {
2716       MIB.addReg(RPI.Reg1);
2717       MIB.addReg(RPI.Reg2);
2718 
2719       // Update register live in.
2720       if (!MRI.isReserved(RPI.Reg1))
2721         MBB.addLiveIn(RPI.Reg1);
2722       if (!MRI.isReserved(RPI.Reg2))
2723         MBB.addLiveIn(RPI.Reg2);
2724     }
2725     return true;
2726   }
2727   for (const RegPairInfo &RPI : llvm::reverse(RegPairs)) {
2728     unsigned Reg1 = RPI.Reg1;
2729     unsigned Reg2 = RPI.Reg2;
2730     unsigned StrOpc;
2731 
2732     // Issue sequence of spills for cs regs.  The first spill may be converted
2733     // to a pre-decrement store later by emitPrologue if the callee-save stack
2734     // area allocation can't be combined with the local stack area allocation.
2735     // For example:
2736     //    stp     x22, x21, [sp, #0]     // addImm(+0)
2737     //    stp     x20, x19, [sp, #16]    // addImm(+2)
2738     //    stp     fp, lr, [sp, #32]      // addImm(+4)
2739     // Rationale: This sequence saves uop updates compared to a sequence of
2740     // pre-increment spills like stp xi,xj,[sp,#-16]!
2741     // Note: Similar rationale and sequence for restores in epilog.
2742     unsigned Size;
2743     Align Alignment;
2744     switch (RPI.Type) {
2745     case RegPairInfo::GPR:
2746        StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
2747        Size = 8;
2748        Alignment = Align(8);
2749        break;
2750     case RegPairInfo::FPR64:
2751        StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
2752        Size = 8;
2753        Alignment = Align(8);
2754        break;
2755     case RegPairInfo::FPR128:
2756        StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui;
2757        Size = 16;
2758        Alignment = Align(16);
2759        break;
2760     case RegPairInfo::ZPR:
2761        StrOpc = AArch64::STR_ZXI;
2762        Size = 16;
2763        Alignment = Align(16);
2764        break;
2765     case RegPairInfo::PPR:
2766        StrOpc = AArch64::STR_PXI;
2767        Size = 2;
2768        Alignment = Align(2);
2769        break;
2770     }
2771     LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
2772                if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
2773                dbgs() << ") -> fi#(" << RPI.FrameIdx;
2774                if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
2775                dbgs() << ")\n");
2776 
2777     assert((!NeedsWinCFI || !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) &&
2778            "Windows unwdinding requires a consecutive (FP,LR) pair");
2779     // Windows unwind codes require consecutive registers if registers are
2780     // paired.  Make the switch here, so that the code below will save (x,x+1)
2781     // and not (x+1,x).
2782     unsigned FrameIdxReg1 = RPI.FrameIdx;
2783     unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
2784     if (NeedsWinCFI && RPI.isPaired()) {
2785       std::swap(Reg1, Reg2);
2786       std::swap(FrameIdxReg1, FrameIdxReg2);
2787     }
2788     MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
2789     if (!MRI.isReserved(Reg1))
2790       MBB.addLiveIn(Reg1);
2791     if (RPI.isPaired()) {
2792       if (!MRI.isReserved(Reg2))
2793         MBB.addLiveIn(Reg2);
2794       MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
2795       MIB.addMemOperand(MF.getMachineMemOperand(
2796           MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
2797           MachineMemOperand::MOStore, Size, Alignment));
2798     }
2799     MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
2800         .addReg(AArch64::SP)
2801         .addImm(RPI.Offset) // [sp, #offset*scale],
2802                             // where factor*scale is implicit
2803         .setMIFlag(MachineInstr::FrameSetup);
2804     MIB.addMemOperand(MF.getMachineMemOperand(
2805         MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
2806         MachineMemOperand::MOStore, Size, Alignment));
2807     if (NeedsWinCFI)
2808       InsertSEH(MIB, TII, MachineInstr::FrameSetup);
2809 
2810     // Update the StackIDs of the SVE stack slots.
2811     MachineFrameInfo &MFI = MF.getFrameInfo();
2812     if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR)
2813       MFI.setStackID(RPI.FrameIdx, TargetStackID::ScalableVector);
2814 
2815   }
2816   return true;
2817 }
2818 
restoreCalleeSavedRegisters(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,MutableArrayRef<CalleeSavedInfo> CSI,const TargetRegisterInfo * TRI) const2819 bool AArch64FrameLowering::restoreCalleeSavedRegisters(
2820     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
2821     MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2822   MachineFunction &MF = *MBB.getParent();
2823   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
2824   DebugLoc DL;
2825   SmallVector<RegPairInfo, 8> RegPairs;
2826   bool NeedsWinCFI = needsWinCFI(MF);
2827 
2828   if (MBBI != MBB.end())
2829     DL = MBBI->getDebugLoc();
2830 
2831   computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
2832 
2833   auto EmitMI = [&](const RegPairInfo &RPI) -> MachineBasicBlock::iterator {
2834     unsigned Reg1 = RPI.Reg1;
2835     unsigned Reg2 = RPI.Reg2;
2836 
2837     // Issue sequence of restores for cs regs. The last restore may be converted
2838     // to a post-increment load later by emitEpilogue if the callee-save stack
2839     // area allocation can't be combined with the local stack area allocation.
2840     // For example:
2841     //    ldp     fp, lr, [sp, #32]       // addImm(+4)
2842     //    ldp     x20, x19, [sp, #16]     // addImm(+2)
2843     //    ldp     x22, x21, [sp, #0]      // addImm(+0)
2844     // Note: see comment in spillCalleeSavedRegisters()
2845     unsigned LdrOpc;
2846     unsigned Size;
2847     Align Alignment;
2848     switch (RPI.Type) {
2849     case RegPairInfo::GPR:
2850        LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
2851        Size = 8;
2852        Alignment = Align(8);
2853        break;
2854     case RegPairInfo::FPR64:
2855        LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
2856        Size = 8;
2857        Alignment = Align(8);
2858        break;
2859     case RegPairInfo::FPR128:
2860        LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui;
2861        Size = 16;
2862        Alignment = Align(16);
2863        break;
2864     case RegPairInfo::ZPR:
2865        LdrOpc = AArch64::LDR_ZXI;
2866        Size = 16;
2867        Alignment = Align(16);
2868        break;
2869     case RegPairInfo::PPR:
2870        LdrOpc = AArch64::LDR_PXI;
2871        Size = 2;
2872        Alignment = Align(2);
2873        break;
2874     }
2875     LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
2876                if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
2877                dbgs() << ") -> fi#(" << RPI.FrameIdx;
2878                if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
2879                dbgs() << ")\n");
2880 
2881     // Windows unwind codes require consecutive registers if registers are
2882     // paired.  Make the switch here, so that the code below will save (x,x+1)
2883     // and not (x+1,x).
2884     unsigned FrameIdxReg1 = RPI.FrameIdx;
2885     unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
2886     if (NeedsWinCFI && RPI.isPaired()) {
2887       std::swap(Reg1, Reg2);
2888       std::swap(FrameIdxReg1, FrameIdxReg2);
2889     }
2890     MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc));
2891     if (RPI.isPaired()) {
2892       MIB.addReg(Reg2, getDefRegState(true));
2893       MIB.addMemOperand(MF.getMachineMemOperand(
2894           MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
2895           MachineMemOperand::MOLoad, Size, Alignment));
2896     }
2897     MIB.addReg(Reg1, getDefRegState(true))
2898         .addReg(AArch64::SP)
2899         .addImm(RPI.Offset) // [sp, #offset*scale]
2900                             // where factor*scale is implicit
2901         .setMIFlag(MachineInstr::FrameDestroy);
2902     MIB.addMemOperand(MF.getMachineMemOperand(
2903         MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
2904         MachineMemOperand::MOLoad, Size, Alignment));
2905     if (NeedsWinCFI)
2906       InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
2907 
2908     return MIB->getIterator();
2909   };
2910 
2911   // SVE objects are always restored in reverse order.
2912   for (const RegPairInfo &RPI : reverse(RegPairs))
2913     if (RPI.isScalable())
2914       EmitMI(RPI);
2915 
2916   if (homogeneousPrologEpilog(MF, &MBB)) {
2917     auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
2918                    .setMIFlag(MachineInstr::FrameDestroy);
2919     for (auto &RPI : RegPairs) {
2920       MIB.addReg(RPI.Reg1, RegState::Define);
2921       MIB.addReg(RPI.Reg2, RegState::Define);
2922     }
2923     return true;
2924   }
2925 
2926   if (ReverseCSRRestoreSeq) {
2927     MachineBasicBlock::iterator First = MBB.end();
2928     for (const RegPairInfo &RPI : reverse(RegPairs)) {
2929       if (RPI.isScalable())
2930         continue;
2931       MachineBasicBlock::iterator It = EmitMI(RPI);
2932       if (First == MBB.end())
2933         First = It;
2934     }
2935     if (First != MBB.end())
2936       MBB.splice(MBBI, &MBB, First);
2937   } else {
2938     for (const RegPairInfo &RPI : RegPairs) {
2939       if (RPI.isScalable())
2940         continue;
2941       (void)EmitMI(RPI);
2942     }
2943   }
2944 
2945   return true;
2946 }
2947 
determineCalleeSaves(MachineFunction & MF,BitVector & SavedRegs,RegScavenger * RS) const2948 void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
2949                                                 BitVector &SavedRegs,
2950                                                 RegScavenger *RS) const {
2951   // All calls are tail calls in GHC calling conv, and functions have no
2952   // prologue/epilogue.
2953   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
2954     return;
2955 
2956   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
2957   const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
2958       MF.getSubtarget().getRegisterInfo());
2959   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2960   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2961   unsigned UnspilledCSGPR = AArch64::NoRegister;
2962   unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
2963 
2964   MachineFrameInfo &MFI = MF.getFrameInfo();
2965   const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
2966 
2967   unsigned BasePointerReg = RegInfo->hasBasePointer(MF)
2968                                 ? RegInfo->getBaseRegister()
2969                                 : (unsigned)AArch64::NoRegister;
2970 
2971   unsigned ExtraCSSpill = 0;
2972   // Figure out which callee-saved registers to save/restore.
2973   for (unsigned i = 0; CSRegs[i]; ++i) {
2974     const unsigned Reg = CSRegs[i];
2975 
2976     // Add the base pointer register to SavedRegs if it is callee-save.
2977     if (Reg == BasePointerReg)
2978       SavedRegs.set(Reg);
2979 
2980     bool RegUsed = SavedRegs.test(Reg);
2981     unsigned PairedReg = AArch64::NoRegister;
2982     if (AArch64::GPR64RegClass.contains(Reg) ||
2983         AArch64::FPR64RegClass.contains(Reg) ||
2984         AArch64::FPR128RegClass.contains(Reg))
2985       PairedReg = CSRegs[i ^ 1];
2986 
2987     if (!RegUsed) {
2988       if (AArch64::GPR64RegClass.contains(Reg) &&
2989           !RegInfo->isReservedReg(MF, Reg)) {
2990         UnspilledCSGPR = Reg;
2991         UnspilledCSGPRPaired = PairedReg;
2992       }
2993       continue;
2994     }
2995 
2996     // MachO's compact unwind format relies on all registers being stored in
2997     // pairs.
2998     // FIXME: the usual format is actually better if unwinding isn't needed.
2999     if (producePairRegisters(MF) && PairedReg != AArch64::NoRegister &&
3000         !SavedRegs.test(PairedReg)) {
3001       SavedRegs.set(PairedReg);
3002       if (AArch64::GPR64RegClass.contains(PairedReg) &&
3003           !RegInfo->isReservedReg(MF, PairedReg))
3004         ExtraCSSpill = PairedReg;
3005     }
3006   }
3007 
3008   if (MF.getFunction().getCallingConv() == CallingConv::Win64 &&
3009       !Subtarget.isTargetWindows()) {
3010     // For Windows calling convention on a non-windows OS, where X18 is treated
3011     // as reserved, back up X18 when entering non-windows code (marked with the
3012     // Windows calling convention) and restore when returning regardless of
3013     // whether the individual function uses it - it might call other functions
3014     // that clobber it.
3015     SavedRegs.set(AArch64::X18);
3016   }
3017 
3018   // Calculates the callee saved stack size.
3019   unsigned CSStackSize = 0;
3020   unsigned SVECSStackSize = 0;
3021   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3022   const MachineRegisterInfo &MRI = MF.getRegInfo();
3023   for (unsigned Reg : SavedRegs.set_bits()) {
3024     auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8;
3025     if (AArch64::PPRRegClass.contains(Reg) ||
3026         AArch64::ZPRRegClass.contains(Reg))
3027       SVECSStackSize += RegSize;
3028     else
3029       CSStackSize += RegSize;
3030   }
3031 
3032   // Save number of saved regs, so we can easily update CSStackSize later.
3033   unsigned NumSavedRegs = SavedRegs.count();
3034 
3035   // The frame record needs to be created by saving the appropriate registers
3036   uint64_t EstimatedStackSize = MFI.estimateStackSize(MF);
3037   if (hasFP(MF) ||
3038       windowsRequiresStackProbe(MF, EstimatedStackSize + CSStackSize + 16)) {
3039     SavedRegs.set(AArch64::FP);
3040     SavedRegs.set(AArch64::LR);
3041   }
3042 
3043   LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nSaved CSRs:";
3044              for (unsigned Reg
3045                   : SavedRegs.set_bits()) dbgs()
3046              << ' ' << printReg(Reg, RegInfo);
3047              dbgs() << "\n";);
3048 
3049   // If any callee-saved registers are used, the frame cannot be eliminated.
3050   int64_t SVEStackSize =
3051       alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16);
3052   bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize;
3053 
3054   // The CSR spill slots have not been allocated yet, so estimateStackSize
3055   // won't include them.
3056   unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
3057 
3058   // Conservatively always assume BigStack when there are SVE spills.
3059   bool BigStack = SVEStackSize ||
3060                   (EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit;
3061   if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
3062     AFI->setHasStackFrame(true);
3063 
3064   // Estimate if we might need to scavenge a register at some point in order
3065   // to materialize a stack offset. If so, either spill one additional
3066   // callee-saved register or reserve a special spill slot to facilitate
3067   // register scavenging. If we already spilled an extra callee-saved register
3068   // above to keep the number of spills even, we don't need to do anything else
3069   // here.
3070   if (BigStack) {
3071     if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) {
3072       LLVM_DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo)
3073                         << " to get a scratch register.\n");
3074       SavedRegs.set(UnspilledCSGPR);
3075       // MachO's compact unwind format relies on all registers being stored in
3076       // pairs, so if we need to spill one extra for BigStack, then we need to
3077       // store the pair.
3078       if (producePairRegisters(MF))
3079         SavedRegs.set(UnspilledCSGPRPaired);
3080       ExtraCSSpill = UnspilledCSGPR;
3081     }
3082 
3083     // If we didn't find an extra callee-saved register to spill, create
3084     // an emergency spill slot.
3085     if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) {
3086       const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3087       const TargetRegisterClass &RC = AArch64::GPR64RegClass;
3088       unsigned Size = TRI->getSpillSize(RC);
3089       Align Alignment = TRI->getSpillAlign(RC);
3090       int FI = MFI.CreateStackObject(Size, Alignment, false);
3091       RS->addScavengingFrameIndex(FI);
3092       LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
3093                         << " as the emergency spill slot.\n");
3094     }
3095   }
3096 
3097   // Adding the size of additional 64bit GPR saves.
3098   CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs);
3099 
3100   // A Swift asynchronous context extends the frame record with a pointer
3101   // directly before FP.
3102   if (hasFP(MF) && AFI->hasSwiftAsyncContext())
3103     CSStackSize += 8;
3104 
3105   uint64_t AlignedCSStackSize = alignTo(CSStackSize, 16);
3106   LLVM_DEBUG(dbgs() << "Estimated stack frame size: "
3107                << EstimatedStackSize + AlignedCSStackSize
3108                << " bytes.\n");
3109 
3110   assert((!MFI.isCalleeSavedInfoValid() ||
3111           AFI->getCalleeSavedStackSize() == AlignedCSStackSize) &&
3112          "Should not invalidate callee saved info");
3113 
3114   // Round up to register pair alignment to avoid additional SP adjustment
3115   // instructions.
3116   AFI->setCalleeSavedStackSize(AlignedCSStackSize);
3117   AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize);
3118   AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16));
3119 }
3120 
assignCalleeSavedSpillSlots(MachineFunction & MF,const TargetRegisterInfo * RegInfo,std::vector<CalleeSavedInfo> & CSI,unsigned & MinCSFrameIndex,unsigned & MaxCSFrameIndex) const3121 bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
3122     MachineFunction &MF, const TargetRegisterInfo *RegInfo,
3123     std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
3124     unsigned &MaxCSFrameIndex) const {
3125   bool NeedsWinCFI = needsWinCFI(MF);
3126   // To match the canonical windows frame layout, reverse the list of
3127   // callee saved registers to get them laid out by PrologEpilogInserter
3128   // in the right order. (PrologEpilogInserter allocates stack objects top
3129   // down. Windows canonical prologs store higher numbered registers at
3130   // the top, thus have the CSI array start from the highest registers.)
3131   if (NeedsWinCFI)
3132     std::reverse(CSI.begin(), CSI.end());
3133 
3134   if (CSI.empty())
3135     return true; // Early exit if no callee saved registers are modified!
3136 
3137   // Now that we know which registers need to be saved and restored, allocate
3138   // stack slots for them.
3139   MachineFrameInfo &MFI = MF.getFrameInfo();
3140   auto *AFI = MF.getInfo<AArch64FunctionInfo>();
3141 
3142   bool UsesWinAAPCS = isTargetWindows(MF);
3143   if (UsesWinAAPCS && hasFP(MF) && AFI->hasSwiftAsyncContext()) {
3144     int FrameIdx = MFI.CreateStackObject(8, Align(16), true);
3145     AFI->setSwiftAsyncContextFrameIdx(FrameIdx);
3146     if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
3147     if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
3148   }
3149 
3150   for (auto &CS : CSI) {
3151     Register Reg = CS.getReg();
3152     const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
3153 
3154     unsigned Size = RegInfo->getSpillSize(*RC);
3155     Align Alignment(RegInfo->getSpillAlign(*RC));
3156     int FrameIdx = MFI.CreateStackObject(Size, Alignment, true);
3157     CS.setFrameIdx(FrameIdx);
3158 
3159     if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
3160     if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
3161 
3162     // Grab 8 bytes below FP for the extended asynchronous frame info.
3163     if (hasFP(MF) && AFI->hasSwiftAsyncContext() && !UsesWinAAPCS &&
3164         Reg == AArch64::FP) {
3165       FrameIdx = MFI.CreateStackObject(8, Alignment, true);
3166       AFI->setSwiftAsyncContextFrameIdx(FrameIdx);
3167       if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
3168       if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
3169     }
3170   }
3171   return true;
3172 }
3173 
enableStackSlotScavenging(const MachineFunction & MF) const3174 bool AArch64FrameLowering::enableStackSlotScavenging(
3175     const MachineFunction &MF) const {
3176   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3177   return AFI->hasCalleeSaveStackFreeSpace();
3178 }
3179 
3180 /// returns true if there are any SVE callee saves.
getSVECalleeSaveSlotRange(const MachineFrameInfo & MFI,int & Min,int & Max)3181 static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI,
3182                                       int &Min, int &Max) {
3183   Min = std::numeric_limits<int>::max();
3184   Max = std::numeric_limits<int>::min();
3185 
3186   if (!MFI.isCalleeSavedInfoValid())
3187     return false;
3188 
3189   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
3190   for (auto &CS : CSI) {
3191     if (AArch64::ZPRRegClass.contains(CS.getReg()) ||
3192         AArch64::PPRRegClass.contains(CS.getReg())) {
3193       assert((Max == std::numeric_limits<int>::min() ||
3194               Max + 1 == CS.getFrameIdx()) &&
3195              "SVE CalleeSaves are not consecutive");
3196 
3197       Min = std::min(Min, CS.getFrameIdx());
3198       Max = std::max(Max, CS.getFrameIdx());
3199     }
3200   }
3201   return Min != std::numeric_limits<int>::max();
3202 }
3203 
3204 // Process all the SVE stack objects and determine offsets for each
3205 // object. If AssignOffsets is true, the offsets get assigned.
3206 // Fills in the first and last callee-saved frame indices into
3207 // Min/MaxCSFrameIndex, respectively.
3208 // Returns the size of the stack.
determineSVEStackObjectOffsets(MachineFrameInfo & MFI,int & MinCSFrameIndex,int & MaxCSFrameIndex,bool AssignOffsets)3209 static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
3210                                               int &MinCSFrameIndex,
3211                                               int &MaxCSFrameIndex,
3212                                               bool AssignOffsets) {
3213 #ifndef NDEBUG
3214   // First process all fixed stack objects.
3215   for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
3216     assert(MFI.getStackID(I) != TargetStackID::ScalableVector &&
3217            "SVE vectors should never be passed on the stack by value, only by "
3218            "reference.");
3219 #endif
3220 
3221   auto Assign = [&MFI](int FI, int64_t Offset) {
3222     LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n");
3223     MFI.setObjectOffset(FI, Offset);
3224   };
3225 
3226   int64_t Offset = 0;
3227 
3228   // Then process all callee saved slots.
3229   if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) {
3230     // Assign offsets to the callee save slots.
3231     for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {
3232       Offset += MFI.getObjectSize(I);
3233       Offset = alignTo(Offset, MFI.getObjectAlign(I));
3234       if (AssignOffsets)
3235         Assign(I, -Offset);
3236     }
3237   }
3238 
3239   // Ensure that the Callee-save area is aligned to 16bytes.
3240   Offset = alignTo(Offset, Align(16U));
3241 
3242   // Create a buffer of SVE objects to allocate and sort it.
3243   SmallVector<int, 8> ObjectsToAllocate;
3244   // If we have a stack protector, and we've previously decided that we have SVE
3245   // objects on the stack and thus need it to go in the SVE stack area, then it
3246   // needs to go first.
3247   int StackProtectorFI = -1;
3248   if (MFI.hasStackProtectorIndex()) {
3249     StackProtectorFI = MFI.getStackProtectorIndex();
3250     if (MFI.getStackID(StackProtectorFI) == TargetStackID::ScalableVector)
3251       ObjectsToAllocate.push_back(StackProtectorFI);
3252   }
3253   for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
3254     unsigned StackID = MFI.getStackID(I);
3255     if (StackID != TargetStackID::ScalableVector)
3256       continue;
3257     if (I == StackProtectorFI)
3258       continue;
3259     if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex)
3260       continue;
3261     if (MFI.isDeadObjectIndex(I))
3262       continue;
3263 
3264     ObjectsToAllocate.push_back(I);
3265   }
3266 
3267   // Allocate all SVE locals and spills
3268   for (unsigned FI : ObjectsToAllocate) {
3269     Align Alignment = MFI.getObjectAlign(FI);
3270     // FIXME: Given that the length of SVE vectors is not necessarily a power of
3271     // two, we'd need to align every object dynamically at runtime if the
3272     // alignment is larger than 16. This is not yet supported.
3273     if (Alignment > Align(16))
3274       report_fatal_error(
3275           "Alignment of scalable vectors > 16 bytes is not yet supported");
3276 
3277     Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment);
3278     if (AssignOffsets)
3279       Assign(FI, -Offset);
3280   }
3281 
3282   return Offset;
3283 }
3284 
estimateSVEStackObjectOffsets(MachineFrameInfo & MFI) const3285 int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets(
3286     MachineFrameInfo &MFI) const {
3287   int MinCSFrameIndex, MaxCSFrameIndex;
3288   return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false);
3289 }
3290 
assignSVEStackObjectOffsets(MachineFrameInfo & MFI,int & MinCSFrameIndex,int & MaxCSFrameIndex) const3291 int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
3292     MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const {
3293   return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex,
3294                                         true);
3295 }
3296 
processFunctionBeforeFrameFinalized(MachineFunction & MF,RegScavenger * RS) const3297 void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
3298     MachineFunction &MF, RegScavenger *RS) const {
3299   MachineFrameInfo &MFI = MF.getFrameInfo();
3300 
3301   assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
3302          "Upwards growing stack unsupported");
3303 
3304   int MinCSFrameIndex, MaxCSFrameIndex;
3305   int64_t SVEStackSize =
3306       assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex);
3307 
3308   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3309   AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U));
3310   AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex);
3311 
3312   // If this function isn't doing Win64-style C++ EH, we don't need to do
3313   // anything.
3314   if (!MF.hasEHFunclets())
3315     return;
3316   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
3317   WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
3318 
3319   MachineBasicBlock &MBB = MF.front();
3320   auto MBBI = MBB.begin();
3321   while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
3322     ++MBBI;
3323 
3324   // Create an UnwindHelp object.
3325   // The UnwindHelp object is allocated at the start of the fixed object area
3326   int64_t FixedObject =
3327       getFixedObjectSize(MF, AFI, /*IsWin64*/ true, /*IsFunclet*/ false);
3328   int UnwindHelpFI = MFI.CreateFixedObject(/*Size*/ 8,
3329                                            /*SPOffset*/ -FixedObject,
3330                                            /*IsImmutable=*/false);
3331   EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
3332 
3333   // We need to store -2 into the UnwindHelp object at the start of the
3334   // function.
3335   DebugLoc DL;
3336   RS->enterBasicBlockEnd(MBB);
3337   RS->backward(std::prev(MBBI));
3338   Register DstReg = RS->FindUnusedReg(&AArch64::GPR64commonRegClass);
3339   assert(DstReg && "There must be a free register after frame setup");
3340   BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), DstReg).addImm(-2);
3341   BuildMI(MBB, MBBI, DL, TII.get(AArch64::STURXi))
3342       .addReg(DstReg, getKillRegState(true))
3343       .addFrameIndex(UnwindHelpFI)
3344       .addImm(0);
3345 }
3346 
3347 namespace {
3348 struct TagStoreInstr {
3349   MachineInstr *MI;
3350   int64_t Offset, Size;
TagStoreInstr__anon23de39c90811::TagStoreInstr3351   explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size)
3352       : MI(MI), Offset(Offset), Size(Size) {}
3353 };
3354 
3355 class TagStoreEdit {
3356   MachineFunction *MF;
3357   MachineBasicBlock *MBB;
3358   MachineRegisterInfo *MRI;
3359   // Tag store instructions that are being replaced.
3360   SmallVector<TagStoreInstr, 8> TagStores;
3361   // Combined memref arguments of the above instructions.
3362   SmallVector<MachineMemOperand *, 8> CombinedMemRefs;
3363 
3364   // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg +
3365   // FrameRegOffset + Size) with the address tag of SP.
3366   Register FrameReg;
3367   StackOffset FrameRegOffset;
3368   int64_t Size;
3369   // If not None, move FrameReg to (FrameReg + FrameRegUpdate) at the end.
3370   Optional<int64_t> FrameRegUpdate;
3371   // MIFlags for any FrameReg updating instructions.
3372   unsigned FrameRegUpdateFlags;
3373 
3374   // Use zeroing instruction variants.
3375   bool ZeroData;
3376   DebugLoc DL;
3377 
3378   void emitUnrolled(MachineBasicBlock::iterator InsertI);
3379   void emitLoop(MachineBasicBlock::iterator InsertI);
3380 
3381 public:
TagStoreEdit(MachineBasicBlock * MBB,bool ZeroData)3382   TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData)
3383       : MBB(MBB), ZeroData(ZeroData) {
3384     MF = MBB->getParent();
3385     MRI = &MF->getRegInfo();
3386   }
3387   // Add an instruction to be replaced. Instructions must be added in the
3388   // ascending order of Offset, and have to be adjacent.
addInstruction(TagStoreInstr I)3389   void addInstruction(TagStoreInstr I) {
3390     assert((TagStores.empty() ||
3391             TagStores.back().Offset + TagStores.back().Size == I.Offset) &&
3392            "Non-adjacent tag store instructions.");
3393     TagStores.push_back(I);
3394   }
clear()3395   void clear() { TagStores.clear(); }
3396   // Emit equivalent code at the given location, and erase the current set of
3397   // instructions. May skip if the replacement is not profitable. May invalidate
3398   // the input iterator and replace it with a valid one.
3399   void emitCode(MachineBasicBlock::iterator &InsertI,
3400                 const AArch64FrameLowering *TFI, bool TryMergeSPUpdate);
3401 };
3402 
emitUnrolled(MachineBasicBlock::iterator InsertI)3403 void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
3404   const AArch64InstrInfo *TII =
3405       MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
3406 
3407   const int64_t kMinOffset = -256 * 16;
3408   const int64_t kMaxOffset = 255 * 16;
3409 
3410   Register BaseReg = FrameReg;
3411   int64_t BaseRegOffsetBytes = FrameRegOffset.getFixed();
3412   if (BaseRegOffsetBytes < kMinOffset ||
3413       BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) {
3414     Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
3415     emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg,
3416                     StackOffset::getFixed(BaseRegOffsetBytes), TII);
3417     BaseReg = ScratchReg;
3418     BaseRegOffsetBytes = 0;
3419   }
3420 
3421   MachineInstr *LastI = nullptr;
3422   while (Size) {
3423     int64_t InstrSize = (Size > 16) ? 32 : 16;
3424     unsigned Opcode =
3425         InstrSize == 16
3426             ? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset)
3427             : (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset);
3428     MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode))
3429                           .addReg(AArch64::SP)
3430                           .addReg(BaseReg)
3431                           .addImm(BaseRegOffsetBytes / 16)
3432                           .setMemRefs(CombinedMemRefs);
3433     // A store to [BaseReg, #0] should go last for an opportunity to fold the
3434     // final SP adjustment in the epilogue.
3435     if (BaseRegOffsetBytes == 0)
3436       LastI = I;
3437     BaseRegOffsetBytes += InstrSize;
3438     Size -= InstrSize;
3439   }
3440 
3441   if (LastI)
3442     MBB->splice(InsertI, MBB, LastI);
3443 }
3444 
emitLoop(MachineBasicBlock::iterator InsertI)3445 void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
3446   const AArch64InstrInfo *TII =
3447       MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
3448 
3449   Register BaseReg = FrameRegUpdate
3450                          ? FrameReg
3451                          : MRI->createVirtualRegister(&AArch64::GPR64RegClass);
3452   Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
3453 
3454   emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII);
3455 
3456   int64_t LoopSize = Size;
3457   // If the loop size is not a multiple of 32, split off one 16-byte store at
3458   // the end to fold BaseReg update into.
3459   if (FrameRegUpdate && *FrameRegUpdate)
3460     LoopSize -= LoopSize % 32;
3461   MachineInstr *LoopI = BuildMI(*MBB, InsertI, DL,
3462                                 TII->get(ZeroData ? AArch64::STZGloop_wback
3463                                                   : AArch64::STGloop_wback))
3464                             .addDef(SizeReg)
3465                             .addDef(BaseReg)
3466                             .addImm(LoopSize)
3467                             .addReg(BaseReg)
3468                             .setMemRefs(CombinedMemRefs);
3469   if (FrameRegUpdate)
3470     LoopI->setFlags(FrameRegUpdateFlags);
3471 
3472   int64_t ExtraBaseRegUpdate =
3473       FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getFixed() - Size) : 0;
3474   if (LoopSize < Size) {
3475     assert(FrameRegUpdate);
3476     assert(Size - LoopSize == 16);
3477     // Tag 16 more bytes at BaseReg and update BaseReg.
3478     BuildMI(*MBB, InsertI, DL,
3479             TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex))
3480         .addDef(BaseReg)
3481         .addReg(BaseReg)
3482         .addReg(BaseReg)
3483         .addImm(1 + ExtraBaseRegUpdate / 16)
3484         .setMemRefs(CombinedMemRefs)
3485         .setMIFlags(FrameRegUpdateFlags);
3486   } else if (ExtraBaseRegUpdate) {
3487     // Update BaseReg.
3488     BuildMI(
3489         *MBB, InsertI, DL,
3490         TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri))
3491         .addDef(BaseReg)
3492         .addReg(BaseReg)
3493         .addImm(std::abs(ExtraBaseRegUpdate))
3494         .addImm(0)
3495         .setMIFlags(FrameRegUpdateFlags);
3496   }
3497 }
3498 
3499 // Check if *II is a register update that can be merged into STGloop that ends
3500 // at (Reg + Size). RemainingOffset is the required adjustment to Reg after the
3501 // end of the loop.
canMergeRegUpdate(MachineBasicBlock::iterator II,unsigned Reg,int64_t Size,int64_t * TotalOffset)3502 bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg,
3503                        int64_t Size, int64_t *TotalOffset) {
3504   MachineInstr &MI = *II;
3505   if ((MI.getOpcode() == AArch64::ADDXri ||
3506        MI.getOpcode() == AArch64::SUBXri) &&
3507       MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) {
3508     unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm());
3509     int64_t Offset = MI.getOperand(2).getImm() << Shift;
3510     if (MI.getOpcode() == AArch64::SUBXri)
3511       Offset = -Offset;
3512     int64_t AbsPostOffset = std::abs(Offset - Size);
3513     const int64_t kMaxOffset =
3514         0xFFF; // Max encoding for unshifted ADDXri / SUBXri
3515     if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) {
3516       *TotalOffset = Offset;
3517       return true;
3518     }
3519   }
3520   return false;
3521 }
3522 
mergeMemRefs(const SmallVectorImpl<TagStoreInstr> & TSE,SmallVectorImpl<MachineMemOperand * > & MemRefs)3523 void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
3524                   SmallVectorImpl<MachineMemOperand *> &MemRefs) {
3525   MemRefs.clear();
3526   for (auto &TS : TSE) {
3527     MachineInstr *MI = TS.MI;
3528     // An instruction without memory operands may access anything. Be
3529     // conservative and return an empty list.
3530     if (MI->memoperands_empty()) {
3531       MemRefs.clear();
3532       return;
3533     }
3534     MemRefs.append(MI->memoperands_begin(), MI->memoperands_end());
3535   }
3536 }
3537 
emitCode(MachineBasicBlock::iterator & InsertI,const AArch64FrameLowering * TFI,bool TryMergeSPUpdate)3538 void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
3539                             const AArch64FrameLowering *TFI,
3540                             bool TryMergeSPUpdate) {
3541   if (TagStores.empty())
3542     return;
3543   TagStoreInstr &FirstTagStore = TagStores[0];
3544   TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1];
3545   Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size;
3546   DL = TagStores[0].MI->getDebugLoc();
3547 
3548   Register Reg;
3549   FrameRegOffset = TFI->resolveFrameOffsetReference(
3550       *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg,
3551       /*PreferFP=*/false, /*ForSimm=*/true);
3552   FrameReg = Reg;
3553   FrameRegUpdate = None;
3554 
3555   mergeMemRefs(TagStores, CombinedMemRefs);
3556 
3557   LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n";
3558              for (const auto &Instr
3559                   : TagStores) { dbgs() << "  " << *Instr.MI; });
3560 
3561   // Size threshold where a loop becomes shorter than a linear sequence of
3562   // tagging instructions.
3563   const int kSetTagLoopThreshold = 176;
3564   if (Size < kSetTagLoopThreshold) {
3565     if (TagStores.size() < 2)
3566       return;
3567     emitUnrolled(InsertI);
3568   } else {
3569     MachineInstr *UpdateInstr = nullptr;
3570     int64_t TotalOffset = 0;
3571     if (TryMergeSPUpdate) {
3572       // See if we can merge base register update into the STGloop.
3573       // This is done in AArch64LoadStoreOptimizer for "normal" stores,
3574       // but STGloop is way too unusual for that, and also it only
3575       // realistically happens in function epilogue. Also, STGloop is expanded
3576       // before that pass.
3577       if (InsertI != MBB->end() &&
3578           canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getFixed() + Size,
3579                             &TotalOffset)) {
3580         UpdateInstr = &*InsertI++;
3581         LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n  "
3582                           << *UpdateInstr);
3583       }
3584     }
3585 
3586     if (!UpdateInstr && TagStores.size() < 2)
3587       return;
3588 
3589     if (UpdateInstr) {
3590       FrameRegUpdate = TotalOffset;
3591       FrameRegUpdateFlags = UpdateInstr->getFlags();
3592     }
3593     emitLoop(InsertI);
3594     if (UpdateInstr)
3595       UpdateInstr->eraseFromParent();
3596   }
3597 
3598   for (auto &TS : TagStores)
3599     TS.MI->eraseFromParent();
3600 }
3601 
isMergeableStackTaggingInstruction(MachineInstr & MI,int64_t & Offset,int64_t & Size,bool & ZeroData)3602 bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset,
3603                                         int64_t &Size, bool &ZeroData) {
3604   MachineFunction &MF = *MI.getParent()->getParent();
3605   const MachineFrameInfo &MFI = MF.getFrameInfo();
3606 
3607   unsigned Opcode = MI.getOpcode();
3608   ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGOffset ||
3609               Opcode == AArch64::STZ2GOffset);
3610 
3611   if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) {
3612     if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead())
3613       return false;
3614     if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI())
3615       return false;
3616     Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex());
3617     Size = MI.getOperand(2).getImm();
3618     return true;
3619   }
3620 
3621   if (Opcode == AArch64::STGOffset || Opcode == AArch64::STZGOffset)
3622     Size = 16;
3623   else if (Opcode == AArch64::ST2GOffset || Opcode == AArch64::STZ2GOffset)
3624     Size = 32;
3625   else
3626     return false;
3627 
3628   if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI())
3629     return false;
3630 
3631   Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) +
3632            16 * MI.getOperand(2).getImm();
3633   return true;
3634 }
3635 
3636 // Detect a run of memory tagging instructions for adjacent stack frame slots,
3637 // and replace them with a shorter instruction sequence:
3638 // * replace STG + STG with ST2G
3639 // * replace STGloop + STGloop with STGloop
3640 // This code needs to run when stack slot offsets are already known, but before
3641 // FrameIndex operands in STG instructions are eliminated.
tryMergeAdjacentSTG(MachineBasicBlock::iterator II,const AArch64FrameLowering * TFI,RegScavenger * RS)3642 MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
3643                                                 const AArch64FrameLowering *TFI,
3644                                                 RegScavenger *RS) {
3645   bool FirstZeroData;
3646   int64_t Size, Offset;
3647   MachineInstr &MI = *II;
3648   MachineBasicBlock *MBB = MI.getParent();
3649   MachineBasicBlock::iterator NextI = ++II;
3650   if (&MI == &MBB->instr_back())
3651     return II;
3652   if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData))
3653     return II;
3654 
3655   SmallVector<TagStoreInstr, 4> Instrs;
3656   Instrs.emplace_back(&MI, Offset, Size);
3657 
3658   constexpr int kScanLimit = 10;
3659   int Count = 0;
3660   for (MachineBasicBlock::iterator E = MBB->end();
3661        NextI != E && Count < kScanLimit; ++NextI) {
3662     MachineInstr &MI = *NextI;
3663     bool ZeroData;
3664     int64_t Size, Offset;
3665     // Collect instructions that update memory tags with a FrameIndex operand
3666     // and (when applicable) constant size, and whose output registers are dead
3667     // (the latter is almost always the case in practice). Since these
3668     // instructions effectively have no inputs or outputs, we are free to skip
3669     // any non-aliasing instructions in between without tracking used registers.
3670     if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) {
3671       if (ZeroData != FirstZeroData)
3672         break;
3673       Instrs.emplace_back(&MI, Offset, Size);
3674       continue;
3675     }
3676 
3677     // Only count non-transient, non-tagging instructions toward the scan
3678     // limit.
3679     if (!MI.isTransient())
3680       ++Count;
3681 
3682     // Just in case, stop before the epilogue code starts.
3683     if (MI.getFlag(MachineInstr::FrameSetup) ||
3684         MI.getFlag(MachineInstr::FrameDestroy))
3685       break;
3686 
3687     // Reject anything that may alias the collected instructions.
3688     if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects())
3689       break;
3690   }
3691 
3692   // New code will be inserted after the last tagging instruction we've found.
3693   MachineBasicBlock::iterator InsertI = Instrs.back().MI;
3694   InsertI++;
3695 
3696   llvm::stable_sort(Instrs,
3697                     [](const TagStoreInstr &Left, const TagStoreInstr &Right) {
3698                       return Left.Offset < Right.Offset;
3699                     });
3700 
3701   // Make sure that we don't have any overlapping stores.
3702   int64_t CurOffset = Instrs[0].Offset;
3703   for (auto &Instr : Instrs) {
3704     if (CurOffset > Instr.Offset)
3705       return NextI;
3706     CurOffset = Instr.Offset + Instr.Size;
3707   }
3708 
3709   // Find contiguous runs of tagged memory and emit shorter instruction
3710   // sequencies for them when possible.
3711   TagStoreEdit TSE(MBB, FirstZeroData);
3712   Optional<int64_t> EndOffset;
3713   for (auto &Instr : Instrs) {
3714     if (EndOffset && *EndOffset != Instr.Offset) {
3715       // Found a gap.
3716       TSE.emitCode(InsertI, TFI, /*TryMergeSPUpdate = */ false);
3717       TSE.clear();
3718     }
3719 
3720     TSE.addInstruction(Instr);
3721     EndOffset = Instr.Offset + Instr.Size;
3722   }
3723 
3724   // Multiple FP/SP updates in a loop cannot be described by CFI instructions.
3725   TSE.emitCode(InsertI, TFI, /*TryMergeSPUpdate = */
3726                !MBB->getParent()
3727                     ->getInfo<AArch64FunctionInfo>()
3728                     ->needsAsyncDwarfUnwindInfo());
3729 
3730   return InsertI;
3731 }
3732 } // namespace
3733 
processFunctionBeforeFrameIndicesReplaced(MachineFunction & MF,RegScavenger * RS=nullptr) const3734 void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
3735     MachineFunction &MF, RegScavenger *RS = nullptr) const {
3736   if (StackTaggingMergeSetTag)
3737     for (auto &BB : MF)
3738       for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();)
3739         II = tryMergeAdjacentSTG(II, this, RS);
3740 }
3741 
3742 /// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
3743 /// before the update.  This is easily retrieved as it is exactly the offset
3744 /// that is set in processFunctionBeforeFrameFinalized.
getFrameIndexReferencePreferSP(const MachineFunction & MF,int FI,Register & FrameReg,bool IgnoreSPUpdates) const3745 StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP(
3746     const MachineFunction &MF, int FI, Register &FrameReg,
3747     bool IgnoreSPUpdates) const {
3748   const MachineFrameInfo &MFI = MF.getFrameInfo();
3749   if (IgnoreSPUpdates) {
3750     LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
3751                       << MFI.getObjectOffset(FI) << "\n");
3752     FrameReg = AArch64::SP;
3753     return StackOffset::getFixed(MFI.getObjectOffset(FI));
3754   }
3755 
3756   // Go to common code if we cannot provide sp + offset.
3757   if (MFI.hasVarSizedObjects() ||
3758       MF.getInfo<AArch64FunctionInfo>()->getStackSizeSVE() ||
3759       MF.getSubtarget().getRegisterInfo()->hasStackRealignment(MF))
3760     return getFrameIndexReference(MF, FI, FrameReg);
3761 
3762   FrameReg = AArch64::SP;
3763   return getStackOffset(MF, MFI.getObjectOffset(FI));
3764 }
3765 
3766 /// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve
3767 /// the parent's frame pointer
getWinEHParentFrameOffset(const MachineFunction & MF) const3768 unsigned AArch64FrameLowering::getWinEHParentFrameOffset(
3769     const MachineFunction &MF) const {
3770   return 0;
3771 }
3772 
3773 /// Funclets only need to account for space for the callee saved registers,
3774 /// as the locals are accounted for in the parent's stack frame.
getWinEHFuncletFrameSize(const MachineFunction & MF) const3775 unsigned AArch64FrameLowering::getWinEHFuncletFrameSize(
3776     const MachineFunction &MF) const {
3777   // This is the size of the pushed CSRs.
3778   unsigned CSSize =
3779       MF.getInfo<AArch64FunctionInfo>()->getCalleeSavedStackSize();
3780   // This is the amount of stack a funclet needs to allocate.
3781   return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(),
3782                  getStackAlign());
3783 }
3784 
3785 namespace {
3786 struct FrameObject {
3787   bool IsValid = false;
3788   // Index of the object in MFI.
3789   int ObjectIndex = 0;
3790   // Group ID this object belongs to.
3791   int GroupIndex = -1;
3792   // This object should be placed first (closest to SP).
3793   bool ObjectFirst = false;
3794   // This object's group (which always contains the object with
3795   // ObjectFirst==true) should be placed first.
3796   bool GroupFirst = false;
3797 };
3798 
3799 class GroupBuilder {
3800   SmallVector<int, 8> CurrentMembers;
3801   int NextGroupIndex = 0;
3802   std::vector<FrameObject> &Objects;
3803 
3804 public:
GroupBuilder(std::vector<FrameObject> & Objects)3805   GroupBuilder(std::vector<FrameObject> &Objects) : Objects(Objects) {}
AddMember(int Index)3806   void AddMember(int Index) { CurrentMembers.push_back(Index); }
EndCurrentGroup()3807   void EndCurrentGroup() {
3808     if (CurrentMembers.size() > 1) {
3809       // Create a new group with the current member list. This might remove them
3810       // from their pre-existing groups. That's OK, dealing with overlapping
3811       // groups is too hard and unlikely to make a difference.
3812       LLVM_DEBUG(dbgs() << "group:");
3813       for (int Index : CurrentMembers) {
3814         Objects[Index].GroupIndex = NextGroupIndex;
3815         LLVM_DEBUG(dbgs() << " " << Index);
3816       }
3817       LLVM_DEBUG(dbgs() << "\n");
3818       NextGroupIndex++;
3819     }
3820     CurrentMembers.clear();
3821   }
3822 };
3823 
FrameObjectCompare(const FrameObject & A,const FrameObject & B)3824 bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
3825   // Objects at a lower index are closer to FP; objects at a higher index are
3826   // closer to SP.
3827   //
3828   // For consistency in our comparison, all invalid objects are placed
3829   // at the end. This also allows us to stop walking when we hit the
3830   // first invalid item after it's all sorted.
3831   //
3832   // The "first" object goes first (closest to SP), followed by the members of
3833   // the "first" group.
3834   //
3835   // The rest are sorted by the group index to keep the groups together.
3836   // Higher numbered groups are more likely to be around longer (i.e. untagged
3837   // in the function epilogue and not at some earlier point). Place them closer
3838   // to SP.
3839   //
3840   // If all else equal, sort by the object index to keep the objects in the
3841   // original order.
3842   return std::make_tuple(!A.IsValid, A.ObjectFirst, A.GroupFirst, A.GroupIndex,
3843                          A.ObjectIndex) <
3844          std::make_tuple(!B.IsValid, B.ObjectFirst, B.GroupFirst, B.GroupIndex,
3845                          B.ObjectIndex);
3846 }
3847 } // namespace
3848 
orderFrameObjects(const MachineFunction & MF,SmallVectorImpl<int> & ObjectsToAllocate) const3849 void AArch64FrameLowering::orderFrameObjects(
3850     const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
3851   if (!OrderFrameObjects || ObjectsToAllocate.empty())
3852     return;
3853 
3854   const MachineFrameInfo &MFI = MF.getFrameInfo();
3855   std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd());
3856   for (auto &Obj : ObjectsToAllocate) {
3857     FrameObjects[Obj].IsValid = true;
3858     FrameObjects[Obj].ObjectIndex = Obj;
3859   }
3860 
3861   // Identify stack slots that are tagged at the same time.
3862   GroupBuilder GB(FrameObjects);
3863   for (auto &MBB : MF) {
3864     for (auto &MI : MBB) {
3865       if (MI.isDebugInstr())
3866         continue;
3867       int OpIndex;
3868       switch (MI.getOpcode()) {
3869       case AArch64::STGloop:
3870       case AArch64::STZGloop:
3871         OpIndex = 3;
3872         break;
3873       case AArch64::STGOffset:
3874       case AArch64::STZGOffset:
3875       case AArch64::ST2GOffset:
3876       case AArch64::STZ2GOffset:
3877         OpIndex = 1;
3878         break;
3879       default:
3880         OpIndex = -1;
3881       }
3882 
3883       int TaggedFI = -1;
3884       if (OpIndex >= 0) {
3885         const MachineOperand &MO = MI.getOperand(OpIndex);
3886         if (MO.isFI()) {
3887           int FI = MO.getIndex();
3888           if (FI >= 0 && FI < MFI.getObjectIndexEnd() &&
3889               FrameObjects[FI].IsValid)
3890             TaggedFI = FI;
3891         }
3892       }
3893 
3894       // If this is a stack tagging instruction for a slot that is not part of a
3895       // group yet, either start a new group or add it to the current one.
3896       if (TaggedFI >= 0)
3897         GB.AddMember(TaggedFI);
3898       else
3899         GB.EndCurrentGroup();
3900     }
3901     // Groups should never span multiple basic blocks.
3902     GB.EndCurrentGroup();
3903   }
3904 
3905   // If the function's tagged base pointer is pinned to a stack slot, we want to
3906   // put that slot first when possible. This will likely place it at SP + 0,
3907   // and save one instruction when generating the base pointer because IRG does
3908   // not allow an immediate offset.
3909   const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
3910   Optional<int> TBPI = AFI.getTaggedBasePointerIndex();
3911   if (TBPI) {
3912     FrameObjects[*TBPI].ObjectFirst = true;
3913     FrameObjects[*TBPI].GroupFirst = true;
3914     int FirstGroupIndex = FrameObjects[*TBPI].GroupIndex;
3915     if (FirstGroupIndex >= 0)
3916       for (FrameObject &Object : FrameObjects)
3917         if (Object.GroupIndex == FirstGroupIndex)
3918           Object.GroupFirst = true;
3919   }
3920 
3921   llvm::stable_sort(FrameObjects, FrameObjectCompare);
3922 
3923   int i = 0;
3924   for (auto &Obj : FrameObjects) {
3925     // All invalid items are sorted at the end, so it's safe to stop.
3926     if (!Obj.IsValid)
3927       break;
3928     ObjectsToAllocate[i++] = Obj.ObjectIndex;
3929   }
3930 
3931   LLVM_DEBUG(dbgs() << "Final frame order:\n"; for (auto &Obj
3932                                                     : FrameObjects) {
3933     if (!Obj.IsValid)
3934       break;
3935     dbgs() << "  " << Obj.ObjectIndex << ": group " << Obj.GroupIndex;
3936     if (Obj.ObjectFirst)
3937       dbgs() << ", first";
3938     if (Obj.GroupFirst)
3939       dbgs() << ", group-first";
3940     dbgs() << "\n";
3941   });
3942 }
3943