1 //===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of TargetFrameLowering class.
10 //
11 // On AArch64, stack frames are structured as follows:
12 //
13 // The stack grows downward.
14 //
15 // All of the individual frame areas on the frame below are optional, i.e. it's
16 // possible to create a function so that the particular area isn't present
17 // in the frame.
18 //
19 // At function entry, the "frame" looks as follows:
20 //
21 // |                                   | Higher address
22 // |-----------------------------------|
23 // |                                   |
24 // | arguments passed on the stack     |
25 // |                                   |
26 // |-----------------------------------| <- sp
27 // |                                   | Lower address
28 //
29 //
30 // After the prologue has run, the frame has the following general structure.
31 // Note that this doesn't depict the case where a red-zone is used. Also,
32 // technically the last frame area (VLAs) doesn't get created until in the
33 // main function body, after the prologue is run. However, it's depicted here
34 // for completeness.
35 //
36 // |                                   | Higher address
37 // |-----------------------------------|
38 // |                                   |
39 // | arguments passed on the stack     |
40 // |                                   |
41 // |-----------------------------------|
42 // |                                   |
43 // | (Win64 only) varargs from reg     |
44 // |                                   |
45 // |-----------------------------------|
46 // |                                   |
47 // | callee-saved gpr registers        | <--.
48 // |                                   |    | On Darwin platforms these
49 // |- - - - - - - - - - - - - - - - - -|    | callee saves are swapped,
50 // | prev_lr                           |    | (frame record first)
51 // | prev_fp                           | <--'
52 // | async context if needed           |
53 // | (a.k.a. "frame record")           |
54 // |-----------------------------------| <- fp(=x29)
55 // |                                   |
56 // | callee-saved fp/simd/SVE regs     |
57 // |                                   |
58 // |-----------------------------------|
59 // |                                   |
60 // |        SVE stack objects          |
61 // |                                   |
62 // |-----------------------------------|
63 // |.empty.space.to.make.part.below....|
64 // |.aligned.in.case.it.needs.more.than| (size of this area is unknown at
65 // |.the.standard.16-byte.alignment....|  compile time; if present)
66 // |-----------------------------------|
67 // |                                   |
68 // | local variables of fixed size     |
69 // | including spill slots             |
70 // |-----------------------------------| <- bp(not defined by ABI,
71 // |.variable-sized.local.variables....|       LLVM chooses X19)
72 // |.(VLAs)............................| (size of this area is unknown at
73 // |...................................|  compile time)
74 // |-----------------------------------| <- sp
75 // |                                   | Lower address
76 //
77 //
78 // To access the data in a frame, at-compile time, a constant offset must be
79 // computable from one of the pointers (fp, bp, sp) to access it. The size
80 // of the areas with a dotted background cannot be computed at compile-time
81 // if they are present, making it required to have all three of fp, bp and
82 // sp to be set up to be able to access all contents in the frame areas,
83 // assuming all of the frame areas are non-empty.
84 //
85 // For most functions, some of the frame areas are empty. For those functions,
86 // it may not be necessary to set up fp or bp:
87 // * A base pointer is definitely needed when there are both VLAs and local
88 //   variables with more-than-default alignment requirements.
89 // * A frame pointer is definitely needed when there are local variables with
90 //   more-than-default alignment requirements.
91 //
92 // For Darwin platforms the frame-record (fp, lr) is stored at the top of the
93 // callee-saved area, since the unwind encoding does not allow for encoding
94 // this dynamically and existing tools depend on this layout. For other
95 // platforms, the frame-record is stored at the bottom of the (gpr) callee-saved
96 // area to allow SVE stack objects (allocated directly below the callee-saves,
97 // if available) to be accessed directly from the framepointer.
98 // The SVE spill/fill instructions have VL-scaled addressing modes such
99 // as:
100 //    ldr z8, [fp, #-7 mul vl]
101 // For SVE the size of the vector length (VL) is not known at compile-time, so
102 // '#-7 mul vl' is an offset that can only be evaluated at runtime. With this
103 // layout, we don't need to add an unscaled offset to the framepointer before
104 // accessing the SVE object in the frame.
105 //
106 // In some cases when a base pointer is not strictly needed, it is generated
107 // anyway when offsets from the frame pointer to access local variables become
108 // so large that the offset can't be encoded in the immediate fields of loads
109 // or stores.
110 //
111 // Outgoing function arguments must be at the bottom of the stack frame when
112 // calling another function. If we do not have variable-sized stack objects, we
113 // can allocate a "reserved call frame" area at the bottom of the local
114 // variable area, large enough for all outgoing calls. If we do have VLAs, then
115 // the stack pointer must be decremented and incremented around each call to
116 // make space for the arguments below the VLAs.
117 //
118 // FIXME: also explain the redzone concept.
119 //
120 // An example of the prologue:
121 //
122 //     .globl __foo
123 //     .align 2
124 //  __foo:
125 // Ltmp0:
126 //     .cfi_startproc
127 //     .cfi_personality 155, ___gxx_personality_v0
128 // Leh_func_begin:
129 //     .cfi_lsda 16, Lexception33
130 //
131 //     stp  xa,bx, [sp, -#offset]!
132 //     ...
133 //     stp  x28, x27, [sp, #offset-32]
134 //     stp  fp, lr, [sp, #offset-16]
135 //     add  fp, sp, #offset - 16
136 //     sub  sp, sp, #1360
137 //
138 // The Stack:
139 //       +-------------------------------------------+
140 // 10000 | ........ | ........ | ........ | ........ |
141 // 10004 | ........ | ........ | ........ | ........ |
142 //       +-------------------------------------------+
143 // 10008 | ........ | ........ | ........ | ........ |
144 // 1000c | ........ | ........ | ........ | ........ |
145 //       +===========================================+
146 // 10010 |                X28 Register               |
147 // 10014 |                X28 Register               |
148 //       +-------------------------------------------+
149 // 10018 |                X27 Register               |
150 // 1001c |                X27 Register               |
151 //       +===========================================+
152 // 10020 |                Frame Pointer              |
153 // 10024 |                Frame Pointer              |
154 //       +-------------------------------------------+
155 // 10028 |                Link Register              |
156 // 1002c |                Link Register              |
157 //       +===========================================+
158 // 10030 | ........ | ........ | ........ | ........ |
159 // 10034 | ........ | ........ | ........ | ........ |
160 //       +-------------------------------------------+
161 // 10038 | ........ | ........ | ........ | ........ |
162 // 1003c | ........ | ........ | ........ | ........ |
163 //       +-------------------------------------------+
164 //
165 //     [sp] = 10030        ::    >>initial value<<
166 //     sp = 10020          ::  stp fp, lr, [sp, #-16]!
167 //     fp = sp == 10020    ::  mov fp, sp
168 //     [sp] == 10020       ::  stp x28, x27, [sp, #-16]!
169 //     sp == 10010         ::    >>final value<<
170 //
171 // The frame pointer (w29) points to address 10020. If we use an offset of
172 // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
173 // for w27, and -32 for w28:
174 //
175 //  Ltmp1:
176 //     .cfi_def_cfa w29, 16
177 //  Ltmp2:
178 //     .cfi_offset w30, -8
179 //  Ltmp3:
180 //     .cfi_offset w29, -16
181 //  Ltmp4:
182 //     .cfi_offset w27, -24
183 //  Ltmp5:
184 //     .cfi_offset w28, -32
185 //
186 //===----------------------------------------------------------------------===//
187 
188 #include "AArch64FrameLowering.h"
189 #include "AArch64InstrInfo.h"
190 #include "AArch64MachineFunctionInfo.h"
191 #include "AArch64RegisterInfo.h"
192 #include "AArch64Subtarget.h"
193 #include "AArch64TargetMachine.h"
194 #include "MCTargetDesc/AArch64AddressingModes.h"
195 #include "llvm/ADT/ScopeExit.h"
196 #include "llvm/ADT/SmallVector.h"
197 #include "llvm/ADT/Statistic.h"
198 #include "llvm/CodeGen/LivePhysRegs.h"
199 #include "llvm/CodeGen/MachineBasicBlock.h"
200 #include "llvm/CodeGen/MachineFrameInfo.h"
201 #include "llvm/CodeGen/MachineFunction.h"
202 #include "llvm/CodeGen/MachineInstr.h"
203 #include "llvm/CodeGen/MachineInstrBuilder.h"
204 #include "llvm/CodeGen/MachineMemOperand.h"
205 #include "llvm/CodeGen/MachineModuleInfo.h"
206 #include "llvm/CodeGen/MachineOperand.h"
207 #include "llvm/CodeGen/MachineRegisterInfo.h"
208 #include "llvm/CodeGen/RegisterScavenging.h"
209 #include "llvm/CodeGen/TargetInstrInfo.h"
210 #include "llvm/CodeGen/TargetRegisterInfo.h"
211 #include "llvm/CodeGen/TargetSubtargetInfo.h"
212 #include "llvm/CodeGen/WinEHFuncInfo.h"
213 #include "llvm/IR/Attributes.h"
214 #include "llvm/IR/CallingConv.h"
215 #include "llvm/IR/DataLayout.h"
216 #include "llvm/IR/DebugLoc.h"
217 #include "llvm/IR/Function.h"
218 #include "llvm/MC/MCAsmInfo.h"
219 #include "llvm/MC/MCDwarf.h"
220 #include "llvm/Support/CommandLine.h"
221 #include "llvm/Support/Debug.h"
222 #include "llvm/Support/ErrorHandling.h"
223 #include "llvm/Support/MathExtras.h"
224 #include "llvm/Support/raw_ostream.h"
225 #include "llvm/Target/TargetMachine.h"
226 #include "llvm/Target/TargetOptions.h"
227 #include <cassert>
228 #include <cstdint>
229 #include <iterator>
230 #include <vector>
231 
232 using namespace llvm;
233 
234 #define DEBUG_TYPE "frame-info"
235 
236 static cl::opt<bool> EnableRedZone("aarch64-redzone",
237                                    cl::desc("enable use of redzone on AArch64"),
238                                    cl::init(false), cl::Hidden);
239 
240 static cl::opt<bool>
241     ReverseCSRRestoreSeq("reverse-csr-restore-seq",
242                          cl::desc("reverse the CSR restore sequence"),
243                          cl::init(false), cl::Hidden);
244 
245 static cl::opt<bool> StackTaggingMergeSetTag(
246     "stack-tagging-merge-settag",
247     cl::desc("merge settag instruction in function epilog"), cl::init(true),
248     cl::Hidden);
249 
250 static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects",
251                                        cl::desc("sort stack allocations"),
252                                        cl::init(true), cl::Hidden);
253 
254 cl::opt<bool> EnableHomogeneousPrologEpilog(
255     "homogeneous-prolog-epilog", cl::init(false), cl::ZeroOrMore, cl::Hidden,
256     cl::desc("Emit homogeneous prologue and epilogue for the size "
257              "optimization (default = off)"));
258 
259 STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
260 
261 /// Returns how much of the incoming argument stack area (in bytes) we should
262 /// clean up in an epilogue. For the C calling convention this will be 0, for
263 /// guaranteed tail call conventions it can be positive (a normal return or a
264 /// tail call to a function that uses less stack space for arguments) or
265 /// negative (for a tail call to a function that needs more stack space than us
266 /// for arguments).
267 static int64_t getArgumentStackToRestore(MachineFunction &MF,
268                                          MachineBasicBlock &MBB) {
269   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
270   bool IsTailCallReturn = false;
271   if (MBB.end() != MBBI) {
272     unsigned RetOpcode = MBBI->getOpcode();
273     IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi ||
274                        RetOpcode == AArch64::TCRETURNri ||
275                        RetOpcode == AArch64::TCRETURNriBTI;
276   }
277   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
278 
279   int64_t ArgumentPopSize = 0;
280   if (IsTailCallReturn) {
281     MachineOperand &StackAdjust = MBBI->getOperand(1);
282 
283     // For a tail-call in a callee-pops-arguments environment, some or all of
284     // the stack may actually be in use for the call's arguments, this is
285     // calculated during LowerCall and consumed here...
286     ArgumentPopSize = StackAdjust.getImm();
287   } else {
288     // ... otherwise the amount to pop is *all* of the argument space,
289     // conveniently stored in the MachineFunctionInfo by
290     // LowerFormalArguments. This will, of course, be zero for the C calling
291     // convention.
292     ArgumentPopSize = AFI->getArgumentStackToRestore();
293   }
294 
295   return ArgumentPopSize;
296 }
297 
298 static bool produceCompactUnwindFrame(MachineFunction &MF);
299 static bool needsWinCFI(const MachineFunction &MF);
300 static StackOffset getSVEStackSize(const MachineFunction &MF);
301 static bool needsShadowCallStackPrologueEpilogue(MachineFunction &MF);
302 
303 /// Returns true if a homogeneous prolog or epilog code can be emitted
304 /// for the size optimization. If possible, a frame helper call is injected.
305 /// When Exit block is given, this check is for epilog.
306 bool AArch64FrameLowering::homogeneousPrologEpilog(
307     MachineFunction &MF, MachineBasicBlock *Exit) const {
308   if (!MF.getFunction().hasMinSize())
309     return false;
310   if (!EnableHomogeneousPrologEpilog)
311     return false;
312   if (ReverseCSRRestoreSeq)
313     return false;
314   if (EnableRedZone)
315     return false;
316 
317   // TODO: Window is supported yet.
318   if (needsWinCFI(MF))
319     return false;
320   // TODO: SVE is not supported yet.
321   if (getSVEStackSize(MF))
322     return false;
323 
324   // Bail on stack adjustment needed on return for simplicity.
325   const MachineFrameInfo &MFI = MF.getFrameInfo();
326   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
327   if (MFI.hasVarSizedObjects() || RegInfo->hasStackRealignment(MF))
328     return false;
329   if (Exit && getArgumentStackToRestore(MF, *Exit))
330     return false;
331 
332   return true;
333 }
334 
335 /// Returns true if CSRs should be paired.
336 bool AArch64FrameLowering::producePairRegisters(MachineFunction &MF) const {
337   return produceCompactUnwindFrame(MF) || homogeneousPrologEpilog(MF);
338 }
339 
340 /// This is the biggest offset to the stack pointer we can encode in aarch64
341 /// instructions (without using a separate calculation and a temp register).
342 /// Note that the exception here are vector stores/loads which cannot encode any
343 /// displacements (see estimateRSStackSizeLimit(), isAArch64FrameOffsetLegal()).
344 static const unsigned DefaultSafeSPDisplacement = 255;
345 
346 /// Look at each instruction that references stack frames and return the stack
347 /// size limit beyond which some of these instructions will require a scratch
348 /// register during their expansion later.
349 static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
350   // FIXME: For now, just conservatively guestimate based on unscaled indexing
351   // range. We'll end up allocating an unnecessary spill slot a lot, but
352   // realistically that's not a big deal at this stage of the game.
353   for (MachineBasicBlock &MBB : MF) {
354     for (MachineInstr &MI : MBB) {
355       if (MI.isDebugInstr() || MI.isPseudo() ||
356           MI.getOpcode() == AArch64::ADDXri ||
357           MI.getOpcode() == AArch64::ADDSXri)
358         continue;
359 
360       for (const MachineOperand &MO : MI.operands()) {
361         if (!MO.isFI())
362           continue;
363 
364         StackOffset Offset;
365         if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) ==
366             AArch64FrameOffsetCannotUpdate)
367           return 0;
368       }
369     }
370   }
371   return DefaultSafeSPDisplacement;
372 }
373 
374 TargetStackID::Value
375 AArch64FrameLowering::getStackIDForScalableVectors() const {
376   return TargetStackID::ScalableVector;
377 }
378 
379 /// Returns the size of the fixed object area (allocated next to sp on entry)
380 /// On Win64 this may include a var args area and an UnwindHelp object for EH.
381 static unsigned getFixedObjectSize(const MachineFunction &MF,
382                                    const AArch64FunctionInfo *AFI, bool IsWin64,
383                                    bool IsFunclet) {
384   if (!IsWin64 || IsFunclet) {
385     return AFI->getTailCallReservedStack();
386   } else {
387     if (AFI->getTailCallReservedStack() != 0)
388       report_fatal_error("cannot generate ABI-changing tail call for Win64");
389     // Var args are stored here in the primary function.
390     const unsigned VarArgsArea = AFI->getVarArgsGPRSize();
391     // To support EH funclets we allocate an UnwindHelp object
392     const unsigned UnwindHelpObject = (MF.hasEHFunclets() ? 8 : 0);
393     return alignTo(VarArgsArea + UnwindHelpObject, 16);
394   }
395 }
396 
397 /// Returns the size of the entire SVE stackframe (calleesaves + spills).
398 static StackOffset getSVEStackSize(const MachineFunction &MF) {
399   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
400   return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE());
401 }
402 
403 bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
404   if (!EnableRedZone)
405     return false;
406 
407   // Don't use the red zone if the function explicitly asks us not to.
408   // This is typically used for kernel code.
409   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
410   const unsigned RedZoneSize =
411       Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction());
412   if (!RedZoneSize)
413     return false;
414 
415   const MachineFrameInfo &MFI = MF.getFrameInfo();
416   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
417   uint64_t NumBytes = AFI->getLocalStackSize();
418 
419   return !(MFI.hasCalls() || hasFP(MF) || NumBytes > RedZoneSize ||
420            getSVEStackSize(MF));
421 }
422 
423 /// hasFP - Return true if the specified function should have a dedicated frame
424 /// pointer register.
425 bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
426   const MachineFrameInfo &MFI = MF.getFrameInfo();
427   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
428   // Win64 EH requires a frame pointer if funclets are present, as the locals
429   // are accessed off the frame pointer in both the parent function and the
430   // funclets.
431   if (MF.hasEHFunclets())
432     return true;
433   // Retain behavior of always omitting the FP for leaf functions when possible.
434   if (MF.getTarget().Options.DisableFramePointerElim(MF))
435     return true;
436   if (MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
437       MFI.hasStackMap() || MFI.hasPatchPoint() ||
438       RegInfo->hasStackRealignment(MF))
439     return true;
440   // With large callframes around we may need to use FP to access the scavenging
441   // emergency spillslot.
442   //
443   // Unfortunately some calls to hasFP() like machine verifier ->
444   // getReservedReg() -> hasFP in the middle of global isel are too early
445   // to know the max call frame size. Hopefully conservatively returning "true"
446   // in those cases is fine.
447   // DefaultSafeSPDisplacement is fine as we only emergency spill GP regs.
448   if (!MFI.isMaxCallFrameSizeComputed() ||
449       MFI.getMaxCallFrameSize() > DefaultSafeSPDisplacement)
450     return true;
451 
452   return false;
453 }
454 
455 /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
456 /// not required, we reserve argument space for call sites in the function
457 /// immediately on entry to the current function.  This eliminates the need for
458 /// add/sub sp brackets around call sites.  Returns true if the call frame is
459 /// included as part of the stack frame.
460 bool
461 AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
462   return !MF.getFrameInfo().hasVarSizedObjects();
463 }
464 
465 MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
466     MachineFunction &MF, MachineBasicBlock &MBB,
467     MachineBasicBlock::iterator I) const {
468   const AArch64InstrInfo *TII =
469       static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
470   DebugLoc DL = I->getDebugLoc();
471   unsigned Opc = I->getOpcode();
472   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
473   uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
474 
475   if (!hasReservedCallFrame(MF)) {
476     int64_t Amount = I->getOperand(0).getImm();
477     Amount = alignTo(Amount, getStackAlign());
478     if (!IsDestroy)
479       Amount = -Amount;
480 
481     // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
482     // doesn't have to pop anything), then the first operand will be zero too so
483     // this adjustment is a no-op.
484     if (CalleePopAmount == 0) {
485       // FIXME: in-function stack adjustment for calls is limited to 24-bits
486       // because there's no guaranteed temporary register available.
487       //
488       // ADD/SUB (immediate) has only LSL #0 and LSL #12 available.
489       // 1) For offset <= 12-bit, we use LSL #0
490       // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
491       // LSL #0, and the other uses LSL #12.
492       //
493       // Most call frames will be allocated at the start of a function so
494       // this is OK, but it is a limitation that needs dealing with.
495       assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
496       emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
497                       StackOffset::getFixed(Amount), TII);
498     }
499   } else if (CalleePopAmount != 0) {
500     // If the calling convention demands that the callee pops arguments from the
501     // stack, we want to add it back if we have a reserved call frame.
502     assert(CalleePopAmount < 0xffffff && "call frame too large");
503     emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
504                     StackOffset::getFixed(-(int64_t)CalleePopAmount), TII);
505   }
506   return MBB.erase(I);
507 }
508 
509 void AArch64FrameLowering::emitCalleeSavedGPRLocations(
510     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
511   MachineFunction &MF = *MBB.getParent();
512   MachineFrameInfo &MFI = MF.getFrameInfo();
513 
514   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
515   if (CSI.empty())
516     return;
517 
518   const TargetSubtargetInfo &STI = MF.getSubtarget();
519   const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
520   const TargetInstrInfo &TII = *STI.getInstrInfo();
521   DebugLoc DL = MBB.findDebugLoc(MBBI);
522 
523   for (const auto &Info : CSI) {
524     if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector)
525       continue;
526 
527     assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
528     unsigned DwarfReg = TRI.getDwarfRegNum(Info.getReg(), true);
529 
530     int64_t Offset =
531         MFI.getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea();
532     unsigned CFIIndex = MF.addFrameInst(
533         MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
534     BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
535         .addCFIIndex(CFIIndex)
536         .setMIFlags(MachineInstr::FrameSetup);
537   }
538 }
539 
540 void AArch64FrameLowering::emitCalleeSavedSVELocations(
541     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
542   MachineFunction &MF = *MBB.getParent();
543   MachineFrameInfo &MFI = MF.getFrameInfo();
544 
545   // Add callee saved registers to move list.
546   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
547   if (CSI.empty())
548     return;
549 
550   const TargetSubtargetInfo &STI = MF.getSubtarget();
551   const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
552   const TargetInstrInfo &TII = *STI.getInstrInfo();
553   DebugLoc DL = MBB.findDebugLoc(MBBI);
554   AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
555 
556   for (const auto &Info : CSI) {
557     if (!(MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector))
558       continue;
559 
560     // Not all unwinders may know about SVE registers, so assume the lowest
561     // common demoninator.
562     assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
563     unsigned Reg = Info.getReg();
564     if (!static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
565       continue;
566 
567     StackOffset Offset =
568         StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
569         StackOffset::getFixed(AFI.getCalleeSavedStackSize(MFI));
570 
571     unsigned CFIIndex = MF.addFrameInst(createCFAOffset(TRI, Reg, Offset));
572     BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
573         .addCFIIndex(CFIIndex)
574         .setMIFlags(MachineInstr::FrameSetup);
575   }
576 }
577 
578 void AArch64FrameLowering::emitCalleeSavedFrameMoves(
579     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
580   emitCalleeSavedGPRLocations(MBB, MBBI);
581   emitCalleeSavedSVELocations(MBB, MBBI);
582 }
583 
584 static void insertCFISameValue(const MCInstrDesc &Desc, MachineFunction &MF,
585                                MachineBasicBlock &MBB,
586                                MachineBasicBlock::iterator InsertPt,
587                                unsigned DwarfReg) {
588   unsigned CFIIndex =
589       MF.addFrameInst(MCCFIInstruction::createSameValue(nullptr, DwarfReg));
590   BuildMI(MBB, InsertPt, DebugLoc(), Desc).addCFIIndex(CFIIndex);
591 }
592 
593 void AArch64FrameLowering::resetCFIToInitialState(
594     MachineBasicBlock &MBB) const {
595 
596   MachineFunction &MF = *MBB.getParent();
597   const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
598   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
599   const auto &TRI =
600       static_cast<const AArch64RegisterInfo &>(*Subtarget.getRegisterInfo());
601   const auto &MFI = *MF.getInfo<AArch64FunctionInfo>();
602 
603   const MCInstrDesc &CFIDesc = TII.get(TargetOpcode::CFI_INSTRUCTION);
604   DebugLoc DL;
605 
606   // Reset the CFA to `SP + 0`.
607   MachineBasicBlock::iterator InsertPt = MBB.begin();
608   unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
609       nullptr, TRI.getDwarfRegNum(AArch64::SP, true), 0));
610   BuildMI(MBB, InsertPt, DL, CFIDesc).addCFIIndex(CFIIndex);
611 
612   // Flip the RA sign state.
613   if (MFI.shouldSignReturnAddress()) {
614     CFIIndex = MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
615     BuildMI(MBB, InsertPt, DL, CFIDesc).addCFIIndex(CFIIndex);
616   }
617 
618   // Shadow call stack uses X18, reset it.
619   if (needsShadowCallStackPrologueEpilogue(MF))
620     insertCFISameValue(CFIDesc, MF, MBB, InsertPt,
621                        TRI.getDwarfRegNum(AArch64::X18, true));
622 
623   // Emit .cfi_same_value for callee-saved registers.
624   const std::vector<CalleeSavedInfo> &CSI =
625       MF.getFrameInfo().getCalleeSavedInfo();
626   for (const auto &Info : CSI) {
627     unsigned Reg = Info.getReg();
628     if (!TRI.regNeedsCFI(Reg, Reg))
629       continue;
630     insertCFISameValue(CFIDesc, MF, MBB, InsertPt,
631                        TRI.getDwarfRegNum(Reg, true));
632   }
633 }
634 
635 static void emitCalleeSavedRestores(MachineBasicBlock &MBB,
636                                     MachineBasicBlock::iterator MBBI,
637                                     bool SVE) {
638   MachineFunction &MF = *MBB.getParent();
639   MachineFrameInfo &MFI = MF.getFrameInfo();
640 
641   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
642   if (CSI.empty())
643     return;
644 
645   const TargetSubtargetInfo &STI = MF.getSubtarget();
646   const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
647   const TargetInstrInfo &TII = *STI.getInstrInfo();
648   DebugLoc DL = MBB.findDebugLoc(MBBI);
649 
650   for (const auto &Info : CSI) {
651     if (SVE !=
652         (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector))
653       continue;
654 
655     unsigned Reg = Info.getReg();
656     if (SVE &&
657         !static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
658       continue;
659 
660     unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore(
661         nullptr, TRI.getDwarfRegNum(Info.getReg(), true)));
662     BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
663         .addCFIIndex(CFIIndex)
664         .setMIFlags(MachineInstr::FrameDestroy);
665   }
666 }
667 
668 void AArch64FrameLowering::emitCalleeSavedGPRRestores(
669     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
670   emitCalleeSavedRestores(MBB, MBBI, false);
671 }
672 
673 void AArch64FrameLowering::emitCalleeSavedSVERestores(
674     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
675   emitCalleeSavedRestores(MBB, MBBI, true);
676 }
677 
678 static MCRegister getRegisterOrZero(MCRegister Reg, bool HasSVE) {
679   switch (Reg.id()) {
680   default:
681     // The called routine is expected to preserve r19-r28
682     // r29 and r30 are used as frame pointer and link register resp.
683     return 0;
684 
685     // GPRs
686 #define CASE(n)                                                                \
687   case AArch64::W##n:                                                          \
688   case AArch64::X##n:                                                          \
689     return AArch64::X##n
690   CASE(0);
691   CASE(1);
692   CASE(2);
693   CASE(3);
694   CASE(4);
695   CASE(5);
696   CASE(6);
697   CASE(7);
698   CASE(8);
699   CASE(9);
700   CASE(10);
701   CASE(11);
702   CASE(12);
703   CASE(13);
704   CASE(14);
705   CASE(15);
706   CASE(16);
707   CASE(17);
708   CASE(18);
709 #undef CASE
710 
711     // FPRs
712 #define CASE(n)                                                                \
713   case AArch64::B##n:                                                          \
714   case AArch64::H##n:                                                          \
715   case AArch64::S##n:                                                          \
716   case AArch64::D##n:                                                          \
717   case AArch64::Q##n:                                                          \
718     return HasSVE ? AArch64::Z##n : AArch64::Q##n
719   CASE(0);
720   CASE(1);
721   CASE(2);
722   CASE(3);
723   CASE(4);
724   CASE(5);
725   CASE(6);
726   CASE(7);
727   CASE(8);
728   CASE(9);
729   CASE(10);
730   CASE(11);
731   CASE(12);
732   CASE(13);
733   CASE(14);
734   CASE(15);
735   CASE(16);
736   CASE(17);
737   CASE(18);
738   CASE(19);
739   CASE(20);
740   CASE(21);
741   CASE(22);
742   CASE(23);
743   CASE(24);
744   CASE(25);
745   CASE(26);
746   CASE(27);
747   CASE(28);
748   CASE(29);
749   CASE(30);
750   CASE(31);
751 #undef CASE
752   }
753 }
754 
755 void AArch64FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero,
756                                                 MachineBasicBlock &MBB) const {
757   // Insertion point.
758   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
759 
760   // Fake a debug loc.
761   DebugLoc DL;
762   if (MBBI != MBB.end())
763     DL = MBBI->getDebugLoc();
764 
765   const MachineFunction &MF = *MBB.getParent();
766   const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
767   const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
768 
769   BitVector GPRsToZero(TRI.getNumRegs());
770   BitVector FPRsToZero(TRI.getNumRegs());
771   bool HasSVE = STI.hasSVE();
772   for (MCRegister Reg : RegsToZero.set_bits()) {
773     if (TRI.isGeneralPurposeRegister(MF, Reg)) {
774       // For GPRs, we only care to clear out the 64-bit register.
775       if (MCRegister XReg = getRegisterOrZero(Reg, HasSVE))
776         GPRsToZero.set(XReg);
777     } else if (AArch64::FPR128RegClass.contains(Reg) ||
778                AArch64::FPR64RegClass.contains(Reg) ||
779                AArch64::FPR32RegClass.contains(Reg) ||
780                AArch64::FPR16RegClass.contains(Reg) ||
781                AArch64::FPR8RegClass.contains(Reg)) {
782       // For FPRs,
783       if (MCRegister XReg = getRegisterOrZero(Reg, HasSVE))
784         FPRsToZero.set(XReg);
785     }
786   }
787 
788   const AArch64InstrInfo &TII = *STI.getInstrInfo();
789 
790   // Zero out GPRs.
791   for (MCRegister Reg : GPRsToZero.set_bits())
792     BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), Reg).addImm(0);
793 
794   // Zero out FP/vector registers.
795   for (MCRegister Reg : FPRsToZero.set_bits())
796     BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVID), Reg).addImm(0);
797 
798   if (HasSVE) {
799     for (MCRegister PReg :
800          {AArch64::P0, AArch64::P1, AArch64::P2, AArch64::P3, AArch64::P4,
801           AArch64::P5, AArch64::P6, AArch64::P7, AArch64::P8, AArch64::P9,
802           AArch64::P10, AArch64::P11, AArch64::P12, AArch64::P13, AArch64::P14,
803           AArch64::P15}) {
804       if (RegsToZero[PReg])
805         BuildMI(MBB, MBBI, DL, TII.get(AArch64::PFALSE), PReg);
806     }
807   }
808 }
809 
810 // Find a scratch register that we can use at the start of the prologue to
811 // re-align the stack pointer.  We avoid using callee-save registers since they
812 // may appear to be free when this is called from canUseAsPrologue (during
813 // shrink wrapping), but then no longer be free when this is called from
814 // emitPrologue.
815 //
816 // FIXME: This is a bit conservative, since in the above case we could use one
817 // of the callee-save registers as a scratch temp to re-align the stack pointer,
818 // but we would then have to make sure that we were in fact saving at least one
819 // callee-save register in the prologue, which is additional complexity that
820 // doesn't seem worth the benefit.
821 static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
822   MachineFunction *MF = MBB->getParent();
823 
824   // If MBB is an entry block, use X9 as the scratch register
825   if (&MF->front() == MBB)
826     return AArch64::X9;
827 
828   const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
829   const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
830   LivePhysRegs LiveRegs(TRI);
831   LiveRegs.addLiveIns(*MBB);
832 
833   // Mark callee saved registers as used so we will not choose them.
834   const MCPhysReg *CSRegs = MF->getRegInfo().getCalleeSavedRegs();
835   for (unsigned i = 0; CSRegs[i]; ++i)
836     LiveRegs.addReg(CSRegs[i]);
837 
838   // Prefer X9 since it was historically used for the prologue scratch reg.
839   const MachineRegisterInfo &MRI = MF->getRegInfo();
840   if (LiveRegs.available(MRI, AArch64::X9))
841     return AArch64::X9;
842 
843   for (unsigned Reg : AArch64::GPR64RegClass) {
844     if (LiveRegs.available(MRI, Reg))
845       return Reg;
846   }
847   return AArch64::NoRegister;
848 }
849 
850 bool AArch64FrameLowering::canUseAsPrologue(
851     const MachineBasicBlock &MBB) const {
852   const MachineFunction *MF = MBB.getParent();
853   MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
854   const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
855   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
856 
857   // Don't need a scratch register if we're not going to re-align the stack.
858   if (!RegInfo->hasStackRealignment(*MF))
859     return true;
860   // Otherwise, we can use any block as long as it has a scratch register
861   // available.
862   return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
863 }
864 
865 static bool windowsRequiresStackProbe(MachineFunction &MF,
866                                       uint64_t StackSizeInBytes) {
867   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
868   if (!Subtarget.isTargetWindows())
869     return false;
870   const Function &F = MF.getFunction();
871   // TODO: When implementing stack protectors, take that into account
872   // for the probe threshold.
873   unsigned StackProbeSize = 4096;
874   if (F.hasFnAttribute("stack-probe-size"))
875     F.getFnAttribute("stack-probe-size")
876         .getValueAsString()
877         .getAsInteger(0, StackProbeSize);
878   return (StackSizeInBytes >= StackProbeSize) &&
879          !F.hasFnAttribute("no-stack-arg-probe");
880 }
881 
882 static bool needsWinCFI(const MachineFunction &MF) {
883   const Function &F = MF.getFunction();
884   return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
885          F.needsUnwindTableEntry();
886 }
887 
888 bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
889     MachineFunction &MF, uint64_t StackBumpBytes) const {
890   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
891   const MachineFrameInfo &MFI = MF.getFrameInfo();
892   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
893   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
894   if (homogeneousPrologEpilog(MF))
895     return false;
896 
897   if (AFI->getLocalStackSize() == 0)
898     return false;
899 
900   // For WinCFI, if optimizing for size, prefer to not combine the stack bump
901   // (to force a stp with predecrement) to match the packed unwind format,
902   // provided that there actually are any callee saved registers to merge the
903   // decrement with.
904   // This is potentially marginally slower, but allows using the packed
905   // unwind format for functions that both have a local area and callee saved
906   // registers. Using the packed unwind format notably reduces the size of
907   // the unwind info.
908   if (needsWinCFI(MF) && AFI->getCalleeSavedStackSize() > 0 &&
909       MF.getFunction().hasOptSize())
910     return false;
911 
912   // 512 is the maximum immediate for stp/ldp that will be used for
913   // callee-save save/restores
914   if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackBumpBytes))
915     return false;
916 
917   if (MFI.hasVarSizedObjects())
918     return false;
919 
920   if (RegInfo->hasStackRealignment(MF))
921     return false;
922 
923   // This isn't strictly necessary, but it simplifies things a bit since the
924   // current RedZone handling code assumes the SP is adjusted by the
925   // callee-save save/restore code.
926   if (canUseRedZone(MF))
927     return false;
928 
929   // When there is an SVE area on the stack, always allocate the
930   // callee-saves and spills/locals separately.
931   if (getSVEStackSize(MF))
932     return false;
933 
934   return true;
935 }
936 
937 bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
938     MachineBasicBlock &MBB, unsigned StackBumpBytes) const {
939   if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes))
940     return false;
941 
942   if (MBB.empty())
943     return true;
944 
945   // Disable combined SP bump if the last instruction is an MTE tag store. It
946   // is almost always better to merge SP adjustment into those instructions.
947   MachineBasicBlock::iterator LastI = MBB.getFirstTerminator();
948   MachineBasicBlock::iterator Begin = MBB.begin();
949   while (LastI != Begin) {
950     --LastI;
951     if (LastI->isTransient())
952       continue;
953     if (!LastI->getFlag(MachineInstr::FrameDestroy))
954       break;
955   }
956   switch (LastI->getOpcode()) {
957   case AArch64::STGloop:
958   case AArch64::STZGloop:
959   case AArch64::STGOffset:
960   case AArch64::STZGOffset:
961   case AArch64::ST2GOffset:
962   case AArch64::STZ2GOffset:
963     return false;
964   default:
965     return true;
966   }
967   llvm_unreachable("unreachable");
968 }
969 
970 // Given a load or a store instruction, generate an appropriate unwinding SEH
971 // code on Windows.
972 static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
973                                              const TargetInstrInfo &TII,
974                                              MachineInstr::MIFlag Flag) {
975   unsigned Opc = MBBI->getOpcode();
976   MachineBasicBlock *MBB = MBBI->getParent();
977   MachineFunction &MF = *MBB->getParent();
978   DebugLoc DL = MBBI->getDebugLoc();
979   unsigned ImmIdx = MBBI->getNumOperands() - 1;
980   int Imm = MBBI->getOperand(ImmIdx).getImm();
981   MachineInstrBuilder MIB;
982   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
983   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
984 
985   switch (Opc) {
986   default:
987     llvm_unreachable("No SEH Opcode for this instruction");
988   case AArch64::LDPDpost:
989     Imm = -Imm;
990     LLVM_FALLTHROUGH;
991   case AArch64::STPDpre: {
992     unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
993     unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg());
994     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP_X))
995               .addImm(Reg0)
996               .addImm(Reg1)
997               .addImm(Imm * 8)
998               .setMIFlag(Flag);
999     break;
1000   }
1001   case AArch64::LDPXpost:
1002     Imm = -Imm;
1003     LLVM_FALLTHROUGH;
1004   case AArch64::STPXpre: {
1005     Register Reg0 = MBBI->getOperand(1).getReg();
1006     Register Reg1 = MBBI->getOperand(2).getReg();
1007     if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
1008       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X))
1009                 .addImm(Imm * 8)
1010                 .setMIFlag(Flag);
1011     else
1012       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP_X))
1013                 .addImm(RegInfo->getSEHRegNum(Reg0))
1014                 .addImm(RegInfo->getSEHRegNum(Reg1))
1015                 .addImm(Imm * 8)
1016                 .setMIFlag(Flag);
1017     break;
1018   }
1019   case AArch64::LDRDpost:
1020     Imm = -Imm;
1021     LLVM_FALLTHROUGH;
1022   case AArch64::STRDpre: {
1023     unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
1024     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg_X))
1025               .addImm(Reg)
1026               .addImm(Imm)
1027               .setMIFlag(Flag);
1028     break;
1029   }
1030   case AArch64::LDRXpost:
1031     Imm = -Imm;
1032     LLVM_FALLTHROUGH;
1033   case AArch64::STRXpre: {
1034     unsigned Reg =  RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
1035     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg_X))
1036               .addImm(Reg)
1037               .addImm(Imm)
1038               .setMIFlag(Flag);
1039     break;
1040   }
1041   case AArch64::STPDi:
1042   case AArch64::LDPDi: {
1043     unsigned Reg0 =  RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
1044     unsigned Reg1 =  RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
1045     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP))
1046               .addImm(Reg0)
1047               .addImm(Reg1)
1048               .addImm(Imm * 8)
1049               .setMIFlag(Flag);
1050     break;
1051   }
1052   case AArch64::STPXi:
1053   case AArch64::LDPXi: {
1054     Register Reg0 = MBBI->getOperand(0).getReg();
1055     Register Reg1 = MBBI->getOperand(1).getReg();
1056     if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
1057       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR))
1058                 .addImm(Imm * 8)
1059                 .setMIFlag(Flag);
1060     else
1061       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP))
1062                 .addImm(RegInfo->getSEHRegNum(Reg0))
1063                 .addImm(RegInfo->getSEHRegNum(Reg1))
1064                 .addImm(Imm * 8)
1065                 .setMIFlag(Flag);
1066     break;
1067   }
1068   case AArch64::STRXui:
1069   case AArch64::LDRXui: {
1070     int Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
1071     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg))
1072               .addImm(Reg)
1073               .addImm(Imm * 8)
1074               .setMIFlag(Flag);
1075     break;
1076   }
1077   case AArch64::STRDui:
1078   case AArch64::LDRDui: {
1079     unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
1080     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg))
1081               .addImm(Reg)
1082               .addImm(Imm * 8)
1083               .setMIFlag(Flag);
1084     break;
1085   }
1086   }
1087   auto I = MBB->insertAfter(MBBI, MIB);
1088   return I;
1089 }
1090 
1091 // Fix up the SEH opcode associated with the save/restore instruction.
1092 static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
1093                            unsigned LocalStackSize) {
1094   MachineOperand *ImmOpnd = nullptr;
1095   unsigned ImmIdx = MBBI->getNumOperands() - 1;
1096   switch (MBBI->getOpcode()) {
1097   default:
1098     llvm_unreachable("Fix the offset in the SEH instruction");
1099   case AArch64::SEH_SaveFPLR:
1100   case AArch64::SEH_SaveRegP:
1101   case AArch64::SEH_SaveReg:
1102   case AArch64::SEH_SaveFRegP:
1103   case AArch64::SEH_SaveFReg:
1104     ImmOpnd = &MBBI->getOperand(ImmIdx);
1105     break;
1106   }
1107   if (ImmOpnd)
1108     ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize);
1109 }
1110 
1111 // Convert callee-save register save/restore instruction to do stack pointer
1112 // decrement/increment to allocate/deallocate the callee-save stack area by
1113 // converting store/load to use pre/post increment version.
1114 static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
1115     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
1116     const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
1117     bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI,
1118     MachineInstr::MIFlag FrameFlag = MachineInstr::FrameSetup,
1119     int CFAOffset = 0) {
1120   unsigned NewOpc;
1121   switch (MBBI->getOpcode()) {
1122   default:
1123     llvm_unreachable("Unexpected callee-save save/restore opcode!");
1124   case AArch64::STPXi:
1125     NewOpc = AArch64::STPXpre;
1126     break;
1127   case AArch64::STPDi:
1128     NewOpc = AArch64::STPDpre;
1129     break;
1130   case AArch64::STPQi:
1131     NewOpc = AArch64::STPQpre;
1132     break;
1133   case AArch64::STRXui:
1134     NewOpc = AArch64::STRXpre;
1135     break;
1136   case AArch64::STRDui:
1137     NewOpc = AArch64::STRDpre;
1138     break;
1139   case AArch64::STRQui:
1140     NewOpc = AArch64::STRQpre;
1141     break;
1142   case AArch64::LDPXi:
1143     NewOpc = AArch64::LDPXpost;
1144     break;
1145   case AArch64::LDPDi:
1146     NewOpc = AArch64::LDPDpost;
1147     break;
1148   case AArch64::LDPQi:
1149     NewOpc = AArch64::LDPQpost;
1150     break;
1151   case AArch64::LDRXui:
1152     NewOpc = AArch64::LDRXpost;
1153     break;
1154   case AArch64::LDRDui:
1155     NewOpc = AArch64::LDRDpost;
1156     break;
1157   case AArch64::LDRQui:
1158     NewOpc = AArch64::LDRQpost;
1159     break;
1160   }
1161   // Get rid of the SEH code associated with the old instruction.
1162   if (NeedsWinCFI) {
1163     auto SEH = std::next(MBBI);
1164     if (AArch64InstrInfo::isSEHInstruction(*SEH))
1165       SEH->eraseFromParent();
1166   }
1167 
1168   TypeSize Scale = TypeSize::Fixed(1);
1169   unsigned Width;
1170   int64_t MinOffset, MaxOffset;
1171   bool Success = static_cast<const AArch64InstrInfo *>(TII)->getMemOpInfo(
1172       NewOpc, Scale, Width, MinOffset, MaxOffset);
1173   (void)Success;
1174   assert(Success && "unknown load/store opcode");
1175 
1176   // If the first store isn't right where we want SP then we can't fold the
1177   // update in so create a normal arithmetic instruction instead.
1178   MachineFunction &MF = *MBB.getParent();
1179   if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 ||
1180       CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) {
1181     emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1182                     StackOffset::getFixed(CSStackSizeInc), TII, FrameFlag,
1183                     false, false, nullptr, EmitCFI,
1184                     StackOffset::getFixed(CFAOffset));
1185 
1186     return std::prev(MBBI);
1187   }
1188 
1189   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
1190   MIB.addReg(AArch64::SP, RegState::Define);
1191 
1192   // Copy all operands other than the immediate offset.
1193   unsigned OpndIdx = 0;
1194   for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
1195        ++OpndIdx)
1196     MIB.add(MBBI->getOperand(OpndIdx));
1197 
1198   assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
1199          "Unexpected immediate offset in first/last callee-save save/restore "
1200          "instruction!");
1201   assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
1202          "Unexpected base register in callee-save save/restore instruction!");
1203   assert(CSStackSizeInc % Scale == 0);
1204   MIB.addImm(CSStackSizeInc / (int)Scale);
1205 
1206   MIB.setMIFlags(MBBI->getFlags());
1207   MIB.setMemRefs(MBBI->memoperands());
1208 
1209   // Generate a new SEH code that corresponds to the new instruction.
1210   if (NeedsWinCFI) {
1211     *HasWinCFI = true;
1212     InsertSEH(*MIB, *TII, FrameFlag);
1213   }
1214 
1215   if (EmitCFI) {
1216     unsigned CFIIndex = MF.addFrameInst(
1217         MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset - CSStackSizeInc));
1218     BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1219         .addCFIIndex(CFIIndex)
1220         .setMIFlags(FrameFlag);
1221   }
1222 
1223   return std::prev(MBB.erase(MBBI));
1224 }
1225 
1226 // Fixup callee-save register save/restore instructions to take into account
1227 // combined SP bump by adding the local stack size to the stack offsets.
1228 static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
1229                                               uint64_t LocalStackSize,
1230                                               bool NeedsWinCFI,
1231                                               bool *HasWinCFI) {
1232   if (AArch64InstrInfo::isSEHInstruction(MI))
1233     return;
1234 
1235   unsigned Opc = MI.getOpcode();
1236   unsigned Scale;
1237   switch (Opc) {
1238   case AArch64::STPXi:
1239   case AArch64::STRXui:
1240   case AArch64::STPDi:
1241   case AArch64::STRDui:
1242   case AArch64::LDPXi:
1243   case AArch64::LDRXui:
1244   case AArch64::LDPDi:
1245   case AArch64::LDRDui:
1246     Scale = 8;
1247     break;
1248   case AArch64::STPQi:
1249   case AArch64::STRQui:
1250   case AArch64::LDPQi:
1251   case AArch64::LDRQui:
1252     Scale = 16;
1253     break;
1254   default:
1255     llvm_unreachable("Unexpected callee-save save/restore opcode!");
1256   }
1257 
1258   unsigned OffsetIdx = MI.getNumExplicitOperands() - 1;
1259   assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP &&
1260          "Unexpected base register in callee-save save/restore instruction!");
1261   // Last operand is immediate offset that needs fixing.
1262   MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx);
1263   // All generated opcodes have scaled offsets.
1264   assert(LocalStackSize % Scale == 0);
1265   OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / Scale);
1266 
1267   if (NeedsWinCFI) {
1268     *HasWinCFI = true;
1269     auto MBBI = std::next(MachineBasicBlock::iterator(MI));
1270     assert(MBBI != MI.getParent()->end() && "Expecting a valid instruction");
1271     assert(AArch64InstrInfo::isSEHInstruction(*MBBI) &&
1272            "Expecting a SEH instruction");
1273     fixupSEHOpcode(MBBI, LocalStackSize);
1274   }
1275 }
1276 
1277 static bool isTargetWindows(const MachineFunction &MF) {
1278   return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
1279 }
1280 
1281 // Convenience function to determine whether I is an SVE callee save.
1282 static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
1283   switch (I->getOpcode()) {
1284   default:
1285     return false;
1286   case AArch64::STR_ZXI:
1287   case AArch64::STR_PXI:
1288   case AArch64::LDR_ZXI:
1289   case AArch64::LDR_PXI:
1290     return I->getFlag(MachineInstr::FrameSetup) ||
1291            I->getFlag(MachineInstr::FrameDestroy);
1292   }
1293 }
1294 
1295 static bool needsShadowCallStackPrologueEpilogue(MachineFunction &MF) {
1296   if (!(llvm::any_of(
1297             MF.getFrameInfo().getCalleeSavedInfo(),
1298             [](const auto &Info) { return Info.getReg() == AArch64::LR; }) &&
1299         MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)))
1300     return false;
1301 
1302   if (!MF.getSubtarget<AArch64Subtarget>().isXRegisterReserved(18))
1303     report_fatal_error("Must reserve x18 to use shadow call stack");
1304 
1305   return true;
1306 }
1307 
1308 static void emitShadowCallStackPrologue(const TargetInstrInfo &TII,
1309                                         MachineFunction &MF,
1310                                         MachineBasicBlock &MBB,
1311                                         MachineBasicBlock::iterator MBBI,
1312                                         const DebugLoc &DL, bool NeedsWinCFI,
1313                                         bool NeedsUnwindInfo) {
1314   // Shadow call stack prolog: str x30, [x18], #8
1315   BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXpost))
1316       .addReg(AArch64::X18, RegState::Define)
1317       .addReg(AArch64::LR)
1318       .addReg(AArch64::X18)
1319       .addImm(8)
1320       .setMIFlag(MachineInstr::FrameSetup);
1321 
1322   // This instruction also makes x18 live-in to the entry block.
1323   MBB.addLiveIn(AArch64::X18);
1324 
1325   if (NeedsWinCFI)
1326     BuildMI(MBB, MBBI, DL, TII.get(AArch64::SEH_Nop))
1327         .setMIFlag(MachineInstr::FrameSetup);
1328 
1329   if (NeedsUnwindInfo) {
1330     // Emit a CFI instruction that causes 8 to be subtracted from the value of
1331     // x18 when unwinding past this frame.
1332     static const char CFIInst[] = {
1333         dwarf::DW_CFA_val_expression,
1334         18, // register
1335         2,  // length
1336         static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
1337         static_cast<char>(-8) & 0x7f, // addend (sleb128)
1338     };
1339     unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape(
1340         nullptr, StringRef(CFIInst, sizeof(CFIInst))));
1341     BuildMI(MBB, MBBI, DL, TII.get(AArch64::CFI_INSTRUCTION))
1342         .addCFIIndex(CFIIndex)
1343         .setMIFlag(MachineInstr::FrameSetup);
1344   }
1345 }
1346 
1347 static void emitShadowCallStackEpilogue(const TargetInstrInfo &TII,
1348                                         MachineFunction &MF,
1349                                         MachineBasicBlock &MBB,
1350                                         MachineBasicBlock::iterator MBBI,
1351                                         const DebugLoc &DL) {
1352   // Shadow call stack epilog: ldr x30, [x18, #-8]!
1353   BuildMI(MBB, MBBI, DL, TII.get(AArch64::LDRXpre))
1354       .addReg(AArch64::X18, RegState::Define)
1355       .addReg(AArch64::LR, RegState::Define)
1356       .addReg(AArch64::X18)
1357       .addImm(-8)
1358       .setMIFlag(MachineInstr::FrameDestroy);
1359 
1360   if (MF.getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo()) {
1361     unsigned CFIIndex =
1362         MF.addFrameInst(MCCFIInstruction::createRestore(nullptr, 18));
1363     BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
1364         .addCFIIndex(CFIIndex)
1365         .setMIFlags(MachineInstr::FrameDestroy);
1366   }
1367 }
1368 
1369 void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
1370                                         MachineBasicBlock &MBB) const {
1371   MachineBasicBlock::iterator MBBI = MBB.begin();
1372   const MachineFrameInfo &MFI = MF.getFrameInfo();
1373   const Function &F = MF.getFunction();
1374   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1375   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1376   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1377   MachineModuleInfo &MMI = MF.getMMI();
1378   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1379   bool EmitCFI = AFI->needsDwarfUnwindInfo();
1380   bool HasFP = hasFP(MF);
1381   bool NeedsWinCFI = needsWinCFI(MF);
1382   bool HasWinCFI = false;
1383   auto Cleanup = make_scope_exit([&]() { MF.setHasWinCFI(HasWinCFI); });
1384 
1385   bool IsFunclet = MBB.isEHFuncletEntry();
1386 
1387   // At this point, we're going to decide whether or not the function uses a
1388   // redzone. In most cases, the function doesn't have a redzone so let's
1389   // assume that's false and set it to true in the case that there's a redzone.
1390   AFI->setHasRedZone(false);
1391 
1392   // Debug location must be unknown since the first debug location is used
1393   // to determine the end of the prologue.
1394   DebugLoc DL;
1395 
1396   const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>();
1397   if (needsShadowCallStackPrologueEpilogue(MF))
1398     emitShadowCallStackPrologue(*TII, MF, MBB, MBBI, DL, NeedsWinCFI,
1399                                 MFnI.needsDwarfUnwindInfo());
1400 
1401   if (MFnI.shouldSignReturnAddress()) {
1402     unsigned PACI;
1403     if (MFnI.shouldSignWithBKey()) {
1404       BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITBKEY))
1405           .setMIFlag(MachineInstr::FrameSetup);
1406       PACI = Subtarget.hasPAuth() ? AArch64::PACIB : AArch64::PACIBSP;
1407     } else {
1408       PACI = Subtarget.hasPAuth() ? AArch64::PACIA : AArch64::PACIASP;
1409     }
1410 
1411     auto MI = BuildMI(MBB, MBBI, DL, TII->get(PACI));
1412     if (Subtarget.hasPAuth())
1413       MI.addReg(AArch64::LR, RegState::Define)
1414           .addReg(AArch64::LR)
1415           .addReg(AArch64::SP, RegState::InternalRead);
1416     MI.setMIFlag(MachineInstr::FrameSetup);
1417     if (EmitCFI) {
1418       unsigned CFIIndex =
1419           MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
1420       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1421           .addCFIIndex(CFIIndex)
1422           .setMIFlags(MachineInstr::FrameSetup);
1423     }
1424   }
1425 
1426   // We signal the presence of a Swift extended frame to external tools by
1427   // storing FP with 0b0001 in bits 63:60. In normal userland operation a simple
1428   // ORR is sufficient, it is assumed a Swift kernel would initialize the TBI
1429   // bits so that is still true.
1430   if (HasFP && AFI->hasSwiftAsyncContext()) {
1431     switch (MF.getTarget().Options.SwiftAsyncFramePointer) {
1432     case SwiftAsyncFramePointerMode::DeploymentBased:
1433       if (Subtarget.swiftAsyncContextIsDynamicallySet()) {
1434         // The special symbol below is absolute and has a *value* that can be
1435         // combined with the frame pointer to signal an extended frame.
1436         BuildMI(MBB, MBBI, DL, TII->get(AArch64::LOADgot), AArch64::X16)
1437             .addExternalSymbol("swift_async_extendedFramePointerFlags",
1438                                AArch64II::MO_GOT);
1439         BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrs), AArch64::FP)
1440             .addUse(AArch64::FP)
1441             .addUse(AArch64::X16)
1442             .addImm(Subtarget.isTargetILP32() ? 32 : 0);
1443         break;
1444       }
1445       LLVM_FALLTHROUGH;
1446 
1447     case SwiftAsyncFramePointerMode::Always:
1448       // ORR x29, x29, #0x1000_0000_0000_0000
1449       BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXri), AArch64::FP)
1450           .addUse(AArch64::FP)
1451           .addImm(0x1100)
1452           .setMIFlag(MachineInstr::FrameSetup);
1453       break;
1454 
1455     case SwiftAsyncFramePointerMode::Never:
1456       break;
1457     }
1458   }
1459 
1460   // All calls are tail calls in GHC calling conv, and functions have no
1461   // prologue/epilogue.
1462   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
1463     return;
1464 
1465   // Set tagged base pointer to the requested stack slot.
1466   // Ideally it should match SP value after prologue.
1467   Optional<int> TBPI = AFI->getTaggedBasePointerIndex();
1468   if (TBPI)
1469     AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI));
1470   else
1471     AFI->setTaggedBasePointerOffset(MFI.getStackSize());
1472 
1473   const StackOffset &SVEStackSize = getSVEStackSize(MF);
1474 
1475   // getStackSize() includes all the locals in its size calculation. We don't
1476   // include these locals when computing the stack size of a funclet, as they
1477   // are allocated in the parent's stack frame and accessed via the frame
1478   // pointer from the funclet.  We only save the callee saved registers in the
1479   // funclet, which are really the callee saved registers of the parent
1480   // function, including the funclet.
1481   int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
1482                                : MFI.getStackSize();
1483   if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
1484     assert(!HasFP && "unexpected function without stack frame but with FP");
1485     assert(!SVEStackSize &&
1486            "unexpected function without stack frame but with SVE objects");
1487     // All of the stack allocation is for locals.
1488     AFI->setLocalStackSize(NumBytes);
1489     if (!NumBytes)
1490       return;
1491     // REDZONE: If the stack size is less than 128 bytes, we don't need
1492     // to actually allocate.
1493     if (canUseRedZone(MF)) {
1494       AFI->setHasRedZone(true);
1495       ++NumRedZoneFunctions;
1496     } else {
1497       emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1498                       StackOffset::getFixed(-NumBytes), TII,
1499                       MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1500       if (EmitCFI) {
1501         // Label used to tie together the PROLOG_LABEL and the MachineMoves.
1502         MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
1503           // Encode the stack size of the leaf function.
1504         unsigned CFIIndex = MF.addFrameInst(
1505             MCCFIInstruction::cfiDefCfaOffset(FrameLabel, NumBytes));
1506         BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1507             .addCFIIndex(CFIIndex)
1508             .setMIFlags(MachineInstr::FrameSetup);
1509       }
1510     }
1511 
1512     if (NeedsWinCFI) {
1513       HasWinCFI = true;
1514       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
1515           .setMIFlag(MachineInstr::FrameSetup);
1516     }
1517 
1518     return;
1519   }
1520 
1521   bool IsWin64 =
1522       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1523   unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
1524 
1525   auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
1526   // All of the remaining stack allocations are for locals.
1527   AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
1528   bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
1529   bool HomPrologEpilog = homogeneousPrologEpilog(MF);
1530   if (CombineSPBump) {
1531     assert(!SVEStackSize && "Cannot combine SP bump with SVE");
1532     emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1533                     StackOffset::getFixed(-NumBytes), TII,
1534                     MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI,
1535                     EmitCFI);
1536     NumBytes = 0;
1537   } else if (HomPrologEpilog) {
1538     // Stack has been already adjusted.
1539     NumBytes -= PrologueSaveSize;
1540   } else if (PrologueSaveSize != 0) {
1541     MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
1542         MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI,
1543         EmitCFI);
1544     NumBytes -= PrologueSaveSize;
1545   }
1546   assert(NumBytes >= 0 && "Negative stack allocation size!?");
1547 
1548   // Move past the saves of the callee-saved registers, fixing up the offsets
1549   // and pre-inc if we decided to combine the callee-save and local stack
1550   // pointer bump above.
1551   MachineBasicBlock::iterator End = MBB.end();
1552   while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
1553          !IsSVECalleeSave(MBBI)) {
1554     if (CombineSPBump)
1555       fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
1556                                         NeedsWinCFI, &HasWinCFI);
1557     ++MBBI;
1558   }
1559 
1560   // For funclets the FP belongs to the containing function.
1561   if (!IsFunclet && HasFP) {
1562     // Only set up FP if we actually need to.
1563     int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset();
1564 
1565     if (CombineSPBump)
1566       FPOffset += AFI->getLocalStackSize();
1567 
1568     if (AFI->hasSwiftAsyncContext()) {
1569       // Before we update the live FP we have to ensure there's a valid (or
1570       // null) asynchronous context in its slot just before FP in the frame
1571       // record, so store it now.
1572       const auto &Attrs = MF.getFunction().getAttributes();
1573       bool HaveInitialContext = Attrs.hasAttrSomewhere(Attribute::SwiftAsync);
1574       if (HaveInitialContext)
1575         MBB.addLiveIn(AArch64::X22);
1576       BuildMI(MBB, MBBI, DL, TII->get(AArch64::StoreSwiftAsyncContext))
1577           .addUse(HaveInitialContext ? AArch64::X22 : AArch64::XZR)
1578           .addUse(AArch64::SP)
1579           .addImm(FPOffset - 8)
1580           .setMIFlags(MachineInstr::FrameSetup);
1581     }
1582 
1583     if (HomPrologEpilog) {
1584       auto Prolog = MBBI;
1585       --Prolog;
1586       assert(Prolog->getOpcode() == AArch64::HOM_Prolog);
1587       Prolog->addOperand(MachineOperand::CreateImm(FPOffset));
1588     } else {
1589       // Issue    sub fp, sp, FPOffset or
1590       //          mov fp,sp          when FPOffset is zero.
1591       // Note: All stores of callee-saved registers are marked as "FrameSetup".
1592       // This code marks the instruction(s) that set the FP also.
1593       emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
1594                       StackOffset::getFixed(FPOffset), TII,
1595                       MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1596     }
1597     if (EmitCFI) {
1598       // Define the current CFA rule to use the provided FP.
1599       const int OffsetToFirstCalleeSaveFromFP =
1600           AFI->getCalleeSaveBaseToFrameRecordOffset() -
1601           AFI->getCalleeSavedStackSize();
1602       Register FramePtr = RegInfo->getFrameRegister(MF);
1603       unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
1604       unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
1605           nullptr, Reg, FixedObject - OffsetToFirstCalleeSaveFromFP));
1606       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1607           .addCFIIndex(CFIIndex)
1608           .setMIFlags(MachineInstr::FrameSetup);
1609     }
1610   }
1611 
1612   // Now emit the moves for whatever callee saved regs we have (including FP,
1613   // LR if those are saved). Frame instructions for SVE register are emitted
1614   // later, after the instruction which actually save SVE regs.
1615   if (EmitCFI)
1616     emitCalleeSavedGPRLocations(MBB, MBBI);
1617 
1618   if (windowsRequiresStackProbe(MF, NumBytes)) {
1619     uint64_t NumWords = NumBytes >> 4;
1620     if (NeedsWinCFI) {
1621       HasWinCFI = true;
1622       // alloc_l can hold at most 256MB, so assume that NumBytes doesn't
1623       // exceed this amount.  We need to move at most 2^24 - 1 into x15.
1624       // This is at most two instructions, MOVZ follwed by MOVK.
1625       // TODO: Fix to use multiple stack alloc unwind codes for stacks
1626       // exceeding 256MB in size.
1627       if (NumBytes >= (1 << 28))
1628         report_fatal_error("Stack size cannot exceed 256MB for stack "
1629                             "unwinding purposes");
1630 
1631       uint32_t LowNumWords = NumWords & 0xFFFF;
1632       BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVZXi), AArch64::X15)
1633             .addImm(LowNumWords)
1634             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
1635             .setMIFlag(MachineInstr::FrameSetup);
1636       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1637             .setMIFlag(MachineInstr::FrameSetup);
1638       if ((NumWords & 0xFFFF0000) != 0) {
1639           BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X15)
1640               .addReg(AArch64::X15)
1641               .addImm((NumWords & 0xFFFF0000) >> 16) // High half
1642               .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16))
1643               .setMIFlag(MachineInstr::FrameSetup);
1644           BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1645             .setMIFlag(MachineInstr::FrameSetup);
1646       }
1647     } else {
1648       BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
1649           .addImm(NumWords)
1650           .setMIFlags(MachineInstr::FrameSetup);
1651     }
1652 
1653     switch (MF.getTarget().getCodeModel()) {
1654     case CodeModel::Tiny:
1655     case CodeModel::Small:
1656     case CodeModel::Medium:
1657     case CodeModel::Kernel:
1658       BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
1659           .addExternalSymbol("__chkstk")
1660           .addReg(AArch64::X15, RegState::Implicit)
1661           .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
1662           .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
1663           .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
1664           .setMIFlags(MachineInstr::FrameSetup);
1665       if (NeedsWinCFI) {
1666         HasWinCFI = true;
1667         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1668             .setMIFlag(MachineInstr::FrameSetup);
1669       }
1670       break;
1671     case CodeModel::Large:
1672       BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT))
1673           .addReg(AArch64::X16, RegState::Define)
1674           .addExternalSymbol("__chkstk")
1675           .addExternalSymbol("__chkstk")
1676           .setMIFlags(MachineInstr::FrameSetup);
1677       if (NeedsWinCFI) {
1678         HasWinCFI = true;
1679         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1680             .setMIFlag(MachineInstr::FrameSetup);
1681       }
1682 
1683       BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF)))
1684           .addReg(AArch64::X16, RegState::Kill)
1685           .addReg(AArch64::X15, RegState::Implicit | RegState::Define)
1686           .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
1687           .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
1688           .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
1689           .setMIFlags(MachineInstr::FrameSetup);
1690       if (NeedsWinCFI) {
1691         HasWinCFI = true;
1692         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1693             .setMIFlag(MachineInstr::FrameSetup);
1694       }
1695       break;
1696     }
1697 
1698     BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
1699         .addReg(AArch64::SP, RegState::Kill)
1700         .addReg(AArch64::X15, RegState::Kill)
1701         .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4))
1702         .setMIFlags(MachineInstr::FrameSetup);
1703     if (NeedsWinCFI) {
1704       HasWinCFI = true;
1705       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
1706           .addImm(NumBytes)
1707           .setMIFlag(MachineInstr::FrameSetup);
1708     }
1709     NumBytes = 0;
1710   }
1711 
1712   StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {};
1713   MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI;
1714 
1715   // Process the SVE callee-saves to determine what space needs to be
1716   // allocated.
1717   if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
1718     // Find callee save instructions in frame.
1719     CalleeSavesBegin = MBBI;
1720     assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
1721     while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator())
1722       ++MBBI;
1723     CalleeSavesEnd = MBBI;
1724 
1725     AllocateBefore = StackOffset::getScalable(CalleeSavedSize);
1726     AllocateAfter = SVEStackSize - AllocateBefore;
1727   }
1728 
1729   // Allocate space for the callee saves (if any).
1730   emitFrameOffset(
1731       MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP, -AllocateBefore, TII,
1732       MachineInstr::FrameSetup, false, false, nullptr,
1733       EmitCFI && !HasFP && AllocateBefore,
1734       StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes));
1735 
1736   if (EmitCFI)
1737     emitCalleeSavedSVELocations(MBB, CalleeSavesEnd);
1738 
1739   // Finally allocate remaining SVE stack space.
1740   emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP,
1741                   -AllocateAfter, TII, MachineInstr::FrameSetup, false, false,
1742                   nullptr, EmitCFI && !HasFP && AllocateAfter,
1743                   AllocateBefore + StackOffset::getFixed(
1744                                        (int64_t)MFI.getStackSize() - NumBytes));
1745 
1746   // Allocate space for the rest of the frame.
1747   if (NumBytes) {
1748     // Alignment is required for the parent frame, not the funclet
1749     const bool NeedsRealignment =
1750         !IsFunclet && RegInfo->hasStackRealignment(MF);
1751     unsigned scratchSPReg = AArch64::SP;
1752 
1753     if (NeedsRealignment) {
1754       scratchSPReg = findScratchNonCalleeSaveRegister(&MBB);
1755       assert(scratchSPReg != AArch64::NoRegister);
1756     }
1757 
1758     // If we're a leaf function, try using the red zone.
1759     if (!canUseRedZone(MF)) {
1760       // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
1761       // the correct value here, as NumBytes also includes padding bytes,
1762       // which shouldn't be counted here.
1763       emitFrameOffset(
1764           MBB, MBBI, DL, scratchSPReg, AArch64::SP,
1765           StackOffset::getFixed(-NumBytes), TII, MachineInstr::FrameSetup,
1766           false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
1767           SVEStackSize +
1768               StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes));
1769     }
1770     if (NeedsRealignment) {
1771       const unsigned NrBitsToZero = Log2(MFI.getMaxAlign());
1772       assert(NrBitsToZero > 1);
1773       assert(scratchSPReg != AArch64::SP);
1774 
1775       // SUB X9, SP, NumBytes
1776       //   -- X9 is temporary register, so shouldn't contain any live data here,
1777       //   -- free to use. This is already produced by emitFrameOffset above.
1778       // AND SP, X9, 0b11111...0000
1779       // The logical immediates have a non-trivial encoding. The following
1780       // formula computes the encoded immediate with all ones but
1781       // NrBitsToZero zero bits as least significant bits.
1782       uint32_t andMaskEncoded = (1 << 12)                         // = N
1783                                 | ((64 - NrBitsToZero) << 6)      // immr
1784                                 | ((64 - NrBitsToZero - 1) << 0); // imms
1785 
1786       BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
1787           .addReg(scratchSPReg, RegState::Kill)
1788           .addImm(andMaskEncoded);
1789       AFI->setStackRealigned(true);
1790       if (NeedsWinCFI) {
1791         HasWinCFI = true;
1792         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
1793             .addImm(NumBytes & andMaskEncoded)
1794             .setMIFlag(MachineInstr::FrameSetup);
1795       }
1796     }
1797   }
1798 
1799   // If we need a base pointer, set it up here. It's whatever the value of the
1800   // stack pointer is at this point. Any variable size objects will be allocated
1801   // after this, so we can still use the base pointer to reference locals.
1802   //
1803   // FIXME: Clarify FrameSetup flags here.
1804   // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
1805   // needed.
1806   // For funclets the BP belongs to the containing function.
1807   if (!IsFunclet && RegInfo->hasBasePointer(MF)) {
1808     TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
1809                      false);
1810     if (NeedsWinCFI) {
1811       HasWinCFI = true;
1812       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1813           .setMIFlag(MachineInstr::FrameSetup);
1814     }
1815   }
1816 
1817   // The very last FrameSetup instruction indicates the end of prologue. Emit a
1818   // SEH opcode indicating the prologue end.
1819   if (NeedsWinCFI && HasWinCFI) {
1820     BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
1821         .setMIFlag(MachineInstr::FrameSetup);
1822   }
1823 
1824   // SEH funclets are passed the frame pointer in X1.  If the parent
1825   // function uses the base register, then the base register is used
1826   // directly, and is not retrieved from X1.
1827   if (IsFunclet && F.hasPersonalityFn()) {
1828     EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
1829     if (isAsynchronousEHPersonality(Per)) {
1830       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP)
1831           .addReg(AArch64::X1)
1832           .setMIFlag(MachineInstr::FrameSetup);
1833       MBB.addLiveIn(AArch64::X1);
1834     }
1835   }
1836 }
1837 
1838 static void InsertReturnAddressAuth(MachineFunction &MF,
1839                                     MachineBasicBlock &MBB) {
1840   const auto &MFI = *MF.getInfo<AArch64FunctionInfo>();
1841   if (!MFI.shouldSignReturnAddress())
1842     return;
1843   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1844   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1845 
1846   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
1847   DebugLoc DL;
1848   if (MBBI != MBB.end())
1849     DL = MBBI->getDebugLoc();
1850 
1851   // The AUTIASP instruction assembles to a hint instruction before v8.3a so
1852   // this instruction can safely used for any v8a architecture.
1853   // From v8.3a onwards there are optimised authenticate LR and return
1854   // instructions, namely RETA{A,B}, that can be used instead. In this case the
1855   // DW_CFA_AARCH64_negate_ra_state can't be emitted.
1856   if (Subtarget.hasPAuth() && MBBI != MBB.end() &&
1857       MBBI->getOpcode() == AArch64::RET_ReallyLR) {
1858     BuildMI(MBB, MBBI, DL,
1859             TII->get(MFI.shouldSignWithBKey() ? AArch64::RETAB : AArch64::RETAA))
1860         .copyImplicitOps(*MBBI);
1861     MBB.erase(MBBI);
1862   } else {
1863     BuildMI(
1864         MBB, MBBI, DL,
1865         TII->get(MFI.shouldSignWithBKey() ? AArch64::AUTIBSP : AArch64::AUTIASP))
1866         .setMIFlag(MachineInstr::FrameDestroy);
1867 
1868     unsigned CFIIndex =
1869         MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
1870     BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1871         .addCFIIndex(CFIIndex)
1872         .setMIFlags(MachineInstr::FrameDestroy);
1873   }
1874 }
1875 
1876 static bool isFuncletReturnInstr(const MachineInstr &MI) {
1877   switch (MI.getOpcode()) {
1878   default:
1879     return false;
1880   case AArch64::CATCHRET:
1881   case AArch64::CLEANUPRET:
1882     return true;
1883   }
1884 }
1885 
1886 void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
1887                                         MachineBasicBlock &MBB) const {
1888   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
1889   MachineFrameInfo &MFI = MF.getFrameInfo();
1890   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1891   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1892   DebugLoc DL;
1893   bool NeedsWinCFI = needsWinCFI(MF);
1894   bool EmitCFI = MF.getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo();
1895   bool HasWinCFI = false;
1896   bool IsFunclet = false;
1897   auto WinCFI = make_scope_exit([&]() { assert(HasWinCFI == MF.hasWinCFI()); });
1898 
1899   if (MBB.end() != MBBI) {
1900     DL = MBBI->getDebugLoc();
1901     IsFunclet = isFuncletReturnInstr(*MBBI);
1902   }
1903 
1904   auto FinishingTouches = make_scope_exit([&]() {
1905     InsertReturnAddressAuth(MF, MBB);
1906     if (needsShadowCallStackPrologueEpilogue(MF))
1907       emitShadowCallStackEpilogue(*TII, MF, MBB, MBB.getFirstTerminator(), DL);
1908     if (EmitCFI)
1909       emitCalleeSavedGPRRestores(MBB, MBB.getFirstTerminator());
1910   });
1911 
1912   int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
1913                                : MFI.getStackSize();
1914   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1915 
1916   // All calls are tail calls in GHC calling conv, and functions have no
1917   // prologue/epilogue.
1918   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
1919     return;
1920 
1921   // How much of the stack used by incoming arguments this function is expected
1922   // to restore in this particular epilogue.
1923   int64_t ArgumentStackToRestore = getArgumentStackToRestore(MF, MBB);
1924   bool IsWin64 =
1925       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1926   unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
1927 
1928   int64_t AfterCSRPopSize = ArgumentStackToRestore;
1929   auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
1930   // We cannot rely on the local stack size set in emitPrologue if the function
1931   // has funclets, as funclets have different local stack size requirements, and
1932   // the current value set in emitPrologue may be that of the containing
1933   // function.
1934   if (MF.hasEHFunclets())
1935     AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
1936   if (homogeneousPrologEpilog(MF, &MBB)) {
1937     assert(!NeedsWinCFI);
1938     auto LastPopI = MBB.getFirstTerminator();
1939     if (LastPopI != MBB.begin()) {
1940       auto HomogeneousEpilog = std::prev(LastPopI);
1941       if (HomogeneousEpilog->getOpcode() == AArch64::HOM_Epilog)
1942         LastPopI = HomogeneousEpilog;
1943     }
1944 
1945     // Adjust local stack
1946     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
1947                     StackOffset::getFixed(AFI->getLocalStackSize()), TII,
1948                     MachineInstr::FrameDestroy, false, NeedsWinCFI);
1949 
1950     // SP has been already adjusted while restoring callee save regs.
1951     // We've bailed-out the case with adjusting SP for arguments.
1952     assert(AfterCSRPopSize == 0);
1953     return;
1954   }
1955   bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
1956   // Assume we can't combine the last pop with the sp restore.
1957 
1958   bool CombineAfterCSRBump = false;
1959   if (!CombineSPBump && PrologueSaveSize != 0) {
1960     MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
1961     while (Pop->getOpcode() == TargetOpcode::CFI_INSTRUCTION ||
1962            AArch64InstrInfo::isSEHInstruction(*Pop))
1963       Pop = std::prev(Pop);
1964     // Converting the last ldp to a post-index ldp is valid only if the last
1965     // ldp's offset is 0.
1966     const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
1967     // If the offset is 0 and the AfterCSR pop is not actually trying to
1968     // allocate more stack for arguments (in space that an untimely interrupt
1969     // may clobber), convert it to a post-index ldp.
1970     if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0) {
1971       convertCalleeSaveRestoreToSPPrePostIncDec(
1972           MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, EmitCFI,
1973           MachineInstr::FrameDestroy, PrologueSaveSize);
1974     } else {
1975       // If not, make sure to emit an add after the last ldp.
1976       // We're doing this by transfering the size to be restored from the
1977       // adjustment *before* the CSR pops to the adjustment *after* the CSR
1978       // pops.
1979       AfterCSRPopSize += PrologueSaveSize;
1980       CombineAfterCSRBump = true;
1981     }
1982   }
1983 
1984   // Move past the restores of the callee-saved registers.
1985   // If we plan on combining the sp bump of the local stack size and the callee
1986   // save stack size, we might need to adjust the CSR save and restore offsets.
1987   MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
1988   MachineBasicBlock::iterator Begin = MBB.begin();
1989   while (LastPopI != Begin) {
1990     --LastPopI;
1991     if (!LastPopI->getFlag(MachineInstr::FrameDestroy) ||
1992         IsSVECalleeSave(LastPopI)) {
1993       ++LastPopI;
1994       break;
1995     } else if (CombineSPBump)
1996       fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize(),
1997                                         NeedsWinCFI, &HasWinCFI);
1998   }
1999 
2000   if (MF.hasWinCFI()) {
2001     // If the prologue didn't contain any SEH opcodes and didn't set the
2002     // MF.hasWinCFI() flag, assume the epilogue won't either, and skip the
2003     // EpilogStart - to avoid generating CFI for functions that don't need it.
2004     // (And as we didn't generate any prologue at all, it would be asymmetrical
2005     // to the epilogue.) By the end of the function, we assert that
2006     // HasWinCFI is equal to MF.hasWinCFI(), to verify this assumption.
2007     HasWinCFI = true;
2008     BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart))
2009         .setMIFlag(MachineInstr::FrameDestroy);
2010   }
2011 
2012   if (hasFP(MF) && AFI->hasSwiftAsyncContext()) {
2013     switch (MF.getTarget().Options.SwiftAsyncFramePointer) {
2014     case SwiftAsyncFramePointerMode::DeploymentBased:
2015       // Avoid the reload as it is GOT relative, and instead fall back to the
2016       // hardcoded value below.  This allows a mismatch between the OS and
2017       // application without immediately terminating on the difference.
2018       LLVM_FALLTHROUGH;
2019     case SwiftAsyncFramePointerMode::Always:
2020       // We need to reset FP to its untagged state on return. Bit 60 is
2021       // currently used to show the presence of an extended frame.
2022 
2023       // BIC x29, x29, #0x1000_0000_0000_0000
2024       BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::ANDXri),
2025               AArch64::FP)
2026           .addUse(AArch64::FP)
2027           .addImm(0x10fe)
2028           .setMIFlag(MachineInstr::FrameDestroy);
2029       break;
2030 
2031     case SwiftAsyncFramePointerMode::Never:
2032       break;
2033     }
2034   }
2035 
2036   const StackOffset &SVEStackSize = getSVEStackSize(MF);
2037 
2038   // If there is a single SP update, insert it before the ret and we're done.
2039   if (CombineSPBump) {
2040     assert(!SVEStackSize && "Cannot combine SP bump with SVE");
2041 
2042     // When we are about to restore the CSRs, the CFA register is SP again.
2043     if (EmitCFI && hasFP(MF)) {
2044       const AArch64RegisterInfo &RegInfo = *Subtarget.getRegisterInfo();
2045       unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true);
2046       unsigned CFIIndex =
2047           MF.addFrameInst(MCCFIInstruction::cfiDefCfa(nullptr, Reg, NumBytes));
2048       BuildMI(MBB, LastPopI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
2049           .addCFIIndex(CFIIndex)
2050           .setMIFlags(MachineInstr::FrameDestroy);
2051     }
2052 
2053     emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
2054                     StackOffset::getFixed(NumBytes + (int64_t)AfterCSRPopSize),
2055                     TII, MachineInstr::FrameDestroy, false, NeedsWinCFI,
2056                     &HasWinCFI, EmitCFI, StackOffset::getFixed(NumBytes));
2057     if (HasWinCFI)
2058       BuildMI(MBB, MBB.getFirstTerminator(), DL,
2059               TII->get(AArch64::SEH_EpilogEnd))
2060           .setMIFlag(MachineInstr::FrameDestroy);
2061     return;
2062   }
2063 
2064   NumBytes -= PrologueSaveSize;
2065   assert(NumBytes >= 0 && "Negative stack allocation size!?");
2066 
2067   // Process the SVE callee-saves to determine what space needs to be
2068   // deallocated.
2069   StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
2070   MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI;
2071   if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
2072     RestoreBegin = std::prev(RestoreEnd);
2073     while (RestoreBegin != MBB.begin() &&
2074            IsSVECalleeSave(std::prev(RestoreBegin)))
2075       --RestoreBegin;
2076 
2077     assert(IsSVECalleeSave(RestoreBegin) &&
2078            IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
2079 
2080     StackOffset CalleeSavedSizeAsOffset =
2081         StackOffset::getScalable(CalleeSavedSize);
2082     DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
2083     DeallocateAfter = CalleeSavedSizeAsOffset;
2084   }
2085 
2086   // Deallocate the SVE area.
2087   if (SVEStackSize) {
2088     // If we have stack realignment or variable sized objects on the stack,
2089     // restore the stack pointer from the frame pointer prior to SVE CSR
2090     // restoration.
2091     if (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) {
2092       if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
2093         // Set SP to start of SVE callee-save area from which they can
2094         // be reloaded. The code below will deallocate the stack space
2095         // space by moving FP -> SP.
2096         emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP,
2097                         StackOffset::getScalable(-CalleeSavedSize), TII,
2098                         MachineInstr::FrameDestroy);
2099       }
2100     } else {
2101       if (AFI->getSVECalleeSavedStackSize()) {
2102         // Deallocate the non-SVE locals first before we can deallocate (and
2103         // restore callee saves) from the SVE area.
2104         emitFrameOffset(
2105             MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
2106             StackOffset::getFixed(NumBytes), TII, MachineInstr::FrameDestroy,
2107             false, false, nullptr, EmitCFI && !hasFP(MF),
2108             SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize));
2109         NumBytes = 0;
2110       }
2111 
2112       emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
2113                       DeallocateBefore, TII, MachineInstr::FrameDestroy, false,
2114                       false, nullptr, EmitCFI && !hasFP(MF),
2115                       SVEStackSize +
2116                           StackOffset::getFixed(NumBytes + PrologueSaveSize));
2117 
2118       emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
2119                       DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
2120                       false, nullptr, EmitCFI && !hasFP(MF),
2121                       DeallocateAfter +
2122                           StackOffset::getFixed(NumBytes + PrologueSaveSize));
2123     }
2124     if (EmitCFI)
2125       emitCalleeSavedSVERestores(MBB, RestoreEnd);
2126   }
2127 
2128   if (!hasFP(MF)) {
2129     bool RedZone = canUseRedZone(MF);
2130     // If this was a redzone leaf function, we don't need to restore the
2131     // stack pointer (but we may need to pop stack args for fastcc).
2132     if (RedZone && AfterCSRPopSize == 0)
2133       return;
2134 
2135     // Pop the local variables off the stack. If there are no callee-saved
2136     // registers, it means we are actually positioned at the terminator and can
2137     // combine stack increment for the locals and the stack increment for
2138     // callee-popped arguments into (possibly) a single instruction and be done.
2139     bool NoCalleeSaveRestore = PrologueSaveSize == 0;
2140     int64_t StackRestoreBytes = RedZone ? 0 : NumBytes;
2141     if (NoCalleeSaveRestore)
2142       StackRestoreBytes += AfterCSRPopSize;
2143 
2144     emitFrameOffset(
2145         MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
2146         StackOffset::getFixed(StackRestoreBytes), TII,
2147         MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI, EmitCFI,
2148         StackOffset::getFixed((RedZone ? 0 : NumBytes) + PrologueSaveSize));
2149 
2150     // If we were able to combine the local stack pop with the argument pop,
2151     // then we're done.
2152     if (NoCalleeSaveRestore || AfterCSRPopSize == 0) {
2153       if (HasWinCFI) {
2154         BuildMI(MBB, MBB.getFirstTerminator(), DL,
2155                 TII->get(AArch64::SEH_EpilogEnd))
2156             .setMIFlag(MachineInstr::FrameDestroy);
2157       }
2158       return;
2159     }
2160 
2161     NumBytes = 0;
2162   }
2163 
2164   // Restore the original stack pointer.
2165   // FIXME: Rather than doing the math here, we should instead just use
2166   // non-post-indexed loads for the restores if we aren't actually going to
2167   // be able to save any instructions.
2168   if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) {
2169     emitFrameOffset(
2170         MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
2171         StackOffset::getFixed(-AFI->getCalleeSaveBaseToFrameRecordOffset()),
2172         TII, MachineInstr::FrameDestroy, false, NeedsWinCFI);
2173   } else if (NumBytes)
2174     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
2175                     StackOffset::getFixed(NumBytes), TII,
2176                     MachineInstr::FrameDestroy, false, NeedsWinCFI);
2177 
2178   // When we are about to restore the CSRs, the CFA register is SP again.
2179   if (EmitCFI && hasFP(MF)) {
2180     const AArch64RegisterInfo &RegInfo = *Subtarget.getRegisterInfo();
2181     unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true);
2182     unsigned CFIIndex = MF.addFrameInst(
2183         MCCFIInstruction::cfiDefCfa(nullptr, Reg, PrologueSaveSize));
2184     BuildMI(MBB, LastPopI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
2185         .addCFIIndex(CFIIndex)
2186         .setMIFlags(MachineInstr::FrameDestroy);
2187   }
2188 
2189   // This must be placed after the callee-save restore code because that code
2190   // assumes the SP is at the same location as it was after the callee-save save
2191   // code in the prologue.
2192   if (AfterCSRPopSize) {
2193     assert(AfterCSRPopSize > 0 && "attempting to reallocate arg stack that an "
2194                                   "interrupt may have clobbered");
2195 
2196     emitFrameOffset(
2197         MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
2198         StackOffset::getFixed(AfterCSRPopSize), TII, MachineInstr::FrameDestroy,
2199         false, NeedsWinCFI, &HasWinCFI, EmitCFI,
2200         StackOffset::getFixed(CombineAfterCSRBump ? PrologueSaveSize : 0));
2201   }
2202   if (HasWinCFI)
2203     BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd))
2204         .setMIFlag(MachineInstr::FrameDestroy);
2205 }
2206 
2207 /// getFrameIndexReference - Provide a base+offset reference to an FI slot for
2208 /// debug info.  It's the same as what we use for resolving the code-gen
2209 /// references for now.  FIXME: This can go wrong when references are
2210 /// SP-relative and simple call frames aren't used.
2211 StackOffset
2212 AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
2213                                              Register &FrameReg) const {
2214   return resolveFrameIndexReference(
2215       MF, FI, FrameReg,
2216       /*PreferFP=*/
2217       MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
2218       /*ForSimm=*/false);
2219 }
2220 
2221 StackOffset
2222 AArch64FrameLowering::getNonLocalFrameIndexReference(const MachineFunction &MF,
2223                                                      int FI) const {
2224   return StackOffset::getFixed(getSEHFrameIndexOffset(MF, FI));
2225 }
2226 
2227 static StackOffset getFPOffset(const MachineFunction &MF,
2228                                int64_t ObjectOffset) {
2229   const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2230   const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2231   bool IsWin64 =
2232       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
2233   unsigned FixedObject =
2234       getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false);
2235   int64_t CalleeSaveSize = AFI->getCalleeSavedStackSize(MF.getFrameInfo());
2236   int64_t FPAdjust =
2237       CalleeSaveSize - AFI->getCalleeSaveBaseToFrameRecordOffset();
2238   return StackOffset::getFixed(ObjectOffset + FixedObject + FPAdjust);
2239 }
2240 
2241 static StackOffset getStackOffset(const MachineFunction &MF,
2242                                   int64_t ObjectOffset) {
2243   const auto &MFI = MF.getFrameInfo();
2244   return StackOffset::getFixed(ObjectOffset + (int64_t)MFI.getStackSize());
2245 }
2246 
2247   // TODO: This function currently does not work for scalable vectors.
2248 int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
2249                                                  int FI) const {
2250   const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
2251       MF.getSubtarget().getRegisterInfo());
2252   int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI);
2253   return RegInfo->getLocalAddressRegister(MF) == AArch64::FP
2254              ? getFPOffset(MF, ObjectOffset).getFixed()
2255              : getStackOffset(MF, ObjectOffset).getFixed();
2256 }
2257 
2258 StackOffset AArch64FrameLowering::resolveFrameIndexReference(
2259     const MachineFunction &MF, int FI, Register &FrameReg, bool PreferFP,
2260     bool ForSimm) const {
2261   const auto &MFI = MF.getFrameInfo();
2262   int64_t ObjectOffset = MFI.getObjectOffset(FI);
2263   bool isFixed = MFI.isFixedObjectIndex(FI);
2264   bool isSVE = MFI.getStackID(FI) == TargetStackID::ScalableVector;
2265   return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
2266                                      PreferFP, ForSimm);
2267 }
2268 
2269 StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
2270     const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE,
2271     Register &FrameReg, bool PreferFP, bool ForSimm) const {
2272   const auto &MFI = MF.getFrameInfo();
2273   const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
2274       MF.getSubtarget().getRegisterInfo());
2275   const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2276   const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2277 
2278   int64_t FPOffset = getFPOffset(MF, ObjectOffset).getFixed();
2279   int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed();
2280   bool isCSR =
2281       !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
2282 
2283   const StackOffset &SVEStackSize = getSVEStackSize(MF);
2284 
2285   // Use frame pointer to reference fixed objects. Use it for locals if
2286   // there are VLAs or a dynamically realigned SP (and thus the SP isn't
2287   // reliable as a base). Make sure useFPForScavengingIndex() does the
2288   // right thing for the emergency spill slot.
2289   bool UseFP = false;
2290   if (AFI->hasStackFrame() && !isSVE) {
2291     // We shouldn't prefer using the FP to access fixed-sized stack objects when
2292     // there are scalable (SVE) objects in between the FP and the fixed-sized
2293     // objects.
2294     PreferFP &= !SVEStackSize;
2295 
2296     // Note: Keeping the following as multiple 'if' statements rather than
2297     // merging to a single expression for readability.
2298     //
2299     // Argument access should always use the FP.
2300     if (isFixed) {
2301       UseFP = hasFP(MF);
2302     } else if (isCSR && RegInfo->hasStackRealignment(MF)) {
2303       // References to the CSR area must use FP if we're re-aligning the stack
2304       // since the dynamically-sized alignment padding is between the SP/BP and
2305       // the CSR area.
2306       assert(hasFP(MF) && "Re-aligned stack must have frame pointer");
2307       UseFP = true;
2308     } else if (hasFP(MF) && !RegInfo->hasStackRealignment(MF)) {
2309       // If the FPOffset is negative and we're producing a signed immediate, we
2310       // have to keep in mind that the available offset range for negative
2311       // offsets is smaller than for positive ones. If an offset is available
2312       // via the FP and the SP, use whichever is closest.
2313       bool FPOffsetFits = !ForSimm || FPOffset >= -256;
2314       PreferFP |= Offset > -FPOffset && !SVEStackSize;
2315 
2316       if (MFI.hasVarSizedObjects()) {
2317         // If we have variable sized objects, we can use either FP or BP, as the
2318         // SP offset is unknown. We can use the base pointer if we have one and
2319         // FP is not preferred. If not, we're stuck with using FP.
2320         bool CanUseBP = RegInfo->hasBasePointer(MF);
2321         if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best.
2322           UseFP = PreferFP;
2323         else if (!CanUseBP) // Can't use BP. Forced to use FP.
2324           UseFP = true;
2325         // else we can use BP and FP, but the offset from FP won't fit.
2326         // That will make us scavenge registers which we can probably avoid by
2327         // using BP. If it won't fit for BP either, we'll scavenge anyway.
2328       } else if (FPOffset >= 0) {
2329         // Use SP or FP, whichever gives us the best chance of the offset
2330         // being in range for direct access. If the FPOffset is positive,
2331         // that'll always be best, as the SP will be even further away.
2332         UseFP = true;
2333       } else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) {
2334         // Funclets access the locals contained in the parent's stack frame
2335         // via the frame pointer, so we have to use the FP in the parent
2336         // function.
2337         (void) Subtarget;
2338         assert(
2339             Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()) &&
2340             "Funclets should only be present on Win64");
2341         UseFP = true;
2342       } else {
2343         // We have the choice between FP and (SP or BP).
2344         if (FPOffsetFits && PreferFP) // If FP is the best fit, use it.
2345           UseFP = true;
2346       }
2347     }
2348   }
2349 
2350   assert(
2351       ((isFixed || isCSR) || !RegInfo->hasStackRealignment(MF) || !UseFP) &&
2352       "In the presence of dynamic stack pointer realignment, "
2353       "non-argument/CSR objects cannot be accessed through the frame pointer");
2354 
2355   if (isSVE) {
2356     StackOffset FPOffset =
2357         StackOffset::get(-AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset);
2358     StackOffset SPOffset =
2359         SVEStackSize +
2360         StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(),
2361                          ObjectOffset);
2362     // Always use the FP for SVE spills if available and beneficial.
2363     if (hasFP(MF) && (SPOffset.getFixed() ||
2364                       FPOffset.getScalable() < SPOffset.getScalable() ||
2365                       RegInfo->hasStackRealignment(MF))) {
2366       FrameReg = RegInfo->getFrameRegister(MF);
2367       return FPOffset;
2368     }
2369 
2370     FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister()
2371                                            : (unsigned)AArch64::SP;
2372     return SPOffset;
2373   }
2374 
2375   StackOffset ScalableOffset = {};
2376   if (UseFP && !(isFixed || isCSR))
2377     ScalableOffset = -SVEStackSize;
2378   if (!UseFP && (isFixed || isCSR))
2379     ScalableOffset = SVEStackSize;
2380 
2381   if (UseFP) {
2382     FrameReg = RegInfo->getFrameRegister(MF);
2383     return StackOffset::getFixed(FPOffset) + ScalableOffset;
2384   }
2385 
2386   // Use the base pointer if we have one.
2387   if (RegInfo->hasBasePointer(MF))
2388     FrameReg = RegInfo->getBaseRegister();
2389   else {
2390     assert(!MFI.hasVarSizedObjects() &&
2391            "Can't use SP when we have var sized objects.");
2392     FrameReg = AArch64::SP;
2393     // If we're using the red zone for this function, the SP won't actually
2394     // be adjusted, so the offsets will be negative. They're also all
2395     // within range of the signed 9-bit immediate instructions.
2396     if (canUseRedZone(MF))
2397       Offset -= AFI->getLocalStackSize();
2398   }
2399 
2400   return StackOffset::getFixed(Offset) + ScalableOffset;
2401 }
2402 
2403 static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
2404   // Do not set a kill flag on values that are also marked as live-in. This
2405   // happens with the @llvm-returnaddress intrinsic and with arguments passed in
2406   // callee saved registers.
2407   // Omitting the kill flags is conservatively correct even if the live-in
2408   // is not used after all.
2409   bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg);
2410   return getKillRegState(!IsLiveIn);
2411 }
2412 
2413 static bool produceCompactUnwindFrame(MachineFunction &MF) {
2414   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2415   AttributeList Attrs = MF.getFunction().getAttributes();
2416   return Subtarget.isTargetMachO() &&
2417          !(Subtarget.getTargetLowering()->supportSwiftError() &&
2418            Attrs.hasAttrSomewhere(Attribute::SwiftError)) &&
2419          MF.getFunction().getCallingConv() != CallingConv::SwiftTail;
2420 }
2421 
2422 static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
2423                                              bool NeedsWinCFI, bool IsFirst) {
2424   // If we are generating register pairs for a Windows function that requires
2425   // EH support, then pair consecutive registers only.  There are no unwind
2426   // opcodes for saves/restores of non-consectuve register pairs.
2427   // The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x,
2428   // save_lrpair.
2429   // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
2430 
2431   if (Reg2 == AArch64::FP)
2432     return true;
2433   if (!NeedsWinCFI)
2434     return false;
2435   if (Reg2 == Reg1 + 1)
2436     return false;
2437   // If pairing a GPR with LR, the pair can be described by the save_lrpair
2438   // opcode. If this is the first register pair, it would end up with a
2439   // predecrement, but there's no save_lrpair_x opcode, so we can only do this
2440   // if LR is paired with something else than the first register.
2441   // The save_lrpair opcode requires the first register to be an odd one.
2442   if (Reg1 >= AArch64::X19 && Reg1 <= AArch64::X27 &&
2443       (Reg1 - AArch64::X19) % 2 == 0 && Reg2 == AArch64::LR && !IsFirst)
2444     return false;
2445   return true;
2446 }
2447 
2448 /// Returns true if Reg1 and Reg2 cannot be paired using a ldp/stp instruction.
2449 /// WindowsCFI requires that only consecutive registers can be paired.
2450 /// LR and FP need to be allocated together when the frame needs to save
2451 /// the frame-record. This means any other register pairing with LR is invalid.
2452 static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2,
2453                                       bool UsesWinAAPCS, bool NeedsWinCFI,
2454                                       bool NeedsFrameRecord, bool IsFirst) {
2455   if (UsesWinAAPCS)
2456     return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI, IsFirst);
2457 
2458   // If we need to store the frame record, don't pair any register
2459   // with LR other than FP.
2460   if (NeedsFrameRecord)
2461     return Reg2 == AArch64::LR;
2462 
2463   return false;
2464 }
2465 
2466 namespace {
2467 
2468 struct RegPairInfo {
2469   unsigned Reg1 = AArch64::NoRegister;
2470   unsigned Reg2 = AArch64::NoRegister;
2471   int FrameIdx;
2472   int Offset;
2473   enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type;
2474 
2475   RegPairInfo() = default;
2476 
2477   bool isPaired() const { return Reg2 != AArch64::NoRegister; }
2478 
2479   unsigned getScale() const {
2480     switch (Type) {
2481     case PPR:
2482       return 2;
2483     case GPR:
2484     case FPR64:
2485       return 8;
2486     case ZPR:
2487     case FPR128:
2488       return 16;
2489     }
2490     llvm_unreachable("Unsupported type");
2491   }
2492 
2493   bool isScalable() const { return Type == PPR || Type == ZPR; }
2494 };
2495 
2496 } // end anonymous namespace
2497 
2498 static void computeCalleeSaveRegisterPairs(
2499     MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
2500     const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
2501     bool NeedsFrameRecord) {
2502 
2503   if (CSI.empty())
2504     return;
2505 
2506   bool IsWindows = isTargetWindows(MF);
2507   bool NeedsWinCFI = needsWinCFI(MF);
2508   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2509   MachineFrameInfo &MFI = MF.getFrameInfo();
2510   CallingConv::ID CC = MF.getFunction().getCallingConv();
2511   unsigned Count = CSI.size();
2512   (void)CC;
2513   // MachO's compact unwind format relies on all registers being stored in
2514   // pairs.
2515   assert((!produceCompactUnwindFrame(MF) ||
2516           CC == CallingConv::PreserveMost || CC == CallingConv::CXX_FAST_TLS ||
2517           (Count & 1) == 0) &&
2518          "Odd number of callee-saved regs to spill!");
2519   int ByteOffset = AFI->getCalleeSavedStackSize();
2520   int StackFillDir = -1;
2521   int RegInc = 1;
2522   unsigned FirstReg = 0;
2523   if (NeedsWinCFI) {
2524     // For WinCFI, fill the stack from the bottom up.
2525     ByteOffset = 0;
2526     StackFillDir = 1;
2527     // As the CSI array is reversed to match PrologEpilogInserter, iterate
2528     // backwards, to pair up registers starting from lower numbered registers.
2529     RegInc = -1;
2530     FirstReg = Count - 1;
2531   }
2532   int ScalableByteOffset = AFI->getSVECalleeSavedStackSize();
2533   bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace();
2534 
2535   // When iterating backwards, the loop condition relies on unsigned wraparound.
2536   for (unsigned i = FirstReg; i < Count; i += RegInc) {
2537     RegPairInfo RPI;
2538     RPI.Reg1 = CSI[i].getReg();
2539 
2540     if (AArch64::GPR64RegClass.contains(RPI.Reg1))
2541       RPI.Type = RegPairInfo::GPR;
2542     else if (AArch64::FPR64RegClass.contains(RPI.Reg1))
2543       RPI.Type = RegPairInfo::FPR64;
2544     else if (AArch64::FPR128RegClass.contains(RPI.Reg1))
2545       RPI.Type = RegPairInfo::FPR128;
2546     else if (AArch64::ZPRRegClass.contains(RPI.Reg1))
2547       RPI.Type = RegPairInfo::ZPR;
2548     else if (AArch64::PPRRegClass.contains(RPI.Reg1))
2549       RPI.Type = RegPairInfo::PPR;
2550     else
2551       llvm_unreachable("Unsupported register class.");
2552 
2553     // Add the next reg to the pair if it is in the same register class.
2554     if (unsigned(i + RegInc) < Count) {
2555       Register NextReg = CSI[i + RegInc].getReg();
2556       bool IsFirst = i == FirstReg;
2557       switch (RPI.Type) {
2558       case RegPairInfo::GPR:
2559         if (AArch64::GPR64RegClass.contains(NextReg) &&
2560             !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows,
2561                                        NeedsWinCFI, NeedsFrameRecord, IsFirst))
2562           RPI.Reg2 = NextReg;
2563         break;
2564       case RegPairInfo::FPR64:
2565         if (AArch64::FPR64RegClass.contains(NextReg) &&
2566             !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI,
2567                                               IsFirst))
2568           RPI.Reg2 = NextReg;
2569         break;
2570       case RegPairInfo::FPR128:
2571         if (AArch64::FPR128RegClass.contains(NextReg))
2572           RPI.Reg2 = NextReg;
2573         break;
2574       case RegPairInfo::PPR:
2575       case RegPairInfo::ZPR:
2576         break;
2577       }
2578     }
2579 
2580     // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
2581     // list to come in sorted by frame index so that we can issue the store
2582     // pair instructions directly. Assert if we see anything otherwise.
2583     //
2584     // The order of the registers in the list is controlled by
2585     // getCalleeSavedRegs(), so they will always be in-order, as well.
2586     assert((!RPI.isPaired() ||
2587             (CSI[i].getFrameIdx() + RegInc == CSI[i + RegInc].getFrameIdx())) &&
2588            "Out of order callee saved regs!");
2589 
2590     assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg2 != AArch64::FP ||
2591             RPI.Reg1 == AArch64::LR) &&
2592            "FrameRecord must be allocated together with LR");
2593 
2594     // Windows AAPCS has FP and LR reversed.
2595     assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg1 != AArch64::FP ||
2596             RPI.Reg2 == AArch64::LR) &&
2597            "FrameRecord must be allocated together with LR");
2598 
2599     // MachO's compact unwind format relies on all registers being stored in
2600     // adjacent register pairs.
2601     assert((!produceCompactUnwindFrame(MF) ||
2602             CC == CallingConv::PreserveMost || CC == CallingConv::CXX_FAST_TLS ||
2603             (RPI.isPaired() &&
2604              ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
2605               RPI.Reg1 + 1 == RPI.Reg2))) &&
2606            "Callee-save registers not saved as adjacent register pair!");
2607 
2608     RPI.FrameIdx = CSI[i].getFrameIdx();
2609     if (NeedsWinCFI &&
2610         RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair
2611       RPI.FrameIdx = CSI[i + RegInc].getFrameIdx();
2612 
2613     int Scale = RPI.getScale();
2614 
2615     int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
2616     assert(OffsetPre % Scale == 0);
2617 
2618     if (RPI.isScalable())
2619       ScalableByteOffset += StackFillDir * Scale;
2620     else
2621       ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
2622 
2623     // Swift's async context is directly before FP, so allocate an extra
2624     // 8 bytes for it.
2625     if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
2626         RPI.Reg2 == AArch64::FP)
2627       ByteOffset += StackFillDir * 8;
2628 
2629     assert(!(RPI.isScalable() && RPI.isPaired()) &&
2630            "Paired spill/fill instructions don't exist for SVE vectors");
2631 
2632     // Round up size of non-pair to pair size if we need to pad the
2633     // callee-save area to ensure 16-byte alignment.
2634     if (NeedGapToAlignStack && !NeedsWinCFI &&
2635         !RPI.isScalable() && RPI.Type != RegPairInfo::FPR128 &&
2636         !RPI.isPaired() && ByteOffset % 16 != 0) {
2637       ByteOffset += 8 * StackFillDir;
2638       assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16));
2639       // A stack frame with a gap looks like this, bottom up:
2640       // d9, d8. x21, gap, x20, x19.
2641       // Set extra alignment on the x21 object to create the gap above it.
2642       MFI.setObjectAlignment(RPI.FrameIdx, Align(16));
2643       NeedGapToAlignStack = false;
2644     }
2645 
2646     int OffsetPost = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
2647     assert(OffsetPost % Scale == 0);
2648     // If filling top down (default), we want the offset after incrementing it.
2649     // If fillibg bootom up (WinCFI) we need the original offset.
2650     int Offset = NeedsWinCFI ? OffsetPre : OffsetPost;
2651 
2652     // The FP, LR pair goes 8 bytes into our expanded 24-byte slot so that the
2653     // Swift context can directly precede FP.
2654     if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
2655         RPI.Reg2 == AArch64::FP)
2656       Offset += 8;
2657     RPI.Offset = Offset / Scale;
2658 
2659     assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) ||
2660             (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) &&
2661            "Offset out of bounds for LDP/STP immediate");
2662 
2663     // Save the offset to frame record so that the FP register can point to the
2664     // innermost frame record (spilled FP and LR registers).
2665     if (NeedsFrameRecord && ((!IsWindows && RPI.Reg1 == AArch64::LR &&
2666                               RPI.Reg2 == AArch64::FP) ||
2667                              (IsWindows && RPI.Reg1 == AArch64::FP &&
2668                               RPI.Reg2 == AArch64::LR)))
2669       AFI->setCalleeSaveBaseToFrameRecordOffset(Offset);
2670 
2671     RegPairs.push_back(RPI);
2672     if (RPI.isPaired())
2673       i += RegInc;
2674   }
2675   if (NeedsWinCFI) {
2676     // If we need an alignment gap in the stack, align the topmost stack
2677     // object. A stack frame with a gap looks like this, bottom up:
2678     // x19, d8. d9, gap.
2679     // Set extra alignment on the topmost stack object (the first element in
2680     // CSI, which goes top down), to create the gap above it.
2681     if (AFI->hasCalleeSaveStackFreeSpace())
2682       MFI.setObjectAlignment(CSI[0].getFrameIdx(), Align(16));
2683     // We iterated bottom up over the registers; flip RegPairs back to top
2684     // down order.
2685     std::reverse(RegPairs.begin(), RegPairs.end());
2686   }
2687 }
2688 
2689 bool AArch64FrameLowering::spillCalleeSavedRegisters(
2690     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2691     ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2692   MachineFunction &MF = *MBB.getParent();
2693   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
2694   bool NeedsWinCFI = needsWinCFI(MF);
2695   DebugLoc DL;
2696   SmallVector<RegPairInfo, 8> RegPairs;
2697 
2698   computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
2699 
2700   const MachineRegisterInfo &MRI = MF.getRegInfo();
2701   if (homogeneousPrologEpilog(MF)) {
2702     auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Prolog))
2703                    .setMIFlag(MachineInstr::FrameSetup);
2704 
2705     for (auto &RPI : RegPairs) {
2706       MIB.addReg(RPI.Reg1);
2707       MIB.addReg(RPI.Reg2);
2708 
2709       // Update register live in.
2710       if (!MRI.isReserved(RPI.Reg1))
2711         MBB.addLiveIn(RPI.Reg1);
2712       if (!MRI.isReserved(RPI.Reg2))
2713         MBB.addLiveIn(RPI.Reg2);
2714     }
2715     return true;
2716   }
2717   for (const RegPairInfo &RPI : llvm::reverse(RegPairs)) {
2718     unsigned Reg1 = RPI.Reg1;
2719     unsigned Reg2 = RPI.Reg2;
2720     unsigned StrOpc;
2721 
2722     // Issue sequence of spills for cs regs.  The first spill may be converted
2723     // to a pre-decrement store later by emitPrologue if the callee-save stack
2724     // area allocation can't be combined with the local stack area allocation.
2725     // For example:
2726     //    stp     x22, x21, [sp, #0]     // addImm(+0)
2727     //    stp     x20, x19, [sp, #16]    // addImm(+2)
2728     //    stp     fp, lr, [sp, #32]      // addImm(+4)
2729     // Rationale: This sequence saves uop updates compared to a sequence of
2730     // pre-increment spills like stp xi,xj,[sp,#-16]!
2731     // Note: Similar rationale and sequence for restores in epilog.
2732     unsigned Size;
2733     Align Alignment;
2734     switch (RPI.Type) {
2735     case RegPairInfo::GPR:
2736        StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
2737        Size = 8;
2738        Alignment = Align(8);
2739        break;
2740     case RegPairInfo::FPR64:
2741        StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
2742        Size = 8;
2743        Alignment = Align(8);
2744        break;
2745     case RegPairInfo::FPR128:
2746        StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui;
2747        Size = 16;
2748        Alignment = Align(16);
2749        break;
2750     case RegPairInfo::ZPR:
2751        StrOpc = AArch64::STR_ZXI;
2752        Size = 16;
2753        Alignment = Align(16);
2754        break;
2755     case RegPairInfo::PPR:
2756        StrOpc = AArch64::STR_PXI;
2757        Size = 2;
2758        Alignment = Align(2);
2759        break;
2760     }
2761     LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
2762                if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
2763                dbgs() << ") -> fi#(" << RPI.FrameIdx;
2764                if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
2765                dbgs() << ")\n");
2766 
2767     assert((!NeedsWinCFI || !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) &&
2768            "Windows unwdinding requires a consecutive (FP,LR) pair");
2769     // Windows unwind codes require consecutive registers if registers are
2770     // paired.  Make the switch here, so that the code below will save (x,x+1)
2771     // and not (x+1,x).
2772     unsigned FrameIdxReg1 = RPI.FrameIdx;
2773     unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
2774     if (NeedsWinCFI && RPI.isPaired()) {
2775       std::swap(Reg1, Reg2);
2776       std::swap(FrameIdxReg1, FrameIdxReg2);
2777     }
2778     MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
2779     if (!MRI.isReserved(Reg1))
2780       MBB.addLiveIn(Reg1);
2781     if (RPI.isPaired()) {
2782       if (!MRI.isReserved(Reg2))
2783         MBB.addLiveIn(Reg2);
2784       MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
2785       MIB.addMemOperand(MF.getMachineMemOperand(
2786           MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
2787           MachineMemOperand::MOStore, Size, Alignment));
2788     }
2789     MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
2790         .addReg(AArch64::SP)
2791         .addImm(RPI.Offset) // [sp, #offset*scale],
2792                             // where factor*scale is implicit
2793         .setMIFlag(MachineInstr::FrameSetup);
2794     MIB.addMemOperand(MF.getMachineMemOperand(
2795         MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
2796         MachineMemOperand::MOStore, Size, Alignment));
2797     if (NeedsWinCFI)
2798       InsertSEH(MIB, TII, MachineInstr::FrameSetup);
2799 
2800     // Update the StackIDs of the SVE stack slots.
2801     MachineFrameInfo &MFI = MF.getFrameInfo();
2802     if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR)
2803       MFI.setStackID(RPI.FrameIdx, TargetStackID::ScalableVector);
2804 
2805   }
2806   return true;
2807 }
2808 
2809 bool AArch64FrameLowering::restoreCalleeSavedRegisters(
2810     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
2811     MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2812   MachineFunction &MF = *MBB.getParent();
2813   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
2814   DebugLoc DL;
2815   SmallVector<RegPairInfo, 8> RegPairs;
2816   bool NeedsWinCFI = needsWinCFI(MF);
2817 
2818   if (MBBI != MBB.end())
2819     DL = MBBI->getDebugLoc();
2820 
2821   computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
2822 
2823   auto EmitMI = [&](const RegPairInfo &RPI) -> MachineBasicBlock::iterator {
2824     unsigned Reg1 = RPI.Reg1;
2825     unsigned Reg2 = RPI.Reg2;
2826 
2827     // Issue sequence of restores for cs regs. The last restore may be converted
2828     // to a post-increment load later by emitEpilogue if the callee-save stack
2829     // area allocation can't be combined with the local stack area allocation.
2830     // For example:
2831     //    ldp     fp, lr, [sp, #32]       // addImm(+4)
2832     //    ldp     x20, x19, [sp, #16]     // addImm(+2)
2833     //    ldp     x22, x21, [sp, #0]      // addImm(+0)
2834     // Note: see comment in spillCalleeSavedRegisters()
2835     unsigned LdrOpc;
2836     unsigned Size;
2837     Align Alignment;
2838     switch (RPI.Type) {
2839     case RegPairInfo::GPR:
2840        LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
2841        Size = 8;
2842        Alignment = Align(8);
2843        break;
2844     case RegPairInfo::FPR64:
2845        LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
2846        Size = 8;
2847        Alignment = Align(8);
2848        break;
2849     case RegPairInfo::FPR128:
2850        LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui;
2851        Size = 16;
2852        Alignment = Align(16);
2853        break;
2854     case RegPairInfo::ZPR:
2855        LdrOpc = AArch64::LDR_ZXI;
2856        Size = 16;
2857        Alignment = Align(16);
2858        break;
2859     case RegPairInfo::PPR:
2860        LdrOpc = AArch64::LDR_PXI;
2861        Size = 2;
2862        Alignment = Align(2);
2863        break;
2864     }
2865     LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
2866                if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
2867                dbgs() << ") -> fi#(" << RPI.FrameIdx;
2868                if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
2869                dbgs() << ")\n");
2870 
2871     // Windows unwind codes require consecutive registers if registers are
2872     // paired.  Make the switch here, so that the code below will save (x,x+1)
2873     // and not (x+1,x).
2874     unsigned FrameIdxReg1 = RPI.FrameIdx;
2875     unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
2876     if (NeedsWinCFI && RPI.isPaired()) {
2877       std::swap(Reg1, Reg2);
2878       std::swap(FrameIdxReg1, FrameIdxReg2);
2879     }
2880     MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc));
2881     if (RPI.isPaired()) {
2882       MIB.addReg(Reg2, getDefRegState(true));
2883       MIB.addMemOperand(MF.getMachineMemOperand(
2884           MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
2885           MachineMemOperand::MOLoad, Size, Alignment));
2886     }
2887     MIB.addReg(Reg1, getDefRegState(true))
2888         .addReg(AArch64::SP)
2889         .addImm(RPI.Offset) // [sp, #offset*scale]
2890                             // where factor*scale is implicit
2891         .setMIFlag(MachineInstr::FrameDestroy);
2892     MIB.addMemOperand(MF.getMachineMemOperand(
2893         MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
2894         MachineMemOperand::MOLoad, Size, Alignment));
2895     if (NeedsWinCFI)
2896       InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
2897 
2898     return MIB->getIterator();
2899   };
2900 
2901   // SVE objects are always restored in reverse order.
2902   for (const RegPairInfo &RPI : reverse(RegPairs))
2903     if (RPI.isScalable())
2904       EmitMI(RPI);
2905 
2906   if (homogeneousPrologEpilog(MF, &MBB)) {
2907     auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
2908                    .setMIFlag(MachineInstr::FrameDestroy);
2909     for (auto &RPI : RegPairs) {
2910       MIB.addReg(RPI.Reg1, RegState::Define);
2911       MIB.addReg(RPI.Reg2, RegState::Define);
2912     }
2913     return true;
2914   }
2915 
2916   if (ReverseCSRRestoreSeq) {
2917     MachineBasicBlock::iterator First = MBB.end();
2918     for (const RegPairInfo &RPI : reverse(RegPairs)) {
2919       if (RPI.isScalable())
2920         continue;
2921       MachineBasicBlock::iterator It = EmitMI(RPI);
2922       if (First == MBB.end())
2923         First = It;
2924     }
2925     if (First != MBB.end())
2926       MBB.splice(MBBI, &MBB, First);
2927   } else {
2928     for (const RegPairInfo &RPI : RegPairs) {
2929       if (RPI.isScalable())
2930         continue;
2931       (void)EmitMI(RPI);
2932     }
2933   }
2934 
2935   return true;
2936 }
2937 
2938 void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
2939                                                 BitVector &SavedRegs,
2940                                                 RegScavenger *RS) const {
2941   // All calls are tail calls in GHC calling conv, and functions have no
2942   // prologue/epilogue.
2943   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
2944     return;
2945 
2946   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
2947   const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
2948       MF.getSubtarget().getRegisterInfo());
2949   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2950   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2951   unsigned UnspilledCSGPR = AArch64::NoRegister;
2952   unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
2953 
2954   MachineFrameInfo &MFI = MF.getFrameInfo();
2955   const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
2956 
2957   unsigned BasePointerReg = RegInfo->hasBasePointer(MF)
2958                                 ? RegInfo->getBaseRegister()
2959                                 : (unsigned)AArch64::NoRegister;
2960 
2961   unsigned ExtraCSSpill = 0;
2962   // Figure out which callee-saved registers to save/restore.
2963   for (unsigned i = 0; CSRegs[i]; ++i) {
2964     const unsigned Reg = CSRegs[i];
2965 
2966     // Add the base pointer register to SavedRegs if it is callee-save.
2967     if (Reg == BasePointerReg)
2968       SavedRegs.set(Reg);
2969 
2970     bool RegUsed = SavedRegs.test(Reg);
2971     unsigned PairedReg = AArch64::NoRegister;
2972     if (AArch64::GPR64RegClass.contains(Reg) ||
2973         AArch64::FPR64RegClass.contains(Reg) ||
2974         AArch64::FPR128RegClass.contains(Reg))
2975       PairedReg = CSRegs[i ^ 1];
2976 
2977     if (!RegUsed) {
2978       if (AArch64::GPR64RegClass.contains(Reg) &&
2979           !RegInfo->isReservedReg(MF, Reg)) {
2980         UnspilledCSGPR = Reg;
2981         UnspilledCSGPRPaired = PairedReg;
2982       }
2983       continue;
2984     }
2985 
2986     // MachO's compact unwind format relies on all registers being stored in
2987     // pairs.
2988     // FIXME: the usual format is actually better if unwinding isn't needed.
2989     if (producePairRegisters(MF) && PairedReg != AArch64::NoRegister &&
2990         !SavedRegs.test(PairedReg)) {
2991       SavedRegs.set(PairedReg);
2992       if (AArch64::GPR64RegClass.contains(PairedReg) &&
2993           !RegInfo->isReservedReg(MF, PairedReg))
2994         ExtraCSSpill = PairedReg;
2995     }
2996   }
2997 
2998   if (MF.getFunction().getCallingConv() == CallingConv::Win64 &&
2999       !Subtarget.isTargetWindows()) {
3000     // For Windows calling convention on a non-windows OS, where X18 is treated
3001     // as reserved, back up X18 when entering non-windows code (marked with the
3002     // Windows calling convention) and restore when returning regardless of
3003     // whether the individual function uses it - it might call other functions
3004     // that clobber it.
3005     SavedRegs.set(AArch64::X18);
3006   }
3007 
3008   // Calculates the callee saved stack size.
3009   unsigned CSStackSize = 0;
3010   unsigned SVECSStackSize = 0;
3011   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3012   const MachineRegisterInfo &MRI = MF.getRegInfo();
3013   for (unsigned Reg : SavedRegs.set_bits()) {
3014     auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8;
3015     if (AArch64::PPRRegClass.contains(Reg) ||
3016         AArch64::ZPRRegClass.contains(Reg))
3017       SVECSStackSize += RegSize;
3018     else
3019       CSStackSize += RegSize;
3020   }
3021 
3022   // Save number of saved regs, so we can easily update CSStackSize later.
3023   unsigned NumSavedRegs = SavedRegs.count();
3024 
3025   // The frame record needs to be created by saving the appropriate registers
3026   uint64_t EstimatedStackSize = MFI.estimateStackSize(MF);
3027   if (hasFP(MF) ||
3028       windowsRequiresStackProbe(MF, EstimatedStackSize + CSStackSize + 16)) {
3029     SavedRegs.set(AArch64::FP);
3030     SavedRegs.set(AArch64::LR);
3031   }
3032 
3033   LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nSaved CSRs:";
3034              for (unsigned Reg
3035                   : SavedRegs.set_bits()) dbgs()
3036              << ' ' << printReg(Reg, RegInfo);
3037              dbgs() << "\n";);
3038 
3039   // If any callee-saved registers are used, the frame cannot be eliminated.
3040   int64_t SVEStackSize =
3041       alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16);
3042   bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize;
3043 
3044   // The CSR spill slots have not been allocated yet, so estimateStackSize
3045   // won't include them.
3046   unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
3047 
3048   // Conservatively always assume BigStack when there are SVE spills.
3049   bool BigStack = SVEStackSize ||
3050                   (EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit;
3051   if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
3052     AFI->setHasStackFrame(true);
3053 
3054   // Estimate if we might need to scavenge a register at some point in order
3055   // to materialize a stack offset. If so, either spill one additional
3056   // callee-saved register or reserve a special spill slot to facilitate
3057   // register scavenging. If we already spilled an extra callee-saved register
3058   // above to keep the number of spills even, we don't need to do anything else
3059   // here.
3060   if (BigStack) {
3061     if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) {
3062       LLVM_DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo)
3063                         << " to get a scratch register.\n");
3064       SavedRegs.set(UnspilledCSGPR);
3065       // MachO's compact unwind format relies on all registers being stored in
3066       // pairs, so if we need to spill one extra for BigStack, then we need to
3067       // store the pair.
3068       if (producePairRegisters(MF))
3069         SavedRegs.set(UnspilledCSGPRPaired);
3070       ExtraCSSpill = UnspilledCSGPR;
3071     }
3072 
3073     // If we didn't find an extra callee-saved register to spill, create
3074     // an emergency spill slot.
3075     if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) {
3076       const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3077       const TargetRegisterClass &RC = AArch64::GPR64RegClass;
3078       unsigned Size = TRI->getSpillSize(RC);
3079       Align Alignment = TRI->getSpillAlign(RC);
3080       int FI = MFI.CreateStackObject(Size, Alignment, false);
3081       RS->addScavengingFrameIndex(FI);
3082       LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
3083                         << " as the emergency spill slot.\n");
3084     }
3085   }
3086 
3087   // Adding the size of additional 64bit GPR saves.
3088   CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs);
3089 
3090   // A Swift asynchronous context extends the frame record with a pointer
3091   // directly before FP.
3092   if (hasFP(MF) && AFI->hasSwiftAsyncContext())
3093     CSStackSize += 8;
3094 
3095   uint64_t AlignedCSStackSize = alignTo(CSStackSize, 16);
3096   LLVM_DEBUG(dbgs() << "Estimated stack frame size: "
3097                << EstimatedStackSize + AlignedCSStackSize
3098                << " bytes.\n");
3099 
3100   assert((!MFI.isCalleeSavedInfoValid() ||
3101           AFI->getCalleeSavedStackSize() == AlignedCSStackSize) &&
3102          "Should not invalidate callee saved info");
3103 
3104   // Round up to register pair alignment to avoid additional SP adjustment
3105   // instructions.
3106   AFI->setCalleeSavedStackSize(AlignedCSStackSize);
3107   AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize);
3108   AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16));
3109 }
3110 
3111 bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
3112     MachineFunction &MF, const TargetRegisterInfo *RegInfo,
3113     std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
3114     unsigned &MaxCSFrameIndex) const {
3115   bool NeedsWinCFI = needsWinCFI(MF);
3116   // To match the canonical windows frame layout, reverse the list of
3117   // callee saved registers to get them laid out by PrologEpilogInserter
3118   // in the right order. (PrologEpilogInserter allocates stack objects top
3119   // down. Windows canonical prologs store higher numbered registers at
3120   // the top, thus have the CSI array start from the highest registers.)
3121   if (NeedsWinCFI)
3122     std::reverse(CSI.begin(), CSI.end());
3123 
3124   if (CSI.empty())
3125     return true; // Early exit if no callee saved registers are modified!
3126 
3127   // Now that we know which registers need to be saved and restored, allocate
3128   // stack slots for them.
3129   MachineFrameInfo &MFI = MF.getFrameInfo();
3130   auto *AFI = MF.getInfo<AArch64FunctionInfo>();
3131 
3132   bool UsesWinAAPCS = isTargetWindows(MF);
3133   if (UsesWinAAPCS && hasFP(MF) && AFI->hasSwiftAsyncContext()) {
3134     int FrameIdx = MFI.CreateStackObject(8, Align(16), true);
3135     AFI->setSwiftAsyncContextFrameIdx(FrameIdx);
3136     if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
3137     if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
3138   }
3139 
3140   for (auto &CS : CSI) {
3141     Register Reg = CS.getReg();
3142     const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
3143 
3144     unsigned Size = RegInfo->getSpillSize(*RC);
3145     Align Alignment(RegInfo->getSpillAlign(*RC));
3146     int FrameIdx = MFI.CreateStackObject(Size, Alignment, true);
3147     CS.setFrameIdx(FrameIdx);
3148 
3149     if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
3150     if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
3151 
3152     // Grab 8 bytes below FP for the extended asynchronous frame info.
3153     if (hasFP(MF) && AFI->hasSwiftAsyncContext() && !UsesWinAAPCS &&
3154         Reg == AArch64::FP) {
3155       FrameIdx = MFI.CreateStackObject(8, Alignment, true);
3156       AFI->setSwiftAsyncContextFrameIdx(FrameIdx);
3157       if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
3158       if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
3159     }
3160   }
3161   return true;
3162 }
3163 
3164 bool AArch64FrameLowering::enableStackSlotScavenging(
3165     const MachineFunction &MF) const {
3166   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3167   return AFI->hasCalleeSaveStackFreeSpace();
3168 }
3169 
3170 /// returns true if there are any SVE callee saves.
3171 static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI,
3172                                       int &Min, int &Max) {
3173   Min = std::numeric_limits<int>::max();
3174   Max = std::numeric_limits<int>::min();
3175 
3176   if (!MFI.isCalleeSavedInfoValid())
3177     return false;
3178 
3179   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
3180   for (auto &CS : CSI) {
3181     if (AArch64::ZPRRegClass.contains(CS.getReg()) ||
3182         AArch64::PPRRegClass.contains(CS.getReg())) {
3183       assert((Max == std::numeric_limits<int>::min() ||
3184               Max + 1 == CS.getFrameIdx()) &&
3185              "SVE CalleeSaves are not consecutive");
3186 
3187       Min = std::min(Min, CS.getFrameIdx());
3188       Max = std::max(Max, CS.getFrameIdx());
3189     }
3190   }
3191   return Min != std::numeric_limits<int>::max();
3192 }
3193 
3194 // Process all the SVE stack objects and determine offsets for each
3195 // object. If AssignOffsets is true, the offsets get assigned.
3196 // Fills in the first and last callee-saved frame indices into
3197 // Min/MaxCSFrameIndex, respectively.
3198 // Returns the size of the stack.
3199 static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
3200                                               int &MinCSFrameIndex,
3201                                               int &MaxCSFrameIndex,
3202                                               bool AssignOffsets) {
3203 #ifndef NDEBUG
3204   // First process all fixed stack objects.
3205   for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
3206     assert(MFI.getStackID(I) != TargetStackID::ScalableVector &&
3207            "SVE vectors should never be passed on the stack by value, only by "
3208            "reference.");
3209 #endif
3210 
3211   auto Assign = [&MFI](int FI, int64_t Offset) {
3212     LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n");
3213     MFI.setObjectOffset(FI, Offset);
3214   };
3215 
3216   int64_t Offset = 0;
3217 
3218   // Then process all callee saved slots.
3219   if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) {
3220     // Assign offsets to the callee save slots.
3221     for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {
3222       Offset += MFI.getObjectSize(I);
3223       Offset = alignTo(Offset, MFI.getObjectAlign(I));
3224       if (AssignOffsets)
3225         Assign(I, -Offset);
3226     }
3227   }
3228 
3229   // Ensure that the Callee-save area is aligned to 16bytes.
3230   Offset = alignTo(Offset, Align(16U));
3231 
3232   // Create a buffer of SVE objects to allocate and sort it.
3233   SmallVector<int, 8> ObjectsToAllocate;
3234   // If we have a stack protector, and we've previously decided that we have SVE
3235   // objects on the stack and thus need it to go in the SVE stack area, then it
3236   // needs to go first.
3237   int StackProtectorFI = -1;
3238   if (MFI.hasStackProtectorIndex()) {
3239     StackProtectorFI = MFI.getStackProtectorIndex();
3240     if (MFI.getStackID(StackProtectorFI) == TargetStackID::ScalableVector)
3241       ObjectsToAllocate.push_back(StackProtectorFI);
3242   }
3243   for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
3244     unsigned StackID = MFI.getStackID(I);
3245     if (StackID != TargetStackID::ScalableVector)
3246       continue;
3247     if (I == StackProtectorFI)
3248       continue;
3249     if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex)
3250       continue;
3251     if (MFI.isDeadObjectIndex(I))
3252       continue;
3253 
3254     ObjectsToAllocate.push_back(I);
3255   }
3256 
3257   // Allocate all SVE locals and spills
3258   for (unsigned FI : ObjectsToAllocate) {
3259     Align Alignment = MFI.getObjectAlign(FI);
3260     // FIXME: Given that the length of SVE vectors is not necessarily a power of
3261     // two, we'd need to align every object dynamically at runtime if the
3262     // alignment is larger than 16. This is not yet supported.
3263     if (Alignment > Align(16))
3264       report_fatal_error(
3265           "Alignment of scalable vectors > 16 bytes is not yet supported");
3266 
3267     Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment);
3268     if (AssignOffsets)
3269       Assign(FI, -Offset);
3270   }
3271 
3272   return Offset;
3273 }
3274 
3275 int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets(
3276     MachineFrameInfo &MFI) const {
3277   int MinCSFrameIndex, MaxCSFrameIndex;
3278   return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false);
3279 }
3280 
3281 int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
3282     MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const {
3283   return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex,
3284                                         true);
3285 }
3286 
3287 void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
3288     MachineFunction &MF, RegScavenger *RS) const {
3289   MachineFrameInfo &MFI = MF.getFrameInfo();
3290 
3291   assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
3292          "Upwards growing stack unsupported");
3293 
3294   int MinCSFrameIndex, MaxCSFrameIndex;
3295   int64_t SVEStackSize =
3296       assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex);
3297 
3298   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3299   AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U));
3300   AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex);
3301 
3302   // If this function isn't doing Win64-style C++ EH, we don't need to do
3303   // anything.
3304   if (!MF.hasEHFunclets())
3305     return;
3306   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
3307   WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
3308 
3309   MachineBasicBlock &MBB = MF.front();
3310   auto MBBI = MBB.begin();
3311   while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
3312     ++MBBI;
3313 
3314   // Create an UnwindHelp object.
3315   // The UnwindHelp object is allocated at the start of the fixed object area
3316   int64_t FixedObject =
3317       getFixedObjectSize(MF, AFI, /*IsWin64*/ true, /*IsFunclet*/ false);
3318   int UnwindHelpFI = MFI.CreateFixedObject(/*Size*/ 8,
3319                                            /*SPOffset*/ -FixedObject,
3320                                            /*IsImmutable=*/false);
3321   EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
3322 
3323   // We need to store -2 into the UnwindHelp object at the start of the
3324   // function.
3325   DebugLoc DL;
3326   RS->enterBasicBlockEnd(MBB);
3327   RS->backward(std::prev(MBBI));
3328   Register DstReg = RS->FindUnusedReg(&AArch64::GPR64commonRegClass);
3329   assert(DstReg && "There must be a free register after frame setup");
3330   BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), DstReg).addImm(-2);
3331   BuildMI(MBB, MBBI, DL, TII.get(AArch64::STURXi))
3332       .addReg(DstReg, getKillRegState(true))
3333       .addFrameIndex(UnwindHelpFI)
3334       .addImm(0);
3335 }
3336 
3337 namespace {
3338 struct TagStoreInstr {
3339   MachineInstr *MI;
3340   int64_t Offset, Size;
3341   explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size)
3342       : MI(MI), Offset(Offset), Size(Size) {}
3343 };
3344 
3345 class TagStoreEdit {
3346   MachineFunction *MF;
3347   MachineBasicBlock *MBB;
3348   MachineRegisterInfo *MRI;
3349   // Tag store instructions that are being replaced.
3350   SmallVector<TagStoreInstr, 8> TagStores;
3351   // Combined memref arguments of the above instructions.
3352   SmallVector<MachineMemOperand *, 8> CombinedMemRefs;
3353 
3354   // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg +
3355   // FrameRegOffset + Size) with the address tag of SP.
3356   Register FrameReg;
3357   StackOffset FrameRegOffset;
3358   int64_t Size;
3359   // If not None, move FrameReg to (FrameReg + FrameRegUpdate) at the end.
3360   Optional<int64_t> FrameRegUpdate;
3361   // MIFlags for any FrameReg updating instructions.
3362   unsigned FrameRegUpdateFlags;
3363 
3364   // Use zeroing instruction variants.
3365   bool ZeroData;
3366   DebugLoc DL;
3367 
3368   void emitUnrolled(MachineBasicBlock::iterator InsertI);
3369   void emitLoop(MachineBasicBlock::iterator InsertI);
3370 
3371 public:
3372   TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData)
3373       : MBB(MBB), ZeroData(ZeroData) {
3374     MF = MBB->getParent();
3375     MRI = &MF->getRegInfo();
3376   }
3377   // Add an instruction to be replaced. Instructions must be added in the
3378   // ascending order of Offset, and have to be adjacent.
3379   void addInstruction(TagStoreInstr I) {
3380     assert((TagStores.empty() ||
3381             TagStores.back().Offset + TagStores.back().Size == I.Offset) &&
3382            "Non-adjacent tag store instructions.");
3383     TagStores.push_back(I);
3384   }
3385   void clear() { TagStores.clear(); }
3386   // Emit equivalent code at the given location, and erase the current set of
3387   // instructions. May skip if the replacement is not profitable. May invalidate
3388   // the input iterator and replace it with a valid one.
3389   void emitCode(MachineBasicBlock::iterator &InsertI,
3390                 const AArch64FrameLowering *TFI, bool TryMergeSPUpdate);
3391 };
3392 
3393 void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
3394   const AArch64InstrInfo *TII =
3395       MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
3396 
3397   const int64_t kMinOffset = -256 * 16;
3398   const int64_t kMaxOffset = 255 * 16;
3399 
3400   Register BaseReg = FrameReg;
3401   int64_t BaseRegOffsetBytes = FrameRegOffset.getFixed();
3402   if (BaseRegOffsetBytes < kMinOffset ||
3403       BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) {
3404     Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
3405     emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg,
3406                     StackOffset::getFixed(BaseRegOffsetBytes), TII);
3407     BaseReg = ScratchReg;
3408     BaseRegOffsetBytes = 0;
3409   }
3410 
3411   MachineInstr *LastI = nullptr;
3412   while (Size) {
3413     int64_t InstrSize = (Size > 16) ? 32 : 16;
3414     unsigned Opcode =
3415         InstrSize == 16
3416             ? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset)
3417             : (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset);
3418     MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode))
3419                           .addReg(AArch64::SP)
3420                           .addReg(BaseReg)
3421                           .addImm(BaseRegOffsetBytes / 16)
3422                           .setMemRefs(CombinedMemRefs);
3423     // A store to [BaseReg, #0] should go last for an opportunity to fold the
3424     // final SP adjustment in the epilogue.
3425     if (BaseRegOffsetBytes == 0)
3426       LastI = I;
3427     BaseRegOffsetBytes += InstrSize;
3428     Size -= InstrSize;
3429   }
3430 
3431   if (LastI)
3432     MBB->splice(InsertI, MBB, LastI);
3433 }
3434 
3435 void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
3436   const AArch64InstrInfo *TII =
3437       MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
3438 
3439   Register BaseReg = FrameRegUpdate
3440                          ? FrameReg
3441                          : MRI->createVirtualRegister(&AArch64::GPR64RegClass);
3442   Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
3443 
3444   emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII);
3445 
3446   int64_t LoopSize = Size;
3447   // If the loop size is not a multiple of 32, split off one 16-byte store at
3448   // the end to fold BaseReg update into.
3449   if (FrameRegUpdate && *FrameRegUpdate)
3450     LoopSize -= LoopSize % 32;
3451   MachineInstr *LoopI = BuildMI(*MBB, InsertI, DL,
3452                                 TII->get(ZeroData ? AArch64::STZGloop_wback
3453                                                   : AArch64::STGloop_wback))
3454                             .addDef(SizeReg)
3455                             .addDef(BaseReg)
3456                             .addImm(LoopSize)
3457                             .addReg(BaseReg)
3458                             .setMemRefs(CombinedMemRefs);
3459   if (FrameRegUpdate)
3460     LoopI->setFlags(FrameRegUpdateFlags);
3461 
3462   int64_t ExtraBaseRegUpdate =
3463       FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getFixed() - Size) : 0;
3464   if (LoopSize < Size) {
3465     assert(FrameRegUpdate);
3466     assert(Size - LoopSize == 16);
3467     // Tag 16 more bytes at BaseReg and update BaseReg.
3468     BuildMI(*MBB, InsertI, DL,
3469             TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex))
3470         .addDef(BaseReg)
3471         .addReg(BaseReg)
3472         .addReg(BaseReg)
3473         .addImm(1 + ExtraBaseRegUpdate / 16)
3474         .setMemRefs(CombinedMemRefs)
3475         .setMIFlags(FrameRegUpdateFlags);
3476   } else if (ExtraBaseRegUpdate) {
3477     // Update BaseReg.
3478     BuildMI(
3479         *MBB, InsertI, DL,
3480         TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri))
3481         .addDef(BaseReg)
3482         .addReg(BaseReg)
3483         .addImm(std::abs(ExtraBaseRegUpdate))
3484         .addImm(0)
3485         .setMIFlags(FrameRegUpdateFlags);
3486   }
3487 }
3488 
3489 // Check if *II is a register update that can be merged into STGloop that ends
3490 // at (Reg + Size). RemainingOffset is the required adjustment to Reg after the
3491 // end of the loop.
3492 bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg,
3493                        int64_t Size, int64_t *TotalOffset) {
3494   MachineInstr &MI = *II;
3495   if ((MI.getOpcode() == AArch64::ADDXri ||
3496        MI.getOpcode() == AArch64::SUBXri) &&
3497       MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) {
3498     unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm());
3499     int64_t Offset = MI.getOperand(2).getImm() << Shift;
3500     if (MI.getOpcode() == AArch64::SUBXri)
3501       Offset = -Offset;
3502     int64_t AbsPostOffset = std::abs(Offset - Size);
3503     const int64_t kMaxOffset =
3504         0xFFF; // Max encoding for unshifted ADDXri / SUBXri
3505     if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) {
3506       *TotalOffset = Offset;
3507       return true;
3508     }
3509   }
3510   return false;
3511 }
3512 
3513 void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
3514                   SmallVectorImpl<MachineMemOperand *> &MemRefs) {
3515   MemRefs.clear();
3516   for (auto &TS : TSE) {
3517     MachineInstr *MI = TS.MI;
3518     // An instruction without memory operands may access anything. Be
3519     // conservative and return an empty list.
3520     if (MI->memoperands_empty()) {
3521       MemRefs.clear();
3522       return;
3523     }
3524     MemRefs.append(MI->memoperands_begin(), MI->memoperands_end());
3525   }
3526 }
3527 
3528 void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
3529                             const AArch64FrameLowering *TFI,
3530                             bool TryMergeSPUpdate) {
3531   if (TagStores.empty())
3532     return;
3533   TagStoreInstr &FirstTagStore = TagStores[0];
3534   TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1];
3535   Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size;
3536   DL = TagStores[0].MI->getDebugLoc();
3537 
3538   Register Reg;
3539   FrameRegOffset = TFI->resolveFrameOffsetReference(
3540       *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg,
3541       /*PreferFP=*/false, /*ForSimm=*/true);
3542   FrameReg = Reg;
3543   FrameRegUpdate = None;
3544 
3545   mergeMemRefs(TagStores, CombinedMemRefs);
3546 
3547   LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n";
3548              for (const auto &Instr
3549                   : TagStores) { dbgs() << "  " << *Instr.MI; });
3550 
3551   // Size threshold where a loop becomes shorter than a linear sequence of
3552   // tagging instructions.
3553   const int kSetTagLoopThreshold = 176;
3554   if (Size < kSetTagLoopThreshold) {
3555     if (TagStores.size() < 2)
3556       return;
3557     emitUnrolled(InsertI);
3558   } else {
3559     MachineInstr *UpdateInstr = nullptr;
3560     int64_t TotalOffset = 0;
3561     if (TryMergeSPUpdate) {
3562       // See if we can merge base register update into the STGloop.
3563       // This is done in AArch64LoadStoreOptimizer for "normal" stores,
3564       // but STGloop is way too unusual for that, and also it only
3565       // realistically happens in function epilogue. Also, STGloop is expanded
3566       // before that pass.
3567       if (InsertI != MBB->end() &&
3568           canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getFixed() + Size,
3569                             &TotalOffset)) {
3570         UpdateInstr = &*InsertI++;
3571         LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n  "
3572                           << *UpdateInstr);
3573       }
3574     }
3575 
3576     if (!UpdateInstr && TagStores.size() < 2)
3577       return;
3578 
3579     if (UpdateInstr) {
3580       FrameRegUpdate = TotalOffset;
3581       FrameRegUpdateFlags = UpdateInstr->getFlags();
3582     }
3583     emitLoop(InsertI);
3584     if (UpdateInstr)
3585       UpdateInstr->eraseFromParent();
3586   }
3587 
3588   for (auto &TS : TagStores)
3589     TS.MI->eraseFromParent();
3590 }
3591 
3592 bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset,
3593                                         int64_t &Size, bool &ZeroData) {
3594   MachineFunction &MF = *MI.getParent()->getParent();
3595   const MachineFrameInfo &MFI = MF.getFrameInfo();
3596 
3597   unsigned Opcode = MI.getOpcode();
3598   ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGOffset ||
3599               Opcode == AArch64::STZ2GOffset);
3600 
3601   if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) {
3602     if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead())
3603       return false;
3604     if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI())
3605       return false;
3606     Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex());
3607     Size = MI.getOperand(2).getImm();
3608     return true;
3609   }
3610 
3611   if (Opcode == AArch64::STGOffset || Opcode == AArch64::STZGOffset)
3612     Size = 16;
3613   else if (Opcode == AArch64::ST2GOffset || Opcode == AArch64::STZ2GOffset)
3614     Size = 32;
3615   else
3616     return false;
3617 
3618   if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI())
3619     return false;
3620 
3621   Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) +
3622            16 * MI.getOperand(2).getImm();
3623   return true;
3624 }
3625 
3626 // Detect a run of memory tagging instructions for adjacent stack frame slots,
3627 // and replace them with a shorter instruction sequence:
3628 // * replace STG + STG with ST2G
3629 // * replace STGloop + STGloop with STGloop
3630 // This code needs to run when stack slot offsets are already known, but before
3631 // FrameIndex operands in STG instructions are eliminated.
3632 MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
3633                                                 const AArch64FrameLowering *TFI,
3634                                                 RegScavenger *RS) {
3635   bool FirstZeroData;
3636   int64_t Size, Offset;
3637   MachineInstr &MI = *II;
3638   MachineBasicBlock *MBB = MI.getParent();
3639   MachineBasicBlock::iterator NextI = ++II;
3640   if (&MI == &MBB->instr_back())
3641     return II;
3642   if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData))
3643     return II;
3644 
3645   SmallVector<TagStoreInstr, 4> Instrs;
3646   Instrs.emplace_back(&MI, Offset, Size);
3647 
3648   constexpr int kScanLimit = 10;
3649   int Count = 0;
3650   for (MachineBasicBlock::iterator E = MBB->end();
3651        NextI != E && Count < kScanLimit; ++NextI) {
3652     MachineInstr &MI = *NextI;
3653     bool ZeroData;
3654     int64_t Size, Offset;
3655     // Collect instructions that update memory tags with a FrameIndex operand
3656     // and (when applicable) constant size, and whose output registers are dead
3657     // (the latter is almost always the case in practice). Since these
3658     // instructions effectively have no inputs or outputs, we are free to skip
3659     // any non-aliasing instructions in between without tracking used registers.
3660     if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) {
3661       if (ZeroData != FirstZeroData)
3662         break;
3663       Instrs.emplace_back(&MI, Offset, Size);
3664       continue;
3665     }
3666 
3667     // Only count non-transient, non-tagging instructions toward the scan
3668     // limit.
3669     if (!MI.isTransient())
3670       ++Count;
3671 
3672     // Just in case, stop before the epilogue code starts.
3673     if (MI.getFlag(MachineInstr::FrameSetup) ||
3674         MI.getFlag(MachineInstr::FrameDestroy))
3675       break;
3676 
3677     // Reject anything that may alias the collected instructions.
3678     if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects())
3679       break;
3680   }
3681 
3682   // New code will be inserted after the last tagging instruction we've found.
3683   MachineBasicBlock::iterator InsertI = Instrs.back().MI;
3684   InsertI++;
3685 
3686   llvm::stable_sort(Instrs,
3687                     [](const TagStoreInstr &Left, const TagStoreInstr &Right) {
3688                       return Left.Offset < Right.Offset;
3689                     });
3690 
3691   // Make sure that we don't have any overlapping stores.
3692   int64_t CurOffset = Instrs[0].Offset;
3693   for (auto &Instr : Instrs) {
3694     if (CurOffset > Instr.Offset)
3695       return NextI;
3696     CurOffset = Instr.Offset + Instr.Size;
3697   }
3698 
3699   // Find contiguous runs of tagged memory and emit shorter instruction
3700   // sequencies for them when possible.
3701   TagStoreEdit TSE(MBB, FirstZeroData);
3702   Optional<int64_t> EndOffset;
3703   for (auto &Instr : Instrs) {
3704     if (EndOffset && *EndOffset != Instr.Offset) {
3705       // Found a gap.
3706       TSE.emitCode(InsertI, TFI, /*TryMergeSPUpdate = */ false);
3707       TSE.clear();
3708     }
3709 
3710     TSE.addInstruction(Instr);
3711     EndOffset = Instr.Offset + Instr.Size;
3712   }
3713 
3714   // Multiple FP/SP updates in a loop cannot be described by CFI instructions.
3715   TSE.emitCode(InsertI, TFI, /*TryMergeSPUpdate = */
3716                !MBB->getParent()
3717                     ->getInfo<AArch64FunctionInfo>()
3718                     ->needsAsyncDwarfUnwindInfo());
3719 
3720   return InsertI;
3721 }
3722 } // namespace
3723 
3724 void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
3725     MachineFunction &MF, RegScavenger *RS = nullptr) const {
3726   if (StackTaggingMergeSetTag)
3727     for (auto &BB : MF)
3728       for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();)
3729         II = tryMergeAdjacentSTG(II, this, RS);
3730 }
3731 
3732 /// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
3733 /// before the update.  This is easily retrieved as it is exactly the offset
3734 /// that is set in processFunctionBeforeFrameFinalized.
3735 StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP(
3736     const MachineFunction &MF, int FI, Register &FrameReg,
3737     bool IgnoreSPUpdates) const {
3738   const MachineFrameInfo &MFI = MF.getFrameInfo();
3739   if (IgnoreSPUpdates) {
3740     LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
3741                       << MFI.getObjectOffset(FI) << "\n");
3742     FrameReg = AArch64::SP;
3743     return StackOffset::getFixed(MFI.getObjectOffset(FI));
3744   }
3745 
3746   // Go to common code if we cannot provide sp + offset.
3747   if (MFI.hasVarSizedObjects() ||
3748       MF.getInfo<AArch64FunctionInfo>()->getStackSizeSVE() ||
3749       MF.getSubtarget().getRegisterInfo()->hasStackRealignment(MF))
3750     return getFrameIndexReference(MF, FI, FrameReg);
3751 
3752   FrameReg = AArch64::SP;
3753   return getStackOffset(MF, MFI.getObjectOffset(FI));
3754 }
3755 
3756 /// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve
3757 /// the parent's frame pointer
3758 unsigned AArch64FrameLowering::getWinEHParentFrameOffset(
3759     const MachineFunction &MF) const {
3760   return 0;
3761 }
3762 
3763 /// Funclets only need to account for space for the callee saved registers,
3764 /// as the locals are accounted for in the parent's stack frame.
3765 unsigned AArch64FrameLowering::getWinEHFuncletFrameSize(
3766     const MachineFunction &MF) const {
3767   // This is the size of the pushed CSRs.
3768   unsigned CSSize =
3769       MF.getInfo<AArch64FunctionInfo>()->getCalleeSavedStackSize();
3770   // This is the amount of stack a funclet needs to allocate.
3771   return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(),
3772                  getStackAlign());
3773 }
3774 
3775 namespace {
3776 struct FrameObject {
3777   bool IsValid = false;
3778   // Index of the object in MFI.
3779   int ObjectIndex = 0;
3780   // Group ID this object belongs to.
3781   int GroupIndex = -1;
3782   // This object should be placed first (closest to SP).
3783   bool ObjectFirst = false;
3784   // This object's group (which always contains the object with
3785   // ObjectFirst==true) should be placed first.
3786   bool GroupFirst = false;
3787 };
3788 
3789 class GroupBuilder {
3790   SmallVector<int, 8> CurrentMembers;
3791   int NextGroupIndex = 0;
3792   std::vector<FrameObject> &Objects;
3793 
3794 public:
3795   GroupBuilder(std::vector<FrameObject> &Objects) : Objects(Objects) {}
3796   void AddMember(int Index) { CurrentMembers.push_back(Index); }
3797   void EndCurrentGroup() {
3798     if (CurrentMembers.size() > 1) {
3799       // Create a new group with the current member list. This might remove them
3800       // from their pre-existing groups. That's OK, dealing with overlapping
3801       // groups is too hard and unlikely to make a difference.
3802       LLVM_DEBUG(dbgs() << "group:");
3803       for (int Index : CurrentMembers) {
3804         Objects[Index].GroupIndex = NextGroupIndex;
3805         LLVM_DEBUG(dbgs() << " " << Index);
3806       }
3807       LLVM_DEBUG(dbgs() << "\n");
3808       NextGroupIndex++;
3809     }
3810     CurrentMembers.clear();
3811   }
3812 };
3813 
3814 bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
3815   // Objects at a lower index are closer to FP; objects at a higher index are
3816   // closer to SP.
3817   //
3818   // For consistency in our comparison, all invalid objects are placed
3819   // at the end. This also allows us to stop walking when we hit the
3820   // first invalid item after it's all sorted.
3821   //
3822   // The "first" object goes first (closest to SP), followed by the members of
3823   // the "first" group.
3824   //
3825   // The rest are sorted by the group index to keep the groups together.
3826   // Higher numbered groups are more likely to be around longer (i.e. untagged
3827   // in the function epilogue and not at some earlier point). Place them closer
3828   // to SP.
3829   //
3830   // If all else equal, sort by the object index to keep the objects in the
3831   // original order.
3832   return std::make_tuple(!A.IsValid, A.ObjectFirst, A.GroupFirst, A.GroupIndex,
3833                          A.ObjectIndex) <
3834          std::make_tuple(!B.IsValid, B.ObjectFirst, B.GroupFirst, B.GroupIndex,
3835                          B.ObjectIndex);
3836 }
3837 } // namespace
3838 
3839 void AArch64FrameLowering::orderFrameObjects(
3840     const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
3841   if (!OrderFrameObjects || ObjectsToAllocate.empty())
3842     return;
3843 
3844   const MachineFrameInfo &MFI = MF.getFrameInfo();
3845   std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd());
3846   for (auto &Obj : ObjectsToAllocate) {
3847     FrameObjects[Obj].IsValid = true;
3848     FrameObjects[Obj].ObjectIndex = Obj;
3849   }
3850 
3851   // Identify stack slots that are tagged at the same time.
3852   GroupBuilder GB(FrameObjects);
3853   for (auto &MBB : MF) {
3854     for (auto &MI : MBB) {
3855       if (MI.isDebugInstr())
3856         continue;
3857       int OpIndex;
3858       switch (MI.getOpcode()) {
3859       case AArch64::STGloop:
3860       case AArch64::STZGloop:
3861         OpIndex = 3;
3862         break;
3863       case AArch64::STGOffset:
3864       case AArch64::STZGOffset:
3865       case AArch64::ST2GOffset:
3866       case AArch64::STZ2GOffset:
3867         OpIndex = 1;
3868         break;
3869       default:
3870         OpIndex = -1;
3871       }
3872 
3873       int TaggedFI = -1;
3874       if (OpIndex >= 0) {
3875         const MachineOperand &MO = MI.getOperand(OpIndex);
3876         if (MO.isFI()) {
3877           int FI = MO.getIndex();
3878           if (FI >= 0 && FI < MFI.getObjectIndexEnd() &&
3879               FrameObjects[FI].IsValid)
3880             TaggedFI = FI;
3881         }
3882       }
3883 
3884       // If this is a stack tagging instruction for a slot that is not part of a
3885       // group yet, either start a new group or add it to the current one.
3886       if (TaggedFI >= 0)
3887         GB.AddMember(TaggedFI);
3888       else
3889         GB.EndCurrentGroup();
3890     }
3891     // Groups should never span multiple basic blocks.
3892     GB.EndCurrentGroup();
3893   }
3894 
3895   // If the function's tagged base pointer is pinned to a stack slot, we want to
3896   // put that slot first when possible. This will likely place it at SP + 0,
3897   // and save one instruction when generating the base pointer because IRG does
3898   // not allow an immediate offset.
3899   const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
3900   Optional<int> TBPI = AFI.getTaggedBasePointerIndex();
3901   if (TBPI) {
3902     FrameObjects[*TBPI].ObjectFirst = true;
3903     FrameObjects[*TBPI].GroupFirst = true;
3904     int FirstGroupIndex = FrameObjects[*TBPI].GroupIndex;
3905     if (FirstGroupIndex >= 0)
3906       for (FrameObject &Object : FrameObjects)
3907         if (Object.GroupIndex == FirstGroupIndex)
3908           Object.GroupFirst = true;
3909   }
3910 
3911   llvm::stable_sort(FrameObjects, FrameObjectCompare);
3912 
3913   int i = 0;
3914   for (auto &Obj : FrameObjects) {
3915     // All invalid items are sorted at the end, so it's safe to stop.
3916     if (!Obj.IsValid)
3917       break;
3918     ObjectsToAllocate[i++] = Obj.ObjectIndex;
3919   }
3920 
3921   LLVM_DEBUG(dbgs() << "Final frame order:\n"; for (auto &Obj
3922                                                     : FrameObjects) {
3923     if (!Obj.IsValid)
3924       break;
3925     dbgs() << "  " << Obj.ObjectIndex << ": group " << Obj.GroupIndex;
3926     if (Obj.ObjectFirst)
3927       dbgs() << ", first";
3928     if (Obj.GroupFirst)
3929       dbgs() << ", group-first";
3930     dbgs() << "\n";
3931   });
3932 }
3933