1 //===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of TargetFrameLowering class.
10 //
11 // On AArch64, stack frames are structured as follows:
12 //
13 // The stack grows downward.
14 //
15 // All of the individual frame areas on the frame below are optional, i.e. it's
16 // possible to create a function so that the particular area isn't present
17 // in the frame.
18 //
19 // At function entry, the "frame" looks as follows:
20 //
21 // | | Higher address
22 // |-----------------------------------|
23 // | |
24 // | arguments passed on the stack |
25 // | |
26 // |-----------------------------------| <- sp
27 // | | Lower address
28 //
29 //
30 // After the prologue has run, the frame has the following general structure.
31 // Note that this doesn't depict the case where a red-zone is used. Also,
32 // technically the last frame area (VLAs) doesn't get created until in the
33 // main function body, after the prologue is run. However, it's depicted here
34 // for completeness.
35 //
36 // | | Higher address
37 // |-----------------------------------|
38 // | |
39 // | arguments passed on the stack |
40 // | |
41 // |-----------------------------------|
42 // | |
43 // | (Win64 only) varargs from reg |
44 // | |
45 // |-----------------------------------|
46 // | |
47 // | callee-saved gpr registers | <--.
48 // | | | On Darwin platforms these
49 // |- - - - - - - - - - - - - - - - - -| | callee saves are swapped,
50 // | prev_lr | | (frame record first)
51 // | prev_fp | <--'
52 // | async context if needed |
53 // | (a.k.a. "frame record") |
54 // |-----------------------------------| <- fp(=x29)
55 // | |
56 // | callee-saved fp/simd/SVE regs |
57 // | |
58 // |-----------------------------------|
59 // | |
60 // | SVE stack objects |
61 // | |
62 // |-----------------------------------|
63 // |.empty.space.to.make.part.below....|
64 // |.aligned.in.case.it.needs.more.than| (size of this area is unknown at
65 // |.the.standard.16-byte.alignment....| compile time; if present)
66 // |-----------------------------------|
67 // | |
68 // | local variables of fixed size |
69 // | including spill slots |
70 // |-----------------------------------| <- bp(not defined by ABI,
71 // |.variable-sized.local.variables....| LLVM chooses X19)
72 // |.(VLAs)............................| (size of this area is unknown at
73 // |...................................| compile time)
74 // |-----------------------------------| <- sp
75 // | | Lower address
76 //
77 //
78 // To access the data in a frame, at-compile time, a constant offset must be
79 // computable from one of the pointers (fp, bp, sp) to access it. The size
80 // of the areas with a dotted background cannot be computed at compile-time
81 // if they are present, making it required to have all three of fp, bp and
82 // sp to be set up to be able to access all contents in the frame areas,
83 // assuming all of the frame areas are non-empty.
84 //
85 // For most functions, some of the frame areas are empty. For those functions,
86 // it may not be necessary to set up fp or bp:
87 // * A base pointer is definitely needed when there are both VLAs and local
88 // variables with more-than-default alignment requirements.
89 // * A frame pointer is definitely needed when there are local variables with
90 // more-than-default alignment requirements.
91 //
92 // For Darwin platforms the frame-record (fp, lr) is stored at the top of the
93 // callee-saved area, since the unwind encoding does not allow for encoding
94 // this dynamically and existing tools depend on this layout. For other
95 // platforms, the frame-record is stored at the bottom of the (gpr) callee-saved
96 // area to allow SVE stack objects (allocated directly below the callee-saves,
97 // if available) to be accessed directly from the framepointer.
98 // The SVE spill/fill instructions have VL-scaled addressing modes such
99 // as:
100 // ldr z8, [fp, #-7 mul vl]
101 // For SVE the size of the vector length (VL) is not known at compile-time, so
102 // '#-7 mul vl' is an offset that can only be evaluated at runtime. With this
103 // layout, we don't need to add an unscaled offset to the framepointer before
104 // accessing the SVE object in the frame.
105 //
106 // In some cases when a base pointer is not strictly needed, it is generated
107 // anyway when offsets from the frame pointer to access local variables become
108 // so large that the offset can't be encoded in the immediate fields of loads
109 // or stores.
110 //
111 // Outgoing function arguments must be at the bottom of the stack frame when
112 // calling another function. If we do not have variable-sized stack objects, we
113 // can allocate a "reserved call frame" area at the bottom of the local
114 // variable area, large enough for all outgoing calls. If we do have VLAs, then
115 // the stack pointer must be decremented and incremented around each call to
116 // make space for the arguments below the VLAs.
117 //
118 // FIXME: also explain the redzone concept.
119 //
120 // An example of the prologue:
121 //
122 // .globl __foo
123 // .align 2
124 // __foo:
125 // Ltmp0:
126 // .cfi_startproc
127 // .cfi_personality 155, ___gxx_personality_v0
128 // Leh_func_begin:
129 // .cfi_lsda 16, Lexception33
130 //
131 // stp xa,bx, [sp, -#offset]!
132 // ...
133 // stp x28, x27, [sp, #offset-32]
134 // stp fp, lr, [sp, #offset-16]
135 // add fp, sp, #offset - 16
136 // sub sp, sp, #1360
137 //
138 // The Stack:
139 // +-------------------------------------------+
140 // 10000 | ........ | ........ | ........ | ........ |
141 // 10004 | ........ | ........ | ........ | ........ |
142 // +-------------------------------------------+
143 // 10008 | ........ | ........ | ........ | ........ |
144 // 1000c | ........ | ........ | ........ | ........ |
145 // +===========================================+
146 // 10010 | X28 Register |
147 // 10014 | X28 Register |
148 // +-------------------------------------------+
149 // 10018 | X27 Register |
150 // 1001c | X27 Register |
151 // +===========================================+
152 // 10020 | Frame Pointer |
153 // 10024 | Frame Pointer |
154 // +-------------------------------------------+
155 // 10028 | Link Register |
156 // 1002c | Link Register |
157 // +===========================================+
158 // 10030 | ........ | ........ | ........ | ........ |
159 // 10034 | ........ | ........ | ........ | ........ |
160 // +-------------------------------------------+
161 // 10038 | ........ | ........ | ........ | ........ |
162 // 1003c | ........ | ........ | ........ | ........ |
163 // +-------------------------------------------+
164 //
165 // [sp] = 10030 :: >>initial value<<
166 // sp = 10020 :: stp fp, lr, [sp, #-16]!
167 // fp = sp == 10020 :: mov fp, sp
168 // [sp] == 10020 :: stp x28, x27, [sp, #-16]!
169 // sp == 10010 :: >>final value<<
170 //
171 // The frame pointer (w29) points to address 10020. If we use an offset of
172 // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
173 // for w27, and -32 for w28:
174 //
175 // Ltmp1:
176 // .cfi_def_cfa w29, 16
177 // Ltmp2:
178 // .cfi_offset w30, -8
179 // Ltmp3:
180 // .cfi_offset w29, -16
181 // Ltmp4:
182 // .cfi_offset w27, -24
183 // Ltmp5:
184 // .cfi_offset w28, -32
185 //
186 //===----------------------------------------------------------------------===//
187
188 #include "AArch64FrameLowering.h"
189 #include "AArch64InstrInfo.h"
190 #include "AArch64MachineFunctionInfo.h"
191 #include "AArch64RegisterInfo.h"
192 #include "AArch64Subtarget.h"
193 #include "AArch64TargetMachine.h"
194 #include "MCTargetDesc/AArch64AddressingModes.h"
195 #include "MCTargetDesc/AArch64MCTargetDesc.h"
196 #include "llvm/ADT/ScopeExit.h"
197 #include "llvm/ADT/SmallVector.h"
198 #include "llvm/ADT/Statistic.h"
199 #include "llvm/CodeGen/LivePhysRegs.h"
200 #include "llvm/CodeGen/MachineBasicBlock.h"
201 #include "llvm/CodeGen/MachineFrameInfo.h"
202 #include "llvm/CodeGen/MachineFunction.h"
203 #include "llvm/CodeGen/MachineInstr.h"
204 #include "llvm/CodeGen/MachineInstrBuilder.h"
205 #include "llvm/CodeGen/MachineMemOperand.h"
206 #include "llvm/CodeGen/MachineModuleInfo.h"
207 #include "llvm/CodeGen/MachineOperand.h"
208 #include "llvm/CodeGen/MachineRegisterInfo.h"
209 #include "llvm/CodeGen/RegisterScavenging.h"
210 #include "llvm/CodeGen/TargetInstrInfo.h"
211 #include "llvm/CodeGen/TargetRegisterInfo.h"
212 #include "llvm/CodeGen/TargetSubtargetInfo.h"
213 #include "llvm/CodeGen/WinEHFuncInfo.h"
214 #include "llvm/IR/Attributes.h"
215 #include "llvm/IR/CallingConv.h"
216 #include "llvm/IR/DataLayout.h"
217 #include "llvm/IR/DebugLoc.h"
218 #include "llvm/IR/Function.h"
219 #include "llvm/MC/MCAsmInfo.h"
220 #include "llvm/MC/MCDwarf.h"
221 #include "llvm/Support/CommandLine.h"
222 #include "llvm/Support/Debug.h"
223 #include "llvm/Support/ErrorHandling.h"
224 #include "llvm/Support/MathExtras.h"
225 #include "llvm/Support/raw_ostream.h"
226 #include "llvm/Target/TargetMachine.h"
227 #include "llvm/Target/TargetOptions.h"
228 #include <cassert>
229 #include <cstdint>
230 #include <iterator>
231 #include <vector>
232
233 using namespace llvm;
234
235 #define DEBUG_TYPE "frame-info"
236
237 static cl::opt<bool> EnableRedZone("aarch64-redzone",
238 cl::desc("enable use of redzone on AArch64"),
239 cl::init(false), cl::Hidden);
240
241 static cl::opt<bool>
242 ReverseCSRRestoreSeq("reverse-csr-restore-seq",
243 cl::desc("reverse the CSR restore sequence"),
244 cl::init(false), cl::Hidden);
245
246 static cl::opt<bool> StackTaggingMergeSetTag(
247 "stack-tagging-merge-settag",
248 cl::desc("merge settag instruction in function epilog"), cl::init(true),
249 cl::Hidden);
250
251 static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects",
252 cl::desc("sort stack allocations"),
253 cl::init(true), cl::Hidden);
254
255 cl::opt<bool> EnableHomogeneousPrologEpilog(
256 "homogeneous-prolog-epilog", cl::Hidden,
257 cl::desc("Emit homogeneous prologue and epilogue for the size "
258 "optimization (default = off)"));
259
260 STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
261
262 /// Returns how much of the incoming argument stack area (in bytes) we should
263 /// clean up in an epilogue. For the C calling convention this will be 0, for
264 /// guaranteed tail call conventions it can be positive (a normal return or a
265 /// tail call to a function that uses less stack space for arguments) or
266 /// negative (for a tail call to a function that needs more stack space than us
267 /// for arguments).
getArgumentStackToRestore(MachineFunction & MF,MachineBasicBlock & MBB)268 static int64_t getArgumentStackToRestore(MachineFunction &MF,
269 MachineBasicBlock &MBB) {
270 MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
271 bool IsTailCallReturn = false;
272 if (MBB.end() != MBBI) {
273 unsigned RetOpcode = MBBI->getOpcode();
274 IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi ||
275 RetOpcode == AArch64::TCRETURNri ||
276 RetOpcode == AArch64::TCRETURNriBTI;
277 }
278 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
279
280 int64_t ArgumentPopSize = 0;
281 if (IsTailCallReturn) {
282 MachineOperand &StackAdjust = MBBI->getOperand(1);
283
284 // For a tail-call in a callee-pops-arguments environment, some or all of
285 // the stack may actually be in use for the call's arguments, this is
286 // calculated during LowerCall and consumed here...
287 ArgumentPopSize = StackAdjust.getImm();
288 } else {
289 // ... otherwise the amount to pop is *all* of the argument space,
290 // conveniently stored in the MachineFunctionInfo by
291 // LowerFormalArguments. This will, of course, be zero for the C calling
292 // convention.
293 ArgumentPopSize = AFI->getArgumentStackToRestore();
294 }
295
296 return ArgumentPopSize;
297 }
298
299 static bool produceCompactUnwindFrame(MachineFunction &MF);
300 static bool needsWinCFI(const MachineFunction &MF);
301 static StackOffset getSVEStackSize(const MachineFunction &MF);
302 static bool needsShadowCallStackPrologueEpilogue(MachineFunction &MF);
303
304 /// Returns true if a homogeneous prolog or epilog code can be emitted
305 /// for the size optimization. If possible, a frame helper call is injected.
306 /// When Exit block is given, this check is for epilog.
homogeneousPrologEpilog(MachineFunction & MF,MachineBasicBlock * Exit) const307 bool AArch64FrameLowering::homogeneousPrologEpilog(
308 MachineFunction &MF, MachineBasicBlock *Exit) const {
309 if (!MF.getFunction().hasMinSize())
310 return false;
311 if (!EnableHomogeneousPrologEpilog)
312 return false;
313 if (ReverseCSRRestoreSeq)
314 return false;
315 if (EnableRedZone)
316 return false;
317
318 // TODO: Window is supported yet.
319 if (needsWinCFI(MF))
320 return false;
321 // TODO: SVE is not supported yet.
322 if (getSVEStackSize(MF))
323 return false;
324
325 // Bail on stack adjustment needed on return for simplicity.
326 const MachineFrameInfo &MFI = MF.getFrameInfo();
327 const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
328 if (MFI.hasVarSizedObjects() || RegInfo->hasStackRealignment(MF))
329 return false;
330 if (Exit && getArgumentStackToRestore(MF, *Exit))
331 return false;
332
333 return true;
334 }
335
336 /// Returns true if CSRs should be paired.
producePairRegisters(MachineFunction & MF) const337 bool AArch64FrameLowering::producePairRegisters(MachineFunction &MF) const {
338 return produceCompactUnwindFrame(MF) || homogeneousPrologEpilog(MF);
339 }
340
341 /// This is the biggest offset to the stack pointer we can encode in aarch64
342 /// instructions (without using a separate calculation and a temp register).
343 /// Note that the exception here are vector stores/loads which cannot encode any
344 /// displacements (see estimateRSStackSizeLimit(), isAArch64FrameOffsetLegal()).
345 static const unsigned DefaultSafeSPDisplacement = 255;
346
347 /// Look at each instruction that references stack frames and return the stack
348 /// size limit beyond which some of these instructions will require a scratch
349 /// register during their expansion later.
estimateRSStackSizeLimit(MachineFunction & MF)350 static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
351 // FIXME: For now, just conservatively guestimate based on unscaled indexing
352 // range. We'll end up allocating an unnecessary spill slot a lot, but
353 // realistically that's not a big deal at this stage of the game.
354 for (MachineBasicBlock &MBB : MF) {
355 for (MachineInstr &MI : MBB) {
356 if (MI.isDebugInstr() || MI.isPseudo() ||
357 MI.getOpcode() == AArch64::ADDXri ||
358 MI.getOpcode() == AArch64::ADDSXri)
359 continue;
360
361 for (const MachineOperand &MO : MI.operands()) {
362 if (!MO.isFI())
363 continue;
364
365 StackOffset Offset;
366 if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) ==
367 AArch64FrameOffsetCannotUpdate)
368 return 0;
369 }
370 }
371 }
372 return DefaultSafeSPDisplacement;
373 }
374
375 TargetStackID::Value
getStackIDForScalableVectors() const376 AArch64FrameLowering::getStackIDForScalableVectors() const {
377 return TargetStackID::ScalableVector;
378 }
379
380 /// Returns the size of the fixed object area (allocated next to sp on entry)
381 /// On Win64 this may include a var args area and an UnwindHelp object for EH.
getFixedObjectSize(const MachineFunction & MF,const AArch64FunctionInfo * AFI,bool IsWin64,bool IsFunclet)382 static unsigned getFixedObjectSize(const MachineFunction &MF,
383 const AArch64FunctionInfo *AFI, bool IsWin64,
384 bool IsFunclet) {
385 if (!IsWin64 || IsFunclet) {
386 return AFI->getTailCallReservedStack();
387 } else {
388 if (AFI->getTailCallReservedStack() != 0)
389 report_fatal_error("cannot generate ABI-changing tail call for Win64");
390 // Var args are stored here in the primary function.
391 const unsigned VarArgsArea = AFI->getVarArgsGPRSize();
392 // To support EH funclets we allocate an UnwindHelp object
393 const unsigned UnwindHelpObject = (MF.hasEHFunclets() ? 8 : 0);
394 return alignTo(VarArgsArea + UnwindHelpObject, 16);
395 }
396 }
397
398 /// Returns the size of the entire SVE stackframe (calleesaves + spills).
getSVEStackSize(const MachineFunction & MF)399 static StackOffset getSVEStackSize(const MachineFunction &MF) {
400 const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
401 return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE());
402 }
403
canUseRedZone(const MachineFunction & MF) const404 bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
405 if (!EnableRedZone)
406 return false;
407
408 // Don't use the red zone if the function explicitly asks us not to.
409 // This is typically used for kernel code.
410 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
411 const unsigned RedZoneSize =
412 Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction());
413 if (!RedZoneSize)
414 return false;
415
416 const MachineFrameInfo &MFI = MF.getFrameInfo();
417 const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
418 uint64_t NumBytes = AFI->getLocalStackSize();
419
420 return !(MFI.hasCalls() || hasFP(MF) || NumBytes > RedZoneSize ||
421 getSVEStackSize(MF));
422 }
423
424 /// hasFP - Return true if the specified function should have a dedicated frame
425 /// pointer register.
hasFP(const MachineFunction & MF) const426 bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
427 const MachineFrameInfo &MFI = MF.getFrameInfo();
428 const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
429 // Win64 EH requires a frame pointer if funclets are present, as the locals
430 // are accessed off the frame pointer in both the parent function and the
431 // funclets.
432 if (MF.hasEHFunclets())
433 return true;
434 // Retain behavior of always omitting the FP for leaf functions when possible.
435 if (MF.getTarget().Options.DisableFramePointerElim(MF))
436 return true;
437 if (MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
438 MFI.hasStackMap() || MFI.hasPatchPoint() ||
439 RegInfo->hasStackRealignment(MF))
440 return true;
441 // With large callframes around we may need to use FP to access the scavenging
442 // emergency spillslot.
443 //
444 // Unfortunately some calls to hasFP() like machine verifier ->
445 // getReservedReg() -> hasFP in the middle of global isel are too early
446 // to know the max call frame size. Hopefully conservatively returning "true"
447 // in those cases is fine.
448 // DefaultSafeSPDisplacement is fine as we only emergency spill GP regs.
449 if (!MFI.isMaxCallFrameSizeComputed() ||
450 MFI.getMaxCallFrameSize() > DefaultSafeSPDisplacement)
451 return true;
452
453 return false;
454 }
455
456 /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
457 /// not required, we reserve argument space for call sites in the function
458 /// immediately on entry to the current function. This eliminates the need for
459 /// add/sub sp brackets around call sites. Returns true if the call frame is
460 /// included as part of the stack frame.
461 bool
hasReservedCallFrame(const MachineFunction & MF) const462 AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
463 return !MF.getFrameInfo().hasVarSizedObjects();
464 }
465
eliminateCallFramePseudoInstr(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I) const466 MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
467 MachineFunction &MF, MachineBasicBlock &MBB,
468 MachineBasicBlock::iterator I) const {
469 const AArch64InstrInfo *TII =
470 static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
471 DebugLoc DL = I->getDebugLoc();
472 unsigned Opc = I->getOpcode();
473 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
474 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
475
476 if (!hasReservedCallFrame(MF)) {
477 int64_t Amount = I->getOperand(0).getImm();
478 Amount = alignTo(Amount, getStackAlign());
479 if (!IsDestroy)
480 Amount = -Amount;
481
482 // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
483 // doesn't have to pop anything), then the first operand will be zero too so
484 // this adjustment is a no-op.
485 if (CalleePopAmount == 0) {
486 // FIXME: in-function stack adjustment for calls is limited to 24-bits
487 // because there's no guaranteed temporary register available.
488 //
489 // ADD/SUB (immediate) has only LSL #0 and LSL #12 available.
490 // 1) For offset <= 12-bit, we use LSL #0
491 // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
492 // LSL #0, and the other uses LSL #12.
493 //
494 // Most call frames will be allocated at the start of a function so
495 // this is OK, but it is a limitation that needs dealing with.
496 assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
497 emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
498 StackOffset::getFixed(Amount), TII);
499 }
500 } else if (CalleePopAmount != 0) {
501 // If the calling convention demands that the callee pops arguments from the
502 // stack, we want to add it back if we have a reserved call frame.
503 assert(CalleePopAmount < 0xffffff && "call frame too large");
504 emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
505 StackOffset::getFixed(-(int64_t)CalleePopAmount), TII);
506 }
507 return MBB.erase(I);
508 }
509
emitCalleeSavedGPRLocations(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI) const510 void AArch64FrameLowering::emitCalleeSavedGPRLocations(
511 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
512 MachineFunction &MF = *MBB.getParent();
513 MachineFrameInfo &MFI = MF.getFrameInfo();
514
515 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
516 if (CSI.empty())
517 return;
518
519 const TargetSubtargetInfo &STI = MF.getSubtarget();
520 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
521 const TargetInstrInfo &TII = *STI.getInstrInfo();
522 DebugLoc DL = MBB.findDebugLoc(MBBI);
523
524 for (const auto &Info : CSI) {
525 if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector)
526 continue;
527
528 assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
529 unsigned DwarfReg = TRI.getDwarfRegNum(Info.getReg(), true);
530
531 int64_t Offset =
532 MFI.getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea();
533 unsigned CFIIndex = MF.addFrameInst(
534 MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
535 BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
536 .addCFIIndex(CFIIndex)
537 .setMIFlags(MachineInstr::FrameSetup);
538 }
539 }
540
emitCalleeSavedSVELocations(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI) const541 void AArch64FrameLowering::emitCalleeSavedSVELocations(
542 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
543 MachineFunction &MF = *MBB.getParent();
544 MachineFrameInfo &MFI = MF.getFrameInfo();
545
546 // Add callee saved registers to move list.
547 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
548 if (CSI.empty())
549 return;
550
551 const TargetSubtargetInfo &STI = MF.getSubtarget();
552 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
553 const TargetInstrInfo &TII = *STI.getInstrInfo();
554 DebugLoc DL = MBB.findDebugLoc(MBBI);
555 AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
556
557 for (const auto &Info : CSI) {
558 if (!(MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector))
559 continue;
560
561 // Not all unwinders may know about SVE registers, so assume the lowest
562 // common demoninator.
563 assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
564 unsigned Reg = Info.getReg();
565 if (!static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
566 continue;
567
568 StackOffset Offset =
569 StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
570 StackOffset::getFixed(AFI.getCalleeSavedStackSize(MFI));
571
572 unsigned CFIIndex = MF.addFrameInst(createCFAOffset(TRI, Reg, Offset));
573 BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
574 .addCFIIndex(CFIIndex)
575 .setMIFlags(MachineInstr::FrameSetup);
576 }
577 }
578
emitCalleeSavedFrameMoves(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI) const579 void AArch64FrameLowering::emitCalleeSavedFrameMoves(
580 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
581 emitCalleeSavedGPRLocations(MBB, MBBI);
582 emitCalleeSavedSVELocations(MBB, MBBI);
583 }
584
insertCFISameValue(const MCInstrDesc & Desc,MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertPt,unsigned DwarfReg)585 static void insertCFISameValue(const MCInstrDesc &Desc, MachineFunction &MF,
586 MachineBasicBlock &MBB,
587 MachineBasicBlock::iterator InsertPt,
588 unsigned DwarfReg) {
589 unsigned CFIIndex =
590 MF.addFrameInst(MCCFIInstruction::createSameValue(nullptr, DwarfReg));
591 BuildMI(MBB, InsertPt, DebugLoc(), Desc).addCFIIndex(CFIIndex);
592 }
593
resetCFIToInitialState(MachineBasicBlock & MBB) const594 void AArch64FrameLowering::resetCFIToInitialState(
595 MachineBasicBlock &MBB) const {
596
597 MachineFunction &MF = *MBB.getParent();
598 const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
599 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
600 const auto &TRI =
601 static_cast<const AArch64RegisterInfo &>(*Subtarget.getRegisterInfo());
602 const auto &MFI = *MF.getInfo<AArch64FunctionInfo>();
603
604 const MCInstrDesc &CFIDesc = TII.get(TargetOpcode::CFI_INSTRUCTION);
605 DebugLoc DL;
606
607 // Reset the CFA to `SP + 0`.
608 MachineBasicBlock::iterator InsertPt = MBB.begin();
609 unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
610 nullptr, TRI.getDwarfRegNum(AArch64::SP, true), 0));
611 BuildMI(MBB, InsertPt, DL, CFIDesc).addCFIIndex(CFIIndex);
612
613 // Flip the RA sign state.
614 if (MFI.shouldSignReturnAddress()) {
615 CFIIndex = MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
616 BuildMI(MBB, InsertPt, DL, CFIDesc).addCFIIndex(CFIIndex);
617 }
618
619 // Shadow call stack uses X18, reset it.
620 if (needsShadowCallStackPrologueEpilogue(MF))
621 insertCFISameValue(CFIDesc, MF, MBB, InsertPt,
622 TRI.getDwarfRegNum(AArch64::X18, true));
623
624 // Emit .cfi_same_value for callee-saved registers.
625 const std::vector<CalleeSavedInfo> &CSI =
626 MF.getFrameInfo().getCalleeSavedInfo();
627 for (const auto &Info : CSI) {
628 unsigned Reg = Info.getReg();
629 if (!TRI.regNeedsCFI(Reg, Reg))
630 continue;
631 insertCFISameValue(CFIDesc, MF, MBB, InsertPt,
632 TRI.getDwarfRegNum(Reg, true));
633 }
634 }
635
emitCalleeSavedRestores(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,bool SVE)636 static void emitCalleeSavedRestores(MachineBasicBlock &MBB,
637 MachineBasicBlock::iterator MBBI,
638 bool SVE) {
639 MachineFunction &MF = *MBB.getParent();
640 MachineFrameInfo &MFI = MF.getFrameInfo();
641
642 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
643 if (CSI.empty())
644 return;
645
646 const TargetSubtargetInfo &STI = MF.getSubtarget();
647 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
648 const TargetInstrInfo &TII = *STI.getInstrInfo();
649 DebugLoc DL = MBB.findDebugLoc(MBBI);
650
651 for (const auto &Info : CSI) {
652 if (SVE !=
653 (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector))
654 continue;
655
656 unsigned Reg = Info.getReg();
657 if (SVE &&
658 !static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
659 continue;
660
661 unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore(
662 nullptr, TRI.getDwarfRegNum(Info.getReg(), true)));
663 BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
664 .addCFIIndex(CFIIndex)
665 .setMIFlags(MachineInstr::FrameDestroy);
666 }
667 }
668
emitCalleeSavedGPRRestores(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI) const669 void AArch64FrameLowering::emitCalleeSavedGPRRestores(
670 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
671 emitCalleeSavedRestores(MBB, MBBI, false);
672 }
673
emitCalleeSavedSVERestores(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI) const674 void AArch64FrameLowering::emitCalleeSavedSVERestores(
675 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
676 emitCalleeSavedRestores(MBB, MBBI, true);
677 }
678
getRegisterOrZero(MCRegister Reg,bool HasSVE)679 static MCRegister getRegisterOrZero(MCRegister Reg, bool HasSVE) {
680 switch (Reg.id()) {
681 default:
682 // The called routine is expected to preserve r19-r28
683 // r29 and r30 are used as frame pointer and link register resp.
684 return 0;
685
686 // GPRs
687 #define CASE(n) \
688 case AArch64::W##n: \
689 case AArch64::X##n: \
690 return AArch64::X##n
691 CASE(0);
692 CASE(1);
693 CASE(2);
694 CASE(3);
695 CASE(4);
696 CASE(5);
697 CASE(6);
698 CASE(7);
699 CASE(8);
700 CASE(9);
701 CASE(10);
702 CASE(11);
703 CASE(12);
704 CASE(13);
705 CASE(14);
706 CASE(15);
707 CASE(16);
708 CASE(17);
709 CASE(18);
710 #undef CASE
711
712 // FPRs
713 #define CASE(n) \
714 case AArch64::B##n: \
715 case AArch64::H##n: \
716 case AArch64::S##n: \
717 case AArch64::D##n: \
718 case AArch64::Q##n: \
719 return HasSVE ? AArch64::Z##n : AArch64::Q##n
720 CASE(0);
721 CASE(1);
722 CASE(2);
723 CASE(3);
724 CASE(4);
725 CASE(5);
726 CASE(6);
727 CASE(7);
728 CASE(8);
729 CASE(9);
730 CASE(10);
731 CASE(11);
732 CASE(12);
733 CASE(13);
734 CASE(14);
735 CASE(15);
736 CASE(16);
737 CASE(17);
738 CASE(18);
739 CASE(19);
740 CASE(20);
741 CASE(21);
742 CASE(22);
743 CASE(23);
744 CASE(24);
745 CASE(25);
746 CASE(26);
747 CASE(27);
748 CASE(28);
749 CASE(29);
750 CASE(30);
751 CASE(31);
752 #undef CASE
753 }
754 }
755
emitZeroCallUsedRegs(BitVector RegsToZero,MachineBasicBlock & MBB) const756 void AArch64FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero,
757 MachineBasicBlock &MBB) const {
758 // Insertion point.
759 MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
760
761 // Fake a debug loc.
762 DebugLoc DL;
763 if (MBBI != MBB.end())
764 DL = MBBI->getDebugLoc();
765
766 const MachineFunction &MF = *MBB.getParent();
767 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
768 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
769
770 BitVector GPRsToZero(TRI.getNumRegs());
771 BitVector FPRsToZero(TRI.getNumRegs());
772 bool HasSVE = STI.hasSVE();
773 for (MCRegister Reg : RegsToZero.set_bits()) {
774 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
775 // For GPRs, we only care to clear out the 64-bit register.
776 if (MCRegister XReg = getRegisterOrZero(Reg, HasSVE))
777 GPRsToZero.set(XReg);
778 } else if (AArch64::FPR128RegClass.contains(Reg) ||
779 AArch64::FPR64RegClass.contains(Reg) ||
780 AArch64::FPR32RegClass.contains(Reg) ||
781 AArch64::FPR16RegClass.contains(Reg) ||
782 AArch64::FPR8RegClass.contains(Reg)) {
783 // For FPRs,
784 if (MCRegister XReg = getRegisterOrZero(Reg, HasSVE))
785 FPRsToZero.set(XReg);
786 }
787 }
788
789 const AArch64InstrInfo &TII = *STI.getInstrInfo();
790
791 // Zero out GPRs.
792 for (MCRegister Reg : GPRsToZero.set_bits())
793 BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), Reg).addImm(0);
794
795 // Zero out FP/vector registers.
796 for (MCRegister Reg : FPRsToZero.set_bits())
797 if (HasSVE)
798 BuildMI(MBB, MBBI, DL, TII.get(AArch64::DUP_ZI_D), Reg)
799 .addImm(0)
800 .addImm(0);
801 else
802 BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVIv2d_ns), Reg).addImm(0);
803
804 if (HasSVE) {
805 for (MCRegister PReg :
806 {AArch64::P0, AArch64::P1, AArch64::P2, AArch64::P3, AArch64::P4,
807 AArch64::P5, AArch64::P6, AArch64::P7, AArch64::P8, AArch64::P9,
808 AArch64::P10, AArch64::P11, AArch64::P12, AArch64::P13, AArch64::P14,
809 AArch64::P15}) {
810 if (RegsToZero[PReg])
811 BuildMI(MBB, MBBI, DL, TII.get(AArch64::PFALSE), PReg);
812 }
813 }
814 }
815
816 // Find a scratch register that we can use at the start of the prologue to
817 // re-align the stack pointer. We avoid using callee-save registers since they
818 // may appear to be free when this is called from canUseAsPrologue (during
819 // shrink wrapping), but then no longer be free when this is called from
820 // emitPrologue.
821 //
822 // FIXME: This is a bit conservative, since in the above case we could use one
823 // of the callee-save registers as a scratch temp to re-align the stack pointer,
824 // but we would then have to make sure that we were in fact saving at least one
825 // callee-save register in the prologue, which is additional complexity that
826 // doesn't seem worth the benefit.
findScratchNonCalleeSaveRegister(MachineBasicBlock * MBB)827 static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
828 MachineFunction *MF = MBB->getParent();
829
830 // If MBB is an entry block, use X9 as the scratch register
831 if (&MF->front() == MBB)
832 return AArch64::X9;
833
834 const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
835 const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
836 LivePhysRegs LiveRegs(TRI);
837 LiveRegs.addLiveIns(*MBB);
838
839 // Mark callee saved registers as used so we will not choose them.
840 const MCPhysReg *CSRegs = MF->getRegInfo().getCalleeSavedRegs();
841 for (unsigned i = 0; CSRegs[i]; ++i)
842 LiveRegs.addReg(CSRegs[i]);
843
844 // Prefer X9 since it was historically used for the prologue scratch reg.
845 const MachineRegisterInfo &MRI = MF->getRegInfo();
846 if (LiveRegs.available(MRI, AArch64::X9))
847 return AArch64::X9;
848
849 for (unsigned Reg : AArch64::GPR64RegClass) {
850 if (LiveRegs.available(MRI, Reg))
851 return Reg;
852 }
853 return AArch64::NoRegister;
854 }
855
canUseAsPrologue(const MachineBasicBlock & MBB) const856 bool AArch64FrameLowering::canUseAsPrologue(
857 const MachineBasicBlock &MBB) const {
858 const MachineFunction *MF = MBB.getParent();
859 MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
860 const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
861 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
862
863 // Don't need a scratch register if we're not going to re-align the stack.
864 if (!RegInfo->hasStackRealignment(*MF))
865 return true;
866 // Otherwise, we can use any block as long as it has a scratch register
867 // available.
868 return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
869 }
870
windowsRequiresStackProbe(MachineFunction & MF,uint64_t StackSizeInBytes)871 static bool windowsRequiresStackProbe(MachineFunction &MF,
872 uint64_t StackSizeInBytes) {
873 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
874 if (!Subtarget.isTargetWindows())
875 return false;
876 const Function &F = MF.getFunction();
877 // TODO: When implementing stack protectors, take that into account
878 // for the probe threshold.
879 unsigned StackProbeSize = 4096;
880 if (F.hasFnAttribute("stack-probe-size"))
881 F.getFnAttribute("stack-probe-size")
882 .getValueAsString()
883 .getAsInteger(0, StackProbeSize);
884 return (StackSizeInBytes >= StackProbeSize) &&
885 !F.hasFnAttribute("no-stack-arg-probe");
886 }
887
needsWinCFI(const MachineFunction & MF)888 static bool needsWinCFI(const MachineFunction &MF) {
889 const Function &F = MF.getFunction();
890 return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
891 F.needsUnwindTableEntry();
892 }
893
shouldCombineCSRLocalStackBump(MachineFunction & MF,uint64_t StackBumpBytes) const894 bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
895 MachineFunction &MF, uint64_t StackBumpBytes) const {
896 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
897 const MachineFrameInfo &MFI = MF.getFrameInfo();
898 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
899 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
900 if (homogeneousPrologEpilog(MF))
901 return false;
902
903 if (AFI->getLocalStackSize() == 0)
904 return false;
905
906 // For WinCFI, if optimizing for size, prefer to not combine the stack bump
907 // (to force a stp with predecrement) to match the packed unwind format,
908 // provided that there actually are any callee saved registers to merge the
909 // decrement with.
910 // This is potentially marginally slower, but allows using the packed
911 // unwind format for functions that both have a local area and callee saved
912 // registers. Using the packed unwind format notably reduces the size of
913 // the unwind info.
914 if (needsWinCFI(MF) && AFI->getCalleeSavedStackSize() > 0 &&
915 MF.getFunction().hasOptSize())
916 return false;
917
918 // 512 is the maximum immediate for stp/ldp that will be used for
919 // callee-save save/restores
920 if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackBumpBytes))
921 return false;
922
923 if (MFI.hasVarSizedObjects())
924 return false;
925
926 if (RegInfo->hasStackRealignment(MF))
927 return false;
928
929 // This isn't strictly necessary, but it simplifies things a bit since the
930 // current RedZone handling code assumes the SP is adjusted by the
931 // callee-save save/restore code.
932 if (canUseRedZone(MF))
933 return false;
934
935 // When there is an SVE area on the stack, always allocate the
936 // callee-saves and spills/locals separately.
937 if (getSVEStackSize(MF))
938 return false;
939
940 return true;
941 }
942
shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock & MBB,unsigned StackBumpBytes) const943 bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
944 MachineBasicBlock &MBB, unsigned StackBumpBytes) const {
945 if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes))
946 return false;
947
948 if (MBB.empty())
949 return true;
950
951 // Disable combined SP bump if the last instruction is an MTE tag store. It
952 // is almost always better to merge SP adjustment into those instructions.
953 MachineBasicBlock::iterator LastI = MBB.getFirstTerminator();
954 MachineBasicBlock::iterator Begin = MBB.begin();
955 while (LastI != Begin) {
956 --LastI;
957 if (LastI->isTransient())
958 continue;
959 if (!LastI->getFlag(MachineInstr::FrameDestroy))
960 break;
961 }
962 switch (LastI->getOpcode()) {
963 case AArch64::STGloop:
964 case AArch64::STZGloop:
965 case AArch64::STGOffset:
966 case AArch64::STZGOffset:
967 case AArch64::ST2GOffset:
968 case AArch64::STZ2GOffset:
969 return false;
970 default:
971 return true;
972 }
973 llvm_unreachable("unreachable");
974 }
975
976 // Given a load or a store instruction, generate an appropriate unwinding SEH
977 // code on Windows.
InsertSEH(MachineBasicBlock::iterator MBBI,const TargetInstrInfo & TII,MachineInstr::MIFlag Flag)978 static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
979 const TargetInstrInfo &TII,
980 MachineInstr::MIFlag Flag) {
981 unsigned Opc = MBBI->getOpcode();
982 MachineBasicBlock *MBB = MBBI->getParent();
983 MachineFunction &MF = *MBB->getParent();
984 DebugLoc DL = MBBI->getDebugLoc();
985 unsigned ImmIdx = MBBI->getNumOperands() - 1;
986 int Imm = MBBI->getOperand(ImmIdx).getImm();
987 MachineInstrBuilder MIB;
988 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
989 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
990
991 switch (Opc) {
992 default:
993 llvm_unreachable("No SEH Opcode for this instruction");
994 case AArch64::LDPDpost:
995 Imm = -Imm;
996 LLVM_FALLTHROUGH;
997 case AArch64::STPDpre: {
998 unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
999 unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg());
1000 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP_X))
1001 .addImm(Reg0)
1002 .addImm(Reg1)
1003 .addImm(Imm * 8)
1004 .setMIFlag(Flag);
1005 break;
1006 }
1007 case AArch64::LDPXpost:
1008 Imm = -Imm;
1009 LLVM_FALLTHROUGH;
1010 case AArch64::STPXpre: {
1011 Register Reg0 = MBBI->getOperand(1).getReg();
1012 Register Reg1 = MBBI->getOperand(2).getReg();
1013 if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
1014 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X))
1015 .addImm(Imm * 8)
1016 .setMIFlag(Flag);
1017 else
1018 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP_X))
1019 .addImm(RegInfo->getSEHRegNum(Reg0))
1020 .addImm(RegInfo->getSEHRegNum(Reg1))
1021 .addImm(Imm * 8)
1022 .setMIFlag(Flag);
1023 break;
1024 }
1025 case AArch64::LDRDpost:
1026 Imm = -Imm;
1027 LLVM_FALLTHROUGH;
1028 case AArch64::STRDpre: {
1029 unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
1030 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg_X))
1031 .addImm(Reg)
1032 .addImm(Imm)
1033 .setMIFlag(Flag);
1034 break;
1035 }
1036 case AArch64::LDRXpost:
1037 Imm = -Imm;
1038 LLVM_FALLTHROUGH;
1039 case AArch64::STRXpre: {
1040 unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
1041 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg_X))
1042 .addImm(Reg)
1043 .addImm(Imm)
1044 .setMIFlag(Flag);
1045 break;
1046 }
1047 case AArch64::STPDi:
1048 case AArch64::LDPDi: {
1049 unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
1050 unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
1051 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP))
1052 .addImm(Reg0)
1053 .addImm(Reg1)
1054 .addImm(Imm * 8)
1055 .setMIFlag(Flag);
1056 break;
1057 }
1058 case AArch64::STPXi:
1059 case AArch64::LDPXi: {
1060 Register Reg0 = MBBI->getOperand(0).getReg();
1061 Register Reg1 = MBBI->getOperand(1).getReg();
1062 if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
1063 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR))
1064 .addImm(Imm * 8)
1065 .setMIFlag(Flag);
1066 else
1067 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP))
1068 .addImm(RegInfo->getSEHRegNum(Reg0))
1069 .addImm(RegInfo->getSEHRegNum(Reg1))
1070 .addImm(Imm * 8)
1071 .setMIFlag(Flag);
1072 break;
1073 }
1074 case AArch64::STRXui:
1075 case AArch64::LDRXui: {
1076 int Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
1077 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg))
1078 .addImm(Reg)
1079 .addImm(Imm * 8)
1080 .setMIFlag(Flag);
1081 break;
1082 }
1083 case AArch64::STRDui:
1084 case AArch64::LDRDui: {
1085 unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
1086 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg))
1087 .addImm(Reg)
1088 .addImm(Imm * 8)
1089 .setMIFlag(Flag);
1090 break;
1091 }
1092 }
1093 auto I = MBB->insertAfter(MBBI, MIB);
1094 return I;
1095 }
1096
1097 // Fix up the SEH opcode associated with the save/restore instruction.
fixupSEHOpcode(MachineBasicBlock::iterator MBBI,unsigned LocalStackSize)1098 static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
1099 unsigned LocalStackSize) {
1100 MachineOperand *ImmOpnd = nullptr;
1101 unsigned ImmIdx = MBBI->getNumOperands() - 1;
1102 switch (MBBI->getOpcode()) {
1103 default:
1104 llvm_unreachable("Fix the offset in the SEH instruction");
1105 case AArch64::SEH_SaveFPLR:
1106 case AArch64::SEH_SaveRegP:
1107 case AArch64::SEH_SaveReg:
1108 case AArch64::SEH_SaveFRegP:
1109 case AArch64::SEH_SaveFReg:
1110 ImmOpnd = &MBBI->getOperand(ImmIdx);
1111 break;
1112 }
1113 if (ImmOpnd)
1114 ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize);
1115 }
1116
1117 // Convert callee-save register save/restore instruction to do stack pointer
1118 // decrement/increment to allocate/deallocate the callee-save stack area by
1119 // converting store/load to use pre/post increment version.
convertCalleeSaveRestoreToSPPrePostIncDec(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,const TargetInstrInfo * TII,int CSStackSizeInc,bool NeedsWinCFI,bool * HasWinCFI,bool EmitCFI,MachineInstr::MIFlag FrameFlag=MachineInstr::FrameSetup,int CFAOffset=0)1120 static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
1121 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
1122 const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
1123 bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI,
1124 MachineInstr::MIFlag FrameFlag = MachineInstr::FrameSetup,
1125 int CFAOffset = 0) {
1126 unsigned NewOpc;
1127 switch (MBBI->getOpcode()) {
1128 default:
1129 llvm_unreachable("Unexpected callee-save save/restore opcode!");
1130 case AArch64::STPXi:
1131 NewOpc = AArch64::STPXpre;
1132 break;
1133 case AArch64::STPDi:
1134 NewOpc = AArch64::STPDpre;
1135 break;
1136 case AArch64::STPQi:
1137 NewOpc = AArch64::STPQpre;
1138 break;
1139 case AArch64::STRXui:
1140 NewOpc = AArch64::STRXpre;
1141 break;
1142 case AArch64::STRDui:
1143 NewOpc = AArch64::STRDpre;
1144 break;
1145 case AArch64::STRQui:
1146 NewOpc = AArch64::STRQpre;
1147 break;
1148 case AArch64::LDPXi:
1149 NewOpc = AArch64::LDPXpost;
1150 break;
1151 case AArch64::LDPDi:
1152 NewOpc = AArch64::LDPDpost;
1153 break;
1154 case AArch64::LDPQi:
1155 NewOpc = AArch64::LDPQpost;
1156 break;
1157 case AArch64::LDRXui:
1158 NewOpc = AArch64::LDRXpost;
1159 break;
1160 case AArch64::LDRDui:
1161 NewOpc = AArch64::LDRDpost;
1162 break;
1163 case AArch64::LDRQui:
1164 NewOpc = AArch64::LDRQpost;
1165 break;
1166 }
1167 // Get rid of the SEH code associated with the old instruction.
1168 if (NeedsWinCFI) {
1169 auto SEH = std::next(MBBI);
1170 if (AArch64InstrInfo::isSEHInstruction(*SEH))
1171 SEH->eraseFromParent();
1172 }
1173
1174 TypeSize Scale = TypeSize::Fixed(1);
1175 unsigned Width;
1176 int64_t MinOffset, MaxOffset;
1177 bool Success = static_cast<const AArch64InstrInfo *>(TII)->getMemOpInfo(
1178 NewOpc, Scale, Width, MinOffset, MaxOffset);
1179 (void)Success;
1180 assert(Success && "unknown load/store opcode");
1181
1182 // If the first store isn't right where we want SP then we can't fold the
1183 // update in so create a normal arithmetic instruction instead.
1184 MachineFunction &MF = *MBB.getParent();
1185 if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 ||
1186 CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) {
1187 emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1188 StackOffset::getFixed(CSStackSizeInc), TII, FrameFlag,
1189 false, false, nullptr, EmitCFI,
1190 StackOffset::getFixed(CFAOffset));
1191
1192 return std::prev(MBBI);
1193 }
1194
1195 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
1196 MIB.addReg(AArch64::SP, RegState::Define);
1197
1198 // Copy all operands other than the immediate offset.
1199 unsigned OpndIdx = 0;
1200 for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
1201 ++OpndIdx)
1202 MIB.add(MBBI->getOperand(OpndIdx));
1203
1204 assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
1205 "Unexpected immediate offset in first/last callee-save save/restore "
1206 "instruction!");
1207 assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
1208 "Unexpected base register in callee-save save/restore instruction!");
1209 assert(CSStackSizeInc % Scale == 0);
1210 MIB.addImm(CSStackSizeInc / (int)Scale);
1211
1212 MIB.setMIFlags(MBBI->getFlags());
1213 MIB.setMemRefs(MBBI->memoperands());
1214
1215 // Generate a new SEH code that corresponds to the new instruction.
1216 if (NeedsWinCFI) {
1217 *HasWinCFI = true;
1218 InsertSEH(*MIB, *TII, FrameFlag);
1219 }
1220
1221 if (EmitCFI) {
1222 unsigned CFIIndex = MF.addFrameInst(
1223 MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset - CSStackSizeInc));
1224 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1225 .addCFIIndex(CFIIndex)
1226 .setMIFlags(FrameFlag);
1227 }
1228
1229 return std::prev(MBB.erase(MBBI));
1230 }
1231
1232 // Fixup callee-save register save/restore instructions to take into account
1233 // combined SP bump by adding the local stack size to the stack offsets.
fixupCalleeSaveRestoreStackOffset(MachineInstr & MI,uint64_t LocalStackSize,bool NeedsWinCFI,bool * HasWinCFI)1234 static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
1235 uint64_t LocalStackSize,
1236 bool NeedsWinCFI,
1237 bool *HasWinCFI) {
1238 if (AArch64InstrInfo::isSEHInstruction(MI))
1239 return;
1240
1241 unsigned Opc = MI.getOpcode();
1242 unsigned Scale;
1243 switch (Opc) {
1244 case AArch64::STPXi:
1245 case AArch64::STRXui:
1246 case AArch64::STPDi:
1247 case AArch64::STRDui:
1248 case AArch64::LDPXi:
1249 case AArch64::LDRXui:
1250 case AArch64::LDPDi:
1251 case AArch64::LDRDui:
1252 Scale = 8;
1253 break;
1254 case AArch64::STPQi:
1255 case AArch64::STRQui:
1256 case AArch64::LDPQi:
1257 case AArch64::LDRQui:
1258 Scale = 16;
1259 break;
1260 default:
1261 llvm_unreachable("Unexpected callee-save save/restore opcode!");
1262 }
1263
1264 unsigned OffsetIdx = MI.getNumExplicitOperands() - 1;
1265 assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP &&
1266 "Unexpected base register in callee-save save/restore instruction!");
1267 // Last operand is immediate offset that needs fixing.
1268 MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx);
1269 // All generated opcodes have scaled offsets.
1270 assert(LocalStackSize % Scale == 0);
1271 OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / Scale);
1272
1273 if (NeedsWinCFI) {
1274 *HasWinCFI = true;
1275 auto MBBI = std::next(MachineBasicBlock::iterator(MI));
1276 assert(MBBI != MI.getParent()->end() && "Expecting a valid instruction");
1277 assert(AArch64InstrInfo::isSEHInstruction(*MBBI) &&
1278 "Expecting a SEH instruction");
1279 fixupSEHOpcode(MBBI, LocalStackSize);
1280 }
1281 }
1282
isTargetWindows(const MachineFunction & MF)1283 static bool isTargetWindows(const MachineFunction &MF) {
1284 return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
1285 }
1286
1287 // Convenience function to determine whether I is an SVE callee save.
IsSVECalleeSave(MachineBasicBlock::iterator I)1288 static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
1289 switch (I->getOpcode()) {
1290 default:
1291 return false;
1292 case AArch64::STR_ZXI:
1293 case AArch64::STR_PXI:
1294 case AArch64::LDR_ZXI:
1295 case AArch64::LDR_PXI:
1296 return I->getFlag(MachineInstr::FrameSetup) ||
1297 I->getFlag(MachineInstr::FrameDestroy);
1298 }
1299 }
1300
needsShadowCallStackPrologueEpilogue(MachineFunction & MF)1301 static bool needsShadowCallStackPrologueEpilogue(MachineFunction &MF) {
1302 if (!(llvm::any_of(
1303 MF.getFrameInfo().getCalleeSavedInfo(),
1304 [](const auto &Info) { return Info.getReg() == AArch64::LR; }) &&
1305 MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)))
1306 return false;
1307
1308 if (!MF.getSubtarget<AArch64Subtarget>().isXRegisterReserved(18))
1309 report_fatal_error("Must reserve x18 to use shadow call stack");
1310
1311 return true;
1312 }
1313
emitShadowCallStackPrologue(const TargetInstrInfo & TII,MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,bool NeedsWinCFI,bool NeedsUnwindInfo)1314 static void emitShadowCallStackPrologue(const TargetInstrInfo &TII,
1315 MachineFunction &MF,
1316 MachineBasicBlock &MBB,
1317 MachineBasicBlock::iterator MBBI,
1318 const DebugLoc &DL, bool NeedsWinCFI,
1319 bool NeedsUnwindInfo) {
1320 // Shadow call stack prolog: str x30, [x18], #8
1321 BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXpost))
1322 .addReg(AArch64::X18, RegState::Define)
1323 .addReg(AArch64::LR)
1324 .addReg(AArch64::X18)
1325 .addImm(8)
1326 .setMIFlag(MachineInstr::FrameSetup);
1327
1328 // This instruction also makes x18 live-in to the entry block.
1329 MBB.addLiveIn(AArch64::X18);
1330
1331 if (NeedsWinCFI)
1332 BuildMI(MBB, MBBI, DL, TII.get(AArch64::SEH_Nop))
1333 .setMIFlag(MachineInstr::FrameSetup);
1334
1335 if (NeedsUnwindInfo) {
1336 // Emit a CFI instruction that causes 8 to be subtracted from the value of
1337 // x18 when unwinding past this frame.
1338 static const char CFIInst[] = {
1339 dwarf::DW_CFA_val_expression,
1340 18, // register
1341 2, // length
1342 static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
1343 static_cast<char>(-8) & 0x7f, // addend (sleb128)
1344 };
1345 unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape(
1346 nullptr, StringRef(CFIInst, sizeof(CFIInst))));
1347 BuildMI(MBB, MBBI, DL, TII.get(AArch64::CFI_INSTRUCTION))
1348 .addCFIIndex(CFIIndex)
1349 .setMIFlag(MachineInstr::FrameSetup);
1350 }
1351 }
1352
emitShadowCallStackEpilogue(const TargetInstrInfo & TII,MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL)1353 static void emitShadowCallStackEpilogue(const TargetInstrInfo &TII,
1354 MachineFunction &MF,
1355 MachineBasicBlock &MBB,
1356 MachineBasicBlock::iterator MBBI,
1357 const DebugLoc &DL) {
1358 // Shadow call stack epilog: ldr x30, [x18, #-8]!
1359 BuildMI(MBB, MBBI, DL, TII.get(AArch64::LDRXpre))
1360 .addReg(AArch64::X18, RegState::Define)
1361 .addReg(AArch64::LR, RegState::Define)
1362 .addReg(AArch64::X18)
1363 .addImm(-8)
1364 .setMIFlag(MachineInstr::FrameDestroy);
1365
1366 if (MF.getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo()) {
1367 unsigned CFIIndex =
1368 MF.addFrameInst(MCCFIInstruction::createRestore(nullptr, 18));
1369 BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
1370 .addCFIIndex(CFIIndex)
1371 .setMIFlags(MachineInstr::FrameDestroy);
1372 }
1373 }
1374
emitPrologue(MachineFunction & MF,MachineBasicBlock & MBB) const1375 void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
1376 MachineBasicBlock &MBB) const {
1377 MachineBasicBlock::iterator MBBI = MBB.begin();
1378 const MachineFrameInfo &MFI = MF.getFrameInfo();
1379 const Function &F = MF.getFunction();
1380 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1381 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1382 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1383 MachineModuleInfo &MMI = MF.getMMI();
1384 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1385 bool EmitCFI = AFI->needsDwarfUnwindInfo();
1386 bool HasFP = hasFP(MF);
1387 bool NeedsWinCFI = needsWinCFI(MF);
1388 bool HasWinCFI = false;
1389 auto Cleanup = make_scope_exit([&]() { MF.setHasWinCFI(HasWinCFI); });
1390
1391 bool IsFunclet = MBB.isEHFuncletEntry();
1392
1393 // At this point, we're going to decide whether or not the function uses a
1394 // redzone. In most cases, the function doesn't have a redzone so let's
1395 // assume that's false and set it to true in the case that there's a redzone.
1396 AFI->setHasRedZone(false);
1397
1398 // Debug location must be unknown since the first debug location is used
1399 // to determine the end of the prologue.
1400 DebugLoc DL;
1401
1402 const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>();
1403 if (needsShadowCallStackPrologueEpilogue(MF))
1404 emitShadowCallStackPrologue(*TII, MF, MBB, MBBI, DL, NeedsWinCFI,
1405 MFnI.needsDwarfUnwindInfo());
1406
1407 if (MFnI.shouldSignReturnAddress()) {
1408 unsigned PACI;
1409 if (MFnI.shouldSignWithBKey()) {
1410 BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITBKEY))
1411 .setMIFlag(MachineInstr::FrameSetup);
1412 PACI = Subtarget.hasPAuth() ? AArch64::PACIB : AArch64::PACIBSP;
1413 } else {
1414 PACI = Subtarget.hasPAuth() ? AArch64::PACIA : AArch64::PACIASP;
1415 }
1416
1417 auto MI = BuildMI(MBB, MBBI, DL, TII->get(PACI));
1418 if (Subtarget.hasPAuth())
1419 MI.addReg(AArch64::LR, RegState::Define)
1420 .addReg(AArch64::LR)
1421 .addReg(AArch64::SP, RegState::InternalRead);
1422 MI.setMIFlag(MachineInstr::FrameSetup);
1423 if (EmitCFI) {
1424 unsigned CFIIndex =
1425 MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
1426 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1427 .addCFIIndex(CFIIndex)
1428 .setMIFlags(MachineInstr::FrameSetup);
1429 }
1430 }
1431 if (EmitCFI && MFnI.isMTETagged()) {
1432 BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITMTETAGGED))
1433 .setMIFlag(MachineInstr::FrameSetup);
1434 }
1435
1436 // We signal the presence of a Swift extended frame to external tools by
1437 // storing FP with 0b0001 in bits 63:60. In normal userland operation a simple
1438 // ORR is sufficient, it is assumed a Swift kernel would initialize the TBI
1439 // bits so that is still true.
1440 if (HasFP && AFI->hasSwiftAsyncContext()) {
1441 switch (MF.getTarget().Options.SwiftAsyncFramePointer) {
1442 case SwiftAsyncFramePointerMode::DeploymentBased:
1443 if (Subtarget.swiftAsyncContextIsDynamicallySet()) {
1444 // The special symbol below is absolute and has a *value* that can be
1445 // combined with the frame pointer to signal an extended frame.
1446 BuildMI(MBB, MBBI, DL, TII->get(AArch64::LOADgot), AArch64::X16)
1447 .addExternalSymbol("swift_async_extendedFramePointerFlags",
1448 AArch64II::MO_GOT);
1449 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrs), AArch64::FP)
1450 .addUse(AArch64::FP)
1451 .addUse(AArch64::X16)
1452 .addImm(Subtarget.isTargetILP32() ? 32 : 0);
1453 break;
1454 }
1455 LLVM_FALLTHROUGH;
1456
1457 case SwiftAsyncFramePointerMode::Always:
1458 // ORR x29, x29, #0x1000_0000_0000_0000
1459 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXri), AArch64::FP)
1460 .addUse(AArch64::FP)
1461 .addImm(0x1100)
1462 .setMIFlag(MachineInstr::FrameSetup);
1463 break;
1464
1465 case SwiftAsyncFramePointerMode::Never:
1466 break;
1467 }
1468 }
1469
1470 // All calls are tail calls in GHC calling conv, and functions have no
1471 // prologue/epilogue.
1472 if (MF.getFunction().getCallingConv() == CallingConv::GHC)
1473 return;
1474
1475 // Set tagged base pointer to the requested stack slot.
1476 // Ideally it should match SP value after prologue.
1477 Optional<int> TBPI = AFI->getTaggedBasePointerIndex();
1478 if (TBPI)
1479 AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI));
1480 else
1481 AFI->setTaggedBasePointerOffset(MFI.getStackSize());
1482
1483 const StackOffset &SVEStackSize = getSVEStackSize(MF);
1484
1485 // getStackSize() includes all the locals in its size calculation. We don't
1486 // include these locals when computing the stack size of a funclet, as they
1487 // are allocated in the parent's stack frame and accessed via the frame
1488 // pointer from the funclet. We only save the callee saved registers in the
1489 // funclet, which are really the callee saved registers of the parent
1490 // function, including the funclet.
1491 int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
1492 : MFI.getStackSize();
1493 if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
1494 assert(!HasFP && "unexpected function without stack frame but with FP");
1495 assert(!SVEStackSize &&
1496 "unexpected function without stack frame but with SVE objects");
1497 // All of the stack allocation is for locals.
1498 AFI->setLocalStackSize(NumBytes);
1499 if (!NumBytes)
1500 return;
1501 // REDZONE: If the stack size is less than 128 bytes, we don't need
1502 // to actually allocate.
1503 if (canUseRedZone(MF)) {
1504 AFI->setHasRedZone(true);
1505 ++NumRedZoneFunctions;
1506 } else {
1507 emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1508 StackOffset::getFixed(-NumBytes), TII,
1509 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1510 if (EmitCFI) {
1511 // Label used to tie together the PROLOG_LABEL and the MachineMoves.
1512 MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
1513 // Encode the stack size of the leaf function.
1514 unsigned CFIIndex = MF.addFrameInst(
1515 MCCFIInstruction::cfiDefCfaOffset(FrameLabel, NumBytes));
1516 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1517 .addCFIIndex(CFIIndex)
1518 .setMIFlags(MachineInstr::FrameSetup);
1519 }
1520 }
1521
1522 if (NeedsWinCFI) {
1523 HasWinCFI = true;
1524 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
1525 .setMIFlag(MachineInstr::FrameSetup);
1526 }
1527
1528 return;
1529 }
1530
1531 bool IsWin64 =
1532 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1533 unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
1534
1535 auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
1536 // All of the remaining stack allocations are for locals.
1537 AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
1538 bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
1539 bool HomPrologEpilog = homogeneousPrologEpilog(MF);
1540 if (CombineSPBump) {
1541 assert(!SVEStackSize && "Cannot combine SP bump with SVE");
1542 emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1543 StackOffset::getFixed(-NumBytes), TII,
1544 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI,
1545 EmitCFI);
1546 NumBytes = 0;
1547 } else if (HomPrologEpilog) {
1548 // Stack has been already adjusted.
1549 NumBytes -= PrologueSaveSize;
1550 } else if (PrologueSaveSize != 0) {
1551 MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
1552 MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI,
1553 EmitCFI);
1554 NumBytes -= PrologueSaveSize;
1555 }
1556 assert(NumBytes >= 0 && "Negative stack allocation size!?");
1557
1558 // Move past the saves of the callee-saved registers, fixing up the offsets
1559 // and pre-inc if we decided to combine the callee-save and local stack
1560 // pointer bump above.
1561 MachineBasicBlock::iterator End = MBB.end();
1562 while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
1563 !IsSVECalleeSave(MBBI)) {
1564 if (CombineSPBump)
1565 fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
1566 NeedsWinCFI, &HasWinCFI);
1567 ++MBBI;
1568 }
1569
1570 // For funclets the FP belongs to the containing function.
1571 if (!IsFunclet && HasFP) {
1572 // Only set up FP if we actually need to.
1573 int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset();
1574
1575 if (CombineSPBump)
1576 FPOffset += AFI->getLocalStackSize();
1577
1578 if (AFI->hasSwiftAsyncContext()) {
1579 // Before we update the live FP we have to ensure there's a valid (or
1580 // null) asynchronous context in its slot just before FP in the frame
1581 // record, so store it now.
1582 const auto &Attrs = MF.getFunction().getAttributes();
1583 bool HaveInitialContext = Attrs.hasAttrSomewhere(Attribute::SwiftAsync);
1584 if (HaveInitialContext)
1585 MBB.addLiveIn(AArch64::X22);
1586 BuildMI(MBB, MBBI, DL, TII->get(AArch64::StoreSwiftAsyncContext))
1587 .addUse(HaveInitialContext ? AArch64::X22 : AArch64::XZR)
1588 .addUse(AArch64::SP)
1589 .addImm(FPOffset - 8)
1590 .setMIFlags(MachineInstr::FrameSetup);
1591 }
1592
1593 if (HomPrologEpilog) {
1594 auto Prolog = MBBI;
1595 --Prolog;
1596 assert(Prolog->getOpcode() == AArch64::HOM_Prolog);
1597 Prolog->addOperand(MachineOperand::CreateImm(FPOffset));
1598 } else {
1599 // Issue sub fp, sp, FPOffset or
1600 // mov fp,sp when FPOffset is zero.
1601 // Note: All stores of callee-saved registers are marked as "FrameSetup".
1602 // This code marks the instruction(s) that set the FP also.
1603 emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
1604 StackOffset::getFixed(FPOffset), TII,
1605 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1606 }
1607 if (EmitCFI) {
1608 // Define the current CFA rule to use the provided FP.
1609 const int OffsetToFirstCalleeSaveFromFP =
1610 AFI->getCalleeSaveBaseToFrameRecordOffset() -
1611 AFI->getCalleeSavedStackSize();
1612 Register FramePtr = RegInfo->getFrameRegister(MF);
1613 unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
1614 unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
1615 nullptr, Reg, FixedObject - OffsetToFirstCalleeSaveFromFP));
1616 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1617 .addCFIIndex(CFIIndex)
1618 .setMIFlags(MachineInstr::FrameSetup);
1619 }
1620 }
1621
1622 // Now emit the moves for whatever callee saved regs we have (including FP,
1623 // LR if those are saved). Frame instructions for SVE register are emitted
1624 // later, after the instruction which actually save SVE regs.
1625 if (EmitCFI)
1626 emitCalleeSavedGPRLocations(MBB, MBBI);
1627
1628 if (windowsRequiresStackProbe(MF, NumBytes)) {
1629 uint64_t NumWords = NumBytes >> 4;
1630 if (NeedsWinCFI) {
1631 HasWinCFI = true;
1632 // alloc_l can hold at most 256MB, so assume that NumBytes doesn't
1633 // exceed this amount. We need to move at most 2^24 - 1 into x15.
1634 // This is at most two instructions, MOVZ follwed by MOVK.
1635 // TODO: Fix to use multiple stack alloc unwind codes for stacks
1636 // exceeding 256MB in size.
1637 if (NumBytes >= (1 << 28))
1638 report_fatal_error("Stack size cannot exceed 256MB for stack "
1639 "unwinding purposes");
1640
1641 uint32_t LowNumWords = NumWords & 0xFFFF;
1642 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVZXi), AArch64::X15)
1643 .addImm(LowNumWords)
1644 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
1645 .setMIFlag(MachineInstr::FrameSetup);
1646 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1647 .setMIFlag(MachineInstr::FrameSetup);
1648 if ((NumWords & 0xFFFF0000) != 0) {
1649 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X15)
1650 .addReg(AArch64::X15)
1651 .addImm((NumWords & 0xFFFF0000) >> 16) // High half
1652 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16))
1653 .setMIFlag(MachineInstr::FrameSetup);
1654 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1655 .setMIFlag(MachineInstr::FrameSetup);
1656 }
1657 } else {
1658 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
1659 .addImm(NumWords)
1660 .setMIFlags(MachineInstr::FrameSetup);
1661 }
1662
1663 switch (MF.getTarget().getCodeModel()) {
1664 case CodeModel::Tiny:
1665 case CodeModel::Small:
1666 case CodeModel::Medium:
1667 case CodeModel::Kernel:
1668 BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
1669 .addExternalSymbol("__chkstk")
1670 .addReg(AArch64::X15, RegState::Implicit)
1671 .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
1672 .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
1673 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
1674 .setMIFlags(MachineInstr::FrameSetup);
1675 if (NeedsWinCFI) {
1676 HasWinCFI = true;
1677 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1678 .setMIFlag(MachineInstr::FrameSetup);
1679 }
1680 break;
1681 case CodeModel::Large:
1682 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT))
1683 .addReg(AArch64::X16, RegState::Define)
1684 .addExternalSymbol("__chkstk")
1685 .addExternalSymbol("__chkstk")
1686 .setMIFlags(MachineInstr::FrameSetup);
1687 if (NeedsWinCFI) {
1688 HasWinCFI = true;
1689 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1690 .setMIFlag(MachineInstr::FrameSetup);
1691 }
1692
1693 BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF)))
1694 .addReg(AArch64::X16, RegState::Kill)
1695 .addReg(AArch64::X15, RegState::Implicit | RegState::Define)
1696 .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
1697 .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
1698 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
1699 .setMIFlags(MachineInstr::FrameSetup);
1700 if (NeedsWinCFI) {
1701 HasWinCFI = true;
1702 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1703 .setMIFlag(MachineInstr::FrameSetup);
1704 }
1705 break;
1706 }
1707
1708 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
1709 .addReg(AArch64::SP, RegState::Kill)
1710 .addReg(AArch64::X15, RegState::Kill)
1711 .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4))
1712 .setMIFlags(MachineInstr::FrameSetup);
1713 if (NeedsWinCFI) {
1714 HasWinCFI = true;
1715 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
1716 .addImm(NumBytes)
1717 .setMIFlag(MachineInstr::FrameSetup);
1718 }
1719 NumBytes = 0;
1720 }
1721
1722 StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {};
1723 MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI;
1724
1725 // Process the SVE callee-saves to determine what space needs to be
1726 // allocated.
1727 if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
1728 // Find callee save instructions in frame.
1729 CalleeSavesBegin = MBBI;
1730 assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
1731 while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator())
1732 ++MBBI;
1733 CalleeSavesEnd = MBBI;
1734
1735 AllocateBefore = StackOffset::getScalable(CalleeSavedSize);
1736 AllocateAfter = SVEStackSize - AllocateBefore;
1737 }
1738
1739 // Allocate space for the callee saves (if any).
1740 emitFrameOffset(
1741 MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP, -AllocateBefore, TII,
1742 MachineInstr::FrameSetup, false, false, nullptr,
1743 EmitCFI && !HasFP && AllocateBefore,
1744 StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes));
1745
1746 if (EmitCFI)
1747 emitCalleeSavedSVELocations(MBB, CalleeSavesEnd);
1748
1749 // Finally allocate remaining SVE stack space.
1750 emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP,
1751 -AllocateAfter, TII, MachineInstr::FrameSetup, false, false,
1752 nullptr, EmitCFI && !HasFP && AllocateAfter,
1753 AllocateBefore + StackOffset::getFixed(
1754 (int64_t)MFI.getStackSize() - NumBytes));
1755
1756 // Allocate space for the rest of the frame.
1757 if (NumBytes) {
1758 // Alignment is required for the parent frame, not the funclet
1759 const bool NeedsRealignment =
1760 !IsFunclet && RegInfo->hasStackRealignment(MF);
1761 unsigned scratchSPReg = AArch64::SP;
1762
1763 if (NeedsRealignment) {
1764 scratchSPReg = findScratchNonCalleeSaveRegister(&MBB);
1765 assert(scratchSPReg != AArch64::NoRegister);
1766 }
1767
1768 // If we're a leaf function, try using the red zone.
1769 if (!canUseRedZone(MF)) {
1770 // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
1771 // the correct value here, as NumBytes also includes padding bytes,
1772 // which shouldn't be counted here.
1773 emitFrameOffset(
1774 MBB, MBBI, DL, scratchSPReg, AArch64::SP,
1775 StackOffset::getFixed(-NumBytes), TII, MachineInstr::FrameSetup,
1776 false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
1777 SVEStackSize +
1778 StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes));
1779 }
1780 if (NeedsRealignment) {
1781 const unsigned NrBitsToZero = Log2(MFI.getMaxAlign());
1782 assert(NrBitsToZero > 1);
1783 assert(scratchSPReg != AArch64::SP);
1784
1785 // SUB X9, SP, NumBytes
1786 // -- X9 is temporary register, so shouldn't contain any live data here,
1787 // -- free to use. This is already produced by emitFrameOffset above.
1788 // AND SP, X9, 0b11111...0000
1789 // The logical immediates have a non-trivial encoding. The following
1790 // formula computes the encoded immediate with all ones but
1791 // NrBitsToZero zero bits as least significant bits.
1792 uint32_t andMaskEncoded = (1 << 12) // = N
1793 | ((64 - NrBitsToZero) << 6) // immr
1794 | ((64 - NrBitsToZero - 1) << 0); // imms
1795
1796 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
1797 .addReg(scratchSPReg, RegState::Kill)
1798 .addImm(andMaskEncoded);
1799 AFI->setStackRealigned(true);
1800 if (NeedsWinCFI) {
1801 HasWinCFI = true;
1802 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
1803 .addImm(NumBytes & andMaskEncoded)
1804 .setMIFlag(MachineInstr::FrameSetup);
1805 }
1806 }
1807 }
1808
1809 // If we need a base pointer, set it up here. It's whatever the value of the
1810 // stack pointer is at this point. Any variable size objects will be allocated
1811 // after this, so we can still use the base pointer to reference locals.
1812 //
1813 // FIXME: Clarify FrameSetup flags here.
1814 // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
1815 // needed.
1816 // For funclets the BP belongs to the containing function.
1817 if (!IsFunclet && RegInfo->hasBasePointer(MF)) {
1818 TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
1819 false);
1820 if (NeedsWinCFI) {
1821 HasWinCFI = true;
1822 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1823 .setMIFlag(MachineInstr::FrameSetup);
1824 }
1825 }
1826
1827 // The very last FrameSetup instruction indicates the end of prologue. Emit a
1828 // SEH opcode indicating the prologue end.
1829 if (NeedsWinCFI && HasWinCFI) {
1830 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
1831 .setMIFlag(MachineInstr::FrameSetup);
1832 }
1833
1834 // SEH funclets are passed the frame pointer in X1. If the parent
1835 // function uses the base register, then the base register is used
1836 // directly, and is not retrieved from X1.
1837 if (IsFunclet && F.hasPersonalityFn()) {
1838 EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
1839 if (isAsynchronousEHPersonality(Per)) {
1840 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP)
1841 .addReg(AArch64::X1)
1842 .setMIFlag(MachineInstr::FrameSetup);
1843 MBB.addLiveIn(AArch64::X1);
1844 }
1845 }
1846 }
1847
InsertReturnAddressAuth(MachineFunction & MF,MachineBasicBlock & MBB)1848 static void InsertReturnAddressAuth(MachineFunction &MF,
1849 MachineBasicBlock &MBB) {
1850 const auto &MFI = *MF.getInfo<AArch64FunctionInfo>();
1851 if (!MFI.shouldSignReturnAddress())
1852 return;
1853 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1854 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1855
1856 MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
1857 DebugLoc DL;
1858 if (MBBI != MBB.end())
1859 DL = MBBI->getDebugLoc();
1860
1861 // The AUTIASP instruction assembles to a hint instruction before v8.3a so
1862 // this instruction can safely used for any v8a architecture.
1863 // From v8.3a onwards there are optimised authenticate LR and return
1864 // instructions, namely RETA{A,B}, that can be used instead. In this case the
1865 // DW_CFA_AARCH64_negate_ra_state can't be emitted.
1866 if (Subtarget.hasPAuth() && MBBI != MBB.end() &&
1867 MBBI->getOpcode() == AArch64::RET_ReallyLR) {
1868 BuildMI(MBB, MBBI, DL,
1869 TII->get(MFI.shouldSignWithBKey() ? AArch64::RETAB : AArch64::RETAA))
1870 .copyImplicitOps(*MBBI);
1871 MBB.erase(MBBI);
1872 } else {
1873 BuildMI(
1874 MBB, MBBI, DL,
1875 TII->get(MFI.shouldSignWithBKey() ? AArch64::AUTIBSP : AArch64::AUTIASP))
1876 .setMIFlag(MachineInstr::FrameDestroy);
1877
1878 unsigned CFIIndex =
1879 MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
1880 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1881 .addCFIIndex(CFIIndex)
1882 .setMIFlags(MachineInstr::FrameDestroy);
1883 }
1884 }
1885
isFuncletReturnInstr(const MachineInstr & MI)1886 static bool isFuncletReturnInstr(const MachineInstr &MI) {
1887 switch (MI.getOpcode()) {
1888 default:
1889 return false;
1890 case AArch64::CATCHRET:
1891 case AArch64::CLEANUPRET:
1892 return true;
1893 }
1894 }
1895
emitEpilogue(MachineFunction & MF,MachineBasicBlock & MBB) const1896 void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
1897 MachineBasicBlock &MBB) const {
1898 MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
1899 MachineFrameInfo &MFI = MF.getFrameInfo();
1900 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1901 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1902 DebugLoc DL;
1903 bool NeedsWinCFI = needsWinCFI(MF);
1904 bool EmitCFI = MF.getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo();
1905 bool HasWinCFI = false;
1906 bool IsFunclet = false;
1907 auto WinCFI = make_scope_exit([&]() { assert(HasWinCFI == MF.hasWinCFI()); });
1908
1909 if (MBB.end() != MBBI) {
1910 DL = MBBI->getDebugLoc();
1911 IsFunclet = isFuncletReturnInstr(*MBBI);
1912 }
1913
1914 auto FinishingTouches = make_scope_exit([&]() {
1915 InsertReturnAddressAuth(MF, MBB);
1916 if (needsShadowCallStackPrologueEpilogue(MF))
1917 emitShadowCallStackEpilogue(*TII, MF, MBB, MBB.getFirstTerminator(), DL);
1918 if (EmitCFI)
1919 emitCalleeSavedGPRRestores(MBB, MBB.getFirstTerminator());
1920 });
1921
1922 int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
1923 : MFI.getStackSize();
1924 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1925
1926 // All calls are tail calls in GHC calling conv, and functions have no
1927 // prologue/epilogue.
1928 if (MF.getFunction().getCallingConv() == CallingConv::GHC)
1929 return;
1930
1931 // How much of the stack used by incoming arguments this function is expected
1932 // to restore in this particular epilogue.
1933 int64_t ArgumentStackToRestore = getArgumentStackToRestore(MF, MBB);
1934 bool IsWin64 =
1935 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1936 unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
1937
1938 int64_t AfterCSRPopSize = ArgumentStackToRestore;
1939 auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
1940 // We cannot rely on the local stack size set in emitPrologue if the function
1941 // has funclets, as funclets have different local stack size requirements, and
1942 // the current value set in emitPrologue may be that of the containing
1943 // function.
1944 if (MF.hasEHFunclets())
1945 AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
1946 if (homogeneousPrologEpilog(MF, &MBB)) {
1947 assert(!NeedsWinCFI);
1948 auto LastPopI = MBB.getFirstTerminator();
1949 if (LastPopI != MBB.begin()) {
1950 auto HomogeneousEpilog = std::prev(LastPopI);
1951 if (HomogeneousEpilog->getOpcode() == AArch64::HOM_Epilog)
1952 LastPopI = HomogeneousEpilog;
1953 }
1954
1955 // Adjust local stack
1956 emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
1957 StackOffset::getFixed(AFI->getLocalStackSize()), TII,
1958 MachineInstr::FrameDestroy, false, NeedsWinCFI);
1959
1960 // SP has been already adjusted while restoring callee save regs.
1961 // We've bailed-out the case with adjusting SP for arguments.
1962 assert(AfterCSRPopSize == 0);
1963 return;
1964 }
1965 bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
1966 // Assume we can't combine the last pop with the sp restore.
1967
1968 bool CombineAfterCSRBump = false;
1969 if (!CombineSPBump && PrologueSaveSize != 0) {
1970 MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
1971 while (Pop->getOpcode() == TargetOpcode::CFI_INSTRUCTION ||
1972 AArch64InstrInfo::isSEHInstruction(*Pop))
1973 Pop = std::prev(Pop);
1974 // Converting the last ldp to a post-index ldp is valid only if the last
1975 // ldp's offset is 0.
1976 const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
1977 // If the offset is 0 and the AfterCSR pop is not actually trying to
1978 // allocate more stack for arguments (in space that an untimely interrupt
1979 // may clobber), convert it to a post-index ldp.
1980 if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0) {
1981 convertCalleeSaveRestoreToSPPrePostIncDec(
1982 MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, EmitCFI,
1983 MachineInstr::FrameDestroy, PrologueSaveSize);
1984 } else {
1985 // If not, make sure to emit an add after the last ldp.
1986 // We're doing this by transfering the size to be restored from the
1987 // adjustment *before* the CSR pops to the adjustment *after* the CSR
1988 // pops.
1989 AfterCSRPopSize += PrologueSaveSize;
1990 CombineAfterCSRBump = true;
1991 }
1992 }
1993
1994 // Move past the restores of the callee-saved registers.
1995 // If we plan on combining the sp bump of the local stack size and the callee
1996 // save stack size, we might need to adjust the CSR save and restore offsets.
1997 MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
1998 MachineBasicBlock::iterator Begin = MBB.begin();
1999 while (LastPopI != Begin) {
2000 --LastPopI;
2001 if (!LastPopI->getFlag(MachineInstr::FrameDestroy) ||
2002 IsSVECalleeSave(LastPopI)) {
2003 ++LastPopI;
2004 break;
2005 } else if (CombineSPBump)
2006 fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize(),
2007 NeedsWinCFI, &HasWinCFI);
2008 }
2009
2010 if (MF.hasWinCFI()) {
2011 // If the prologue didn't contain any SEH opcodes and didn't set the
2012 // MF.hasWinCFI() flag, assume the epilogue won't either, and skip the
2013 // EpilogStart - to avoid generating CFI for functions that don't need it.
2014 // (And as we didn't generate any prologue at all, it would be asymmetrical
2015 // to the epilogue.) By the end of the function, we assert that
2016 // HasWinCFI is equal to MF.hasWinCFI(), to verify this assumption.
2017 HasWinCFI = true;
2018 BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart))
2019 .setMIFlag(MachineInstr::FrameDestroy);
2020 }
2021
2022 if (hasFP(MF) && AFI->hasSwiftAsyncContext()) {
2023 switch (MF.getTarget().Options.SwiftAsyncFramePointer) {
2024 case SwiftAsyncFramePointerMode::DeploymentBased:
2025 // Avoid the reload as it is GOT relative, and instead fall back to the
2026 // hardcoded value below. This allows a mismatch between the OS and
2027 // application without immediately terminating on the difference.
2028 LLVM_FALLTHROUGH;
2029 case SwiftAsyncFramePointerMode::Always:
2030 // We need to reset FP to its untagged state on return. Bit 60 is
2031 // currently used to show the presence of an extended frame.
2032
2033 // BIC x29, x29, #0x1000_0000_0000_0000
2034 BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::ANDXri),
2035 AArch64::FP)
2036 .addUse(AArch64::FP)
2037 .addImm(0x10fe)
2038 .setMIFlag(MachineInstr::FrameDestroy);
2039 break;
2040
2041 case SwiftAsyncFramePointerMode::Never:
2042 break;
2043 }
2044 }
2045
2046 const StackOffset &SVEStackSize = getSVEStackSize(MF);
2047
2048 // If there is a single SP update, insert it before the ret and we're done.
2049 if (CombineSPBump) {
2050 assert(!SVEStackSize && "Cannot combine SP bump with SVE");
2051
2052 // When we are about to restore the CSRs, the CFA register is SP again.
2053 if (EmitCFI && hasFP(MF)) {
2054 const AArch64RegisterInfo &RegInfo = *Subtarget.getRegisterInfo();
2055 unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true);
2056 unsigned CFIIndex =
2057 MF.addFrameInst(MCCFIInstruction::cfiDefCfa(nullptr, Reg, NumBytes));
2058 BuildMI(MBB, LastPopI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
2059 .addCFIIndex(CFIIndex)
2060 .setMIFlags(MachineInstr::FrameDestroy);
2061 }
2062
2063 emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
2064 StackOffset::getFixed(NumBytes + (int64_t)AfterCSRPopSize),
2065 TII, MachineInstr::FrameDestroy, false, NeedsWinCFI,
2066 &HasWinCFI, EmitCFI, StackOffset::getFixed(NumBytes));
2067 if (HasWinCFI)
2068 BuildMI(MBB, MBB.getFirstTerminator(), DL,
2069 TII->get(AArch64::SEH_EpilogEnd))
2070 .setMIFlag(MachineInstr::FrameDestroy);
2071 return;
2072 }
2073
2074 NumBytes -= PrologueSaveSize;
2075 assert(NumBytes >= 0 && "Negative stack allocation size!?");
2076
2077 // Process the SVE callee-saves to determine what space needs to be
2078 // deallocated.
2079 StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
2080 MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI;
2081 if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
2082 RestoreBegin = std::prev(RestoreEnd);
2083 while (RestoreBegin != MBB.begin() &&
2084 IsSVECalleeSave(std::prev(RestoreBegin)))
2085 --RestoreBegin;
2086
2087 assert(IsSVECalleeSave(RestoreBegin) &&
2088 IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
2089
2090 StackOffset CalleeSavedSizeAsOffset =
2091 StackOffset::getScalable(CalleeSavedSize);
2092 DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
2093 DeallocateAfter = CalleeSavedSizeAsOffset;
2094 }
2095
2096 // Deallocate the SVE area.
2097 if (SVEStackSize) {
2098 // If we have stack realignment or variable sized objects on the stack,
2099 // restore the stack pointer from the frame pointer prior to SVE CSR
2100 // restoration.
2101 if (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) {
2102 if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
2103 // Set SP to start of SVE callee-save area from which they can
2104 // be reloaded. The code below will deallocate the stack space
2105 // space by moving FP -> SP.
2106 emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP,
2107 StackOffset::getScalable(-CalleeSavedSize), TII,
2108 MachineInstr::FrameDestroy);
2109 }
2110 } else {
2111 if (AFI->getSVECalleeSavedStackSize()) {
2112 // Deallocate the non-SVE locals first before we can deallocate (and
2113 // restore callee saves) from the SVE area.
2114 emitFrameOffset(
2115 MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
2116 StackOffset::getFixed(NumBytes), TII, MachineInstr::FrameDestroy,
2117 false, false, nullptr, EmitCFI && !hasFP(MF),
2118 SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize));
2119 NumBytes = 0;
2120 }
2121
2122 emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
2123 DeallocateBefore, TII, MachineInstr::FrameDestroy, false,
2124 false, nullptr, EmitCFI && !hasFP(MF),
2125 SVEStackSize +
2126 StackOffset::getFixed(NumBytes + PrologueSaveSize));
2127
2128 emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
2129 DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
2130 false, nullptr, EmitCFI && !hasFP(MF),
2131 DeallocateAfter +
2132 StackOffset::getFixed(NumBytes + PrologueSaveSize));
2133 }
2134 if (EmitCFI)
2135 emitCalleeSavedSVERestores(MBB, RestoreEnd);
2136 }
2137
2138 if (!hasFP(MF)) {
2139 bool RedZone = canUseRedZone(MF);
2140 // If this was a redzone leaf function, we don't need to restore the
2141 // stack pointer (but we may need to pop stack args for fastcc).
2142 if (RedZone && AfterCSRPopSize == 0)
2143 return;
2144
2145 // Pop the local variables off the stack. If there are no callee-saved
2146 // registers, it means we are actually positioned at the terminator and can
2147 // combine stack increment for the locals and the stack increment for
2148 // callee-popped arguments into (possibly) a single instruction and be done.
2149 bool NoCalleeSaveRestore = PrologueSaveSize == 0;
2150 int64_t StackRestoreBytes = RedZone ? 0 : NumBytes;
2151 if (NoCalleeSaveRestore)
2152 StackRestoreBytes += AfterCSRPopSize;
2153
2154 emitFrameOffset(
2155 MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
2156 StackOffset::getFixed(StackRestoreBytes), TII,
2157 MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI, EmitCFI,
2158 StackOffset::getFixed((RedZone ? 0 : NumBytes) + PrologueSaveSize));
2159
2160 // If we were able to combine the local stack pop with the argument pop,
2161 // then we're done.
2162 if (NoCalleeSaveRestore || AfterCSRPopSize == 0) {
2163 if (HasWinCFI) {
2164 BuildMI(MBB, MBB.getFirstTerminator(), DL,
2165 TII->get(AArch64::SEH_EpilogEnd))
2166 .setMIFlag(MachineInstr::FrameDestroy);
2167 }
2168 return;
2169 }
2170
2171 NumBytes = 0;
2172 }
2173
2174 // Restore the original stack pointer.
2175 // FIXME: Rather than doing the math here, we should instead just use
2176 // non-post-indexed loads for the restores if we aren't actually going to
2177 // be able to save any instructions.
2178 if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) {
2179 emitFrameOffset(
2180 MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
2181 StackOffset::getFixed(-AFI->getCalleeSaveBaseToFrameRecordOffset()),
2182 TII, MachineInstr::FrameDestroy, false, NeedsWinCFI);
2183 } else if (NumBytes)
2184 emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
2185 StackOffset::getFixed(NumBytes), TII,
2186 MachineInstr::FrameDestroy, false, NeedsWinCFI);
2187
2188 // When we are about to restore the CSRs, the CFA register is SP again.
2189 if (EmitCFI && hasFP(MF)) {
2190 const AArch64RegisterInfo &RegInfo = *Subtarget.getRegisterInfo();
2191 unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true);
2192 unsigned CFIIndex = MF.addFrameInst(
2193 MCCFIInstruction::cfiDefCfa(nullptr, Reg, PrologueSaveSize));
2194 BuildMI(MBB, LastPopI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
2195 .addCFIIndex(CFIIndex)
2196 .setMIFlags(MachineInstr::FrameDestroy);
2197 }
2198
2199 // This must be placed after the callee-save restore code because that code
2200 // assumes the SP is at the same location as it was after the callee-save save
2201 // code in the prologue.
2202 if (AfterCSRPopSize) {
2203 assert(AfterCSRPopSize > 0 && "attempting to reallocate arg stack that an "
2204 "interrupt may have clobbered");
2205
2206 emitFrameOffset(
2207 MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
2208 StackOffset::getFixed(AfterCSRPopSize), TII, MachineInstr::FrameDestroy,
2209 false, NeedsWinCFI, &HasWinCFI, EmitCFI,
2210 StackOffset::getFixed(CombineAfterCSRBump ? PrologueSaveSize : 0));
2211 }
2212 if (HasWinCFI)
2213 BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd))
2214 .setMIFlag(MachineInstr::FrameDestroy);
2215 }
2216
2217 /// getFrameIndexReference - Provide a base+offset reference to an FI slot for
2218 /// debug info. It's the same as what we use for resolving the code-gen
2219 /// references for now. FIXME: This can go wrong when references are
2220 /// SP-relative and simple call frames aren't used.
2221 StackOffset
getFrameIndexReference(const MachineFunction & MF,int FI,Register & FrameReg) const2222 AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
2223 Register &FrameReg) const {
2224 return resolveFrameIndexReference(
2225 MF, FI, FrameReg,
2226 /*PreferFP=*/
2227 MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
2228 /*ForSimm=*/false);
2229 }
2230
2231 StackOffset
getNonLocalFrameIndexReference(const MachineFunction & MF,int FI) const2232 AArch64FrameLowering::getNonLocalFrameIndexReference(const MachineFunction &MF,
2233 int FI) const {
2234 return StackOffset::getFixed(getSEHFrameIndexOffset(MF, FI));
2235 }
2236
getFPOffset(const MachineFunction & MF,int64_t ObjectOffset)2237 static StackOffset getFPOffset(const MachineFunction &MF,
2238 int64_t ObjectOffset) {
2239 const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2240 const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2241 bool IsWin64 =
2242 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
2243 unsigned FixedObject =
2244 getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false);
2245 int64_t CalleeSaveSize = AFI->getCalleeSavedStackSize(MF.getFrameInfo());
2246 int64_t FPAdjust =
2247 CalleeSaveSize - AFI->getCalleeSaveBaseToFrameRecordOffset();
2248 return StackOffset::getFixed(ObjectOffset + FixedObject + FPAdjust);
2249 }
2250
getStackOffset(const MachineFunction & MF,int64_t ObjectOffset)2251 static StackOffset getStackOffset(const MachineFunction &MF,
2252 int64_t ObjectOffset) {
2253 const auto &MFI = MF.getFrameInfo();
2254 return StackOffset::getFixed(ObjectOffset + (int64_t)MFI.getStackSize());
2255 }
2256
2257 // TODO: This function currently does not work for scalable vectors.
getSEHFrameIndexOffset(const MachineFunction & MF,int FI) const2258 int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
2259 int FI) const {
2260 const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
2261 MF.getSubtarget().getRegisterInfo());
2262 int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI);
2263 return RegInfo->getLocalAddressRegister(MF) == AArch64::FP
2264 ? getFPOffset(MF, ObjectOffset).getFixed()
2265 : getStackOffset(MF, ObjectOffset).getFixed();
2266 }
2267
resolveFrameIndexReference(const MachineFunction & MF,int FI,Register & FrameReg,bool PreferFP,bool ForSimm) const2268 StackOffset AArch64FrameLowering::resolveFrameIndexReference(
2269 const MachineFunction &MF, int FI, Register &FrameReg, bool PreferFP,
2270 bool ForSimm) const {
2271 const auto &MFI = MF.getFrameInfo();
2272 int64_t ObjectOffset = MFI.getObjectOffset(FI);
2273 bool isFixed = MFI.isFixedObjectIndex(FI);
2274 bool isSVE = MFI.getStackID(FI) == TargetStackID::ScalableVector;
2275 return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
2276 PreferFP, ForSimm);
2277 }
2278
resolveFrameOffsetReference(const MachineFunction & MF,int64_t ObjectOffset,bool isFixed,bool isSVE,Register & FrameReg,bool PreferFP,bool ForSimm) const2279 StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
2280 const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE,
2281 Register &FrameReg, bool PreferFP, bool ForSimm) const {
2282 const auto &MFI = MF.getFrameInfo();
2283 const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
2284 MF.getSubtarget().getRegisterInfo());
2285 const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2286 const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2287
2288 int64_t FPOffset = getFPOffset(MF, ObjectOffset).getFixed();
2289 int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed();
2290 bool isCSR =
2291 !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
2292
2293 const StackOffset &SVEStackSize = getSVEStackSize(MF);
2294
2295 // Use frame pointer to reference fixed objects. Use it for locals if
2296 // there are VLAs or a dynamically realigned SP (and thus the SP isn't
2297 // reliable as a base). Make sure useFPForScavengingIndex() does the
2298 // right thing for the emergency spill slot.
2299 bool UseFP = false;
2300 if (AFI->hasStackFrame() && !isSVE) {
2301 // We shouldn't prefer using the FP to access fixed-sized stack objects when
2302 // there are scalable (SVE) objects in between the FP and the fixed-sized
2303 // objects.
2304 PreferFP &= !SVEStackSize;
2305
2306 // Note: Keeping the following as multiple 'if' statements rather than
2307 // merging to a single expression for readability.
2308 //
2309 // Argument access should always use the FP.
2310 if (isFixed) {
2311 UseFP = hasFP(MF);
2312 } else if (isCSR && RegInfo->hasStackRealignment(MF)) {
2313 // References to the CSR area must use FP if we're re-aligning the stack
2314 // since the dynamically-sized alignment padding is between the SP/BP and
2315 // the CSR area.
2316 assert(hasFP(MF) && "Re-aligned stack must have frame pointer");
2317 UseFP = true;
2318 } else if (hasFP(MF) && !RegInfo->hasStackRealignment(MF)) {
2319 // If the FPOffset is negative and we're producing a signed immediate, we
2320 // have to keep in mind that the available offset range for negative
2321 // offsets is smaller than for positive ones. If an offset is available
2322 // via the FP and the SP, use whichever is closest.
2323 bool FPOffsetFits = !ForSimm || FPOffset >= -256;
2324 PreferFP |= Offset > -FPOffset && !SVEStackSize;
2325
2326 if (MFI.hasVarSizedObjects()) {
2327 // If we have variable sized objects, we can use either FP or BP, as the
2328 // SP offset is unknown. We can use the base pointer if we have one and
2329 // FP is not preferred. If not, we're stuck with using FP.
2330 bool CanUseBP = RegInfo->hasBasePointer(MF);
2331 if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best.
2332 UseFP = PreferFP;
2333 else if (!CanUseBP) // Can't use BP. Forced to use FP.
2334 UseFP = true;
2335 // else we can use BP and FP, but the offset from FP won't fit.
2336 // That will make us scavenge registers which we can probably avoid by
2337 // using BP. If it won't fit for BP either, we'll scavenge anyway.
2338 } else if (FPOffset >= 0) {
2339 // Use SP or FP, whichever gives us the best chance of the offset
2340 // being in range for direct access. If the FPOffset is positive,
2341 // that'll always be best, as the SP will be even further away.
2342 UseFP = true;
2343 } else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) {
2344 // Funclets access the locals contained in the parent's stack frame
2345 // via the frame pointer, so we have to use the FP in the parent
2346 // function.
2347 (void) Subtarget;
2348 assert(
2349 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()) &&
2350 "Funclets should only be present on Win64");
2351 UseFP = true;
2352 } else {
2353 // We have the choice between FP and (SP or BP).
2354 if (FPOffsetFits && PreferFP) // If FP is the best fit, use it.
2355 UseFP = true;
2356 }
2357 }
2358 }
2359
2360 assert(
2361 ((isFixed || isCSR) || !RegInfo->hasStackRealignment(MF) || !UseFP) &&
2362 "In the presence of dynamic stack pointer realignment, "
2363 "non-argument/CSR objects cannot be accessed through the frame pointer");
2364
2365 if (isSVE) {
2366 StackOffset FPOffset =
2367 StackOffset::get(-AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset);
2368 StackOffset SPOffset =
2369 SVEStackSize +
2370 StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(),
2371 ObjectOffset);
2372 // Always use the FP for SVE spills if available and beneficial.
2373 if (hasFP(MF) && (SPOffset.getFixed() ||
2374 FPOffset.getScalable() < SPOffset.getScalable() ||
2375 RegInfo->hasStackRealignment(MF))) {
2376 FrameReg = RegInfo->getFrameRegister(MF);
2377 return FPOffset;
2378 }
2379
2380 FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister()
2381 : (unsigned)AArch64::SP;
2382 return SPOffset;
2383 }
2384
2385 StackOffset ScalableOffset = {};
2386 if (UseFP && !(isFixed || isCSR))
2387 ScalableOffset = -SVEStackSize;
2388 if (!UseFP && (isFixed || isCSR))
2389 ScalableOffset = SVEStackSize;
2390
2391 if (UseFP) {
2392 FrameReg = RegInfo->getFrameRegister(MF);
2393 return StackOffset::getFixed(FPOffset) + ScalableOffset;
2394 }
2395
2396 // Use the base pointer if we have one.
2397 if (RegInfo->hasBasePointer(MF))
2398 FrameReg = RegInfo->getBaseRegister();
2399 else {
2400 assert(!MFI.hasVarSizedObjects() &&
2401 "Can't use SP when we have var sized objects.");
2402 FrameReg = AArch64::SP;
2403 // If we're using the red zone for this function, the SP won't actually
2404 // be adjusted, so the offsets will be negative. They're also all
2405 // within range of the signed 9-bit immediate instructions.
2406 if (canUseRedZone(MF))
2407 Offset -= AFI->getLocalStackSize();
2408 }
2409
2410 return StackOffset::getFixed(Offset) + ScalableOffset;
2411 }
2412
getPrologueDeath(MachineFunction & MF,unsigned Reg)2413 static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
2414 // Do not set a kill flag on values that are also marked as live-in. This
2415 // happens with the @llvm-returnaddress intrinsic and with arguments passed in
2416 // callee saved registers.
2417 // Omitting the kill flags is conservatively correct even if the live-in
2418 // is not used after all.
2419 bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg);
2420 return getKillRegState(!IsLiveIn);
2421 }
2422
produceCompactUnwindFrame(MachineFunction & MF)2423 static bool produceCompactUnwindFrame(MachineFunction &MF) {
2424 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2425 AttributeList Attrs = MF.getFunction().getAttributes();
2426 return Subtarget.isTargetMachO() &&
2427 !(Subtarget.getTargetLowering()->supportSwiftError() &&
2428 Attrs.hasAttrSomewhere(Attribute::SwiftError)) &&
2429 MF.getFunction().getCallingConv() != CallingConv::SwiftTail;
2430 }
2431
invalidateWindowsRegisterPairing(unsigned Reg1,unsigned Reg2,bool NeedsWinCFI,bool IsFirst)2432 static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
2433 bool NeedsWinCFI, bool IsFirst) {
2434 // If we are generating register pairs for a Windows function that requires
2435 // EH support, then pair consecutive registers only. There are no unwind
2436 // opcodes for saves/restores of non-consectuve register pairs.
2437 // The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x,
2438 // save_lrpair.
2439 // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
2440
2441 if (Reg2 == AArch64::FP)
2442 return true;
2443 if (!NeedsWinCFI)
2444 return false;
2445 if (Reg2 == Reg1 + 1)
2446 return false;
2447 // If pairing a GPR with LR, the pair can be described by the save_lrpair
2448 // opcode. If this is the first register pair, it would end up with a
2449 // predecrement, but there's no save_lrpair_x opcode, so we can only do this
2450 // if LR is paired with something else than the first register.
2451 // The save_lrpair opcode requires the first register to be an odd one.
2452 if (Reg1 >= AArch64::X19 && Reg1 <= AArch64::X27 &&
2453 (Reg1 - AArch64::X19) % 2 == 0 && Reg2 == AArch64::LR && !IsFirst)
2454 return false;
2455 return true;
2456 }
2457
2458 /// Returns true if Reg1 and Reg2 cannot be paired using a ldp/stp instruction.
2459 /// WindowsCFI requires that only consecutive registers can be paired.
2460 /// LR and FP need to be allocated together when the frame needs to save
2461 /// the frame-record. This means any other register pairing with LR is invalid.
invalidateRegisterPairing(unsigned Reg1,unsigned Reg2,bool UsesWinAAPCS,bool NeedsWinCFI,bool NeedsFrameRecord,bool IsFirst)2462 static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2,
2463 bool UsesWinAAPCS, bool NeedsWinCFI,
2464 bool NeedsFrameRecord, bool IsFirst) {
2465 if (UsesWinAAPCS)
2466 return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI, IsFirst);
2467
2468 // If we need to store the frame record, don't pair any register
2469 // with LR other than FP.
2470 if (NeedsFrameRecord)
2471 return Reg2 == AArch64::LR;
2472
2473 return false;
2474 }
2475
2476 namespace {
2477
2478 struct RegPairInfo {
2479 unsigned Reg1 = AArch64::NoRegister;
2480 unsigned Reg2 = AArch64::NoRegister;
2481 int FrameIdx;
2482 int Offset;
2483 enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type;
2484
2485 RegPairInfo() = default;
2486
isPaired__anon23de39c90511::RegPairInfo2487 bool isPaired() const { return Reg2 != AArch64::NoRegister; }
2488
getScale__anon23de39c90511::RegPairInfo2489 unsigned getScale() const {
2490 switch (Type) {
2491 case PPR:
2492 return 2;
2493 case GPR:
2494 case FPR64:
2495 return 8;
2496 case ZPR:
2497 case FPR128:
2498 return 16;
2499 }
2500 llvm_unreachable("Unsupported type");
2501 }
2502
isScalable__anon23de39c90511::RegPairInfo2503 bool isScalable() const { return Type == PPR || Type == ZPR; }
2504 };
2505
2506 } // end anonymous namespace
2507
computeCalleeSaveRegisterPairs(MachineFunction & MF,ArrayRef<CalleeSavedInfo> CSI,const TargetRegisterInfo * TRI,SmallVectorImpl<RegPairInfo> & RegPairs,bool NeedsFrameRecord)2508 static void computeCalleeSaveRegisterPairs(
2509 MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
2510 const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
2511 bool NeedsFrameRecord) {
2512
2513 if (CSI.empty())
2514 return;
2515
2516 bool IsWindows = isTargetWindows(MF);
2517 bool NeedsWinCFI = needsWinCFI(MF);
2518 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2519 MachineFrameInfo &MFI = MF.getFrameInfo();
2520 CallingConv::ID CC = MF.getFunction().getCallingConv();
2521 unsigned Count = CSI.size();
2522 (void)CC;
2523 // MachO's compact unwind format relies on all registers being stored in
2524 // pairs.
2525 assert((!produceCompactUnwindFrame(MF) ||
2526 CC == CallingConv::PreserveMost || CC == CallingConv::CXX_FAST_TLS ||
2527 (Count & 1) == 0) &&
2528 "Odd number of callee-saved regs to spill!");
2529 int ByteOffset = AFI->getCalleeSavedStackSize();
2530 int StackFillDir = -1;
2531 int RegInc = 1;
2532 unsigned FirstReg = 0;
2533 if (NeedsWinCFI) {
2534 // For WinCFI, fill the stack from the bottom up.
2535 ByteOffset = 0;
2536 StackFillDir = 1;
2537 // As the CSI array is reversed to match PrologEpilogInserter, iterate
2538 // backwards, to pair up registers starting from lower numbered registers.
2539 RegInc = -1;
2540 FirstReg = Count - 1;
2541 }
2542 int ScalableByteOffset = AFI->getSVECalleeSavedStackSize();
2543 bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace();
2544
2545 // When iterating backwards, the loop condition relies on unsigned wraparound.
2546 for (unsigned i = FirstReg; i < Count; i += RegInc) {
2547 RegPairInfo RPI;
2548 RPI.Reg1 = CSI[i].getReg();
2549
2550 if (AArch64::GPR64RegClass.contains(RPI.Reg1))
2551 RPI.Type = RegPairInfo::GPR;
2552 else if (AArch64::FPR64RegClass.contains(RPI.Reg1))
2553 RPI.Type = RegPairInfo::FPR64;
2554 else if (AArch64::FPR128RegClass.contains(RPI.Reg1))
2555 RPI.Type = RegPairInfo::FPR128;
2556 else if (AArch64::ZPRRegClass.contains(RPI.Reg1))
2557 RPI.Type = RegPairInfo::ZPR;
2558 else if (AArch64::PPRRegClass.contains(RPI.Reg1))
2559 RPI.Type = RegPairInfo::PPR;
2560 else
2561 llvm_unreachable("Unsupported register class.");
2562
2563 // Add the next reg to the pair if it is in the same register class.
2564 if (unsigned(i + RegInc) < Count) {
2565 Register NextReg = CSI[i + RegInc].getReg();
2566 bool IsFirst = i == FirstReg;
2567 switch (RPI.Type) {
2568 case RegPairInfo::GPR:
2569 if (AArch64::GPR64RegClass.contains(NextReg) &&
2570 !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows,
2571 NeedsWinCFI, NeedsFrameRecord, IsFirst))
2572 RPI.Reg2 = NextReg;
2573 break;
2574 case RegPairInfo::FPR64:
2575 if (AArch64::FPR64RegClass.contains(NextReg) &&
2576 !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI,
2577 IsFirst))
2578 RPI.Reg2 = NextReg;
2579 break;
2580 case RegPairInfo::FPR128:
2581 if (AArch64::FPR128RegClass.contains(NextReg))
2582 RPI.Reg2 = NextReg;
2583 break;
2584 case RegPairInfo::PPR:
2585 case RegPairInfo::ZPR:
2586 break;
2587 }
2588 }
2589
2590 // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
2591 // list to come in sorted by frame index so that we can issue the store
2592 // pair instructions directly. Assert if we see anything otherwise.
2593 //
2594 // The order of the registers in the list is controlled by
2595 // getCalleeSavedRegs(), so they will always be in-order, as well.
2596 assert((!RPI.isPaired() ||
2597 (CSI[i].getFrameIdx() + RegInc == CSI[i + RegInc].getFrameIdx())) &&
2598 "Out of order callee saved regs!");
2599
2600 assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg2 != AArch64::FP ||
2601 RPI.Reg1 == AArch64::LR) &&
2602 "FrameRecord must be allocated together with LR");
2603
2604 // Windows AAPCS has FP and LR reversed.
2605 assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg1 != AArch64::FP ||
2606 RPI.Reg2 == AArch64::LR) &&
2607 "FrameRecord must be allocated together with LR");
2608
2609 // MachO's compact unwind format relies on all registers being stored in
2610 // adjacent register pairs.
2611 assert((!produceCompactUnwindFrame(MF) ||
2612 CC == CallingConv::PreserveMost || CC == CallingConv::CXX_FAST_TLS ||
2613 (RPI.isPaired() &&
2614 ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
2615 RPI.Reg1 + 1 == RPI.Reg2))) &&
2616 "Callee-save registers not saved as adjacent register pair!");
2617
2618 RPI.FrameIdx = CSI[i].getFrameIdx();
2619 if (NeedsWinCFI &&
2620 RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair
2621 RPI.FrameIdx = CSI[i + RegInc].getFrameIdx();
2622
2623 int Scale = RPI.getScale();
2624
2625 int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
2626 assert(OffsetPre % Scale == 0);
2627
2628 if (RPI.isScalable())
2629 ScalableByteOffset += StackFillDir * Scale;
2630 else
2631 ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
2632
2633 // Swift's async context is directly before FP, so allocate an extra
2634 // 8 bytes for it.
2635 if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
2636 RPI.Reg2 == AArch64::FP)
2637 ByteOffset += StackFillDir * 8;
2638
2639 assert(!(RPI.isScalable() && RPI.isPaired()) &&
2640 "Paired spill/fill instructions don't exist for SVE vectors");
2641
2642 // Round up size of non-pair to pair size if we need to pad the
2643 // callee-save area to ensure 16-byte alignment.
2644 if (NeedGapToAlignStack && !NeedsWinCFI &&
2645 !RPI.isScalable() && RPI.Type != RegPairInfo::FPR128 &&
2646 !RPI.isPaired() && ByteOffset % 16 != 0) {
2647 ByteOffset += 8 * StackFillDir;
2648 assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16));
2649 // A stack frame with a gap looks like this, bottom up:
2650 // d9, d8. x21, gap, x20, x19.
2651 // Set extra alignment on the x21 object to create the gap above it.
2652 MFI.setObjectAlignment(RPI.FrameIdx, Align(16));
2653 NeedGapToAlignStack = false;
2654 }
2655
2656 int OffsetPost = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
2657 assert(OffsetPost % Scale == 0);
2658 // If filling top down (default), we want the offset after incrementing it.
2659 // If fillibg bootom up (WinCFI) we need the original offset.
2660 int Offset = NeedsWinCFI ? OffsetPre : OffsetPost;
2661
2662 // The FP, LR pair goes 8 bytes into our expanded 24-byte slot so that the
2663 // Swift context can directly precede FP.
2664 if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
2665 RPI.Reg2 == AArch64::FP)
2666 Offset += 8;
2667 RPI.Offset = Offset / Scale;
2668
2669 assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) ||
2670 (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) &&
2671 "Offset out of bounds for LDP/STP immediate");
2672
2673 // Save the offset to frame record so that the FP register can point to the
2674 // innermost frame record (spilled FP and LR registers).
2675 if (NeedsFrameRecord && ((!IsWindows && RPI.Reg1 == AArch64::LR &&
2676 RPI.Reg2 == AArch64::FP) ||
2677 (IsWindows && RPI.Reg1 == AArch64::FP &&
2678 RPI.Reg2 == AArch64::LR)))
2679 AFI->setCalleeSaveBaseToFrameRecordOffset(Offset);
2680
2681 RegPairs.push_back(RPI);
2682 if (RPI.isPaired())
2683 i += RegInc;
2684 }
2685 if (NeedsWinCFI) {
2686 // If we need an alignment gap in the stack, align the topmost stack
2687 // object. A stack frame with a gap looks like this, bottom up:
2688 // x19, d8. d9, gap.
2689 // Set extra alignment on the topmost stack object (the first element in
2690 // CSI, which goes top down), to create the gap above it.
2691 if (AFI->hasCalleeSaveStackFreeSpace())
2692 MFI.setObjectAlignment(CSI[0].getFrameIdx(), Align(16));
2693 // We iterated bottom up over the registers; flip RegPairs back to top
2694 // down order.
2695 std::reverse(RegPairs.begin(), RegPairs.end());
2696 }
2697 }
2698
spillCalleeSavedRegisters(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,ArrayRef<CalleeSavedInfo> CSI,const TargetRegisterInfo * TRI) const2699 bool AArch64FrameLowering::spillCalleeSavedRegisters(
2700 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2701 ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2702 MachineFunction &MF = *MBB.getParent();
2703 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
2704 bool NeedsWinCFI = needsWinCFI(MF);
2705 DebugLoc DL;
2706 SmallVector<RegPairInfo, 8> RegPairs;
2707
2708 computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
2709
2710 const MachineRegisterInfo &MRI = MF.getRegInfo();
2711 if (homogeneousPrologEpilog(MF)) {
2712 auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Prolog))
2713 .setMIFlag(MachineInstr::FrameSetup);
2714
2715 for (auto &RPI : RegPairs) {
2716 MIB.addReg(RPI.Reg1);
2717 MIB.addReg(RPI.Reg2);
2718
2719 // Update register live in.
2720 if (!MRI.isReserved(RPI.Reg1))
2721 MBB.addLiveIn(RPI.Reg1);
2722 if (!MRI.isReserved(RPI.Reg2))
2723 MBB.addLiveIn(RPI.Reg2);
2724 }
2725 return true;
2726 }
2727 for (const RegPairInfo &RPI : llvm::reverse(RegPairs)) {
2728 unsigned Reg1 = RPI.Reg1;
2729 unsigned Reg2 = RPI.Reg2;
2730 unsigned StrOpc;
2731
2732 // Issue sequence of spills for cs regs. The first spill may be converted
2733 // to a pre-decrement store later by emitPrologue if the callee-save stack
2734 // area allocation can't be combined with the local stack area allocation.
2735 // For example:
2736 // stp x22, x21, [sp, #0] // addImm(+0)
2737 // stp x20, x19, [sp, #16] // addImm(+2)
2738 // stp fp, lr, [sp, #32] // addImm(+4)
2739 // Rationale: This sequence saves uop updates compared to a sequence of
2740 // pre-increment spills like stp xi,xj,[sp,#-16]!
2741 // Note: Similar rationale and sequence for restores in epilog.
2742 unsigned Size;
2743 Align Alignment;
2744 switch (RPI.Type) {
2745 case RegPairInfo::GPR:
2746 StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
2747 Size = 8;
2748 Alignment = Align(8);
2749 break;
2750 case RegPairInfo::FPR64:
2751 StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
2752 Size = 8;
2753 Alignment = Align(8);
2754 break;
2755 case RegPairInfo::FPR128:
2756 StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui;
2757 Size = 16;
2758 Alignment = Align(16);
2759 break;
2760 case RegPairInfo::ZPR:
2761 StrOpc = AArch64::STR_ZXI;
2762 Size = 16;
2763 Alignment = Align(16);
2764 break;
2765 case RegPairInfo::PPR:
2766 StrOpc = AArch64::STR_PXI;
2767 Size = 2;
2768 Alignment = Align(2);
2769 break;
2770 }
2771 LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
2772 if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
2773 dbgs() << ") -> fi#(" << RPI.FrameIdx;
2774 if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
2775 dbgs() << ")\n");
2776
2777 assert((!NeedsWinCFI || !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) &&
2778 "Windows unwdinding requires a consecutive (FP,LR) pair");
2779 // Windows unwind codes require consecutive registers if registers are
2780 // paired. Make the switch here, so that the code below will save (x,x+1)
2781 // and not (x+1,x).
2782 unsigned FrameIdxReg1 = RPI.FrameIdx;
2783 unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
2784 if (NeedsWinCFI && RPI.isPaired()) {
2785 std::swap(Reg1, Reg2);
2786 std::swap(FrameIdxReg1, FrameIdxReg2);
2787 }
2788 MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
2789 if (!MRI.isReserved(Reg1))
2790 MBB.addLiveIn(Reg1);
2791 if (RPI.isPaired()) {
2792 if (!MRI.isReserved(Reg2))
2793 MBB.addLiveIn(Reg2);
2794 MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
2795 MIB.addMemOperand(MF.getMachineMemOperand(
2796 MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
2797 MachineMemOperand::MOStore, Size, Alignment));
2798 }
2799 MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
2800 .addReg(AArch64::SP)
2801 .addImm(RPI.Offset) // [sp, #offset*scale],
2802 // where factor*scale is implicit
2803 .setMIFlag(MachineInstr::FrameSetup);
2804 MIB.addMemOperand(MF.getMachineMemOperand(
2805 MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
2806 MachineMemOperand::MOStore, Size, Alignment));
2807 if (NeedsWinCFI)
2808 InsertSEH(MIB, TII, MachineInstr::FrameSetup);
2809
2810 // Update the StackIDs of the SVE stack slots.
2811 MachineFrameInfo &MFI = MF.getFrameInfo();
2812 if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR)
2813 MFI.setStackID(RPI.FrameIdx, TargetStackID::ScalableVector);
2814
2815 }
2816 return true;
2817 }
2818
restoreCalleeSavedRegisters(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,MutableArrayRef<CalleeSavedInfo> CSI,const TargetRegisterInfo * TRI) const2819 bool AArch64FrameLowering::restoreCalleeSavedRegisters(
2820 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
2821 MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2822 MachineFunction &MF = *MBB.getParent();
2823 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
2824 DebugLoc DL;
2825 SmallVector<RegPairInfo, 8> RegPairs;
2826 bool NeedsWinCFI = needsWinCFI(MF);
2827
2828 if (MBBI != MBB.end())
2829 DL = MBBI->getDebugLoc();
2830
2831 computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
2832
2833 auto EmitMI = [&](const RegPairInfo &RPI) -> MachineBasicBlock::iterator {
2834 unsigned Reg1 = RPI.Reg1;
2835 unsigned Reg2 = RPI.Reg2;
2836
2837 // Issue sequence of restores for cs regs. The last restore may be converted
2838 // to a post-increment load later by emitEpilogue if the callee-save stack
2839 // area allocation can't be combined with the local stack area allocation.
2840 // For example:
2841 // ldp fp, lr, [sp, #32] // addImm(+4)
2842 // ldp x20, x19, [sp, #16] // addImm(+2)
2843 // ldp x22, x21, [sp, #0] // addImm(+0)
2844 // Note: see comment in spillCalleeSavedRegisters()
2845 unsigned LdrOpc;
2846 unsigned Size;
2847 Align Alignment;
2848 switch (RPI.Type) {
2849 case RegPairInfo::GPR:
2850 LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
2851 Size = 8;
2852 Alignment = Align(8);
2853 break;
2854 case RegPairInfo::FPR64:
2855 LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
2856 Size = 8;
2857 Alignment = Align(8);
2858 break;
2859 case RegPairInfo::FPR128:
2860 LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui;
2861 Size = 16;
2862 Alignment = Align(16);
2863 break;
2864 case RegPairInfo::ZPR:
2865 LdrOpc = AArch64::LDR_ZXI;
2866 Size = 16;
2867 Alignment = Align(16);
2868 break;
2869 case RegPairInfo::PPR:
2870 LdrOpc = AArch64::LDR_PXI;
2871 Size = 2;
2872 Alignment = Align(2);
2873 break;
2874 }
2875 LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
2876 if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
2877 dbgs() << ") -> fi#(" << RPI.FrameIdx;
2878 if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
2879 dbgs() << ")\n");
2880
2881 // Windows unwind codes require consecutive registers if registers are
2882 // paired. Make the switch here, so that the code below will save (x,x+1)
2883 // and not (x+1,x).
2884 unsigned FrameIdxReg1 = RPI.FrameIdx;
2885 unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
2886 if (NeedsWinCFI && RPI.isPaired()) {
2887 std::swap(Reg1, Reg2);
2888 std::swap(FrameIdxReg1, FrameIdxReg2);
2889 }
2890 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc));
2891 if (RPI.isPaired()) {
2892 MIB.addReg(Reg2, getDefRegState(true));
2893 MIB.addMemOperand(MF.getMachineMemOperand(
2894 MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
2895 MachineMemOperand::MOLoad, Size, Alignment));
2896 }
2897 MIB.addReg(Reg1, getDefRegState(true))
2898 .addReg(AArch64::SP)
2899 .addImm(RPI.Offset) // [sp, #offset*scale]
2900 // where factor*scale is implicit
2901 .setMIFlag(MachineInstr::FrameDestroy);
2902 MIB.addMemOperand(MF.getMachineMemOperand(
2903 MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
2904 MachineMemOperand::MOLoad, Size, Alignment));
2905 if (NeedsWinCFI)
2906 InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
2907
2908 return MIB->getIterator();
2909 };
2910
2911 // SVE objects are always restored in reverse order.
2912 for (const RegPairInfo &RPI : reverse(RegPairs))
2913 if (RPI.isScalable())
2914 EmitMI(RPI);
2915
2916 if (homogeneousPrologEpilog(MF, &MBB)) {
2917 auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
2918 .setMIFlag(MachineInstr::FrameDestroy);
2919 for (auto &RPI : RegPairs) {
2920 MIB.addReg(RPI.Reg1, RegState::Define);
2921 MIB.addReg(RPI.Reg2, RegState::Define);
2922 }
2923 return true;
2924 }
2925
2926 if (ReverseCSRRestoreSeq) {
2927 MachineBasicBlock::iterator First = MBB.end();
2928 for (const RegPairInfo &RPI : reverse(RegPairs)) {
2929 if (RPI.isScalable())
2930 continue;
2931 MachineBasicBlock::iterator It = EmitMI(RPI);
2932 if (First == MBB.end())
2933 First = It;
2934 }
2935 if (First != MBB.end())
2936 MBB.splice(MBBI, &MBB, First);
2937 } else {
2938 for (const RegPairInfo &RPI : RegPairs) {
2939 if (RPI.isScalable())
2940 continue;
2941 (void)EmitMI(RPI);
2942 }
2943 }
2944
2945 return true;
2946 }
2947
determineCalleeSaves(MachineFunction & MF,BitVector & SavedRegs,RegScavenger * RS) const2948 void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
2949 BitVector &SavedRegs,
2950 RegScavenger *RS) const {
2951 // All calls are tail calls in GHC calling conv, and functions have no
2952 // prologue/epilogue.
2953 if (MF.getFunction().getCallingConv() == CallingConv::GHC)
2954 return;
2955
2956 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
2957 const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
2958 MF.getSubtarget().getRegisterInfo());
2959 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2960 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2961 unsigned UnspilledCSGPR = AArch64::NoRegister;
2962 unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
2963
2964 MachineFrameInfo &MFI = MF.getFrameInfo();
2965 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
2966
2967 unsigned BasePointerReg = RegInfo->hasBasePointer(MF)
2968 ? RegInfo->getBaseRegister()
2969 : (unsigned)AArch64::NoRegister;
2970
2971 unsigned ExtraCSSpill = 0;
2972 // Figure out which callee-saved registers to save/restore.
2973 for (unsigned i = 0; CSRegs[i]; ++i) {
2974 const unsigned Reg = CSRegs[i];
2975
2976 // Add the base pointer register to SavedRegs if it is callee-save.
2977 if (Reg == BasePointerReg)
2978 SavedRegs.set(Reg);
2979
2980 bool RegUsed = SavedRegs.test(Reg);
2981 unsigned PairedReg = AArch64::NoRegister;
2982 if (AArch64::GPR64RegClass.contains(Reg) ||
2983 AArch64::FPR64RegClass.contains(Reg) ||
2984 AArch64::FPR128RegClass.contains(Reg))
2985 PairedReg = CSRegs[i ^ 1];
2986
2987 if (!RegUsed) {
2988 if (AArch64::GPR64RegClass.contains(Reg) &&
2989 !RegInfo->isReservedReg(MF, Reg)) {
2990 UnspilledCSGPR = Reg;
2991 UnspilledCSGPRPaired = PairedReg;
2992 }
2993 continue;
2994 }
2995
2996 // MachO's compact unwind format relies on all registers being stored in
2997 // pairs.
2998 // FIXME: the usual format is actually better if unwinding isn't needed.
2999 if (producePairRegisters(MF) && PairedReg != AArch64::NoRegister &&
3000 !SavedRegs.test(PairedReg)) {
3001 SavedRegs.set(PairedReg);
3002 if (AArch64::GPR64RegClass.contains(PairedReg) &&
3003 !RegInfo->isReservedReg(MF, PairedReg))
3004 ExtraCSSpill = PairedReg;
3005 }
3006 }
3007
3008 if (MF.getFunction().getCallingConv() == CallingConv::Win64 &&
3009 !Subtarget.isTargetWindows()) {
3010 // For Windows calling convention on a non-windows OS, where X18 is treated
3011 // as reserved, back up X18 when entering non-windows code (marked with the
3012 // Windows calling convention) and restore when returning regardless of
3013 // whether the individual function uses it - it might call other functions
3014 // that clobber it.
3015 SavedRegs.set(AArch64::X18);
3016 }
3017
3018 // Calculates the callee saved stack size.
3019 unsigned CSStackSize = 0;
3020 unsigned SVECSStackSize = 0;
3021 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3022 const MachineRegisterInfo &MRI = MF.getRegInfo();
3023 for (unsigned Reg : SavedRegs.set_bits()) {
3024 auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8;
3025 if (AArch64::PPRRegClass.contains(Reg) ||
3026 AArch64::ZPRRegClass.contains(Reg))
3027 SVECSStackSize += RegSize;
3028 else
3029 CSStackSize += RegSize;
3030 }
3031
3032 // Save number of saved regs, so we can easily update CSStackSize later.
3033 unsigned NumSavedRegs = SavedRegs.count();
3034
3035 // The frame record needs to be created by saving the appropriate registers
3036 uint64_t EstimatedStackSize = MFI.estimateStackSize(MF);
3037 if (hasFP(MF) ||
3038 windowsRequiresStackProbe(MF, EstimatedStackSize + CSStackSize + 16)) {
3039 SavedRegs.set(AArch64::FP);
3040 SavedRegs.set(AArch64::LR);
3041 }
3042
3043 LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nSaved CSRs:";
3044 for (unsigned Reg
3045 : SavedRegs.set_bits()) dbgs()
3046 << ' ' << printReg(Reg, RegInfo);
3047 dbgs() << "\n";);
3048
3049 // If any callee-saved registers are used, the frame cannot be eliminated.
3050 int64_t SVEStackSize =
3051 alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16);
3052 bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize;
3053
3054 // The CSR spill slots have not been allocated yet, so estimateStackSize
3055 // won't include them.
3056 unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
3057
3058 // Conservatively always assume BigStack when there are SVE spills.
3059 bool BigStack = SVEStackSize ||
3060 (EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit;
3061 if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
3062 AFI->setHasStackFrame(true);
3063
3064 // Estimate if we might need to scavenge a register at some point in order
3065 // to materialize a stack offset. If so, either spill one additional
3066 // callee-saved register or reserve a special spill slot to facilitate
3067 // register scavenging. If we already spilled an extra callee-saved register
3068 // above to keep the number of spills even, we don't need to do anything else
3069 // here.
3070 if (BigStack) {
3071 if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) {
3072 LLVM_DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo)
3073 << " to get a scratch register.\n");
3074 SavedRegs.set(UnspilledCSGPR);
3075 // MachO's compact unwind format relies on all registers being stored in
3076 // pairs, so if we need to spill one extra for BigStack, then we need to
3077 // store the pair.
3078 if (producePairRegisters(MF))
3079 SavedRegs.set(UnspilledCSGPRPaired);
3080 ExtraCSSpill = UnspilledCSGPR;
3081 }
3082
3083 // If we didn't find an extra callee-saved register to spill, create
3084 // an emergency spill slot.
3085 if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) {
3086 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3087 const TargetRegisterClass &RC = AArch64::GPR64RegClass;
3088 unsigned Size = TRI->getSpillSize(RC);
3089 Align Alignment = TRI->getSpillAlign(RC);
3090 int FI = MFI.CreateStackObject(Size, Alignment, false);
3091 RS->addScavengingFrameIndex(FI);
3092 LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
3093 << " as the emergency spill slot.\n");
3094 }
3095 }
3096
3097 // Adding the size of additional 64bit GPR saves.
3098 CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs);
3099
3100 // A Swift asynchronous context extends the frame record with a pointer
3101 // directly before FP.
3102 if (hasFP(MF) && AFI->hasSwiftAsyncContext())
3103 CSStackSize += 8;
3104
3105 uint64_t AlignedCSStackSize = alignTo(CSStackSize, 16);
3106 LLVM_DEBUG(dbgs() << "Estimated stack frame size: "
3107 << EstimatedStackSize + AlignedCSStackSize
3108 << " bytes.\n");
3109
3110 assert((!MFI.isCalleeSavedInfoValid() ||
3111 AFI->getCalleeSavedStackSize() == AlignedCSStackSize) &&
3112 "Should not invalidate callee saved info");
3113
3114 // Round up to register pair alignment to avoid additional SP adjustment
3115 // instructions.
3116 AFI->setCalleeSavedStackSize(AlignedCSStackSize);
3117 AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize);
3118 AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16));
3119 }
3120
assignCalleeSavedSpillSlots(MachineFunction & MF,const TargetRegisterInfo * RegInfo,std::vector<CalleeSavedInfo> & CSI,unsigned & MinCSFrameIndex,unsigned & MaxCSFrameIndex) const3121 bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
3122 MachineFunction &MF, const TargetRegisterInfo *RegInfo,
3123 std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
3124 unsigned &MaxCSFrameIndex) const {
3125 bool NeedsWinCFI = needsWinCFI(MF);
3126 // To match the canonical windows frame layout, reverse the list of
3127 // callee saved registers to get them laid out by PrologEpilogInserter
3128 // in the right order. (PrologEpilogInserter allocates stack objects top
3129 // down. Windows canonical prologs store higher numbered registers at
3130 // the top, thus have the CSI array start from the highest registers.)
3131 if (NeedsWinCFI)
3132 std::reverse(CSI.begin(), CSI.end());
3133
3134 if (CSI.empty())
3135 return true; // Early exit if no callee saved registers are modified!
3136
3137 // Now that we know which registers need to be saved and restored, allocate
3138 // stack slots for them.
3139 MachineFrameInfo &MFI = MF.getFrameInfo();
3140 auto *AFI = MF.getInfo<AArch64FunctionInfo>();
3141
3142 bool UsesWinAAPCS = isTargetWindows(MF);
3143 if (UsesWinAAPCS && hasFP(MF) && AFI->hasSwiftAsyncContext()) {
3144 int FrameIdx = MFI.CreateStackObject(8, Align(16), true);
3145 AFI->setSwiftAsyncContextFrameIdx(FrameIdx);
3146 if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
3147 if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
3148 }
3149
3150 for (auto &CS : CSI) {
3151 Register Reg = CS.getReg();
3152 const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
3153
3154 unsigned Size = RegInfo->getSpillSize(*RC);
3155 Align Alignment(RegInfo->getSpillAlign(*RC));
3156 int FrameIdx = MFI.CreateStackObject(Size, Alignment, true);
3157 CS.setFrameIdx(FrameIdx);
3158
3159 if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
3160 if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
3161
3162 // Grab 8 bytes below FP for the extended asynchronous frame info.
3163 if (hasFP(MF) && AFI->hasSwiftAsyncContext() && !UsesWinAAPCS &&
3164 Reg == AArch64::FP) {
3165 FrameIdx = MFI.CreateStackObject(8, Alignment, true);
3166 AFI->setSwiftAsyncContextFrameIdx(FrameIdx);
3167 if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
3168 if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
3169 }
3170 }
3171 return true;
3172 }
3173
enableStackSlotScavenging(const MachineFunction & MF) const3174 bool AArch64FrameLowering::enableStackSlotScavenging(
3175 const MachineFunction &MF) const {
3176 const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3177 return AFI->hasCalleeSaveStackFreeSpace();
3178 }
3179
3180 /// returns true if there are any SVE callee saves.
getSVECalleeSaveSlotRange(const MachineFrameInfo & MFI,int & Min,int & Max)3181 static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI,
3182 int &Min, int &Max) {
3183 Min = std::numeric_limits<int>::max();
3184 Max = std::numeric_limits<int>::min();
3185
3186 if (!MFI.isCalleeSavedInfoValid())
3187 return false;
3188
3189 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
3190 for (auto &CS : CSI) {
3191 if (AArch64::ZPRRegClass.contains(CS.getReg()) ||
3192 AArch64::PPRRegClass.contains(CS.getReg())) {
3193 assert((Max == std::numeric_limits<int>::min() ||
3194 Max + 1 == CS.getFrameIdx()) &&
3195 "SVE CalleeSaves are not consecutive");
3196
3197 Min = std::min(Min, CS.getFrameIdx());
3198 Max = std::max(Max, CS.getFrameIdx());
3199 }
3200 }
3201 return Min != std::numeric_limits<int>::max();
3202 }
3203
3204 // Process all the SVE stack objects and determine offsets for each
3205 // object. If AssignOffsets is true, the offsets get assigned.
3206 // Fills in the first and last callee-saved frame indices into
3207 // Min/MaxCSFrameIndex, respectively.
3208 // Returns the size of the stack.
determineSVEStackObjectOffsets(MachineFrameInfo & MFI,int & MinCSFrameIndex,int & MaxCSFrameIndex,bool AssignOffsets)3209 static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
3210 int &MinCSFrameIndex,
3211 int &MaxCSFrameIndex,
3212 bool AssignOffsets) {
3213 #ifndef NDEBUG
3214 // First process all fixed stack objects.
3215 for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
3216 assert(MFI.getStackID(I) != TargetStackID::ScalableVector &&
3217 "SVE vectors should never be passed on the stack by value, only by "
3218 "reference.");
3219 #endif
3220
3221 auto Assign = [&MFI](int FI, int64_t Offset) {
3222 LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n");
3223 MFI.setObjectOffset(FI, Offset);
3224 };
3225
3226 int64_t Offset = 0;
3227
3228 // Then process all callee saved slots.
3229 if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) {
3230 // Assign offsets to the callee save slots.
3231 for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {
3232 Offset += MFI.getObjectSize(I);
3233 Offset = alignTo(Offset, MFI.getObjectAlign(I));
3234 if (AssignOffsets)
3235 Assign(I, -Offset);
3236 }
3237 }
3238
3239 // Ensure that the Callee-save area is aligned to 16bytes.
3240 Offset = alignTo(Offset, Align(16U));
3241
3242 // Create a buffer of SVE objects to allocate and sort it.
3243 SmallVector<int, 8> ObjectsToAllocate;
3244 // If we have a stack protector, and we've previously decided that we have SVE
3245 // objects on the stack and thus need it to go in the SVE stack area, then it
3246 // needs to go first.
3247 int StackProtectorFI = -1;
3248 if (MFI.hasStackProtectorIndex()) {
3249 StackProtectorFI = MFI.getStackProtectorIndex();
3250 if (MFI.getStackID(StackProtectorFI) == TargetStackID::ScalableVector)
3251 ObjectsToAllocate.push_back(StackProtectorFI);
3252 }
3253 for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
3254 unsigned StackID = MFI.getStackID(I);
3255 if (StackID != TargetStackID::ScalableVector)
3256 continue;
3257 if (I == StackProtectorFI)
3258 continue;
3259 if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex)
3260 continue;
3261 if (MFI.isDeadObjectIndex(I))
3262 continue;
3263
3264 ObjectsToAllocate.push_back(I);
3265 }
3266
3267 // Allocate all SVE locals and spills
3268 for (unsigned FI : ObjectsToAllocate) {
3269 Align Alignment = MFI.getObjectAlign(FI);
3270 // FIXME: Given that the length of SVE vectors is not necessarily a power of
3271 // two, we'd need to align every object dynamically at runtime if the
3272 // alignment is larger than 16. This is not yet supported.
3273 if (Alignment > Align(16))
3274 report_fatal_error(
3275 "Alignment of scalable vectors > 16 bytes is not yet supported");
3276
3277 Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment);
3278 if (AssignOffsets)
3279 Assign(FI, -Offset);
3280 }
3281
3282 return Offset;
3283 }
3284
estimateSVEStackObjectOffsets(MachineFrameInfo & MFI) const3285 int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets(
3286 MachineFrameInfo &MFI) const {
3287 int MinCSFrameIndex, MaxCSFrameIndex;
3288 return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false);
3289 }
3290
assignSVEStackObjectOffsets(MachineFrameInfo & MFI,int & MinCSFrameIndex,int & MaxCSFrameIndex) const3291 int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
3292 MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const {
3293 return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex,
3294 true);
3295 }
3296
processFunctionBeforeFrameFinalized(MachineFunction & MF,RegScavenger * RS) const3297 void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
3298 MachineFunction &MF, RegScavenger *RS) const {
3299 MachineFrameInfo &MFI = MF.getFrameInfo();
3300
3301 assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
3302 "Upwards growing stack unsupported");
3303
3304 int MinCSFrameIndex, MaxCSFrameIndex;
3305 int64_t SVEStackSize =
3306 assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex);
3307
3308 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3309 AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U));
3310 AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex);
3311
3312 // If this function isn't doing Win64-style C++ EH, we don't need to do
3313 // anything.
3314 if (!MF.hasEHFunclets())
3315 return;
3316 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
3317 WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
3318
3319 MachineBasicBlock &MBB = MF.front();
3320 auto MBBI = MBB.begin();
3321 while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
3322 ++MBBI;
3323
3324 // Create an UnwindHelp object.
3325 // The UnwindHelp object is allocated at the start of the fixed object area
3326 int64_t FixedObject =
3327 getFixedObjectSize(MF, AFI, /*IsWin64*/ true, /*IsFunclet*/ false);
3328 int UnwindHelpFI = MFI.CreateFixedObject(/*Size*/ 8,
3329 /*SPOffset*/ -FixedObject,
3330 /*IsImmutable=*/false);
3331 EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
3332
3333 // We need to store -2 into the UnwindHelp object at the start of the
3334 // function.
3335 DebugLoc DL;
3336 RS->enterBasicBlockEnd(MBB);
3337 RS->backward(std::prev(MBBI));
3338 Register DstReg = RS->FindUnusedReg(&AArch64::GPR64commonRegClass);
3339 assert(DstReg && "There must be a free register after frame setup");
3340 BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), DstReg).addImm(-2);
3341 BuildMI(MBB, MBBI, DL, TII.get(AArch64::STURXi))
3342 .addReg(DstReg, getKillRegState(true))
3343 .addFrameIndex(UnwindHelpFI)
3344 .addImm(0);
3345 }
3346
3347 namespace {
3348 struct TagStoreInstr {
3349 MachineInstr *MI;
3350 int64_t Offset, Size;
TagStoreInstr__anon23de39c90811::TagStoreInstr3351 explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size)
3352 : MI(MI), Offset(Offset), Size(Size) {}
3353 };
3354
3355 class TagStoreEdit {
3356 MachineFunction *MF;
3357 MachineBasicBlock *MBB;
3358 MachineRegisterInfo *MRI;
3359 // Tag store instructions that are being replaced.
3360 SmallVector<TagStoreInstr, 8> TagStores;
3361 // Combined memref arguments of the above instructions.
3362 SmallVector<MachineMemOperand *, 8> CombinedMemRefs;
3363
3364 // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg +
3365 // FrameRegOffset + Size) with the address tag of SP.
3366 Register FrameReg;
3367 StackOffset FrameRegOffset;
3368 int64_t Size;
3369 // If not None, move FrameReg to (FrameReg + FrameRegUpdate) at the end.
3370 Optional<int64_t> FrameRegUpdate;
3371 // MIFlags for any FrameReg updating instructions.
3372 unsigned FrameRegUpdateFlags;
3373
3374 // Use zeroing instruction variants.
3375 bool ZeroData;
3376 DebugLoc DL;
3377
3378 void emitUnrolled(MachineBasicBlock::iterator InsertI);
3379 void emitLoop(MachineBasicBlock::iterator InsertI);
3380
3381 public:
TagStoreEdit(MachineBasicBlock * MBB,bool ZeroData)3382 TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData)
3383 : MBB(MBB), ZeroData(ZeroData) {
3384 MF = MBB->getParent();
3385 MRI = &MF->getRegInfo();
3386 }
3387 // Add an instruction to be replaced. Instructions must be added in the
3388 // ascending order of Offset, and have to be adjacent.
addInstruction(TagStoreInstr I)3389 void addInstruction(TagStoreInstr I) {
3390 assert((TagStores.empty() ||
3391 TagStores.back().Offset + TagStores.back().Size == I.Offset) &&
3392 "Non-adjacent tag store instructions.");
3393 TagStores.push_back(I);
3394 }
clear()3395 void clear() { TagStores.clear(); }
3396 // Emit equivalent code at the given location, and erase the current set of
3397 // instructions. May skip if the replacement is not profitable. May invalidate
3398 // the input iterator and replace it with a valid one.
3399 void emitCode(MachineBasicBlock::iterator &InsertI,
3400 const AArch64FrameLowering *TFI, bool TryMergeSPUpdate);
3401 };
3402
emitUnrolled(MachineBasicBlock::iterator InsertI)3403 void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
3404 const AArch64InstrInfo *TII =
3405 MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
3406
3407 const int64_t kMinOffset = -256 * 16;
3408 const int64_t kMaxOffset = 255 * 16;
3409
3410 Register BaseReg = FrameReg;
3411 int64_t BaseRegOffsetBytes = FrameRegOffset.getFixed();
3412 if (BaseRegOffsetBytes < kMinOffset ||
3413 BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) {
3414 Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
3415 emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg,
3416 StackOffset::getFixed(BaseRegOffsetBytes), TII);
3417 BaseReg = ScratchReg;
3418 BaseRegOffsetBytes = 0;
3419 }
3420
3421 MachineInstr *LastI = nullptr;
3422 while (Size) {
3423 int64_t InstrSize = (Size > 16) ? 32 : 16;
3424 unsigned Opcode =
3425 InstrSize == 16
3426 ? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset)
3427 : (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset);
3428 MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode))
3429 .addReg(AArch64::SP)
3430 .addReg(BaseReg)
3431 .addImm(BaseRegOffsetBytes / 16)
3432 .setMemRefs(CombinedMemRefs);
3433 // A store to [BaseReg, #0] should go last for an opportunity to fold the
3434 // final SP adjustment in the epilogue.
3435 if (BaseRegOffsetBytes == 0)
3436 LastI = I;
3437 BaseRegOffsetBytes += InstrSize;
3438 Size -= InstrSize;
3439 }
3440
3441 if (LastI)
3442 MBB->splice(InsertI, MBB, LastI);
3443 }
3444
emitLoop(MachineBasicBlock::iterator InsertI)3445 void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
3446 const AArch64InstrInfo *TII =
3447 MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
3448
3449 Register BaseReg = FrameRegUpdate
3450 ? FrameReg
3451 : MRI->createVirtualRegister(&AArch64::GPR64RegClass);
3452 Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
3453
3454 emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII);
3455
3456 int64_t LoopSize = Size;
3457 // If the loop size is not a multiple of 32, split off one 16-byte store at
3458 // the end to fold BaseReg update into.
3459 if (FrameRegUpdate && *FrameRegUpdate)
3460 LoopSize -= LoopSize % 32;
3461 MachineInstr *LoopI = BuildMI(*MBB, InsertI, DL,
3462 TII->get(ZeroData ? AArch64::STZGloop_wback
3463 : AArch64::STGloop_wback))
3464 .addDef(SizeReg)
3465 .addDef(BaseReg)
3466 .addImm(LoopSize)
3467 .addReg(BaseReg)
3468 .setMemRefs(CombinedMemRefs);
3469 if (FrameRegUpdate)
3470 LoopI->setFlags(FrameRegUpdateFlags);
3471
3472 int64_t ExtraBaseRegUpdate =
3473 FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getFixed() - Size) : 0;
3474 if (LoopSize < Size) {
3475 assert(FrameRegUpdate);
3476 assert(Size - LoopSize == 16);
3477 // Tag 16 more bytes at BaseReg and update BaseReg.
3478 BuildMI(*MBB, InsertI, DL,
3479 TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex))
3480 .addDef(BaseReg)
3481 .addReg(BaseReg)
3482 .addReg(BaseReg)
3483 .addImm(1 + ExtraBaseRegUpdate / 16)
3484 .setMemRefs(CombinedMemRefs)
3485 .setMIFlags(FrameRegUpdateFlags);
3486 } else if (ExtraBaseRegUpdate) {
3487 // Update BaseReg.
3488 BuildMI(
3489 *MBB, InsertI, DL,
3490 TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri))
3491 .addDef(BaseReg)
3492 .addReg(BaseReg)
3493 .addImm(std::abs(ExtraBaseRegUpdate))
3494 .addImm(0)
3495 .setMIFlags(FrameRegUpdateFlags);
3496 }
3497 }
3498
3499 // Check if *II is a register update that can be merged into STGloop that ends
3500 // at (Reg + Size). RemainingOffset is the required adjustment to Reg after the
3501 // end of the loop.
canMergeRegUpdate(MachineBasicBlock::iterator II,unsigned Reg,int64_t Size,int64_t * TotalOffset)3502 bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg,
3503 int64_t Size, int64_t *TotalOffset) {
3504 MachineInstr &MI = *II;
3505 if ((MI.getOpcode() == AArch64::ADDXri ||
3506 MI.getOpcode() == AArch64::SUBXri) &&
3507 MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) {
3508 unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm());
3509 int64_t Offset = MI.getOperand(2).getImm() << Shift;
3510 if (MI.getOpcode() == AArch64::SUBXri)
3511 Offset = -Offset;
3512 int64_t AbsPostOffset = std::abs(Offset - Size);
3513 const int64_t kMaxOffset =
3514 0xFFF; // Max encoding for unshifted ADDXri / SUBXri
3515 if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) {
3516 *TotalOffset = Offset;
3517 return true;
3518 }
3519 }
3520 return false;
3521 }
3522
mergeMemRefs(const SmallVectorImpl<TagStoreInstr> & TSE,SmallVectorImpl<MachineMemOperand * > & MemRefs)3523 void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
3524 SmallVectorImpl<MachineMemOperand *> &MemRefs) {
3525 MemRefs.clear();
3526 for (auto &TS : TSE) {
3527 MachineInstr *MI = TS.MI;
3528 // An instruction without memory operands may access anything. Be
3529 // conservative and return an empty list.
3530 if (MI->memoperands_empty()) {
3531 MemRefs.clear();
3532 return;
3533 }
3534 MemRefs.append(MI->memoperands_begin(), MI->memoperands_end());
3535 }
3536 }
3537
emitCode(MachineBasicBlock::iterator & InsertI,const AArch64FrameLowering * TFI,bool TryMergeSPUpdate)3538 void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
3539 const AArch64FrameLowering *TFI,
3540 bool TryMergeSPUpdate) {
3541 if (TagStores.empty())
3542 return;
3543 TagStoreInstr &FirstTagStore = TagStores[0];
3544 TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1];
3545 Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size;
3546 DL = TagStores[0].MI->getDebugLoc();
3547
3548 Register Reg;
3549 FrameRegOffset = TFI->resolveFrameOffsetReference(
3550 *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg,
3551 /*PreferFP=*/false, /*ForSimm=*/true);
3552 FrameReg = Reg;
3553 FrameRegUpdate = None;
3554
3555 mergeMemRefs(TagStores, CombinedMemRefs);
3556
3557 LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n";
3558 for (const auto &Instr
3559 : TagStores) { dbgs() << " " << *Instr.MI; });
3560
3561 // Size threshold where a loop becomes shorter than a linear sequence of
3562 // tagging instructions.
3563 const int kSetTagLoopThreshold = 176;
3564 if (Size < kSetTagLoopThreshold) {
3565 if (TagStores.size() < 2)
3566 return;
3567 emitUnrolled(InsertI);
3568 } else {
3569 MachineInstr *UpdateInstr = nullptr;
3570 int64_t TotalOffset = 0;
3571 if (TryMergeSPUpdate) {
3572 // See if we can merge base register update into the STGloop.
3573 // This is done in AArch64LoadStoreOptimizer for "normal" stores,
3574 // but STGloop is way too unusual for that, and also it only
3575 // realistically happens in function epilogue. Also, STGloop is expanded
3576 // before that pass.
3577 if (InsertI != MBB->end() &&
3578 canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getFixed() + Size,
3579 &TotalOffset)) {
3580 UpdateInstr = &*InsertI++;
3581 LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n "
3582 << *UpdateInstr);
3583 }
3584 }
3585
3586 if (!UpdateInstr && TagStores.size() < 2)
3587 return;
3588
3589 if (UpdateInstr) {
3590 FrameRegUpdate = TotalOffset;
3591 FrameRegUpdateFlags = UpdateInstr->getFlags();
3592 }
3593 emitLoop(InsertI);
3594 if (UpdateInstr)
3595 UpdateInstr->eraseFromParent();
3596 }
3597
3598 for (auto &TS : TagStores)
3599 TS.MI->eraseFromParent();
3600 }
3601
isMergeableStackTaggingInstruction(MachineInstr & MI,int64_t & Offset,int64_t & Size,bool & ZeroData)3602 bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset,
3603 int64_t &Size, bool &ZeroData) {
3604 MachineFunction &MF = *MI.getParent()->getParent();
3605 const MachineFrameInfo &MFI = MF.getFrameInfo();
3606
3607 unsigned Opcode = MI.getOpcode();
3608 ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGOffset ||
3609 Opcode == AArch64::STZ2GOffset);
3610
3611 if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) {
3612 if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead())
3613 return false;
3614 if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI())
3615 return false;
3616 Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex());
3617 Size = MI.getOperand(2).getImm();
3618 return true;
3619 }
3620
3621 if (Opcode == AArch64::STGOffset || Opcode == AArch64::STZGOffset)
3622 Size = 16;
3623 else if (Opcode == AArch64::ST2GOffset || Opcode == AArch64::STZ2GOffset)
3624 Size = 32;
3625 else
3626 return false;
3627
3628 if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI())
3629 return false;
3630
3631 Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) +
3632 16 * MI.getOperand(2).getImm();
3633 return true;
3634 }
3635
3636 // Detect a run of memory tagging instructions for adjacent stack frame slots,
3637 // and replace them with a shorter instruction sequence:
3638 // * replace STG + STG with ST2G
3639 // * replace STGloop + STGloop with STGloop
3640 // This code needs to run when stack slot offsets are already known, but before
3641 // FrameIndex operands in STG instructions are eliminated.
tryMergeAdjacentSTG(MachineBasicBlock::iterator II,const AArch64FrameLowering * TFI,RegScavenger * RS)3642 MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
3643 const AArch64FrameLowering *TFI,
3644 RegScavenger *RS) {
3645 bool FirstZeroData;
3646 int64_t Size, Offset;
3647 MachineInstr &MI = *II;
3648 MachineBasicBlock *MBB = MI.getParent();
3649 MachineBasicBlock::iterator NextI = ++II;
3650 if (&MI == &MBB->instr_back())
3651 return II;
3652 if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData))
3653 return II;
3654
3655 SmallVector<TagStoreInstr, 4> Instrs;
3656 Instrs.emplace_back(&MI, Offset, Size);
3657
3658 constexpr int kScanLimit = 10;
3659 int Count = 0;
3660 for (MachineBasicBlock::iterator E = MBB->end();
3661 NextI != E && Count < kScanLimit; ++NextI) {
3662 MachineInstr &MI = *NextI;
3663 bool ZeroData;
3664 int64_t Size, Offset;
3665 // Collect instructions that update memory tags with a FrameIndex operand
3666 // and (when applicable) constant size, and whose output registers are dead
3667 // (the latter is almost always the case in practice). Since these
3668 // instructions effectively have no inputs or outputs, we are free to skip
3669 // any non-aliasing instructions in between without tracking used registers.
3670 if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) {
3671 if (ZeroData != FirstZeroData)
3672 break;
3673 Instrs.emplace_back(&MI, Offset, Size);
3674 continue;
3675 }
3676
3677 // Only count non-transient, non-tagging instructions toward the scan
3678 // limit.
3679 if (!MI.isTransient())
3680 ++Count;
3681
3682 // Just in case, stop before the epilogue code starts.
3683 if (MI.getFlag(MachineInstr::FrameSetup) ||
3684 MI.getFlag(MachineInstr::FrameDestroy))
3685 break;
3686
3687 // Reject anything that may alias the collected instructions.
3688 if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects())
3689 break;
3690 }
3691
3692 // New code will be inserted after the last tagging instruction we've found.
3693 MachineBasicBlock::iterator InsertI = Instrs.back().MI;
3694 InsertI++;
3695
3696 llvm::stable_sort(Instrs,
3697 [](const TagStoreInstr &Left, const TagStoreInstr &Right) {
3698 return Left.Offset < Right.Offset;
3699 });
3700
3701 // Make sure that we don't have any overlapping stores.
3702 int64_t CurOffset = Instrs[0].Offset;
3703 for (auto &Instr : Instrs) {
3704 if (CurOffset > Instr.Offset)
3705 return NextI;
3706 CurOffset = Instr.Offset + Instr.Size;
3707 }
3708
3709 // Find contiguous runs of tagged memory and emit shorter instruction
3710 // sequencies for them when possible.
3711 TagStoreEdit TSE(MBB, FirstZeroData);
3712 Optional<int64_t> EndOffset;
3713 for (auto &Instr : Instrs) {
3714 if (EndOffset && *EndOffset != Instr.Offset) {
3715 // Found a gap.
3716 TSE.emitCode(InsertI, TFI, /*TryMergeSPUpdate = */ false);
3717 TSE.clear();
3718 }
3719
3720 TSE.addInstruction(Instr);
3721 EndOffset = Instr.Offset + Instr.Size;
3722 }
3723
3724 // Multiple FP/SP updates in a loop cannot be described by CFI instructions.
3725 TSE.emitCode(InsertI, TFI, /*TryMergeSPUpdate = */
3726 !MBB->getParent()
3727 ->getInfo<AArch64FunctionInfo>()
3728 ->needsAsyncDwarfUnwindInfo());
3729
3730 return InsertI;
3731 }
3732 } // namespace
3733
processFunctionBeforeFrameIndicesReplaced(MachineFunction & MF,RegScavenger * RS=nullptr) const3734 void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
3735 MachineFunction &MF, RegScavenger *RS = nullptr) const {
3736 if (StackTaggingMergeSetTag)
3737 for (auto &BB : MF)
3738 for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();)
3739 II = tryMergeAdjacentSTG(II, this, RS);
3740 }
3741
3742 /// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
3743 /// before the update. This is easily retrieved as it is exactly the offset
3744 /// that is set in processFunctionBeforeFrameFinalized.
getFrameIndexReferencePreferSP(const MachineFunction & MF,int FI,Register & FrameReg,bool IgnoreSPUpdates) const3745 StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP(
3746 const MachineFunction &MF, int FI, Register &FrameReg,
3747 bool IgnoreSPUpdates) const {
3748 const MachineFrameInfo &MFI = MF.getFrameInfo();
3749 if (IgnoreSPUpdates) {
3750 LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
3751 << MFI.getObjectOffset(FI) << "\n");
3752 FrameReg = AArch64::SP;
3753 return StackOffset::getFixed(MFI.getObjectOffset(FI));
3754 }
3755
3756 // Go to common code if we cannot provide sp + offset.
3757 if (MFI.hasVarSizedObjects() ||
3758 MF.getInfo<AArch64FunctionInfo>()->getStackSizeSVE() ||
3759 MF.getSubtarget().getRegisterInfo()->hasStackRealignment(MF))
3760 return getFrameIndexReference(MF, FI, FrameReg);
3761
3762 FrameReg = AArch64::SP;
3763 return getStackOffset(MF, MFI.getObjectOffset(FI));
3764 }
3765
3766 /// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve
3767 /// the parent's frame pointer
getWinEHParentFrameOffset(const MachineFunction & MF) const3768 unsigned AArch64FrameLowering::getWinEHParentFrameOffset(
3769 const MachineFunction &MF) const {
3770 return 0;
3771 }
3772
3773 /// Funclets only need to account for space for the callee saved registers,
3774 /// as the locals are accounted for in the parent's stack frame.
getWinEHFuncletFrameSize(const MachineFunction & MF) const3775 unsigned AArch64FrameLowering::getWinEHFuncletFrameSize(
3776 const MachineFunction &MF) const {
3777 // This is the size of the pushed CSRs.
3778 unsigned CSSize =
3779 MF.getInfo<AArch64FunctionInfo>()->getCalleeSavedStackSize();
3780 // This is the amount of stack a funclet needs to allocate.
3781 return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(),
3782 getStackAlign());
3783 }
3784
3785 namespace {
3786 struct FrameObject {
3787 bool IsValid = false;
3788 // Index of the object in MFI.
3789 int ObjectIndex = 0;
3790 // Group ID this object belongs to.
3791 int GroupIndex = -1;
3792 // This object should be placed first (closest to SP).
3793 bool ObjectFirst = false;
3794 // This object's group (which always contains the object with
3795 // ObjectFirst==true) should be placed first.
3796 bool GroupFirst = false;
3797 };
3798
3799 class GroupBuilder {
3800 SmallVector<int, 8> CurrentMembers;
3801 int NextGroupIndex = 0;
3802 std::vector<FrameObject> &Objects;
3803
3804 public:
GroupBuilder(std::vector<FrameObject> & Objects)3805 GroupBuilder(std::vector<FrameObject> &Objects) : Objects(Objects) {}
AddMember(int Index)3806 void AddMember(int Index) { CurrentMembers.push_back(Index); }
EndCurrentGroup()3807 void EndCurrentGroup() {
3808 if (CurrentMembers.size() > 1) {
3809 // Create a new group with the current member list. This might remove them
3810 // from their pre-existing groups. That's OK, dealing with overlapping
3811 // groups is too hard and unlikely to make a difference.
3812 LLVM_DEBUG(dbgs() << "group:");
3813 for (int Index : CurrentMembers) {
3814 Objects[Index].GroupIndex = NextGroupIndex;
3815 LLVM_DEBUG(dbgs() << " " << Index);
3816 }
3817 LLVM_DEBUG(dbgs() << "\n");
3818 NextGroupIndex++;
3819 }
3820 CurrentMembers.clear();
3821 }
3822 };
3823
FrameObjectCompare(const FrameObject & A,const FrameObject & B)3824 bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
3825 // Objects at a lower index are closer to FP; objects at a higher index are
3826 // closer to SP.
3827 //
3828 // For consistency in our comparison, all invalid objects are placed
3829 // at the end. This also allows us to stop walking when we hit the
3830 // first invalid item after it's all sorted.
3831 //
3832 // The "first" object goes first (closest to SP), followed by the members of
3833 // the "first" group.
3834 //
3835 // The rest are sorted by the group index to keep the groups together.
3836 // Higher numbered groups are more likely to be around longer (i.e. untagged
3837 // in the function epilogue and not at some earlier point). Place them closer
3838 // to SP.
3839 //
3840 // If all else equal, sort by the object index to keep the objects in the
3841 // original order.
3842 return std::make_tuple(!A.IsValid, A.ObjectFirst, A.GroupFirst, A.GroupIndex,
3843 A.ObjectIndex) <
3844 std::make_tuple(!B.IsValid, B.ObjectFirst, B.GroupFirst, B.GroupIndex,
3845 B.ObjectIndex);
3846 }
3847 } // namespace
3848
orderFrameObjects(const MachineFunction & MF,SmallVectorImpl<int> & ObjectsToAllocate) const3849 void AArch64FrameLowering::orderFrameObjects(
3850 const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
3851 if (!OrderFrameObjects || ObjectsToAllocate.empty())
3852 return;
3853
3854 const MachineFrameInfo &MFI = MF.getFrameInfo();
3855 std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd());
3856 for (auto &Obj : ObjectsToAllocate) {
3857 FrameObjects[Obj].IsValid = true;
3858 FrameObjects[Obj].ObjectIndex = Obj;
3859 }
3860
3861 // Identify stack slots that are tagged at the same time.
3862 GroupBuilder GB(FrameObjects);
3863 for (auto &MBB : MF) {
3864 for (auto &MI : MBB) {
3865 if (MI.isDebugInstr())
3866 continue;
3867 int OpIndex;
3868 switch (MI.getOpcode()) {
3869 case AArch64::STGloop:
3870 case AArch64::STZGloop:
3871 OpIndex = 3;
3872 break;
3873 case AArch64::STGOffset:
3874 case AArch64::STZGOffset:
3875 case AArch64::ST2GOffset:
3876 case AArch64::STZ2GOffset:
3877 OpIndex = 1;
3878 break;
3879 default:
3880 OpIndex = -1;
3881 }
3882
3883 int TaggedFI = -1;
3884 if (OpIndex >= 0) {
3885 const MachineOperand &MO = MI.getOperand(OpIndex);
3886 if (MO.isFI()) {
3887 int FI = MO.getIndex();
3888 if (FI >= 0 && FI < MFI.getObjectIndexEnd() &&
3889 FrameObjects[FI].IsValid)
3890 TaggedFI = FI;
3891 }
3892 }
3893
3894 // If this is a stack tagging instruction for a slot that is not part of a
3895 // group yet, either start a new group or add it to the current one.
3896 if (TaggedFI >= 0)
3897 GB.AddMember(TaggedFI);
3898 else
3899 GB.EndCurrentGroup();
3900 }
3901 // Groups should never span multiple basic blocks.
3902 GB.EndCurrentGroup();
3903 }
3904
3905 // If the function's tagged base pointer is pinned to a stack slot, we want to
3906 // put that slot first when possible. This will likely place it at SP + 0,
3907 // and save one instruction when generating the base pointer because IRG does
3908 // not allow an immediate offset.
3909 const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
3910 Optional<int> TBPI = AFI.getTaggedBasePointerIndex();
3911 if (TBPI) {
3912 FrameObjects[*TBPI].ObjectFirst = true;
3913 FrameObjects[*TBPI].GroupFirst = true;
3914 int FirstGroupIndex = FrameObjects[*TBPI].GroupIndex;
3915 if (FirstGroupIndex >= 0)
3916 for (FrameObject &Object : FrameObjects)
3917 if (Object.GroupIndex == FirstGroupIndex)
3918 Object.GroupFirst = true;
3919 }
3920
3921 llvm::stable_sort(FrameObjects, FrameObjectCompare);
3922
3923 int i = 0;
3924 for (auto &Obj : FrameObjects) {
3925 // All invalid items are sorted at the end, so it's safe to stop.
3926 if (!Obj.IsValid)
3927 break;
3928 ObjectsToAllocate[i++] = Obj.ObjectIndex;
3929 }
3930
3931 LLVM_DEBUG(dbgs() << "Final frame order:\n"; for (auto &Obj
3932 : FrameObjects) {
3933 if (!Obj.IsValid)
3934 break;
3935 dbgs() << " " << Obj.ObjectIndex << ": group " << Obj.GroupIndex;
3936 if (Obj.ObjectFirst)
3937 dbgs() << ", first";
3938 if (Obj.GroupFirst)
3939 dbgs() << ", group-first";
3940 dbgs() << "\n";
3941 });
3942 }
3943