1 //===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains the AArch64 implementation of TargetFrameLowering class. 10 // 11 // On AArch64, stack frames are structured as follows: 12 // 13 // The stack grows downward. 14 // 15 // All of the individual frame areas on the frame below are optional, i.e. it's 16 // possible to create a function so that the particular area isn't present 17 // in the frame. 18 // 19 // At function entry, the "frame" looks as follows: 20 // 21 // | | Higher address 22 // |-----------------------------------| 23 // | | 24 // | arguments passed on the stack | 25 // | | 26 // |-----------------------------------| <- sp 27 // | | Lower address 28 // 29 // 30 // After the prologue has run, the frame has the following general structure. 31 // Note that this doesn't depict the case where a red-zone is used. Also, 32 // technically the last frame area (VLAs) doesn't get created until in the 33 // main function body, after the prologue is run. However, it's depicted here 34 // for completeness. 35 // 36 // | | Higher address 37 // |-----------------------------------| 38 // | | 39 // | arguments passed on the stack | 40 // | | 41 // |-----------------------------------| 42 // | | 43 // | (Win64 only) varargs from reg | 44 // | | 45 // |-----------------------------------| 46 // | | 47 // | callee-saved gpr registers | <--. 48 // | | | On Darwin platforms these 49 // |- - - - - - - - - - - - - - - - - -| | callee saves are swapped, 50 // | prev_lr | | (frame record first) 51 // | prev_fp | <--' 52 // | async context if needed | 53 // | (a.k.a. "frame record") | 54 // |-----------------------------------| <- fp(=x29) 55 // | | 56 // | callee-saved fp/simd/SVE regs | 57 // | | 58 // |-----------------------------------| 59 // | | 60 // | SVE stack objects | 61 // | | 62 // |-----------------------------------| 63 // |.empty.space.to.make.part.below....| 64 // |.aligned.in.case.it.needs.more.than| (size of this area is unknown at 65 // |.the.standard.16-byte.alignment....| compile time; if present) 66 // |-----------------------------------| 67 // | | 68 // | local variables of fixed size | 69 // | including spill slots | 70 // |-----------------------------------| <- bp(not defined by ABI, 71 // |.variable-sized.local.variables....| LLVM chooses X19) 72 // |.(VLAs)............................| (size of this area is unknown at 73 // |...................................| compile time) 74 // |-----------------------------------| <- sp 75 // | | Lower address 76 // 77 // 78 // To access the data in a frame, at-compile time, a constant offset must be 79 // computable from one of the pointers (fp, bp, sp) to access it. The size 80 // of the areas with a dotted background cannot be computed at compile-time 81 // if they are present, making it required to have all three of fp, bp and 82 // sp to be set up to be able to access all contents in the frame areas, 83 // assuming all of the frame areas are non-empty. 84 // 85 // For most functions, some of the frame areas are empty. For those functions, 86 // it may not be necessary to set up fp or bp: 87 // * A base pointer is definitely needed when there are both VLAs and local 88 // variables with more-than-default alignment requirements. 89 // * A frame pointer is definitely needed when there are local variables with 90 // more-than-default alignment requirements. 91 // 92 // For Darwin platforms the frame-record (fp, lr) is stored at the top of the 93 // callee-saved area, since the unwind encoding does not allow for encoding 94 // this dynamically and existing tools depend on this layout. For other 95 // platforms, the frame-record is stored at the bottom of the (gpr) callee-saved 96 // area to allow SVE stack objects (allocated directly below the callee-saves, 97 // if available) to be accessed directly from the framepointer. 98 // The SVE spill/fill instructions have VL-scaled addressing modes such 99 // as: 100 // ldr z8, [fp, #-7 mul vl] 101 // For SVE the size of the vector length (VL) is not known at compile-time, so 102 // '#-7 mul vl' is an offset that can only be evaluated at runtime. With this 103 // layout, we don't need to add an unscaled offset to the framepointer before 104 // accessing the SVE object in the frame. 105 // 106 // In some cases when a base pointer is not strictly needed, it is generated 107 // anyway when offsets from the frame pointer to access local variables become 108 // so large that the offset can't be encoded in the immediate fields of loads 109 // or stores. 110 // 111 // Outgoing function arguments must be at the bottom of the stack frame when 112 // calling another function. If we do not have variable-sized stack objects, we 113 // can allocate a "reserved call frame" area at the bottom of the local 114 // variable area, large enough for all outgoing calls. If we do have VLAs, then 115 // the stack pointer must be decremented and incremented around each call to 116 // make space for the arguments below the VLAs. 117 // 118 // FIXME: also explain the redzone concept. 119 // 120 //===----------------------------------------------------------------------===// 121 122 #include "AArch64FrameLowering.h" 123 #include "AArch64InstrInfo.h" 124 #include "AArch64MachineFunctionInfo.h" 125 #include "AArch64RegisterInfo.h" 126 #include "AArch64Subtarget.h" 127 #include "AArch64TargetMachine.h" 128 #include "MCTargetDesc/AArch64AddressingModes.h" 129 #include "llvm/ADT/ScopeExit.h" 130 #include "llvm/ADT/SmallVector.h" 131 #include "llvm/ADT/Statistic.h" 132 #include "llvm/CodeGen/LivePhysRegs.h" 133 #include "llvm/CodeGen/MachineBasicBlock.h" 134 #include "llvm/CodeGen/MachineFrameInfo.h" 135 #include "llvm/CodeGen/MachineFunction.h" 136 #include "llvm/CodeGen/MachineInstr.h" 137 #include "llvm/CodeGen/MachineInstrBuilder.h" 138 #include "llvm/CodeGen/MachineMemOperand.h" 139 #include "llvm/CodeGen/MachineModuleInfo.h" 140 #include "llvm/CodeGen/MachineOperand.h" 141 #include "llvm/CodeGen/MachineRegisterInfo.h" 142 #include "llvm/CodeGen/RegisterScavenging.h" 143 #include "llvm/CodeGen/TargetInstrInfo.h" 144 #include "llvm/CodeGen/TargetRegisterInfo.h" 145 #include "llvm/CodeGen/TargetSubtargetInfo.h" 146 #include "llvm/CodeGen/WinEHFuncInfo.h" 147 #include "llvm/IR/Attributes.h" 148 #include "llvm/IR/CallingConv.h" 149 #include "llvm/IR/DataLayout.h" 150 #include "llvm/IR/DebugLoc.h" 151 #include "llvm/IR/Function.h" 152 #include "llvm/MC/MCAsmInfo.h" 153 #include "llvm/MC/MCDwarf.h" 154 #include "llvm/Support/CommandLine.h" 155 #include "llvm/Support/Debug.h" 156 #include "llvm/Support/ErrorHandling.h" 157 #include "llvm/Support/LEB128.h" 158 #include "llvm/Support/MathExtras.h" 159 #include "llvm/Support/raw_ostream.h" 160 #include "llvm/Target/TargetMachine.h" 161 #include "llvm/Target/TargetOptions.h" 162 #include <cassert> 163 #include <cstdint> 164 #include <iterator> 165 #include <vector> 166 167 using namespace llvm; 168 169 #define DEBUG_TYPE "frame-info" 170 171 static cl::opt<bool> EnableRedZone("aarch64-redzone", 172 cl::desc("enable use of redzone on AArch64"), 173 cl::init(false), cl::Hidden); 174 175 static cl::opt<bool> 176 ReverseCSRRestoreSeq("reverse-csr-restore-seq", 177 cl::desc("reverse the CSR restore sequence"), 178 cl::init(false), cl::Hidden); 179 180 static cl::opt<bool> StackTaggingMergeSetTag( 181 "stack-tagging-merge-settag", 182 cl::desc("merge settag instruction in function epilog"), cl::init(true), 183 cl::Hidden); 184 185 static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects", 186 cl::desc("sort stack allocations"), 187 cl::init(true), cl::Hidden); 188 189 cl::opt<bool> EnableHomogeneousPrologEpilog( 190 "homogeneous-prolog-epilog", cl::init(false), cl::ZeroOrMore, cl::Hidden, 191 cl::desc("Emit homogeneous prologue and epilogue for the size " 192 "optimization (default = off)")); 193 194 STATISTIC(NumRedZoneFunctions, "Number of functions using red zone"); 195 196 /// Returns how much of the incoming argument stack area (in bytes) we should 197 /// clean up in an epilogue. For the C calling convention this will be 0, for 198 /// guaranteed tail call conventions it can be positive (a normal return or a 199 /// tail call to a function that uses less stack space for arguments) or 200 /// negative (for a tail call to a function that needs more stack space than us 201 /// for arguments). 202 static int64_t getArgumentStackToRestore(MachineFunction &MF, 203 MachineBasicBlock &MBB) { 204 MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); 205 bool IsTailCallReturn = false; 206 if (MBB.end() != MBBI) { 207 unsigned RetOpcode = MBBI->getOpcode(); 208 IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi || 209 RetOpcode == AArch64::TCRETURNri || 210 RetOpcode == AArch64::TCRETURNriBTI; 211 } 212 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 213 214 int64_t ArgumentPopSize = 0; 215 if (IsTailCallReturn) { 216 MachineOperand &StackAdjust = MBBI->getOperand(1); 217 218 // For a tail-call in a callee-pops-arguments environment, some or all of 219 // the stack may actually be in use for the call's arguments, this is 220 // calculated during LowerCall and consumed here... 221 ArgumentPopSize = StackAdjust.getImm(); 222 } else { 223 // ... otherwise the amount to pop is *all* of the argument space, 224 // conveniently stored in the MachineFunctionInfo by 225 // LowerFormalArguments. This will, of course, be zero for the C calling 226 // convention. 227 ArgumentPopSize = AFI->getArgumentStackToRestore(); 228 } 229 230 return ArgumentPopSize; 231 } 232 233 static bool produceCompactUnwindFrame(MachineFunction &MF); 234 static bool needsWinCFI(const MachineFunction &MF); 235 static StackOffset getSVEStackSize(const MachineFunction &MF); 236 237 /// Returns true if a homogeneous prolog or epilog code can be emitted 238 /// for the size optimization. If possible, a frame helper call is injected. 239 /// When Exit block is given, this check is for epilog. 240 bool AArch64FrameLowering::homogeneousPrologEpilog( 241 MachineFunction &MF, MachineBasicBlock *Exit) const { 242 if (!MF.getFunction().hasMinSize()) 243 return false; 244 if (!EnableHomogeneousPrologEpilog) 245 return false; 246 if (ReverseCSRRestoreSeq) 247 return false; 248 if (EnableRedZone) 249 return false; 250 251 // TODO: Window is supported yet. 252 if (needsWinCFI(MF)) 253 return false; 254 // TODO: SVE is not supported yet. 255 if (getSVEStackSize(MF)) 256 return false; 257 258 // Bail on stack adjustment needed on return for simplicity. 259 const MachineFrameInfo &MFI = MF.getFrameInfo(); 260 const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); 261 if (MFI.hasVarSizedObjects() || RegInfo->hasStackRealignment(MF)) 262 return false; 263 if (Exit && getArgumentStackToRestore(MF, *Exit)) 264 return false; 265 266 return true; 267 } 268 269 /// Returns true if CSRs should be paired. 270 bool AArch64FrameLowering::producePairRegisters(MachineFunction &MF) const { 271 return produceCompactUnwindFrame(MF) || homogeneousPrologEpilog(MF); 272 } 273 274 /// This is the biggest offset to the stack pointer we can encode in aarch64 275 /// instructions (without using a separate calculation and a temp register). 276 /// Note that the exception here are vector stores/loads which cannot encode any 277 /// displacements (see estimateRSStackSizeLimit(), isAArch64FrameOffsetLegal()). 278 static const unsigned DefaultSafeSPDisplacement = 255; 279 280 /// Look at each instruction that references stack frames and return the stack 281 /// size limit beyond which some of these instructions will require a scratch 282 /// register during their expansion later. 283 static unsigned estimateRSStackSizeLimit(MachineFunction &MF) { 284 // FIXME: For now, just conservatively guestimate based on unscaled indexing 285 // range. We'll end up allocating an unnecessary spill slot a lot, but 286 // realistically that's not a big deal at this stage of the game. 287 for (MachineBasicBlock &MBB : MF) { 288 for (MachineInstr &MI : MBB) { 289 if (MI.isDebugInstr() || MI.isPseudo() || 290 MI.getOpcode() == AArch64::ADDXri || 291 MI.getOpcode() == AArch64::ADDSXri) 292 continue; 293 294 for (const MachineOperand &MO : MI.operands()) { 295 if (!MO.isFI()) 296 continue; 297 298 StackOffset Offset; 299 if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) == 300 AArch64FrameOffsetCannotUpdate) 301 return 0; 302 } 303 } 304 } 305 return DefaultSafeSPDisplacement; 306 } 307 308 TargetStackID::Value 309 AArch64FrameLowering::getStackIDForScalableVectors() const { 310 return TargetStackID::ScalableVector; 311 } 312 313 /// Returns the size of the fixed object area (allocated next to sp on entry) 314 /// On Win64 this may include a var args area and an UnwindHelp object for EH. 315 static unsigned getFixedObjectSize(const MachineFunction &MF, 316 const AArch64FunctionInfo *AFI, bool IsWin64, 317 bool IsFunclet) { 318 if (!IsWin64 || IsFunclet) { 319 return AFI->getTailCallReservedStack(); 320 } else { 321 if (AFI->getTailCallReservedStack() != 0) 322 report_fatal_error("cannot generate ABI-changing tail call for Win64"); 323 // Var args are stored here in the primary function. 324 const unsigned VarArgsArea = AFI->getVarArgsGPRSize(); 325 // To support EH funclets we allocate an UnwindHelp object 326 const unsigned UnwindHelpObject = (MF.hasEHFunclets() ? 8 : 0); 327 return alignTo(VarArgsArea + UnwindHelpObject, 16); 328 } 329 } 330 331 /// Returns the size of the entire SVE stackframe (calleesaves + spills). 332 static StackOffset getSVEStackSize(const MachineFunction &MF) { 333 const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 334 return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE()); 335 } 336 337 bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { 338 if (!EnableRedZone) 339 return false; 340 341 // Don't use the red zone if the function explicitly asks us not to. 342 // This is typically used for kernel code. 343 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); 344 const unsigned RedZoneSize = 345 Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction()); 346 if (!RedZoneSize) 347 return false; 348 349 const MachineFrameInfo &MFI = MF.getFrameInfo(); 350 const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 351 uint64_t NumBytes = AFI->getLocalStackSize(); 352 353 return !(MFI.hasCalls() || hasFP(MF) || NumBytes > RedZoneSize || 354 getSVEStackSize(MF)); 355 } 356 357 /// hasFP - Return true if the specified function should have a dedicated frame 358 /// pointer register. 359 bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const { 360 const MachineFrameInfo &MFI = MF.getFrameInfo(); 361 const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); 362 // Win64 EH requires a frame pointer if funclets are present, as the locals 363 // are accessed off the frame pointer in both the parent function and the 364 // funclets. 365 if (MF.hasEHFunclets()) 366 return true; 367 // Retain behavior of always omitting the FP for leaf functions when possible. 368 if (MF.getTarget().Options.DisableFramePointerElim(MF)) 369 return true; 370 if (MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() || 371 MFI.hasStackMap() || MFI.hasPatchPoint() || 372 RegInfo->hasStackRealignment(MF)) 373 return true; 374 // With large callframes around we may need to use FP to access the scavenging 375 // emergency spillslot. 376 // 377 // Unfortunately some calls to hasFP() like machine verifier -> 378 // getReservedReg() -> hasFP in the middle of global isel are too early 379 // to know the max call frame size. Hopefully conservatively returning "true" 380 // in those cases is fine. 381 // DefaultSafeSPDisplacement is fine as we only emergency spill GP regs. 382 if (!MFI.isMaxCallFrameSizeComputed() || 383 MFI.getMaxCallFrameSize() > DefaultSafeSPDisplacement) 384 return true; 385 386 return false; 387 } 388 389 /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is 390 /// not required, we reserve argument space for call sites in the function 391 /// immediately on entry to the current function. This eliminates the need for 392 /// add/sub sp brackets around call sites. Returns true if the call frame is 393 /// included as part of the stack frame. 394 bool 395 AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { 396 return !MF.getFrameInfo().hasVarSizedObjects(); 397 } 398 399 MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr( 400 MachineFunction &MF, MachineBasicBlock &MBB, 401 MachineBasicBlock::iterator I) const { 402 const AArch64InstrInfo *TII = 403 static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); 404 DebugLoc DL = I->getDebugLoc(); 405 unsigned Opc = I->getOpcode(); 406 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); 407 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; 408 409 if (!hasReservedCallFrame(MF)) { 410 int64_t Amount = I->getOperand(0).getImm(); 411 Amount = alignTo(Amount, getStackAlign()); 412 if (!IsDestroy) 413 Amount = -Amount; 414 415 // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it 416 // doesn't have to pop anything), then the first operand will be zero too so 417 // this adjustment is a no-op. 418 if (CalleePopAmount == 0) { 419 // FIXME: in-function stack adjustment for calls is limited to 24-bits 420 // because there's no guaranteed temporary register available. 421 // 422 // ADD/SUB (immediate) has only LSL #0 and LSL #12 available. 423 // 1) For offset <= 12-bit, we use LSL #0 424 // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses 425 // LSL #0, and the other uses LSL #12. 426 // 427 // Most call frames will be allocated at the start of a function so 428 // this is OK, but it is a limitation that needs dealing with. 429 assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large"); 430 emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, 431 StackOffset::getFixed(Amount), TII); 432 } 433 } else if (CalleePopAmount != 0) { 434 // If the calling convention demands that the callee pops arguments from the 435 // stack, we want to add it back if we have a reserved call frame. 436 assert(CalleePopAmount < 0xffffff && "call frame too large"); 437 emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, 438 StackOffset::getFixed(-(int64_t)CalleePopAmount), TII); 439 } 440 return MBB.erase(I); 441 } 442 443 // Convenience function to create a DWARF expression for 444 // Expr + NumBytes + NumVGScaledBytes * AArch64::VG 445 static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, 446 int NumBytes, int NumVGScaledBytes, unsigned VG, 447 llvm::raw_string_ostream &Comment) { 448 uint8_t buffer[16]; 449 450 if (NumBytes) { 451 Expr.push_back(dwarf::DW_OP_consts); 452 Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer)); 453 Expr.push_back((uint8_t)dwarf::DW_OP_plus); 454 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes); 455 } 456 457 if (NumVGScaledBytes) { 458 Expr.push_back((uint8_t)dwarf::DW_OP_consts); 459 Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer)); 460 461 Expr.push_back((uint8_t)dwarf::DW_OP_bregx); 462 Expr.append(buffer, buffer + encodeULEB128(VG, buffer)); 463 Expr.push_back(0); 464 465 Expr.push_back((uint8_t)dwarf::DW_OP_mul); 466 Expr.push_back((uint8_t)dwarf::DW_OP_plus); 467 468 Comment << (NumVGScaledBytes < 0 ? " - " : " + ") 469 << std::abs(NumVGScaledBytes) << " * VG"; 470 } 471 } 472 473 // Creates an MCCFIInstruction: 474 // { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr } 475 MCCFIInstruction AArch64FrameLowering::createDefCFAExpressionFromSP( 476 const TargetRegisterInfo &TRI, const StackOffset &OffsetFromSP) const { 477 int64_t NumBytes, NumVGScaledBytes; 478 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(OffsetFromSP, NumBytes, 479 NumVGScaledBytes); 480 481 std::string CommentBuffer = "sp"; 482 llvm::raw_string_ostream Comment(CommentBuffer); 483 484 // Build up the expression (SP + NumBytes + NumVGScaledBytes * AArch64::VG) 485 SmallString<64> Expr; 486 Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + /*SP*/ 31)); 487 Expr.push_back(0); 488 appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes, 489 TRI.getDwarfRegNum(AArch64::VG, true), Comment); 490 491 // Wrap this into DW_CFA_def_cfa. 492 SmallString<64> DefCfaExpr; 493 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression); 494 uint8_t buffer[16]; 495 DefCfaExpr.append(buffer, 496 buffer + encodeULEB128(Expr.size(), buffer)); 497 DefCfaExpr.append(Expr.str()); 498 return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), 499 Comment.str()); 500 } 501 502 MCCFIInstruction AArch64FrameLowering::createCfaOffset( 503 const TargetRegisterInfo &TRI, unsigned Reg, 504 const StackOffset &OffsetFromDefCFA) const { 505 int64_t NumBytes, NumVGScaledBytes; 506 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets( 507 OffsetFromDefCFA, NumBytes, NumVGScaledBytes); 508 509 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); 510 511 // Non-scalable offsets can use DW_CFA_offset directly. 512 if (!NumVGScaledBytes) 513 return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes); 514 515 std::string CommentBuffer; 516 llvm::raw_string_ostream Comment(CommentBuffer); 517 Comment << printReg(Reg, &TRI) << " @ cfa"; 518 519 // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG) 520 SmallString<64> OffsetExpr; 521 appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes, 522 TRI.getDwarfRegNum(AArch64::VG, true), Comment); 523 524 // Wrap this into DW_CFA_expression 525 SmallString<64> CfaExpr; 526 CfaExpr.push_back(dwarf::DW_CFA_expression); 527 uint8_t buffer[16]; 528 CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer)); 529 CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer)); 530 CfaExpr.append(OffsetExpr.str()); 531 532 return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), Comment.str()); 533 } 534 535 void AArch64FrameLowering::emitCalleeSavedFrameMoves( 536 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const { 537 MachineFunction &MF = *MBB.getParent(); 538 MachineFrameInfo &MFI = MF.getFrameInfo(); 539 const TargetSubtargetInfo &STI = MF.getSubtarget(); 540 const TargetRegisterInfo *TRI = STI.getRegisterInfo(); 541 const TargetInstrInfo *TII = STI.getInstrInfo(); 542 DebugLoc DL = MBB.findDebugLoc(MBBI); 543 544 // Add callee saved registers to move list. 545 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); 546 if (CSI.empty()) 547 return; 548 549 for (const auto &Info : CSI) { 550 unsigned Reg = Info.getReg(); 551 552 // Not all unwinders may know about SVE registers, so assume the lowest 553 // common demoninator. 554 unsigned NewReg; 555 if (static_cast<const AArch64RegisterInfo *>(TRI)->regNeedsCFI(Reg, NewReg)) 556 Reg = NewReg; 557 else 558 continue; 559 560 StackOffset Offset; 561 if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector) { 562 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 563 Offset = 564 StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) - 565 StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI)); 566 } else { 567 Offset = StackOffset::getFixed(MFI.getObjectOffset(Info.getFrameIdx()) - 568 getOffsetOfLocalArea()); 569 } 570 unsigned CFIIndex = MF.addFrameInst(createCfaOffset(*TRI, Reg, Offset)); 571 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) 572 .addCFIIndex(CFIIndex) 573 .setMIFlags(MachineInstr::FrameSetup); 574 } 575 } 576 577 // Find a scratch register that we can use at the start of the prologue to 578 // re-align the stack pointer. We avoid using callee-save registers since they 579 // may appear to be free when this is called from canUseAsPrologue (during 580 // shrink wrapping), but then no longer be free when this is called from 581 // emitPrologue. 582 // 583 // FIXME: This is a bit conservative, since in the above case we could use one 584 // of the callee-save registers as a scratch temp to re-align the stack pointer, 585 // but we would then have to make sure that we were in fact saving at least one 586 // callee-save register in the prologue, which is additional complexity that 587 // doesn't seem worth the benefit. 588 static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) { 589 MachineFunction *MF = MBB->getParent(); 590 591 // If MBB is an entry block, use X9 as the scratch register 592 if (&MF->front() == MBB) 593 return AArch64::X9; 594 595 const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>(); 596 const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo(); 597 LivePhysRegs LiveRegs(TRI); 598 LiveRegs.addLiveIns(*MBB); 599 600 // Mark callee saved registers as used so we will not choose them. 601 const MCPhysReg *CSRegs = MF->getRegInfo().getCalleeSavedRegs(); 602 for (unsigned i = 0; CSRegs[i]; ++i) 603 LiveRegs.addReg(CSRegs[i]); 604 605 // Prefer X9 since it was historically used for the prologue scratch reg. 606 const MachineRegisterInfo &MRI = MF->getRegInfo(); 607 if (LiveRegs.available(MRI, AArch64::X9)) 608 return AArch64::X9; 609 610 for (unsigned Reg : AArch64::GPR64RegClass) { 611 if (LiveRegs.available(MRI, Reg)) 612 return Reg; 613 } 614 return AArch64::NoRegister; 615 } 616 617 bool AArch64FrameLowering::canUseAsPrologue( 618 const MachineBasicBlock &MBB) const { 619 const MachineFunction *MF = MBB.getParent(); 620 MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB); 621 const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>(); 622 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); 623 624 // Don't need a scratch register if we're not going to re-align the stack. 625 if (!RegInfo->hasStackRealignment(*MF)) 626 return true; 627 // Otherwise, we can use any block as long as it has a scratch register 628 // available. 629 return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister; 630 } 631 632 static bool windowsRequiresStackProbe(MachineFunction &MF, 633 uint64_t StackSizeInBytes) { 634 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); 635 if (!Subtarget.isTargetWindows()) 636 return false; 637 const Function &F = MF.getFunction(); 638 // TODO: When implementing stack protectors, take that into account 639 // for the probe threshold. 640 unsigned StackProbeSize = 4096; 641 if (F.hasFnAttribute("stack-probe-size")) 642 F.getFnAttribute("stack-probe-size") 643 .getValueAsString() 644 .getAsInteger(0, StackProbeSize); 645 return (StackSizeInBytes >= StackProbeSize) && 646 !F.hasFnAttribute("no-stack-arg-probe"); 647 } 648 649 static bool needsWinCFI(const MachineFunction &MF) { 650 const Function &F = MF.getFunction(); 651 return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() && 652 F.needsUnwindTableEntry(); 653 } 654 655 bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( 656 MachineFunction &MF, uint64_t StackBumpBytes) const { 657 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 658 const MachineFrameInfo &MFI = MF.getFrameInfo(); 659 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); 660 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); 661 if (homogeneousPrologEpilog(MF)) 662 return false; 663 664 if (AFI->getLocalStackSize() == 0) 665 return false; 666 667 // For WinCFI, if optimizing for size, prefer to not combine the stack bump 668 // (to force a stp with predecrement) to match the packed unwind format, 669 // provided that there actually are any callee saved registers to merge the 670 // decrement with. 671 // This is potentially marginally slower, but allows using the packed 672 // unwind format for functions that both have a local area and callee saved 673 // registers. Using the packed unwind format notably reduces the size of 674 // the unwind info. 675 if (needsWinCFI(MF) && AFI->getCalleeSavedStackSize() > 0 && 676 MF.getFunction().hasOptSize()) 677 return false; 678 679 // 512 is the maximum immediate for stp/ldp that will be used for 680 // callee-save save/restores 681 if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackBumpBytes)) 682 return false; 683 684 if (MFI.hasVarSizedObjects()) 685 return false; 686 687 if (RegInfo->hasStackRealignment(MF)) 688 return false; 689 690 // This isn't strictly necessary, but it simplifies things a bit since the 691 // current RedZone handling code assumes the SP is adjusted by the 692 // callee-save save/restore code. 693 if (canUseRedZone(MF)) 694 return false; 695 696 // When there is an SVE area on the stack, always allocate the 697 // callee-saves and spills/locals separately. 698 if (getSVEStackSize(MF)) 699 return false; 700 701 return true; 702 } 703 704 bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue( 705 MachineBasicBlock &MBB, unsigned StackBumpBytes) const { 706 if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes)) 707 return false; 708 709 if (MBB.empty()) 710 return true; 711 712 // Disable combined SP bump if the last instruction is an MTE tag store. It 713 // is almost always better to merge SP adjustment into those instructions. 714 MachineBasicBlock::iterator LastI = MBB.getFirstTerminator(); 715 MachineBasicBlock::iterator Begin = MBB.begin(); 716 while (LastI != Begin) { 717 --LastI; 718 if (LastI->isTransient()) 719 continue; 720 if (!LastI->getFlag(MachineInstr::FrameDestroy)) 721 break; 722 } 723 switch (LastI->getOpcode()) { 724 case AArch64::STGloop: 725 case AArch64::STZGloop: 726 case AArch64::STGOffset: 727 case AArch64::STZGOffset: 728 case AArch64::ST2GOffset: 729 case AArch64::STZ2GOffset: 730 return false; 731 default: 732 return true; 733 } 734 llvm_unreachable("unreachable"); 735 } 736 737 // Given a load or a store instruction, generate an appropriate unwinding SEH 738 // code on Windows. 739 static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI, 740 const TargetInstrInfo &TII, 741 MachineInstr::MIFlag Flag) { 742 unsigned Opc = MBBI->getOpcode(); 743 MachineBasicBlock *MBB = MBBI->getParent(); 744 MachineFunction &MF = *MBB->getParent(); 745 DebugLoc DL = MBBI->getDebugLoc(); 746 unsigned ImmIdx = MBBI->getNumOperands() - 1; 747 int Imm = MBBI->getOperand(ImmIdx).getImm(); 748 MachineInstrBuilder MIB; 749 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); 750 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); 751 752 switch (Opc) { 753 default: 754 llvm_unreachable("No SEH Opcode for this instruction"); 755 case AArch64::LDPDpost: 756 Imm = -Imm; 757 LLVM_FALLTHROUGH; 758 case AArch64::STPDpre: { 759 unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg()); 760 unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg()); 761 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP_X)) 762 .addImm(Reg0) 763 .addImm(Reg1) 764 .addImm(Imm * 8) 765 .setMIFlag(Flag); 766 break; 767 } 768 case AArch64::LDPXpost: 769 Imm = -Imm; 770 LLVM_FALLTHROUGH; 771 case AArch64::STPXpre: { 772 Register Reg0 = MBBI->getOperand(1).getReg(); 773 Register Reg1 = MBBI->getOperand(2).getReg(); 774 if (Reg0 == AArch64::FP && Reg1 == AArch64::LR) 775 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X)) 776 .addImm(Imm * 8) 777 .setMIFlag(Flag); 778 else 779 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP_X)) 780 .addImm(RegInfo->getSEHRegNum(Reg0)) 781 .addImm(RegInfo->getSEHRegNum(Reg1)) 782 .addImm(Imm * 8) 783 .setMIFlag(Flag); 784 break; 785 } 786 case AArch64::LDRDpost: 787 Imm = -Imm; 788 LLVM_FALLTHROUGH; 789 case AArch64::STRDpre: { 790 unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg()); 791 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg_X)) 792 .addImm(Reg) 793 .addImm(Imm) 794 .setMIFlag(Flag); 795 break; 796 } 797 case AArch64::LDRXpost: 798 Imm = -Imm; 799 LLVM_FALLTHROUGH; 800 case AArch64::STRXpre: { 801 unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg()); 802 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg_X)) 803 .addImm(Reg) 804 .addImm(Imm) 805 .setMIFlag(Flag); 806 break; 807 } 808 case AArch64::STPDi: 809 case AArch64::LDPDi: { 810 unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg()); 811 unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg()); 812 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP)) 813 .addImm(Reg0) 814 .addImm(Reg1) 815 .addImm(Imm * 8) 816 .setMIFlag(Flag); 817 break; 818 } 819 case AArch64::STPXi: 820 case AArch64::LDPXi: { 821 Register Reg0 = MBBI->getOperand(0).getReg(); 822 Register Reg1 = MBBI->getOperand(1).getReg(); 823 if (Reg0 == AArch64::FP && Reg1 == AArch64::LR) 824 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR)) 825 .addImm(Imm * 8) 826 .setMIFlag(Flag); 827 else 828 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP)) 829 .addImm(RegInfo->getSEHRegNum(Reg0)) 830 .addImm(RegInfo->getSEHRegNum(Reg1)) 831 .addImm(Imm * 8) 832 .setMIFlag(Flag); 833 break; 834 } 835 case AArch64::STRXui: 836 case AArch64::LDRXui: { 837 int Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg()); 838 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg)) 839 .addImm(Reg) 840 .addImm(Imm * 8) 841 .setMIFlag(Flag); 842 break; 843 } 844 case AArch64::STRDui: 845 case AArch64::LDRDui: { 846 unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg()); 847 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg)) 848 .addImm(Reg) 849 .addImm(Imm * 8) 850 .setMIFlag(Flag); 851 break; 852 } 853 } 854 auto I = MBB->insertAfter(MBBI, MIB); 855 return I; 856 } 857 858 // Fix up the SEH opcode associated with the save/restore instruction. 859 static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI, 860 unsigned LocalStackSize) { 861 MachineOperand *ImmOpnd = nullptr; 862 unsigned ImmIdx = MBBI->getNumOperands() - 1; 863 switch (MBBI->getOpcode()) { 864 default: 865 llvm_unreachable("Fix the offset in the SEH instruction"); 866 case AArch64::SEH_SaveFPLR: 867 case AArch64::SEH_SaveRegP: 868 case AArch64::SEH_SaveReg: 869 case AArch64::SEH_SaveFRegP: 870 case AArch64::SEH_SaveFReg: 871 ImmOpnd = &MBBI->getOperand(ImmIdx); 872 break; 873 } 874 if (ImmOpnd) 875 ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize); 876 } 877 878 // Convert callee-save register save/restore instruction to do stack pointer 879 // decrement/increment to allocate/deallocate the callee-save stack area by 880 // converting store/load to use pre/post increment version. 881 static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( 882 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, 883 const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc, 884 bool NeedsWinCFI, bool *HasWinCFI, bool InProlog = true) { 885 // Ignore instructions that do not operate on SP, i.e. shadow call stack 886 // instructions and associated CFI instruction. 887 while (MBBI->getOpcode() == AArch64::STRXpost || 888 MBBI->getOpcode() == AArch64::LDRXpre || 889 MBBI->getOpcode() == AArch64::CFI_INSTRUCTION) { 890 if (MBBI->getOpcode() != AArch64::CFI_INSTRUCTION) 891 assert(MBBI->getOperand(0).getReg() != AArch64::SP); 892 ++MBBI; 893 } 894 unsigned NewOpc; 895 switch (MBBI->getOpcode()) { 896 default: 897 llvm_unreachable("Unexpected callee-save save/restore opcode!"); 898 case AArch64::STPXi: 899 NewOpc = AArch64::STPXpre; 900 break; 901 case AArch64::STPDi: 902 NewOpc = AArch64::STPDpre; 903 break; 904 case AArch64::STPQi: 905 NewOpc = AArch64::STPQpre; 906 break; 907 case AArch64::STRXui: 908 NewOpc = AArch64::STRXpre; 909 break; 910 case AArch64::STRDui: 911 NewOpc = AArch64::STRDpre; 912 break; 913 case AArch64::STRQui: 914 NewOpc = AArch64::STRQpre; 915 break; 916 case AArch64::LDPXi: 917 NewOpc = AArch64::LDPXpost; 918 break; 919 case AArch64::LDPDi: 920 NewOpc = AArch64::LDPDpost; 921 break; 922 case AArch64::LDPQi: 923 NewOpc = AArch64::LDPQpost; 924 break; 925 case AArch64::LDRXui: 926 NewOpc = AArch64::LDRXpost; 927 break; 928 case AArch64::LDRDui: 929 NewOpc = AArch64::LDRDpost; 930 break; 931 case AArch64::LDRQui: 932 NewOpc = AArch64::LDRQpost; 933 break; 934 } 935 // Get rid of the SEH code associated with the old instruction. 936 if (NeedsWinCFI) { 937 auto SEH = std::next(MBBI); 938 if (AArch64InstrInfo::isSEHInstruction(*SEH)) 939 SEH->eraseFromParent(); 940 } 941 942 TypeSize Scale = TypeSize::Fixed(1); 943 unsigned Width; 944 int64_t MinOffset, MaxOffset; 945 bool Success = static_cast<const AArch64InstrInfo *>(TII)->getMemOpInfo( 946 NewOpc, Scale, Width, MinOffset, MaxOffset); 947 (void)Success; 948 assert(Success && "unknown load/store opcode"); 949 950 // If the first store isn't right where we want SP then we can't fold the 951 // update in so create a normal arithmetic instruction instead. 952 if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 || 953 CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) { 954 emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, 955 StackOffset::getFixed(CSStackSizeInc), TII, 956 InProlog ? MachineInstr::FrameSetup 957 : MachineInstr::FrameDestroy); 958 return std::prev(MBBI); 959 } 960 961 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)); 962 MIB.addReg(AArch64::SP, RegState::Define); 963 964 // Copy all operands other than the immediate offset. 965 unsigned OpndIdx = 0; 966 for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd; 967 ++OpndIdx) 968 MIB.add(MBBI->getOperand(OpndIdx)); 969 970 assert(MBBI->getOperand(OpndIdx).getImm() == 0 && 971 "Unexpected immediate offset in first/last callee-save save/restore " 972 "instruction!"); 973 assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP && 974 "Unexpected base register in callee-save save/restore instruction!"); 975 assert(CSStackSizeInc % Scale == 0); 976 MIB.addImm(CSStackSizeInc / (int)Scale); 977 978 MIB.setMIFlags(MBBI->getFlags()); 979 MIB.setMemRefs(MBBI->memoperands()); 980 981 // Generate a new SEH code that corresponds to the new instruction. 982 if (NeedsWinCFI) { 983 *HasWinCFI = true; 984 InsertSEH(*MIB, *TII, 985 InProlog ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy); 986 } 987 988 return std::prev(MBB.erase(MBBI)); 989 } 990 991 // Fixup callee-save register save/restore instructions to take into account 992 // combined SP bump by adding the local stack size to the stack offsets. 993 static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI, 994 uint64_t LocalStackSize, 995 bool NeedsWinCFI, 996 bool *HasWinCFI) { 997 if (AArch64InstrInfo::isSEHInstruction(MI)) 998 return; 999 1000 unsigned Opc = MI.getOpcode(); 1001 1002 // Ignore instructions that do not operate on SP, i.e. shadow call stack 1003 // instructions and associated CFI instruction. 1004 if (Opc == AArch64::STRXpost || Opc == AArch64::LDRXpre || 1005 Opc == AArch64::CFI_INSTRUCTION) { 1006 if (Opc != AArch64::CFI_INSTRUCTION) 1007 assert(MI.getOperand(0).getReg() != AArch64::SP); 1008 return; 1009 } 1010 1011 unsigned Scale; 1012 switch (Opc) { 1013 case AArch64::STPXi: 1014 case AArch64::STRXui: 1015 case AArch64::STPDi: 1016 case AArch64::STRDui: 1017 case AArch64::LDPXi: 1018 case AArch64::LDRXui: 1019 case AArch64::LDPDi: 1020 case AArch64::LDRDui: 1021 Scale = 8; 1022 break; 1023 case AArch64::STPQi: 1024 case AArch64::STRQui: 1025 case AArch64::LDPQi: 1026 case AArch64::LDRQui: 1027 Scale = 16; 1028 break; 1029 default: 1030 llvm_unreachable("Unexpected callee-save save/restore opcode!"); 1031 } 1032 1033 unsigned OffsetIdx = MI.getNumExplicitOperands() - 1; 1034 assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP && 1035 "Unexpected base register in callee-save save/restore instruction!"); 1036 // Last operand is immediate offset that needs fixing. 1037 MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx); 1038 // All generated opcodes have scaled offsets. 1039 assert(LocalStackSize % Scale == 0); 1040 OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / Scale); 1041 1042 if (NeedsWinCFI) { 1043 *HasWinCFI = true; 1044 auto MBBI = std::next(MachineBasicBlock::iterator(MI)); 1045 assert(MBBI != MI.getParent()->end() && "Expecting a valid instruction"); 1046 assert(AArch64InstrInfo::isSEHInstruction(*MBBI) && 1047 "Expecting a SEH instruction"); 1048 fixupSEHOpcode(MBBI, LocalStackSize); 1049 } 1050 } 1051 1052 static void adaptForLdStOpt(MachineBasicBlock &MBB, 1053 MachineBasicBlock::iterator FirstSPPopI, 1054 MachineBasicBlock::iterator LastPopI) { 1055 // Sometimes (when we restore in the same order as we save), we can end up 1056 // with code like this: 1057 // 1058 // ldp x26, x25, [sp] 1059 // ldp x24, x23, [sp, #16] 1060 // ldp x22, x21, [sp, #32] 1061 // ldp x20, x19, [sp, #48] 1062 // add sp, sp, #64 1063 // 1064 // In this case, it is always better to put the first ldp at the end, so 1065 // that the load-store optimizer can run and merge the ldp and the add into 1066 // a post-index ldp. 1067 // If we managed to grab the first pop instruction, move it to the end. 1068 if (ReverseCSRRestoreSeq) 1069 MBB.splice(FirstSPPopI, &MBB, LastPopI); 1070 // We should end up with something like this now: 1071 // 1072 // ldp x24, x23, [sp, #16] 1073 // ldp x22, x21, [sp, #32] 1074 // ldp x20, x19, [sp, #48] 1075 // ldp x26, x25, [sp] 1076 // add sp, sp, #64 1077 // 1078 // and the load-store optimizer can merge the last two instructions into: 1079 // 1080 // ldp x26, x25, [sp], #64 1081 // 1082 } 1083 1084 static bool isTargetWindows(const MachineFunction &MF) { 1085 return MF.getSubtarget<AArch64Subtarget>().isTargetWindows(); 1086 } 1087 1088 // Convenience function to determine whether I is an SVE callee save. 1089 static bool IsSVECalleeSave(MachineBasicBlock::iterator I) { 1090 switch (I->getOpcode()) { 1091 default: 1092 return false; 1093 case AArch64::STR_ZXI: 1094 case AArch64::STR_PXI: 1095 case AArch64::LDR_ZXI: 1096 case AArch64::LDR_PXI: 1097 return I->getFlag(MachineInstr::FrameSetup) || 1098 I->getFlag(MachineInstr::FrameDestroy); 1099 } 1100 } 1101 1102 void AArch64FrameLowering::emitPrologue(MachineFunction &MF, 1103 MachineBasicBlock &MBB) const { 1104 MachineBasicBlock::iterator MBBI = MBB.begin(); 1105 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1106 const Function &F = MF.getFunction(); 1107 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); 1108 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); 1109 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 1110 MachineModuleInfo &MMI = MF.getMMI(); 1111 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 1112 bool needsFrameMoves = 1113 MF.needsFrameMoves() && !MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); 1114 bool HasFP = hasFP(MF); 1115 bool NeedsWinCFI = needsWinCFI(MF); 1116 bool HasWinCFI = false; 1117 auto Cleanup = make_scope_exit([&]() { MF.setHasWinCFI(HasWinCFI); }); 1118 1119 bool IsFunclet = MBB.isEHFuncletEntry(); 1120 1121 // At this point, we're going to decide whether or not the function uses a 1122 // redzone. In most cases, the function doesn't have a redzone so let's 1123 // assume that's false and set it to true in the case that there's a redzone. 1124 AFI->setHasRedZone(false); 1125 1126 // Debug location must be unknown since the first debug location is used 1127 // to determine the end of the prologue. 1128 DebugLoc DL; 1129 1130 const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>(); 1131 if (MFnI.shouldSignReturnAddress()) { 1132 if (MFnI.shouldSignWithBKey()) { 1133 BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITBKEY)) 1134 .setMIFlag(MachineInstr::FrameSetup); 1135 BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIBSP)) 1136 .setMIFlag(MachineInstr::FrameSetup); 1137 } else { 1138 BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIASP)) 1139 .setMIFlag(MachineInstr::FrameSetup); 1140 } 1141 1142 unsigned CFIIndex = 1143 MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr)); 1144 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) 1145 .addCFIIndex(CFIIndex) 1146 .setMIFlags(MachineInstr::FrameSetup); 1147 } 1148 1149 // We signal the presence of a Swift extended frame to external tools by 1150 // storing FP with 0b0001 in bits 63:60. In normal userland operation a simple 1151 // ORR is sufficient, it is assumed a Swift kernel would initialize the TBI 1152 // bits so that is still true. 1153 if (HasFP && AFI->hasSwiftAsyncContext()) { 1154 // ORR x29, x29, #0x1000_0000_0000_0000 1155 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXri), AArch64::FP) 1156 .addUse(AArch64::FP) 1157 .addImm(0x1100) 1158 .setMIFlag(MachineInstr::FrameSetup); 1159 } 1160 1161 // All calls are tail calls in GHC calling conv, and functions have no 1162 // prologue/epilogue. 1163 if (MF.getFunction().getCallingConv() == CallingConv::GHC) 1164 return; 1165 1166 // Set tagged base pointer to the requested stack slot. 1167 // Ideally it should match SP value after prologue. 1168 Optional<int> TBPI = AFI->getTaggedBasePointerIndex(); 1169 if (TBPI) 1170 AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI)); 1171 else 1172 AFI->setTaggedBasePointerOffset(MFI.getStackSize()); 1173 1174 const StackOffset &SVEStackSize = getSVEStackSize(MF); 1175 1176 // getStackSize() includes all the locals in its size calculation. We don't 1177 // include these locals when computing the stack size of a funclet, as they 1178 // are allocated in the parent's stack frame and accessed via the frame 1179 // pointer from the funclet. We only save the callee saved registers in the 1180 // funclet, which are really the callee saved registers of the parent 1181 // function, including the funclet. 1182 int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF) 1183 : MFI.getStackSize(); 1184 if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) { 1185 assert(!HasFP && "unexpected function without stack frame but with FP"); 1186 assert(!SVEStackSize && 1187 "unexpected function without stack frame but with SVE objects"); 1188 // All of the stack allocation is for locals. 1189 AFI->setLocalStackSize(NumBytes); 1190 if (!NumBytes) 1191 return; 1192 // REDZONE: If the stack size is less than 128 bytes, we don't need 1193 // to actually allocate. 1194 if (canUseRedZone(MF)) { 1195 AFI->setHasRedZone(true); 1196 ++NumRedZoneFunctions; 1197 } else { 1198 emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, 1199 StackOffset::getFixed(-NumBytes), TII, 1200 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); 1201 if (!NeedsWinCFI && needsFrameMoves) { 1202 // Label used to tie together the PROLOG_LABEL and the MachineMoves. 1203 MCSymbol *FrameLabel = MMI.getContext().createTempSymbol(); 1204 // Encode the stack size of the leaf function. 1205 unsigned CFIIndex = MF.addFrameInst( 1206 MCCFIInstruction::cfiDefCfaOffset(FrameLabel, NumBytes)); 1207 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) 1208 .addCFIIndex(CFIIndex) 1209 .setMIFlags(MachineInstr::FrameSetup); 1210 } 1211 } 1212 1213 if (NeedsWinCFI) { 1214 HasWinCFI = true; 1215 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd)) 1216 .setMIFlag(MachineInstr::FrameSetup); 1217 } 1218 1219 return; 1220 } 1221 1222 bool IsWin64 = 1223 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); 1224 unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet); 1225 1226 auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject; 1227 // All of the remaining stack allocations are for locals. 1228 AFI->setLocalStackSize(NumBytes - PrologueSaveSize); 1229 bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); 1230 bool HomPrologEpilog = homogeneousPrologEpilog(MF); 1231 if (CombineSPBump) { 1232 assert(!SVEStackSize && "Cannot combine SP bump with SVE"); 1233 emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, 1234 StackOffset::getFixed(-NumBytes), TII, 1235 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); 1236 NumBytes = 0; 1237 } else if (HomPrologEpilog) { 1238 // Stack has been already adjusted. 1239 NumBytes -= PrologueSaveSize; 1240 } else if (PrologueSaveSize != 0) { 1241 MBBI = convertCalleeSaveRestoreToSPPrePostIncDec( 1242 MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI); 1243 NumBytes -= PrologueSaveSize; 1244 } 1245 assert(NumBytes >= 0 && "Negative stack allocation size!?"); 1246 1247 // Move past the saves of the callee-saved registers, fixing up the offsets 1248 // and pre-inc if we decided to combine the callee-save and local stack 1249 // pointer bump above. 1250 MachineBasicBlock::iterator End = MBB.end(); 1251 while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) && 1252 !IsSVECalleeSave(MBBI)) { 1253 if (CombineSPBump) 1254 fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(), 1255 NeedsWinCFI, &HasWinCFI); 1256 ++MBBI; 1257 } 1258 1259 // For funclets the FP belongs to the containing function. 1260 if (!IsFunclet && HasFP) { 1261 // Only set up FP if we actually need to. 1262 int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset(); 1263 1264 if (CombineSPBump) 1265 FPOffset += AFI->getLocalStackSize(); 1266 1267 if (AFI->hasSwiftAsyncContext()) { 1268 // Before we update the live FP we have to ensure there's a valid (or 1269 // null) asynchronous context in its slot just before FP in the frame 1270 // record, so store it now. 1271 const auto &Attrs = MF.getFunction().getAttributes(); 1272 bool HaveInitialContext = Attrs.hasAttrSomewhere(Attribute::SwiftAsync); 1273 if (HaveInitialContext) 1274 MBB.addLiveIn(AArch64::X22); 1275 BuildMI(MBB, MBBI, DL, TII->get(AArch64::StoreSwiftAsyncContext)) 1276 .addUse(HaveInitialContext ? AArch64::X22 : AArch64::XZR) 1277 .addUse(AArch64::SP) 1278 .addImm(FPOffset - 8) 1279 .setMIFlags(MachineInstr::FrameSetup); 1280 } 1281 1282 if (HomPrologEpilog) { 1283 auto Prolog = MBBI; 1284 --Prolog; 1285 assert(Prolog->getOpcode() == AArch64::HOM_Prolog); 1286 Prolog->addOperand(MachineOperand::CreateImm(FPOffset)); 1287 } else { 1288 // Issue sub fp, sp, FPOffset or 1289 // mov fp,sp when FPOffset is zero. 1290 // Note: All stores of callee-saved registers are marked as "FrameSetup". 1291 // This code marks the instruction(s) that set the FP also. 1292 emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, 1293 StackOffset::getFixed(FPOffset), TII, 1294 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); 1295 } 1296 } 1297 1298 if (windowsRequiresStackProbe(MF, NumBytes)) { 1299 uint64_t NumWords = NumBytes >> 4; 1300 if (NeedsWinCFI) { 1301 HasWinCFI = true; 1302 // alloc_l can hold at most 256MB, so assume that NumBytes doesn't 1303 // exceed this amount. We need to move at most 2^24 - 1 into x15. 1304 // This is at most two instructions, MOVZ follwed by MOVK. 1305 // TODO: Fix to use multiple stack alloc unwind codes for stacks 1306 // exceeding 256MB in size. 1307 if (NumBytes >= (1 << 28)) 1308 report_fatal_error("Stack size cannot exceed 256MB for stack " 1309 "unwinding purposes"); 1310 1311 uint32_t LowNumWords = NumWords & 0xFFFF; 1312 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVZXi), AArch64::X15) 1313 .addImm(LowNumWords) 1314 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) 1315 .setMIFlag(MachineInstr::FrameSetup); 1316 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) 1317 .setMIFlag(MachineInstr::FrameSetup); 1318 if ((NumWords & 0xFFFF0000) != 0) { 1319 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X15) 1320 .addReg(AArch64::X15) 1321 .addImm((NumWords & 0xFFFF0000) >> 16) // High half 1322 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16)) 1323 .setMIFlag(MachineInstr::FrameSetup); 1324 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) 1325 .setMIFlag(MachineInstr::FrameSetup); 1326 } 1327 } else { 1328 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15) 1329 .addImm(NumWords) 1330 .setMIFlags(MachineInstr::FrameSetup); 1331 } 1332 1333 switch (MF.getTarget().getCodeModel()) { 1334 case CodeModel::Tiny: 1335 case CodeModel::Small: 1336 case CodeModel::Medium: 1337 case CodeModel::Kernel: 1338 BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL)) 1339 .addExternalSymbol("__chkstk") 1340 .addReg(AArch64::X15, RegState::Implicit) 1341 .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead) 1342 .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead) 1343 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead) 1344 .setMIFlags(MachineInstr::FrameSetup); 1345 if (NeedsWinCFI) { 1346 HasWinCFI = true; 1347 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) 1348 .setMIFlag(MachineInstr::FrameSetup); 1349 } 1350 break; 1351 case CodeModel::Large: 1352 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT)) 1353 .addReg(AArch64::X16, RegState::Define) 1354 .addExternalSymbol("__chkstk") 1355 .addExternalSymbol("__chkstk") 1356 .setMIFlags(MachineInstr::FrameSetup); 1357 if (NeedsWinCFI) { 1358 HasWinCFI = true; 1359 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) 1360 .setMIFlag(MachineInstr::FrameSetup); 1361 } 1362 1363 BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF))) 1364 .addReg(AArch64::X16, RegState::Kill) 1365 .addReg(AArch64::X15, RegState::Implicit | RegState::Define) 1366 .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead) 1367 .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead) 1368 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead) 1369 .setMIFlags(MachineInstr::FrameSetup); 1370 if (NeedsWinCFI) { 1371 HasWinCFI = true; 1372 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) 1373 .setMIFlag(MachineInstr::FrameSetup); 1374 } 1375 break; 1376 } 1377 1378 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP) 1379 .addReg(AArch64::SP, RegState::Kill) 1380 .addReg(AArch64::X15, RegState::Kill) 1381 .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4)) 1382 .setMIFlags(MachineInstr::FrameSetup); 1383 if (NeedsWinCFI) { 1384 HasWinCFI = true; 1385 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) 1386 .addImm(NumBytes) 1387 .setMIFlag(MachineInstr::FrameSetup); 1388 } 1389 NumBytes = 0; 1390 } 1391 1392 StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {}; 1393 MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI; 1394 1395 // Process the SVE callee-saves to determine what space needs to be 1396 // allocated. 1397 if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { 1398 // Find callee save instructions in frame. 1399 CalleeSavesBegin = MBBI; 1400 assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction"); 1401 while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator()) 1402 ++MBBI; 1403 CalleeSavesEnd = MBBI; 1404 1405 AllocateBefore = StackOffset::getScalable(CalleeSavedSize); 1406 AllocateAfter = SVEStackSize - AllocateBefore; 1407 } 1408 1409 // Allocate space for the callee saves (if any). 1410 emitFrameOffset(MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP, 1411 -AllocateBefore, TII, 1412 MachineInstr::FrameSetup); 1413 1414 // Finally allocate remaining SVE stack space. 1415 emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP, 1416 -AllocateAfter, TII, 1417 MachineInstr::FrameSetup); 1418 1419 // Allocate space for the rest of the frame. 1420 if (NumBytes) { 1421 // Alignment is required for the parent frame, not the funclet 1422 const bool NeedsRealignment = 1423 !IsFunclet && RegInfo->hasStackRealignment(MF); 1424 unsigned scratchSPReg = AArch64::SP; 1425 1426 if (NeedsRealignment) { 1427 scratchSPReg = findScratchNonCalleeSaveRegister(&MBB); 1428 assert(scratchSPReg != AArch64::NoRegister); 1429 } 1430 1431 // If we're a leaf function, try using the red zone. 1432 if (!canUseRedZone(MF)) 1433 // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have 1434 // the correct value here, as NumBytes also includes padding bytes, 1435 // which shouldn't be counted here. 1436 emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, 1437 StackOffset::getFixed(-NumBytes), TII, 1438 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); 1439 1440 if (NeedsRealignment) { 1441 const unsigned NrBitsToZero = Log2(MFI.getMaxAlign()); 1442 assert(NrBitsToZero > 1); 1443 assert(scratchSPReg != AArch64::SP); 1444 1445 // SUB X9, SP, NumBytes 1446 // -- X9 is temporary register, so shouldn't contain any live data here, 1447 // -- free to use. This is already produced by emitFrameOffset above. 1448 // AND SP, X9, 0b11111...0000 1449 // The logical immediates have a non-trivial encoding. The following 1450 // formula computes the encoded immediate with all ones but 1451 // NrBitsToZero zero bits as least significant bits. 1452 uint32_t andMaskEncoded = (1 << 12) // = N 1453 | ((64 - NrBitsToZero) << 6) // immr 1454 | ((64 - NrBitsToZero - 1) << 0); // imms 1455 1456 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP) 1457 .addReg(scratchSPReg, RegState::Kill) 1458 .addImm(andMaskEncoded); 1459 AFI->setStackRealigned(true); 1460 if (NeedsWinCFI) { 1461 HasWinCFI = true; 1462 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) 1463 .addImm(NumBytes & andMaskEncoded) 1464 .setMIFlag(MachineInstr::FrameSetup); 1465 } 1466 } 1467 } 1468 1469 // If we need a base pointer, set it up here. It's whatever the value of the 1470 // stack pointer is at this point. Any variable size objects will be allocated 1471 // after this, so we can still use the base pointer to reference locals. 1472 // 1473 // FIXME: Clarify FrameSetup flags here. 1474 // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is 1475 // needed. 1476 // For funclets the BP belongs to the containing function. 1477 if (!IsFunclet && RegInfo->hasBasePointer(MF)) { 1478 TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP, 1479 false); 1480 if (NeedsWinCFI) { 1481 HasWinCFI = true; 1482 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) 1483 .setMIFlag(MachineInstr::FrameSetup); 1484 } 1485 } 1486 1487 // The very last FrameSetup instruction indicates the end of prologue. Emit a 1488 // SEH opcode indicating the prologue end. 1489 if (NeedsWinCFI && HasWinCFI) { 1490 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd)) 1491 .setMIFlag(MachineInstr::FrameSetup); 1492 } 1493 1494 // SEH funclets are passed the frame pointer in X1. If the parent 1495 // function uses the base register, then the base register is used 1496 // directly, and is not retrieved from X1. 1497 if (IsFunclet && F.hasPersonalityFn()) { 1498 EHPersonality Per = classifyEHPersonality(F.getPersonalityFn()); 1499 if (isAsynchronousEHPersonality(Per)) { 1500 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP) 1501 .addReg(AArch64::X1) 1502 .setMIFlag(MachineInstr::FrameSetup); 1503 MBB.addLiveIn(AArch64::X1); 1504 } 1505 } 1506 1507 if (needsFrameMoves) { 1508 // An example of the prologue: 1509 // 1510 // .globl __foo 1511 // .align 2 1512 // __foo: 1513 // Ltmp0: 1514 // .cfi_startproc 1515 // .cfi_personality 155, ___gxx_personality_v0 1516 // Leh_func_begin: 1517 // .cfi_lsda 16, Lexception33 1518 // 1519 // stp xa,bx, [sp, -#offset]! 1520 // ... 1521 // stp x28, x27, [sp, #offset-32] 1522 // stp fp, lr, [sp, #offset-16] 1523 // add fp, sp, #offset - 16 1524 // sub sp, sp, #1360 1525 // 1526 // The Stack: 1527 // +-------------------------------------------+ 1528 // 10000 | ........ | ........ | ........ | ........ | 1529 // 10004 | ........ | ........ | ........ | ........ | 1530 // +-------------------------------------------+ 1531 // 10008 | ........ | ........ | ........ | ........ | 1532 // 1000c | ........ | ........ | ........ | ........ | 1533 // +===========================================+ 1534 // 10010 | X28 Register | 1535 // 10014 | X28 Register | 1536 // +-------------------------------------------+ 1537 // 10018 | X27 Register | 1538 // 1001c | X27 Register | 1539 // +===========================================+ 1540 // 10020 | Frame Pointer | 1541 // 10024 | Frame Pointer | 1542 // +-------------------------------------------+ 1543 // 10028 | Link Register | 1544 // 1002c | Link Register | 1545 // +===========================================+ 1546 // 10030 | ........ | ........ | ........ | ........ | 1547 // 10034 | ........ | ........ | ........ | ........ | 1548 // +-------------------------------------------+ 1549 // 10038 | ........ | ........ | ........ | ........ | 1550 // 1003c | ........ | ........ | ........ | ........ | 1551 // +-------------------------------------------+ 1552 // 1553 // [sp] = 10030 :: >>initial value<< 1554 // sp = 10020 :: stp fp, lr, [sp, #-16]! 1555 // fp = sp == 10020 :: mov fp, sp 1556 // [sp] == 10020 :: stp x28, x27, [sp, #-16]! 1557 // sp == 10010 :: >>final value<< 1558 // 1559 // The frame pointer (w29) points to address 10020. If we use an offset of 1560 // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24 1561 // for w27, and -32 for w28: 1562 // 1563 // Ltmp1: 1564 // .cfi_def_cfa w29, 16 1565 // Ltmp2: 1566 // .cfi_offset w30, -8 1567 // Ltmp3: 1568 // .cfi_offset w29, -16 1569 // Ltmp4: 1570 // .cfi_offset w27, -24 1571 // Ltmp5: 1572 // .cfi_offset w28, -32 1573 1574 if (HasFP) { 1575 const int OffsetToFirstCalleeSaveFromFP = 1576 AFI->getCalleeSaveBaseToFrameRecordOffset() - 1577 AFI->getCalleeSavedStackSize(); 1578 Register FramePtr = RegInfo->getFrameRegister(MF); 1579 1580 // Define the current CFA rule to use the provided FP. 1581 unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true); 1582 unsigned CFIIndex = MF.addFrameInst( 1583 MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - OffsetToFirstCalleeSaveFromFP)); 1584 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) 1585 .addCFIIndex(CFIIndex) 1586 .setMIFlags(MachineInstr::FrameSetup); 1587 } else { 1588 unsigned CFIIndex; 1589 if (SVEStackSize) { 1590 const TargetSubtargetInfo &STI = MF.getSubtarget(); 1591 const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); 1592 StackOffset TotalSize = 1593 SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize()); 1594 CFIIndex = MF.addFrameInst(createDefCFAExpressionFromSP(TRI, TotalSize)); 1595 } else { 1596 // Encode the stack size of the leaf function. 1597 CFIIndex = MF.addFrameInst( 1598 MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize())); 1599 } 1600 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) 1601 .addCFIIndex(CFIIndex) 1602 .setMIFlags(MachineInstr::FrameSetup); 1603 } 1604 1605 // Now emit the moves for whatever callee saved regs we have (including FP, 1606 // LR if those are saved). 1607 emitCalleeSavedFrameMoves(MBB, MBBI); 1608 } 1609 } 1610 1611 static void InsertReturnAddressAuth(MachineFunction &MF, 1612 MachineBasicBlock &MBB) { 1613 const auto &MFI = *MF.getInfo<AArch64FunctionInfo>(); 1614 if (!MFI.shouldSignReturnAddress()) 1615 return; 1616 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); 1617 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 1618 1619 MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); 1620 DebugLoc DL; 1621 if (MBBI != MBB.end()) 1622 DL = MBBI->getDebugLoc(); 1623 1624 // The AUTIASP instruction assembles to a hint instruction before v8.3a so 1625 // this instruction can safely used for any v8a architecture. 1626 // From v8.3a onwards there are optimised authenticate LR and return 1627 // instructions, namely RETA{A,B}, that can be used instead. 1628 if (Subtarget.hasPAuth() && MBBI != MBB.end() && 1629 MBBI->getOpcode() == AArch64::RET_ReallyLR) { 1630 BuildMI(MBB, MBBI, DL, 1631 TII->get(MFI.shouldSignWithBKey() ? AArch64::RETAB : AArch64::RETAA)) 1632 .copyImplicitOps(*MBBI); 1633 MBB.erase(MBBI); 1634 } else { 1635 BuildMI( 1636 MBB, MBBI, DL, 1637 TII->get(MFI.shouldSignWithBKey() ? AArch64::AUTIBSP : AArch64::AUTIASP)) 1638 .setMIFlag(MachineInstr::FrameDestroy); 1639 } 1640 } 1641 1642 static bool isFuncletReturnInstr(const MachineInstr &MI) { 1643 switch (MI.getOpcode()) { 1644 default: 1645 return false; 1646 case AArch64::CATCHRET: 1647 case AArch64::CLEANUPRET: 1648 return true; 1649 } 1650 } 1651 1652 void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, 1653 MachineBasicBlock &MBB) const { 1654 MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); 1655 MachineFrameInfo &MFI = MF.getFrameInfo(); 1656 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); 1657 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 1658 DebugLoc DL; 1659 bool NeedsWinCFI = needsWinCFI(MF); 1660 bool HasWinCFI = false; 1661 bool IsFunclet = false; 1662 auto WinCFI = make_scope_exit([&]() { assert(HasWinCFI == MF.hasWinCFI()); }); 1663 1664 if (MBB.end() != MBBI) { 1665 DL = MBBI->getDebugLoc(); 1666 IsFunclet = isFuncletReturnInstr(*MBBI); 1667 } 1668 1669 int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF) 1670 : MFI.getStackSize(); 1671 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 1672 1673 // All calls are tail calls in GHC calling conv, and functions have no 1674 // prologue/epilogue. 1675 if (MF.getFunction().getCallingConv() == CallingConv::GHC) 1676 return; 1677 1678 // How much of the stack used by incoming arguments this function is expected 1679 // to restore in this particular epilogue. 1680 int64_t ArgumentStackToRestore = getArgumentStackToRestore(MF, MBB); 1681 1682 // The stack frame should be like below, 1683 // 1684 // ---------------------- --- 1685 // | | | 1686 // | BytesInStackArgArea| CalleeArgStackSize 1687 // | (NumReusableBytes) | (of tail call) 1688 // | | --- 1689 // | | | 1690 // ---------------------| --- | 1691 // | | | | 1692 // | CalleeSavedReg | | | 1693 // | (CalleeSavedStackSize)| | | 1694 // | | | | 1695 // ---------------------| | NumBytes 1696 // | | StackSize (StackAdjustUp) 1697 // | LocalStackSize | | | 1698 // | (covering callee | | | 1699 // | args) | | | 1700 // | | | | 1701 // ---------------------- --- --- 1702 // 1703 // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize 1704 // = StackSize + ArgumentPopSize 1705 // 1706 // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps 1707 // it as the 2nd argument of AArch64ISD::TC_RETURN. 1708 1709 auto Cleanup = make_scope_exit([&] { InsertReturnAddressAuth(MF, MBB); }); 1710 1711 bool IsWin64 = 1712 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); 1713 unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet); 1714 1715 int64_t AfterCSRPopSize = ArgumentStackToRestore; 1716 auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject; 1717 // We cannot rely on the local stack size set in emitPrologue if the function 1718 // has funclets, as funclets have different local stack size requirements, and 1719 // the current value set in emitPrologue may be that of the containing 1720 // function. 1721 if (MF.hasEHFunclets()) 1722 AFI->setLocalStackSize(NumBytes - PrologueSaveSize); 1723 if (homogeneousPrologEpilog(MF, &MBB)) { 1724 assert(!NeedsWinCFI); 1725 auto LastPopI = MBB.getFirstTerminator(); 1726 if (LastPopI != MBB.begin()) { 1727 auto HomogeneousEpilog = std::prev(LastPopI); 1728 if (HomogeneousEpilog->getOpcode() == AArch64::HOM_Epilog) 1729 LastPopI = HomogeneousEpilog; 1730 } 1731 1732 // Adjust local stack 1733 emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, 1734 StackOffset::getFixed(-AFI->getLocalStackSize()), TII, 1735 MachineInstr::FrameDestroy, false, NeedsWinCFI); 1736 1737 // SP has been already adjusted while restoring callee save regs. 1738 // We've bailed-out the case with adjusting SP for arguments. 1739 assert(AfterCSRPopSize == 0); 1740 return; 1741 } 1742 bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes); 1743 // Assume we can't combine the last pop with the sp restore. 1744 1745 if (!CombineSPBump && PrologueSaveSize != 0) { 1746 MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator()); 1747 while (AArch64InstrInfo::isSEHInstruction(*Pop)) 1748 Pop = std::prev(Pop); 1749 // Converting the last ldp to a post-index ldp is valid only if the last 1750 // ldp's offset is 0. 1751 const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1); 1752 // If the offset is 0 and the AfterCSR pop is not actually trying to 1753 // allocate more stack for arguments (in space that an untimely interrupt 1754 // may clobber), convert it to a post-index ldp. 1755 if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0) 1756 convertCalleeSaveRestoreToSPPrePostIncDec( 1757 MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, false); 1758 else { 1759 // If not, make sure to emit an add after the last ldp. 1760 // We're doing this by transfering the size to be restored from the 1761 // adjustment *before* the CSR pops to the adjustment *after* the CSR 1762 // pops. 1763 AfterCSRPopSize += PrologueSaveSize; 1764 } 1765 } 1766 1767 // Move past the restores of the callee-saved registers. 1768 // If we plan on combining the sp bump of the local stack size and the callee 1769 // save stack size, we might need to adjust the CSR save and restore offsets. 1770 MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator(); 1771 MachineBasicBlock::iterator Begin = MBB.begin(); 1772 while (LastPopI != Begin) { 1773 --LastPopI; 1774 if (!LastPopI->getFlag(MachineInstr::FrameDestroy) || 1775 IsSVECalleeSave(LastPopI)) { 1776 ++LastPopI; 1777 break; 1778 } else if (CombineSPBump) 1779 fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize(), 1780 NeedsWinCFI, &HasWinCFI); 1781 } 1782 1783 if (MF.hasWinCFI()) { 1784 // If the prologue didn't contain any SEH opcodes and didn't set the 1785 // MF.hasWinCFI() flag, assume the epilogue won't either, and skip the 1786 // EpilogStart - to avoid generating CFI for functions that don't need it. 1787 // (And as we didn't generate any prologue at all, it would be asymmetrical 1788 // to the epilogue.) By the end of the function, we assert that 1789 // HasWinCFI is equal to MF.hasWinCFI(), to verify this assumption. 1790 HasWinCFI = true; 1791 BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart)) 1792 .setMIFlag(MachineInstr::FrameDestroy); 1793 } 1794 1795 if (hasFP(MF) && AFI->hasSwiftAsyncContext()) { 1796 // We need to reset FP to its untagged state on return. Bit 60 is currently 1797 // used to show the presence of an extended frame. 1798 1799 // BIC x29, x29, #0x1000_0000_0000_0000 1800 BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::ANDXri), 1801 AArch64::FP) 1802 .addUse(AArch64::FP) 1803 .addImm(0x10fe) 1804 .setMIFlag(MachineInstr::FrameDestroy); 1805 } 1806 1807 const StackOffset &SVEStackSize = getSVEStackSize(MF); 1808 1809 // If there is a single SP update, insert it before the ret and we're done. 1810 if (CombineSPBump) { 1811 assert(!SVEStackSize && "Cannot combine SP bump with SVE"); 1812 emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, 1813 StackOffset::getFixed(NumBytes + (int64_t)AfterCSRPopSize), 1814 TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, 1815 &HasWinCFI); 1816 if (HasWinCFI) 1817 BuildMI(MBB, MBB.getFirstTerminator(), DL, 1818 TII->get(AArch64::SEH_EpilogEnd)) 1819 .setMIFlag(MachineInstr::FrameDestroy); 1820 return; 1821 } 1822 1823 NumBytes -= PrologueSaveSize; 1824 assert(NumBytes >= 0 && "Negative stack allocation size!?"); 1825 1826 // Process the SVE callee-saves to determine what space needs to be 1827 // deallocated. 1828 StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize; 1829 MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI; 1830 if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { 1831 RestoreBegin = std::prev(RestoreEnd); 1832 while (RestoreBegin != MBB.begin() && 1833 IsSVECalleeSave(std::prev(RestoreBegin))) 1834 --RestoreBegin; 1835 1836 assert(IsSVECalleeSave(RestoreBegin) && 1837 IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction"); 1838 1839 StackOffset CalleeSavedSizeAsOffset = 1840 StackOffset::getScalable(CalleeSavedSize); 1841 DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset; 1842 DeallocateAfter = CalleeSavedSizeAsOffset; 1843 } 1844 1845 // Deallocate the SVE area. 1846 if (SVEStackSize) { 1847 if (AFI->isStackRealigned()) { 1848 if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) 1849 // Set SP to start of SVE callee-save area from which they can 1850 // be reloaded. The code below will deallocate the stack space 1851 // space by moving FP -> SP. 1852 emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP, 1853 StackOffset::getScalable(-CalleeSavedSize), TII, 1854 MachineInstr::FrameDestroy); 1855 } else { 1856 if (AFI->getSVECalleeSavedStackSize()) { 1857 // Deallocate the non-SVE locals first before we can deallocate (and 1858 // restore callee saves) from the SVE area. 1859 emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, 1860 StackOffset::getFixed(NumBytes), TII, 1861 MachineInstr::FrameDestroy); 1862 NumBytes = 0; 1863 } 1864 1865 emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, 1866 DeallocateBefore, TII, MachineInstr::FrameDestroy); 1867 1868 emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, 1869 DeallocateAfter, TII, MachineInstr::FrameDestroy); 1870 } 1871 } 1872 1873 if (!hasFP(MF)) { 1874 bool RedZone = canUseRedZone(MF); 1875 // If this was a redzone leaf function, we don't need to restore the 1876 // stack pointer (but we may need to pop stack args for fastcc). 1877 if (RedZone && AfterCSRPopSize == 0) 1878 return; 1879 1880 bool NoCalleeSaveRestore = PrologueSaveSize == 0; 1881 int64_t StackRestoreBytes = RedZone ? 0 : NumBytes; 1882 if (NoCalleeSaveRestore) 1883 StackRestoreBytes += AfterCSRPopSize; 1884 1885 // If we were able to combine the local stack pop with the argument pop, 1886 // then we're done. 1887 bool Done = NoCalleeSaveRestore || AfterCSRPopSize == 0; 1888 1889 // If we're done after this, make sure to help the load store optimizer. 1890 if (Done) 1891 adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI); 1892 1893 emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, 1894 StackOffset::getFixed(StackRestoreBytes), TII, 1895 MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); 1896 if (Done) { 1897 if (HasWinCFI) { 1898 BuildMI(MBB, MBB.getFirstTerminator(), DL, 1899 TII->get(AArch64::SEH_EpilogEnd)) 1900 .setMIFlag(MachineInstr::FrameDestroy); 1901 } 1902 return; 1903 } 1904 1905 NumBytes = 0; 1906 } 1907 1908 // Restore the original stack pointer. 1909 // FIXME: Rather than doing the math here, we should instead just use 1910 // non-post-indexed loads for the restores if we aren't actually going to 1911 // be able to save any instructions. 1912 if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) { 1913 emitFrameOffset( 1914 MBB, LastPopI, DL, AArch64::SP, AArch64::FP, 1915 StackOffset::getFixed(-AFI->getCalleeSaveBaseToFrameRecordOffset()), 1916 TII, MachineInstr::FrameDestroy, false, NeedsWinCFI); 1917 } else if (NumBytes) 1918 emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, 1919 StackOffset::getFixed(NumBytes), TII, 1920 MachineInstr::FrameDestroy, false, NeedsWinCFI); 1921 1922 // This must be placed after the callee-save restore code because that code 1923 // assumes the SP is at the same location as it was after the callee-save save 1924 // code in the prologue. 1925 if (AfterCSRPopSize) { 1926 assert(AfterCSRPopSize > 0 && "attempting to reallocate arg stack that an " 1927 "interrupt may have clobbered"); 1928 // Find an insertion point for the first ldp so that it goes before the 1929 // shadow call stack epilog instruction. This ensures that the restore of 1930 // lr from x18 is placed after the restore from sp. 1931 auto FirstSPPopI = MBB.getFirstTerminator(); 1932 while (FirstSPPopI != Begin) { 1933 auto Prev = std::prev(FirstSPPopI); 1934 if (Prev->getOpcode() != AArch64::LDRXpre || 1935 Prev->getOperand(0).getReg() == AArch64::SP) 1936 break; 1937 FirstSPPopI = Prev; 1938 } 1939 1940 adaptForLdStOpt(MBB, FirstSPPopI, LastPopI); 1941 1942 emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP, 1943 StackOffset::getFixed(AfterCSRPopSize), TII, 1944 MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); 1945 } 1946 if (HasWinCFI) 1947 BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd)) 1948 .setMIFlag(MachineInstr::FrameDestroy); 1949 } 1950 1951 /// getFrameIndexReference - Provide a base+offset reference to an FI slot for 1952 /// debug info. It's the same as what we use for resolving the code-gen 1953 /// references for now. FIXME: This can go wrong when references are 1954 /// SP-relative and simple call frames aren't used. 1955 StackOffset 1956 AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, 1957 Register &FrameReg) const { 1958 return resolveFrameIndexReference( 1959 MF, FI, FrameReg, 1960 /*PreferFP=*/ 1961 MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress), 1962 /*ForSimm=*/false); 1963 } 1964 1965 StackOffset 1966 AArch64FrameLowering::getNonLocalFrameIndexReference(const MachineFunction &MF, 1967 int FI) const { 1968 return StackOffset::getFixed(getSEHFrameIndexOffset(MF, FI)); 1969 } 1970 1971 static StackOffset getFPOffset(const MachineFunction &MF, 1972 int64_t ObjectOffset) { 1973 const auto *AFI = MF.getInfo<AArch64FunctionInfo>(); 1974 const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); 1975 bool IsWin64 = 1976 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); 1977 unsigned FixedObject = 1978 getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false); 1979 int64_t CalleeSaveSize = AFI->getCalleeSavedStackSize(MF.getFrameInfo()); 1980 int64_t FPAdjust = 1981 CalleeSaveSize - AFI->getCalleeSaveBaseToFrameRecordOffset(); 1982 return StackOffset::getFixed(ObjectOffset + FixedObject + FPAdjust); 1983 } 1984 1985 static StackOffset getStackOffset(const MachineFunction &MF, 1986 int64_t ObjectOffset) { 1987 const auto &MFI = MF.getFrameInfo(); 1988 return StackOffset::getFixed(ObjectOffset + (int64_t)MFI.getStackSize()); 1989 } 1990 1991 // TODO: This function currently does not work for scalable vectors. 1992 int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF, 1993 int FI) const { 1994 const auto *RegInfo = static_cast<const AArch64RegisterInfo *>( 1995 MF.getSubtarget().getRegisterInfo()); 1996 int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI); 1997 return RegInfo->getLocalAddressRegister(MF) == AArch64::FP 1998 ? getFPOffset(MF, ObjectOffset).getFixed() 1999 : getStackOffset(MF, ObjectOffset).getFixed(); 2000 } 2001 2002 StackOffset AArch64FrameLowering::resolveFrameIndexReference( 2003 const MachineFunction &MF, int FI, Register &FrameReg, bool PreferFP, 2004 bool ForSimm) const { 2005 const auto &MFI = MF.getFrameInfo(); 2006 int64_t ObjectOffset = MFI.getObjectOffset(FI); 2007 bool isFixed = MFI.isFixedObjectIndex(FI); 2008 bool isSVE = MFI.getStackID(FI) == TargetStackID::ScalableVector; 2009 return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg, 2010 PreferFP, ForSimm); 2011 } 2012 2013 StackOffset AArch64FrameLowering::resolveFrameOffsetReference( 2014 const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE, 2015 Register &FrameReg, bool PreferFP, bool ForSimm) const { 2016 const auto &MFI = MF.getFrameInfo(); 2017 const auto *RegInfo = static_cast<const AArch64RegisterInfo *>( 2018 MF.getSubtarget().getRegisterInfo()); 2019 const auto *AFI = MF.getInfo<AArch64FunctionInfo>(); 2020 const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); 2021 2022 int64_t FPOffset = getFPOffset(MF, ObjectOffset).getFixed(); 2023 int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed(); 2024 bool isCSR = 2025 !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI)); 2026 2027 const StackOffset &SVEStackSize = getSVEStackSize(MF); 2028 2029 // Use frame pointer to reference fixed objects. Use it for locals if 2030 // there are VLAs or a dynamically realigned SP (and thus the SP isn't 2031 // reliable as a base). Make sure useFPForScavengingIndex() does the 2032 // right thing for the emergency spill slot. 2033 bool UseFP = false; 2034 if (AFI->hasStackFrame() && !isSVE) { 2035 // We shouldn't prefer using the FP when there is an SVE area 2036 // in between the FP and the non-SVE locals/spills. 2037 PreferFP &= !SVEStackSize; 2038 2039 // Note: Keeping the following as multiple 'if' statements rather than 2040 // merging to a single expression for readability. 2041 // 2042 // Argument access should always use the FP. 2043 if (isFixed) { 2044 UseFP = hasFP(MF); 2045 } else if (isCSR && RegInfo->hasStackRealignment(MF)) { 2046 // References to the CSR area must use FP if we're re-aligning the stack 2047 // since the dynamically-sized alignment padding is between the SP/BP and 2048 // the CSR area. 2049 assert(hasFP(MF) && "Re-aligned stack must have frame pointer"); 2050 UseFP = true; 2051 } else if (hasFP(MF) && !RegInfo->hasStackRealignment(MF)) { 2052 // If the FPOffset is negative and we're producing a signed immediate, we 2053 // have to keep in mind that the available offset range for negative 2054 // offsets is smaller than for positive ones. If an offset is available 2055 // via the FP and the SP, use whichever is closest. 2056 bool FPOffsetFits = !ForSimm || FPOffset >= -256; 2057 PreferFP |= Offset > -FPOffset; 2058 2059 if (MFI.hasVarSizedObjects()) { 2060 // If we have variable sized objects, we can use either FP or BP, as the 2061 // SP offset is unknown. We can use the base pointer if we have one and 2062 // FP is not preferred. If not, we're stuck with using FP. 2063 bool CanUseBP = RegInfo->hasBasePointer(MF); 2064 if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best. 2065 UseFP = PreferFP; 2066 else if (!CanUseBP) // Can't use BP. Forced to use FP. 2067 UseFP = true; 2068 // else we can use BP and FP, but the offset from FP won't fit. 2069 // That will make us scavenge registers which we can probably avoid by 2070 // using BP. If it won't fit for BP either, we'll scavenge anyway. 2071 } else if (FPOffset >= 0) { 2072 // Use SP or FP, whichever gives us the best chance of the offset 2073 // being in range for direct access. If the FPOffset is positive, 2074 // that'll always be best, as the SP will be even further away. 2075 UseFP = true; 2076 } else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) { 2077 // Funclets access the locals contained in the parent's stack frame 2078 // via the frame pointer, so we have to use the FP in the parent 2079 // function. 2080 (void) Subtarget; 2081 assert( 2082 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()) && 2083 "Funclets should only be present on Win64"); 2084 UseFP = true; 2085 } else { 2086 // We have the choice between FP and (SP or BP). 2087 if (FPOffsetFits && PreferFP) // If FP is the best fit, use it. 2088 UseFP = true; 2089 } 2090 } 2091 } 2092 2093 assert( 2094 ((isFixed || isCSR) || !RegInfo->hasStackRealignment(MF) || !UseFP) && 2095 "In the presence of dynamic stack pointer realignment, " 2096 "non-argument/CSR objects cannot be accessed through the frame pointer"); 2097 2098 if (isSVE) { 2099 StackOffset FPOffset = 2100 StackOffset::get(-AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset); 2101 StackOffset SPOffset = 2102 SVEStackSize + 2103 StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(), 2104 ObjectOffset); 2105 // Always use the FP for SVE spills if available and beneficial. 2106 if (hasFP(MF) && (SPOffset.getFixed() || 2107 FPOffset.getScalable() < SPOffset.getScalable() || 2108 RegInfo->hasStackRealignment(MF))) { 2109 FrameReg = RegInfo->getFrameRegister(MF); 2110 return FPOffset; 2111 } 2112 2113 FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister() 2114 : (unsigned)AArch64::SP; 2115 return SPOffset; 2116 } 2117 2118 StackOffset ScalableOffset = {}; 2119 if (UseFP && !(isFixed || isCSR)) 2120 ScalableOffset = -SVEStackSize; 2121 if (!UseFP && (isFixed || isCSR)) 2122 ScalableOffset = SVEStackSize; 2123 2124 if (UseFP) { 2125 FrameReg = RegInfo->getFrameRegister(MF); 2126 return StackOffset::getFixed(FPOffset) + ScalableOffset; 2127 } 2128 2129 // Use the base pointer if we have one. 2130 if (RegInfo->hasBasePointer(MF)) 2131 FrameReg = RegInfo->getBaseRegister(); 2132 else { 2133 assert(!MFI.hasVarSizedObjects() && 2134 "Can't use SP when we have var sized objects."); 2135 FrameReg = AArch64::SP; 2136 // If we're using the red zone for this function, the SP won't actually 2137 // be adjusted, so the offsets will be negative. They're also all 2138 // within range of the signed 9-bit immediate instructions. 2139 if (canUseRedZone(MF)) 2140 Offset -= AFI->getLocalStackSize(); 2141 } 2142 2143 return StackOffset::getFixed(Offset) + ScalableOffset; 2144 } 2145 2146 static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) { 2147 // Do not set a kill flag on values that are also marked as live-in. This 2148 // happens with the @llvm-returnaddress intrinsic and with arguments passed in 2149 // callee saved registers. 2150 // Omitting the kill flags is conservatively correct even if the live-in 2151 // is not used after all. 2152 bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg); 2153 return getKillRegState(!IsLiveIn); 2154 } 2155 2156 static bool produceCompactUnwindFrame(MachineFunction &MF) { 2157 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); 2158 AttributeList Attrs = MF.getFunction().getAttributes(); 2159 return Subtarget.isTargetMachO() && 2160 !(Subtarget.getTargetLowering()->supportSwiftError() && 2161 Attrs.hasAttrSomewhere(Attribute::SwiftError)) && 2162 MF.getFunction().getCallingConv() != CallingConv::SwiftTail; 2163 } 2164 2165 static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2, 2166 bool NeedsWinCFI, bool IsFirst) { 2167 // If we are generating register pairs for a Windows function that requires 2168 // EH support, then pair consecutive registers only. There are no unwind 2169 // opcodes for saves/restores of non-consectuve register pairs. 2170 // The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x, 2171 // save_lrpair. 2172 // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling 2173 2174 if (Reg2 == AArch64::FP) 2175 return true; 2176 if (!NeedsWinCFI) 2177 return false; 2178 if (Reg2 == Reg1 + 1) 2179 return false; 2180 // If pairing a GPR with LR, the pair can be described by the save_lrpair 2181 // opcode. If this is the first register pair, it would end up with a 2182 // predecrement, but there's no save_lrpair_x opcode, so we can only do this 2183 // if LR is paired with something else than the first register. 2184 // The save_lrpair opcode requires the first register to be an odd one. 2185 if (Reg1 >= AArch64::X19 && Reg1 <= AArch64::X27 && 2186 (Reg1 - AArch64::X19) % 2 == 0 && Reg2 == AArch64::LR && !IsFirst) 2187 return false; 2188 return true; 2189 } 2190 2191 /// Returns true if Reg1 and Reg2 cannot be paired using a ldp/stp instruction. 2192 /// WindowsCFI requires that only consecutive registers can be paired. 2193 /// LR and FP need to be allocated together when the frame needs to save 2194 /// the frame-record. This means any other register pairing with LR is invalid. 2195 static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2, 2196 bool UsesWinAAPCS, bool NeedsWinCFI, 2197 bool NeedsFrameRecord, bool IsFirst) { 2198 if (UsesWinAAPCS) 2199 return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI, IsFirst); 2200 2201 // If we need to store the frame record, don't pair any register 2202 // with LR other than FP. 2203 if (NeedsFrameRecord) 2204 return Reg2 == AArch64::LR; 2205 2206 return false; 2207 } 2208 2209 namespace { 2210 2211 struct RegPairInfo { 2212 unsigned Reg1 = AArch64::NoRegister; 2213 unsigned Reg2 = AArch64::NoRegister; 2214 int FrameIdx; 2215 int Offset; 2216 enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type; 2217 2218 RegPairInfo() = default; 2219 2220 bool isPaired() const { return Reg2 != AArch64::NoRegister; } 2221 2222 unsigned getScale() const { 2223 switch (Type) { 2224 case PPR: 2225 return 2; 2226 case GPR: 2227 case FPR64: 2228 return 8; 2229 case ZPR: 2230 case FPR128: 2231 return 16; 2232 } 2233 llvm_unreachable("Unsupported type"); 2234 } 2235 2236 bool isScalable() const { return Type == PPR || Type == ZPR; } 2237 }; 2238 2239 } // end anonymous namespace 2240 2241 static void computeCalleeSaveRegisterPairs( 2242 MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI, 2243 const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs, 2244 bool &NeedShadowCallStackProlog, bool NeedsFrameRecord) { 2245 2246 if (CSI.empty()) 2247 return; 2248 2249 bool IsWindows = isTargetWindows(MF); 2250 bool NeedsWinCFI = needsWinCFI(MF); 2251 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 2252 MachineFrameInfo &MFI = MF.getFrameInfo(); 2253 CallingConv::ID CC = MF.getFunction().getCallingConv(); 2254 unsigned Count = CSI.size(); 2255 (void)CC; 2256 // MachO's compact unwind format relies on all registers being stored in 2257 // pairs. 2258 assert((!produceCompactUnwindFrame(MF) || 2259 CC == CallingConv::PreserveMost || 2260 (Count & 1) == 0) && 2261 "Odd number of callee-saved regs to spill!"); 2262 int ByteOffset = AFI->getCalleeSavedStackSize(); 2263 int StackFillDir = -1; 2264 int RegInc = 1; 2265 unsigned FirstReg = 0; 2266 if (NeedsWinCFI) { 2267 // For WinCFI, fill the stack from the bottom up. 2268 ByteOffset = 0; 2269 StackFillDir = 1; 2270 // As the CSI array is reversed to match PrologEpilogInserter, iterate 2271 // backwards, to pair up registers starting from lower numbered registers. 2272 RegInc = -1; 2273 FirstReg = Count - 1; 2274 } 2275 int ScalableByteOffset = AFI->getSVECalleeSavedStackSize(); 2276 bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace(); 2277 2278 // When iterating backwards, the loop condition relies on unsigned wraparound. 2279 for (unsigned i = FirstReg; i < Count; i += RegInc) { 2280 RegPairInfo RPI; 2281 RPI.Reg1 = CSI[i].getReg(); 2282 2283 if (AArch64::GPR64RegClass.contains(RPI.Reg1)) 2284 RPI.Type = RegPairInfo::GPR; 2285 else if (AArch64::FPR64RegClass.contains(RPI.Reg1)) 2286 RPI.Type = RegPairInfo::FPR64; 2287 else if (AArch64::FPR128RegClass.contains(RPI.Reg1)) 2288 RPI.Type = RegPairInfo::FPR128; 2289 else if (AArch64::ZPRRegClass.contains(RPI.Reg1)) 2290 RPI.Type = RegPairInfo::ZPR; 2291 else if (AArch64::PPRRegClass.contains(RPI.Reg1)) 2292 RPI.Type = RegPairInfo::PPR; 2293 else 2294 llvm_unreachable("Unsupported register class."); 2295 2296 // Add the next reg to the pair if it is in the same register class. 2297 if (unsigned(i + RegInc) < Count) { 2298 unsigned NextReg = CSI[i + RegInc].getReg(); 2299 bool IsFirst = i == FirstReg; 2300 switch (RPI.Type) { 2301 case RegPairInfo::GPR: 2302 if (AArch64::GPR64RegClass.contains(NextReg) && 2303 !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows, 2304 NeedsWinCFI, NeedsFrameRecord, IsFirst)) 2305 RPI.Reg2 = NextReg; 2306 break; 2307 case RegPairInfo::FPR64: 2308 if (AArch64::FPR64RegClass.contains(NextReg) && 2309 !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI, 2310 IsFirst)) 2311 RPI.Reg2 = NextReg; 2312 break; 2313 case RegPairInfo::FPR128: 2314 if (AArch64::FPR128RegClass.contains(NextReg)) 2315 RPI.Reg2 = NextReg; 2316 break; 2317 case RegPairInfo::PPR: 2318 case RegPairInfo::ZPR: 2319 break; 2320 } 2321 } 2322 2323 // If either of the registers to be saved is the lr register, it means that 2324 // we also need to save lr in the shadow call stack. 2325 if ((RPI.Reg1 == AArch64::LR || RPI.Reg2 == AArch64::LR) && 2326 MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)) { 2327 if (!MF.getSubtarget<AArch64Subtarget>().isXRegisterReserved(18)) 2328 report_fatal_error("Must reserve x18 to use shadow call stack"); 2329 NeedShadowCallStackProlog = true; 2330 } 2331 2332 // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI 2333 // list to come in sorted by frame index so that we can issue the store 2334 // pair instructions directly. Assert if we see anything otherwise. 2335 // 2336 // The order of the registers in the list is controlled by 2337 // getCalleeSavedRegs(), so they will always be in-order, as well. 2338 assert((!RPI.isPaired() || 2339 (CSI[i].getFrameIdx() + RegInc == CSI[i + RegInc].getFrameIdx())) && 2340 "Out of order callee saved regs!"); 2341 2342 assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg2 != AArch64::FP || 2343 RPI.Reg1 == AArch64::LR) && 2344 "FrameRecord must be allocated together with LR"); 2345 2346 // Windows AAPCS has FP and LR reversed. 2347 assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg1 != AArch64::FP || 2348 RPI.Reg2 == AArch64::LR) && 2349 "FrameRecord must be allocated together with LR"); 2350 2351 // MachO's compact unwind format relies on all registers being stored in 2352 // adjacent register pairs. 2353 assert((!produceCompactUnwindFrame(MF) || 2354 CC == CallingConv::PreserveMost || 2355 (RPI.isPaired() && 2356 ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) || 2357 RPI.Reg1 + 1 == RPI.Reg2))) && 2358 "Callee-save registers not saved as adjacent register pair!"); 2359 2360 RPI.FrameIdx = CSI[i].getFrameIdx(); 2361 if (NeedsWinCFI && 2362 RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair 2363 RPI.FrameIdx = CSI[i + RegInc].getFrameIdx(); 2364 2365 int Scale = RPI.getScale(); 2366 2367 int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset; 2368 assert(OffsetPre % Scale == 0); 2369 2370 if (RPI.isScalable()) 2371 ScalableByteOffset += StackFillDir * Scale; 2372 else 2373 ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale); 2374 2375 // Swift's async context is directly before FP, so allocate an extra 2376 // 8 bytes for it. 2377 if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() && 2378 RPI.Reg2 == AArch64::FP) 2379 ByteOffset += StackFillDir * 8; 2380 2381 assert(!(RPI.isScalable() && RPI.isPaired()) && 2382 "Paired spill/fill instructions don't exist for SVE vectors"); 2383 2384 // Round up size of non-pair to pair size if we need to pad the 2385 // callee-save area to ensure 16-byte alignment. 2386 if (NeedGapToAlignStack && !NeedsWinCFI && 2387 !RPI.isScalable() && RPI.Type != RegPairInfo::FPR128 && 2388 !RPI.isPaired() && ByteOffset % 16 != 0) { 2389 ByteOffset += 8 * StackFillDir; 2390 assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16)); 2391 // A stack frame with a gap looks like this, bottom up: 2392 // d9, d8. x21, gap, x20, x19. 2393 // Set extra alignment on the x21 object to create the gap above it. 2394 MFI.setObjectAlignment(RPI.FrameIdx, Align(16)); 2395 NeedGapToAlignStack = false; 2396 } 2397 2398 int OffsetPost = RPI.isScalable() ? ScalableByteOffset : ByteOffset; 2399 assert(OffsetPost % Scale == 0); 2400 // If filling top down (default), we want the offset after incrementing it. 2401 // If fillibg bootom up (WinCFI) we need the original offset. 2402 int Offset = NeedsWinCFI ? OffsetPre : OffsetPost; 2403 2404 // The FP, LR pair goes 8 bytes into our expanded 24-byte slot so that the 2405 // Swift context can directly precede FP. 2406 if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() && 2407 RPI.Reg2 == AArch64::FP) 2408 Offset += 8; 2409 RPI.Offset = Offset / Scale; 2410 2411 assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) || 2412 (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) && 2413 "Offset out of bounds for LDP/STP immediate"); 2414 2415 // Save the offset to frame record so that the FP register can point to the 2416 // innermost frame record (spilled FP and LR registers). 2417 if (NeedsFrameRecord && ((!IsWindows && RPI.Reg1 == AArch64::LR && 2418 RPI.Reg2 == AArch64::FP) || 2419 (IsWindows && RPI.Reg1 == AArch64::FP && 2420 RPI.Reg2 == AArch64::LR))) 2421 AFI->setCalleeSaveBaseToFrameRecordOffset(Offset); 2422 2423 RegPairs.push_back(RPI); 2424 if (RPI.isPaired()) 2425 i += RegInc; 2426 } 2427 if (NeedsWinCFI) { 2428 // If we need an alignment gap in the stack, align the topmost stack 2429 // object. A stack frame with a gap looks like this, bottom up: 2430 // x19, d8. d9, gap. 2431 // Set extra alignment on the topmost stack object (the first element in 2432 // CSI, which goes top down), to create the gap above it. 2433 if (AFI->hasCalleeSaveStackFreeSpace()) 2434 MFI.setObjectAlignment(CSI[0].getFrameIdx(), Align(16)); 2435 // We iterated bottom up over the registers; flip RegPairs back to top 2436 // down order. 2437 std::reverse(RegPairs.begin(), RegPairs.end()); 2438 } 2439 } 2440 2441 bool AArch64FrameLowering::spillCalleeSavedRegisters( 2442 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, 2443 ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { 2444 MachineFunction &MF = *MBB.getParent(); 2445 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); 2446 bool NeedsWinCFI = needsWinCFI(MF); 2447 DebugLoc DL; 2448 SmallVector<RegPairInfo, 8> RegPairs; 2449 2450 bool NeedShadowCallStackProlog = false; 2451 computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, 2452 NeedShadowCallStackProlog, hasFP(MF)); 2453 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2454 2455 if (NeedShadowCallStackProlog) { 2456 // Shadow call stack prolog: str x30, [x18], #8 2457 BuildMI(MBB, MI, DL, TII.get(AArch64::STRXpost)) 2458 .addReg(AArch64::X18, RegState::Define) 2459 .addReg(AArch64::LR) 2460 .addReg(AArch64::X18) 2461 .addImm(8) 2462 .setMIFlag(MachineInstr::FrameSetup); 2463 2464 if (NeedsWinCFI) 2465 BuildMI(MBB, MI, DL, TII.get(AArch64::SEH_Nop)) 2466 .setMIFlag(MachineInstr::FrameSetup); 2467 2468 if (!MF.getFunction().hasFnAttribute(Attribute::NoUnwind)) { 2469 // Emit a CFI instruction that causes 8 to be subtracted from the value of 2470 // x18 when unwinding past this frame. 2471 static const char CFIInst[] = { 2472 dwarf::DW_CFA_val_expression, 2473 18, // register 2474 2, // length 2475 static_cast<char>(unsigned(dwarf::DW_OP_breg18)), 2476 static_cast<char>(-8) & 0x7f, // addend (sleb128) 2477 }; 2478 unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape( 2479 nullptr, StringRef(CFIInst, sizeof(CFIInst)))); 2480 BuildMI(MBB, MI, DL, TII.get(AArch64::CFI_INSTRUCTION)) 2481 .addCFIIndex(CFIIndex) 2482 .setMIFlag(MachineInstr::FrameSetup); 2483 } 2484 2485 // This instruction also makes x18 live-in to the entry block. 2486 MBB.addLiveIn(AArch64::X18); 2487 } 2488 2489 if (homogeneousPrologEpilog(MF)) { 2490 auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Prolog)) 2491 .setMIFlag(MachineInstr::FrameSetup); 2492 2493 for (auto &RPI : RegPairs) { 2494 MIB.addReg(RPI.Reg1); 2495 MIB.addReg(RPI.Reg2); 2496 2497 // Update register live in. 2498 if (!MRI.isReserved(RPI.Reg1)) 2499 MBB.addLiveIn(RPI.Reg1); 2500 if (!MRI.isReserved(RPI.Reg2)) 2501 MBB.addLiveIn(RPI.Reg2); 2502 } 2503 return true; 2504 } 2505 for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE; 2506 ++RPII) { 2507 RegPairInfo RPI = *RPII; 2508 unsigned Reg1 = RPI.Reg1; 2509 unsigned Reg2 = RPI.Reg2; 2510 unsigned StrOpc; 2511 2512 // Issue sequence of spills for cs regs. The first spill may be converted 2513 // to a pre-decrement store later by emitPrologue if the callee-save stack 2514 // area allocation can't be combined with the local stack area allocation. 2515 // For example: 2516 // stp x22, x21, [sp, #0] // addImm(+0) 2517 // stp x20, x19, [sp, #16] // addImm(+2) 2518 // stp fp, lr, [sp, #32] // addImm(+4) 2519 // Rationale: This sequence saves uop updates compared to a sequence of 2520 // pre-increment spills like stp xi,xj,[sp,#-16]! 2521 // Note: Similar rationale and sequence for restores in epilog. 2522 unsigned Size; 2523 Align Alignment; 2524 switch (RPI.Type) { 2525 case RegPairInfo::GPR: 2526 StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui; 2527 Size = 8; 2528 Alignment = Align(8); 2529 break; 2530 case RegPairInfo::FPR64: 2531 StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui; 2532 Size = 8; 2533 Alignment = Align(8); 2534 break; 2535 case RegPairInfo::FPR128: 2536 StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui; 2537 Size = 16; 2538 Alignment = Align(16); 2539 break; 2540 case RegPairInfo::ZPR: 2541 StrOpc = AArch64::STR_ZXI; 2542 Size = 16; 2543 Alignment = Align(16); 2544 break; 2545 case RegPairInfo::PPR: 2546 StrOpc = AArch64::STR_PXI; 2547 Size = 2; 2548 Alignment = Align(2); 2549 break; 2550 } 2551 LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI); 2552 if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI); 2553 dbgs() << ") -> fi#(" << RPI.FrameIdx; 2554 if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1; 2555 dbgs() << ")\n"); 2556 2557 assert((!NeedsWinCFI || !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) && 2558 "Windows unwdinding requires a consecutive (FP,LR) pair"); 2559 // Windows unwind codes require consecutive registers if registers are 2560 // paired. Make the switch here, so that the code below will save (x,x+1) 2561 // and not (x+1,x). 2562 unsigned FrameIdxReg1 = RPI.FrameIdx; 2563 unsigned FrameIdxReg2 = RPI.FrameIdx + 1; 2564 if (NeedsWinCFI && RPI.isPaired()) { 2565 std::swap(Reg1, Reg2); 2566 std::swap(FrameIdxReg1, FrameIdxReg2); 2567 } 2568 MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc)); 2569 if (!MRI.isReserved(Reg1)) 2570 MBB.addLiveIn(Reg1); 2571 if (RPI.isPaired()) { 2572 if (!MRI.isReserved(Reg2)) 2573 MBB.addLiveIn(Reg2); 2574 MIB.addReg(Reg2, getPrologueDeath(MF, Reg2)); 2575 MIB.addMemOperand(MF.getMachineMemOperand( 2576 MachinePointerInfo::getFixedStack(MF, FrameIdxReg2), 2577 MachineMemOperand::MOStore, Size, Alignment)); 2578 } 2579 MIB.addReg(Reg1, getPrologueDeath(MF, Reg1)) 2580 .addReg(AArch64::SP) 2581 .addImm(RPI.Offset) // [sp, #offset*scale], 2582 // where factor*scale is implicit 2583 .setMIFlag(MachineInstr::FrameSetup); 2584 MIB.addMemOperand(MF.getMachineMemOperand( 2585 MachinePointerInfo::getFixedStack(MF, FrameIdxReg1), 2586 MachineMemOperand::MOStore, Size, Alignment)); 2587 if (NeedsWinCFI) 2588 InsertSEH(MIB, TII, MachineInstr::FrameSetup); 2589 2590 // Update the StackIDs of the SVE stack slots. 2591 MachineFrameInfo &MFI = MF.getFrameInfo(); 2592 if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR) 2593 MFI.setStackID(RPI.FrameIdx, TargetStackID::ScalableVector); 2594 2595 } 2596 return true; 2597 } 2598 2599 bool AArch64FrameLowering::restoreCalleeSavedRegisters( 2600 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, 2601 MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { 2602 MachineFunction &MF = *MBB.getParent(); 2603 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); 2604 DebugLoc DL; 2605 SmallVector<RegPairInfo, 8> RegPairs; 2606 bool NeedsWinCFI = needsWinCFI(MF); 2607 2608 if (MI != MBB.end()) 2609 DL = MI->getDebugLoc(); 2610 2611 bool NeedShadowCallStackProlog = false; 2612 computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, 2613 NeedShadowCallStackProlog, hasFP(MF)); 2614 2615 auto EmitMI = [&](const RegPairInfo &RPI) { 2616 unsigned Reg1 = RPI.Reg1; 2617 unsigned Reg2 = RPI.Reg2; 2618 2619 // Issue sequence of restores for cs regs. The last restore may be converted 2620 // to a post-increment load later by emitEpilogue if the callee-save stack 2621 // area allocation can't be combined with the local stack area allocation. 2622 // For example: 2623 // ldp fp, lr, [sp, #32] // addImm(+4) 2624 // ldp x20, x19, [sp, #16] // addImm(+2) 2625 // ldp x22, x21, [sp, #0] // addImm(+0) 2626 // Note: see comment in spillCalleeSavedRegisters() 2627 unsigned LdrOpc; 2628 unsigned Size; 2629 Align Alignment; 2630 switch (RPI.Type) { 2631 case RegPairInfo::GPR: 2632 LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui; 2633 Size = 8; 2634 Alignment = Align(8); 2635 break; 2636 case RegPairInfo::FPR64: 2637 LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui; 2638 Size = 8; 2639 Alignment = Align(8); 2640 break; 2641 case RegPairInfo::FPR128: 2642 LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui; 2643 Size = 16; 2644 Alignment = Align(16); 2645 break; 2646 case RegPairInfo::ZPR: 2647 LdrOpc = AArch64::LDR_ZXI; 2648 Size = 16; 2649 Alignment = Align(16); 2650 break; 2651 case RegPairInfo::PPR: 2652 LdrOpc = AArch64::LDR_PXI; 2653 Size = 2; 2654 Alignment = Align(2); 2655 break; 2656 } 2657 LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI); 2658 if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI); 2659 dbgs() << ") -> fi#(" << RPI.FrameIdx; 2660 if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1; 2661 dbgs() << ")\n"); 2662 2663 // Windows unwind codes require consecutive registers if registers are 2664 // paired. Make the switch here, so that the code below will save (x,x+1) 2665 // and not (x+1,x). 2666 unsigned FrameIdxReg1 = RPI.FrameIdx; 2667 unsigned FrameIdxReg2 = RPI.FrameIdx + 1; 2668 if (NeedsWinCFI && RPI.isPaired()) { 2669 std::swap(Reg1, Reg2); 2670 std::swap(FrameIdxReg1, FrameIdxReg2); 2671 } 2672 MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc)); 2673 if (RPI.isPaired()) { 2674 MIB.addReg(Reg2, getDefRegState(true)); 2675 MIB.addMemOperand(MF.getMachineMemOperand( 2676 MachinePointerInfo::getFixedStack(MF, FrameIdxReg2), 2677 MachineMemOperand::MOLoad, Size, Alignment)); 2678 } 2679 MIB.addReg(Reg1, getDefRegState(true)) 2680 .addReg(AArch64::SP) 2681 .addImm(RPI.Offset) // [sp, #offset*scale] 2682 // where factor*scale is implicit 2683 .setMIFlag(MachineInstr::FrameDestroy); 2684 MIB.addMemOperand(MF.getMachineMemOperand( 2685 MachinePointerInfo::getFixedStack(MF, FrameIdxReg1), 2686 MachineMemOperand::MOLoad, Size, Alignment)); 2687 if (NeedsWinCFI) 2688 InsertSEH(MIB, TII, MachineInstr::FrameDestroy); 2689 }; 2690 2691 // SVE objects are always restored in reverse order. 2692 for (const RegPairInfo &RPI : reverse(RegPairs)) 2693 if (RPI.isScalable()) 2694 EmitMI(RPI); 2695 2696 if (ReverseCSRRestoreSeq) { 2697 for (const RegPairInfo &RPI : reverse(RegPairs)) 2698 if (!RPI.isScalable()) 2699 EmitMI(RPI); 2700 } else if (homogeneousPrologEpilog(MF, &MBB)) { 2701 auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Epilog)) 2702 .setMIFlag(MachineInstr::FrameDestroy); 2703 for (auto &RPI : RegPairs) { 2704 MIB.addReg(RPI.Reg1, RegState::Define); 2705 MIB.addReg(RPI.Reg2, RegState::Define); 2706 } 2707 return true; 2708 } else 2709 for (const RegPairInfo &RPI : RegPairs) 2710 if (!RPI.isScalable()) 2711 EmitMI(RPI); 2712 2713 if (NeedShadowCallStackProlog) { 2714 // Shadow call stack epilog: ldr x30, [x18, #-8]! 2715 BuildMI(MBB, MI, DL, TII.get(AArch64::LDRXpre)) 2716 .addReg(AArch64::X18, RegState::Define) 2717 .addReg(AArch64::LR, RegState::Define) 2718 .addReg(AArch64::X18) 2719 .addImm(-8) 2720 .setMIFlag(MachineInstr::FrameDestroy); 2721 } 2722 2723 return true; 2724 } 2725 2726 void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, 2727 BitVector &SavedRegs, 2728 RegScavenger *RS) const { 2729 // All calls are tail calls in GHC calling conv, and functions have no 2730 // prologue/epilogue. 2731 if (MF.getFunction().getCallingConv() == CallingConv::GHC) 2732 return; 2733 2734 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); 2735 const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( 2736 MF.getSubtarget().getRegisterInfo()); 2737 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); 2738 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 2739 unsigned UnspilledCSGPR = AArch64::NoRegister; 2740 unsigned UnspilledCSGPRPaired = AArch64::NoRegister; 2741 2742 MachineFrameInfo &MFI = MF.getFrameInfo(); 2743 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); 2744 2745 unsigned BasePointerReg = RegInfo->hasBasePointer(MF) 2746 ? RegInfo->getBaseRegister() 2747 : (unsigned)AArch64::NoRegister; 2748 2749 unsigned ExtraCSSpill = 0; 2750 // Figure out which callee-saved registers to save/restore. 2751 for (unsigned i = 0; CSRegs[i]; ++i) { 2752 const unsigned Reg = CSRegs[i]; 2753 2754 // Add the base pointer register to SavedRegs if it is callee-save. 2755 if (Reg == BasePointerReg) 2756 SavedRegs.set(Reg); 2757 2758 bool RegUsed = SavedRegs.test(Reg); 2759 unsigned PairedReg = AArch64::NoRegister; 2760 if (AArch64::GPR64RegClass.contains(Reg) || 2761 AArch64::FPR64RegClass.contains(Reg) || 2762 AArch64::FPR128RegClass.contains(Reg)) 2763 PairedReg = CSRegs[i ^ 1]; 2764 2765 if (!RegUsed) { 2766 if (AArch64::GPR64RegClass.contains(Reg) && 2767 !RegInfo->isReservedReg(MF, Reg)) { 2768 UnspilledCSGPR = Reg; 2769 UnspilledCSGPRPaired = PairedReg; 2770 } 2771 continue; 2772 } 2773 2774 // MachO's compact unwind format relies on all registers being stored in 2775 // pairs. 2776 // FIXME: the usual format is actually better if unwinding isn't needed. 2777 if (producePairRegisters(MF) && PairedReg != AArch64::NoRegister && 2778 !SavedRegs.test(PairedReg)) { 2779 SavedRegs.set(PairedReg); 2780 if (AArch64::GPR64RegClass.contains(PairedReg) && 2781 !RegInfo->isReservedReg(MF, PairedReg)) 2782 ExtraCSSpill = PairedReg; 2783 } 2784 } 2785 2786 if (MF.getFunction().getCallingConv() == CallingConv::Win64 && 2787 !Subtarget.isTargetWindows()) { 2788 // For Windows calling convention on a non-windows OS, where X18 is treated 2789 // as reserved, back up X18 when entering non-windows code (marked with the 2790 // Windows calling convention) and restore when returning regardless of 2791 // whether the individual function uses it - it might call other functions 2792 // that clobber it. 2793 SavedRegs.set(AArch64::X18); 2794 } 2795 2796 // Calculates the callee saved stack size. 2797 unsigned CSStackSize = 0; 2798 unsigned SVECSStackSize = 0; 2799 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); 2800 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2801 for (unsigned Reg : SavedRegs.set_bits()) { 2802 auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8; 2803 if (AArch64::PPRRegClass.contains(Reg) || 2804 AArch64::ZPRRegClass.contains(Reg)) 2805 SVECSStackSize += RegSize; 2806 else 2807 CSStackSize += RegSize; 2808 } 2809 2810 // Save number of saved regs, so we can easily update CSStackSize later. 2811 unsigned NumSavedRegs = SavedRegs.count(); 2812 2813 // The frame record needs to be created by saving the appropriate registers 2814 uint64_t EstimatedStackSize = MFI.estimateStackSize(MF); 2815 if (hasFP(MF) || 2816 windowsRequiresStackProbe(MF, EstimatedStackSize + CSStackSize + 16)) { 2817 SavedRegs.set(AArch64::FP); 2818 SavedRegs.set(AArch64::LR); 2819 } 2820 2821 LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nSaved CSRs:"; 2822 for (unsigned Reg 2823 : SavedRegs.set_bits()) dbgs() 2824 << ' ' << printReg(Reg, RegInfo); 2825 dbgs() << "\n";); 2826 2827 // If any callee-saved registers are used, the frame cannot be eliminated. 2828 int64_t SVEStackSize = 2829 alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16); 2830 bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize; 2831 2832 // The CSR spill slots have not been allocated yet, so estimateStackSize 2833 // won't include them. 2834 unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF); 2835 2836 // Conservatively always assume BigStack when there are SVE spills. 2837 bool BigStack = SVEStackSize || 2838 (EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit; 2839 if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) 2840 AFI->setHasStackFrame(true); 2841 2842 // Estimate if we might need to scavenge a register at some point in order 2843 // to materialize a stack offset. If so, either spill one additional 2844 // callee-saved register or reserve a special spill slot to facilitate 2845 // register scavenging. If we already spilled an extra callee-saved register 2846 // above to keep the number of spills even, we don't need to do anything else 2847 // here. 2848 if (BigStack) { 2849 if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) { 2850 LLVM_DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo) 2851 << " to get a scratch register.\n"); 2852 SavedRegs.set(UnspilledCSGPR); 2853 // MachO's compact unwind format relies on all registers being stored in 2854 // pairs, so if we need to spill one extra for BigStack, then we need to 2855 // store the pair. 2856 if (producePairRegisters(MF)) 2857 SavedRegs.set(UnspilledCSGPRPaired); 2858 ExtraCSSpill = UnspilledCSGPR; 2859 } 2860 2861 // If we didn't find an extra callee-saved register to spill, create 2862 // an emergency spill slot. 2863 if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) { 2864 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); 2865 const TargetRegisterClass &RC = AArch64::GPR64RegClass; 2866 unsigned Size = TRI->getSpillSize(RC); 2867 Align Alignment = TRI->getSpillAlign(RC); 2868 int FI = MFI.CreateStackObject(Size, Alignment, false); 2869 RS->addScavengingFrameIndex(FI); 2870 LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI 2871 << " as the emergency spill slot.\n"); 2872 } 2873 } 2874 2875 // Adding the size of additional 64bit GPR saves. 2876 CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs); 2877 2878 // A Swift asynchronous context extends the frame record with a pointer 2879 // directly before FP. 2880 if (hasFP(MF) && AFI->hasSwiftAsyncContext()) 2881 CSStackSize += 8; 2882 2883 uint64_t AlignedCSStackSize = alignTo(CSStackSize, 16); 2884 LLVM_DEBUG(dbgs() << "Estimated stack frame size: " 2885 << EstimatedStackSize + AlignedCSStackSize 2886 << " bytes.\n"); 2887 2888 assert((!MFI.isCalleeSavedInfoValid() || 2889 AFI->getCalleeSavedStackSize() == AlignedCSStackSize) && 2890 "Should not invalidate callee saved info"); 2891 2892 // Round up to register pair alignment to avoid additional SP adjustment 2893 // instructions. 2894 AFI->setCalleeSavedStackSize(AlignedCSStackSize); 2895 AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize); 2896 AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16)); 2897 } 2898 2899 bool AArch64FrameLowering::assignCalleeSavedSpillSlots( 2900 MachineFunction &MF, const TargetRegisterInfo *RegInfo, 2901 std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex, 2902 unsigned &MaxCSFrameIndex) const { 2903 bool NeedsWinCFI = needsWinCFI(MF); 2904 // To match the canonical windows frame layout, reverse the list of 2905 // callee saved registers to get them laid out by PrologEpilogInserter 2906 // in the right order. (PrologEpilogInserter allocates stack objects top 2907 // down. Windows canonical prologs store higher numbered registers at 2908 // the top, thus have the CSI array start from the highest registers.) 2909 if (NeedsWinCFI) 2910 std::reverse(CSI.begin(), CSI.end()); 2911 2912 if (CSI.empty()) 2913 return true; // Early exit if no callee saved registers are modified! 2914 2915 // Now that we know which registers need to be saved and restored, allocate 2916 // stack slots for them. 2917 MachineFrameInfo &MFI = MF.getFrameInfo(); 2918 auto *AFI = MF.getInfo<AArch64FunctionInfo>(); 2919 for (auto &CS : CSI) { 2920 Register Reg = CS.getReg(); 2921 const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); 2922 2923 unsigned Size = RegInfo->getSpillSize(*RC); 2924 Align Alignment(RegInfo->getSpillAlign(*RC)); 2925 int FrameIdx = MFI.CreateStackObject(Size, Alignment, true); 2926 CS.setFrameIdx(FrameIdx); 2927 2928 if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx; 2929 if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx; 2930 2931 // Grab 8 bytes below FP for the extended asynchronous frame info. 2932 if (hasFP(MF) && AFI->hasSwiftAsyncContext() && Reg == AArch64::FP) { 2933 FrameIdx = MFI.CreateStackObject(8, Alignment, true); 2934 AFI->setSwiftAsyncContextFrameIdx(FrameIdx); 2935 if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx; 2936 if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx; 2937 } 2938 } 2939 return true; 2940 } 2941 2942 bool AArch64FrameLowering::enableStackSlotScavenging( 2943 const MachineFunction &MF) const { 2944 const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 2945 return AFI->hasCalleeSaveStackFreeSpace(); 2946 } 2947 2948 /// returns true if there are any SVE callee saves. 2949 static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI, 2950 int &Min, int &Max) { 2951 Min = std::numeric_limits<int>::max(); 2952 Max = std::numeric_limits<int>::min(); 2953 2954 if (!MFI.isCalleeSavedInfoValid()) 2955 return false; 2956 2957 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); 2958 for (auto &CS : CSI) { 2959 if (AArch64::ZPRRegClass.contains(CS.getReg()) || 2960 AArch64::PPRRegClass.contains(CS.getReg())) { 2961 assert((Max == std::numeric_limits<int>::min() || 2962 Max + 1 == CS.getFrameIdx()) && 2963 "SVE CalleeSaves are not consecutive"); 2964 2965 Min = std::min(Min, CS.getFrameIdx()); 2966 Max = std::max(Max, CS.getFrameIdx()); 2967 } 2968 } 2969 return Min != std::numeric_limits<int>::max(); 2970 } 2971 2972 // Process all the SVE stack objects and determine offsets for each 2973 // object. If AssignOffsets is true, the offsets get assigned. 2974 // Fills in the first and last callee-saved frame indices into 2975 // Min/MaxCSFrameIndex, respectively. 2976 // Returns the size of the stack. 2977 static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, 2978 int &MinCSFrameIndex, 2979 int &MaxCSFrameIndex, 2980 bool AssignOffsets) { 2981 #ifndef NDEBUG 2982 // First process all fixed stack objects. 2983 for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) 2984 assert(MFI.getStackID(I) != TargetStackID::ScalableVector && 2985 "SVE vectors should never be passed on the stack by value, only by " 2986 "reference."); 2987 #endif 2988 2989 auto Assign = [&MFI](int FI, int64_t Offset) { 2990 LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n"); 2991 MFI.setObjectOffset(FI, Offset); 2992 }; 2993 2994 int64_t Offset = 0; 2995 2996 // Then process all callee saved slots. 2997 if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) { 2998 // Assign offsets to the callee save slots. 2999 for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) { 3000 Offset += MFI.getObjectSize(I); 3001 Offset = alignTo(Offset, MFI.getObjectAlign(I)); 3002 if (AssignOffsets) 3003 Assign(I, -Offset); 3004 } 3005 } 3006 3007 // Ensure that the Callee-save area is aligned to 16bytes. 3008 Offset = alignTo(Offset, Align(16U)); 3009 3010 // Create a buffer of SVE objects to allocate and sort it. 3011 SmallVector<int, 8> ObjectsToAllocate; 3012 for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) { 3013 unsigned StackID = MFI.getStackID(I); 3014 if (StackID != TargetStackID::ScalableVector) 3015 continue; 3016 if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex) 3017 continue; 3018 if (MFI.isDeadObjectIndex(I)) 3019 continue; 3020 3021 ObjectsToAllocate.push_back(I); 3022 } 3023 3024 // Allocate all SVE locals and spills 3025 for (unsigned FI : ObjectsToAllocate) { 3026 Align Alignment = MFI.getObjectAlign(FI); 3027 // FIXME: Given that the length of SVE vectors is not necessarily a power of 3028 // two, we'd need to align every object dynamically at runtime if the 3029 // alignment is larger than 16. This is not yet supported. 3030 if (Alignment > Align(16)) 3031 report_fatal_error( 3032 "Alignment of scalable vectors > 16 bytes is not yet supported"); 3033 3034 Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment); 3035 if (AssignOffsets) 3036 Assign(FI, -Offset); 3037 } 3038 3039 return Offset; 3040 } 3041 3042 int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets( 3043 MachineFrameInfo &MFI) const { 3044 int MinCSFrameIndex, MaxCSFrameIndex; 3045 return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false); 3046 } 3047 3048 int64_t AArch64FrameLowering::assignSVEStackObjectOffsets( 3049 MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const { 3050 return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, 3051 true); 3052 } 3053 3054 void AArch64FrameLowering::processFunctionBeforeFrameFinalized( 3055 MachineFunction &MF, RegScavenger *RS) const { 3056 MachineFrameInfo &MFI = MF.getFrameInfo(); 3057 3058 assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown && 3059 "Upwards growing stack unsupported"); 3060 3061 int MinCSFrameIndex, MaxCSFrameIndex; 3062 int64_t SVEStackSize = 3063 assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex); 3064 3065 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 3066 AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U)); 3067 AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex); 3068 3069 // If this function isn't doing Win64-style C++ EH, we don't need to do 3070 // anything. 3071 if (!MF.hasEHFunclets()) 3072 return; 3073 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); 3074 WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo(); 3075 3076 MachineBasicBlock &MBB = MF.front(); 3077 auto MBBI = MBB.begin(); 3078 while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) 3079 ++MBBI; 3080 3081 // Create an UnwindHelp object. 3082 // The UnwindHelp object is allocated at the start of the fixed object area 3083 int64_t FixedObject = 3084 getFixedObjectSize(MF, AFI, /*IsWin64*/ true, /*IsFunclet*/ false); 3085 int UnwindHelpFI = MFI.CreateFixedObject(/*Size*/ 8, 3086 /*SPOffset*/ -FixedObject, 3087 /*IsImmutable=*/false); 3088 EHInfo.UnwindHelpFrameIdx = UnwindHelpFI; 3089 3090 // We need to store -2 into the UnwindHelp object at the start of the 3091 // function. 3092 DebugLoc DL; 3093 RS->enterBasicBlockEnd(MBB); 3094 RS->backward(std::prev(MBBI)); 3095 unsigned DstReg = RS->FindUnusedReg(&AArch64::GPR64commonRegClass); 3096 assert(DstReg && "There must be a free register after frame setup"); 3097 BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), DstReg).addImm(-2); 3098 BuildMI(MBB, MBBI, DL, TII.get(AArch64::STURXi)) 3099 .addReg(DstReg, getKillRegState(true)) 3100 .addFrameIndex(UnwindHelpFI) 3101 .addImm(0); 3102 } 3103 3104 namespace { 3105 struct TagStoreInstr { 3106 MachineInstr *MI; 3107 int64_t Offset, Size; 3108 explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size) 3109 : MI(MI), Offset(Offset), Size(Size) {} 3110 }; 3111 3112 class TagStoreEdit { 3113 MachineFunction *MF; 3114 MachineBasicBlock *MBB; 3115 MachineRegisterInfo *MRI; 3116 // Tag store instructions that are being replaced. 3117 SmallVector<TagStoreInstr, 8> TagStores; 3118 // Combined memref arguments of the above instructions. 3119 SmallVector<MachineMemOperand *, 8> CombinedMemRefs; 3120 3121 // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg + 3122 // FrameRegOffset + Size) with the address tag of SP. 3123 Register FrameReg; 3124 StackOffset FrameRegOffset; 3125 int64_t Size; 3126 // If not None, move FrameReg to (FrameReg + FrameRegUpdate) at the end. 3127 Optional<int64_t> FrameRegUpdate; 3128 // MIFlags for any FrameReg updating instructions. 3129 unsigned FrameRegUpdateFlags; 3130 3131 // Use zeroing instruction variants. 3132 bool ZeroData; 3133 DebugLoc DL; 3134 3135 void emitUnrolled(MachineBasicBlock::iterator InsertI); 3136 void emitLoop(MachineBasicBlock::iterator InsertI); 3137 3138 public: 3139 TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData) 3140 : MBB(MBB), ZeroData(ZeroData) { 3141 MF = MBB->getParent(); 3142 MRI = &MF->getRegInfo(); 3143 } 3144 // Add an instruction to be replaced. Instructions must be added in the 3145 // ascending order of Offset, and have to be adjacent. 3146 void addInstruction(TagStoreInstr I) { 3147 assert((TagStores.empty() || 3148 TagStores.back().Offset + TagStores.back().Size == I.Offset) && 3149 "Non-adjacent tag store instructions."); 3150 TagStores.push_back(I); 3151 } 3152 void clear() { TagStores.clear(); } 3153 // Emit equivalent code at the given location, and erase the current set of 3154 // instructions. May skip if the replacement is not profitable. May invalidate 3155 // the input iterator and replace it with a valid one. 3156 void emitCode(MachineBasicBlock::iterator &InsertI, 3157 const AArch64FrameLowering *TFI, bool IsLast); 3158 }; 3159 3160 void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) { 3161 const AArch64InstrInfo *TII = 3162 MF->getSubtarget<AArch64Subtarget>().getInstrInfo(); 3163 3164 const int64_t kMinOffset = -256 * 16; 3165 const int64_t kMaxOffset = 255 * 16; 3166 3167 Register BaseReg = FrameReg; 3168 int64_t BaseRegOffsetBytes = FrameRegOffset.getFixed(); 3169 if (BaseRegOffsetBytes < kMinOffset || 3170 BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) { 3171 Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass); 3172 emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg, 3173 StackOffset::getFixed(BaseRegOffsetBytes), TII); 3174 BaseReg = ScratchReg; 3175 BaseRegOffsetBytes = 0; 3176 } 3177 3178 MachineInstr *LastI = nullptr; 3179 while (Size) { 3180 int64_t InstrSize = (Size > 16) ? 32 : 16; 3181 unsigned Opcode = 3182 InstrSize == 16 3183 ? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset) 3184 : (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset); 3185 MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode)) 3186 .addReg(AArch64::SP) 3187 .addReg(BaseReg) 3188 .addImm(BaseRegOffsetBytes / 16) 3189 .setMemRefs(CombinedMemRefs); 3190 // A store to [BaseReg, #0] should go last for an opportunity to fold the 3191 // final SP adjustment in the epilogue. 3192 if (BaseRegOffsetBytes == 0) 3193 LastI = I; 3194 BaseRegOffsetBytes += InstrSize; 3195 Size -= InstrSize; 3196 } 3197 3198 if (LastI) 3199 MBB->splice(InsertI, MBB, LastI); 3200 } 3201 3202 void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) { 3203 const AArch64InstrInfo *TII = 3204 MF->getSubtarget<AArch64Subtarget>().getInstrInfo(); 3205 3206 Register BaseReg = FrameRegUpdate 3207 ? FrameReg 3208 : MRI->createVirtualRegister(&AArch64::GPR64RegClass); 3209 Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass); 3210 3211 emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII); 3212 3213 int64_t LoopSize = Size; 3214 // If the loop size is not a multiple of 32, split off one 16-byte store at 3215 // the end to fold BaseReg update into. 3216 if (FrameRegUpdate && *FrameRegUpdate) 3217 LoopSize -= LoopSize % 32; 3218 MachineInstr *LoopI = BuildMI(*MBB, InsertI, DL, 3219 TII->get(ZeroData ? AArch64::STZGloop_wback 3220 : AArch64::STGloop_wback)) 3221 .addDef(SizeReg) 3222 .addDef(BaseReg) 3223 .addImm(LoopSize) 3224 .addReg(BaseReg) 3225 .setMemRefs(CombinedMemRefs); 3226 if (FrameRegUpdate) 3227 LoopI->setFlags(FrameRegUpdateFlags); 3228 3229 int64_t ExtraBaseRegUpdate = 3230 FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getFixed() - Size) : 0; 3231 if (LoopSize < Size) { 3232 assert(FrameRegUpdate); 3233 assert(Size - LoopSize == 16); 3234 // Tag 16 more bytes at BaseReg and update BaseReg. 3235 BuildMI(*MBB, InsertI, DL, 3236 TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex)) 3237 .addDef(BaseReg) 3238 .addReg(BaseReg) 3239 .addReg(BaseReg) 3240 .addImm(1 + ExtraBaseRegUpdate / 16) 3241 .setMemRefs(CombinedMemRefs) 3242 .setMIFlags(FrameRegUpdateFlags); 3243 } else if (ExtraBaseRegUpdate) { 3244 // Update BaseReg. 3245 BuildMI( 3246 *MBB, InsertI, DL, 3247 TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri)) 3248 .addDef(BaseReg) 3249 .addReg(BaseReg) 3250 .addImm(std::abs(ExtraBaseRegUpdate)) 3251 .addImm(0) 3252 .setMIFlags(FrameRegUpdateFlags); 3253 } 3254 } 3255 3256 // Check if *II is a register update that can be merged into STGloop that ends 3257 // at (Reg + Size). RemainingOffset is the required adjustment to Reg after the 3258 // end of the loop. 3259 bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg, 3260 int64_t Size, int64_t *TotalOffset) { 3261 MachineInstr &MI = *II; 3262 if ((MI.getOpcode() == AArch64::ADDXri || 3263 MI.getOpcode() == AArch64::SUBXri) && 3264 MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) { 3265 unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm()); 3266 int64_t Offset = MI.getOperand(2).getImm() << Shift; 3267 if (MI.getOpcode() == AArch64::SUBXri) 3268 Offset = -Offset; 3269 int64_t AbsPostOffset = std::abs(Offset - Size); 3270 const int64_t kMaxOffset = 3271 0xFFF; // Max encoding for unshifted ADDXri / SUBXri 3272 if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) { 3273 *TotalOffset = Offset; 3274 return true; 3275 } 3276 } 3277 return false; 3278 } 3279 3280 void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE, 3281 SmallVectorImpl<MachineMemOperand *> &MemRefs) { 3282 MemRefs.clear(); 3283 for (auto &TS : TSE) { 3284 MachineInstr *MI = TS.MI; 3285 // An instruction without memory operands may access anything. Be 3286 // conservative and return an empty list. 3287 if (MI->memoperands_empty()) { 3288 MemRefs.clear(); 3289 return; 3290 } 3291 MemRefs.append(MI->memoperands_begin(), MI->memoperands_end()); 3292 } 3293 } 3294 3295 void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI, 3296 const AArch64FrameLowering *TFI, bool IsLast) { 3297 if (TagStores.empty()) 3298 return; 3299 TagStoreInstr &FirstTagStore = TagStores[0]; 3300 TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1]; 3301 Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size; 3302 DL = TagStores[0].MI->getDebugLoc(); 3303 3304 Register Reg; 3305 FrameRegOffset = TFI->resolveFrameOffsetReference( 3306 *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg, 3307 /*PreferFP=*/false, /*ForSimm=*/true); 3308 FrameReg = Reg; 3309 FrameRegUpdate = None; 3310 3311 mergeMemRefs(TagStores, CombinedMemRefs); 3312 3313 LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n"; 3314 for (const auto &Instr 3315 : TagStores) { dbgs() << " " << *Instr.MI; }); 3316 3317 // Size threshold where a loop becomes shorter than a linear sequence of 3318 // tagging instructions. 3319 const int kSetTagLoopThreshold = 176; 3320 if (Size < kSetTagLoopThreshold) { 3321 if (TagStores.size() < 2) 3322 return; 3323 emitUnrolled(InsertI); 3324 } else { 3325 MachineInstr *UpdateInstr = nullptr; 3326 int64_t TotalOffset; 3327 if (IsLast) { 3328 // See if we can merge base register update into the STGloop. 3329 // This is done in AArch64LoadStoreOptimizer for "normal" stores, 3330 // but STGloop is way too unusual for that, and also it only 3331 // realistically happens in function epilogue. Also, STGloop is expanded 3332 // before that pass. 3333 if (InsertI != MBB->end() && 3334 canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getFixed() + Size, 3335 &TotalOffset)) { 3336 UpdateInstr = &*InsertI++; 3337 LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n " 3338 << *UpdateInstr); 3339 } 3340 } 3341 3342 if (!UpdateInstr && TagStores.size() < 2) 3343 return; 3344 3345 if (UpdateInstr) { 3346 FrameRegUpdate = TotalOffset; 3347 FrameRegUpdateFlags = UpdateInstr->getFlags(); 3348 } 3349 emitLoop(InsertI); 3350 if (UpdateInstr) 3351 UpdateInstr->eraseFromParent(); 3352 } 3353 3354 for (auto &TS : TagStores) 3355 TS.MI->eraseFromParent(); 3356 } 3357 3358 bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset, 3359 int64_t &Size, bool &ZeroData) { 3360 MachineFunction &MF = *MI.getParent()->getParent(); 3361 const MachineFrameInfo &MFI = MF.getFrameInfo(); 3362 3363 unsigned Opcode = MI.getOpcode(); 3364 ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGOffset || 3365 Opcode == AArch64::STZ2GOffset); 3366 3367 if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) { 3368 if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead()) 3369 return false; 3370 if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI()) 3371 return false; 3372 Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex()); 3373 Size = MI.getOperand(2).getImm(); 3374 return true; 3375 } 3376 3377 if (Opcode == AArch64::STGOffset || Opcode == AArch64::STZGOffset) 3378 Size = 16; 3379 else if (Opcode == AArch64::ST2GOffset || Opcode == AArch64::STZ2GOffset) 3380 Size = 32; 3381 else 3382 return false; 3383 3384 if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI()) 3385 return false; 3386 3387 Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) + 3388 16 * MI.getOperand(2).getImm(); 3389 return true; 3390 } 3391 3392 // Detect a run of memory tagging instructions for adjacent stack frame slots, 3393 // and replace them with a shorter instruction sequence: 3394 // * replace STG + STG with ST2G 3395 // * replace STGloop + STGloop with STGloop 3396 // This code needs to run when stack slot offsets are already known, but before 3397 // FrameIndex operands in STG instructions are eliminated. 3398 MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II, 3399 const AArch64FrameLowering *TFI, 3400 RegScavenger *RS) { 3401 bool FirstZeroData; 3402 int64_t Size, Offset; 3403 MachineInstr &MI = *II; 3404 MachineBasicBlock *MBB = MI.getParent(); 3405 MachineBasicBlock::iterator NextI = ++II; 3406 if (&MI == &MBB->instr_back()) 3407 return II; 3408 if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData)) 3409 return II; 3410 3411 SmallVector<TagStoreInstr, 4> Instrs; 3412 Instrs.emplace_back(&MI, Offset, Size); 3413 3414 constexpr int kScanLimit = 10; 3415 int Count = 0; 3416 for (MachineBasicBlock::iterator E = MBB->end(); 3417 NextI != E && Count < kScanLimit; ++NextI) { 3418 MachineInstr &MI = *NextI; 3419 bool ZeroData; 3420 int64_t Size, Offset; 3421 // Collect instructions that update memory tags with a FrameIndex operand 3422 // and (when applicable) constant size, and whose output registers are dead 3423 // (the latter is almost always the case in practice). Since these 3424 // instructions effectively have no inputs or outputs, we are free to skip 3425 // any non-aliasing instructions in between without tracking used registers. 3426 if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) { 3427 if (ZeroData != FirstZeroData) 3428 break; 3429 Instrs.emplace_back(&MI, Offset, Size); 3430 continue; 3431 } 3432 3433 // Only count non-transient, non-tagging instructions toward the scan 3434 // limit. 3435 if (!MI.isTransient()) 3436 ++Count; 3437 3438 // Just in case, stop before the epilogue code starts. 3439 if (MI.getFlag(MachineInstr::FrameSetup) || 3440 MI.getFlag(MachineInstr::FrameDestroy)) 3441 break; 3442 3443 // Reject anything that may alias the collected instructions. 3444 if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects()) 3445 break; 3446 } 3447 3448 // New code will be inserted after the last tagging instruction we've found. 3449 MachineBasicBlock::iterator InsertI = Instrs.back().MI; 3450 InsertI++; 3451 3452 llvm::stable_sort(Instrs, 3453 [](const TagStoreInstr &Left, const TagStoreInstr &Right) { 3454 return Left.Offset < Right.Offset; 3455 }); 3456 3457 // Make sure that we don't have any overlapping stores. 3458 int64_t CurOffset = Instrs[0].Offset; 3459 for (auto &Instr : Instrs) { 3460 if (CurOffset > Instr.Offset) 3461 return NextI; 3462 CurOffset = Instr.Offset + Instr.Size; 3463 } 3464 3465 // Find contiguous runs of tagged memory and emit shorter instruction 3466 // sequencies for them when possible. 3467 TagStoreEdit TSE(MBB, FirstZeroData); 3468 Optional<int64_t> EndOffset; 3469 for (auto &Instr : Instrs) { 3470 if (EndOffset && *EndOffset != Instr.Offset) { 3471 // Found a gap. 3472 TSE.emitCode(InsertI, TFI, /*IsLast = */ false); 3473 TSE.clear(); 3474 } 3475 3476 TSE.addInstruction(Instr); 3477 EndOffset = Instr.Offset + Instr.Size; 3478 } 3479 3480 TSE.emitCode(InsertI, TFI, /*IsLast = */ true); 3481 3482 return InsertI; 3483 } 3484 } // namespace 3485 3486 void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced( 3487 MachineFunction &MF, RegScavenger *RS = nullptr) const { 3488 if (StackTaggingMergeSetTag) 3489 for (auto &BB : MF) 3490 for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();) 3491 II = tryMergeAdjacentSTG(II, this, RS); 3492 } 3493 3494 /// For Win64 AArch64 EH, the offset to the Unwind object is from the SP 3495 /// before the update. This is easily retrieved as it is exactly the offset 3496 /// that is set in processFunctionBeforeFrameFinalized. 3497 StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP( 3498 const MachineFunction &MF, int FI, Register &FrameReg, 3499 bool IgnoreSPUpdates) const { 3500 const MachineFrameInfo &MFI = MF.getFrameInfo(); 3501 if (IgnoreSPUpdates) { 3502 LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is " 3503 << MFI.getObjectOffset(FI) << "\n"); 3504 FrameReg = AArch64::SP; 3505 return StackOffset::getFixed(MFI.getObjectOffset(FI)); 3506 } 3507 3508 return getFrameIndexReference(MF, FI, FrameReg); 3509 } 3510 3511 /// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve 3512 /// the parent's frame pointer 3513 unsigned AArch64FrameLowering::getWinEHParentFrameOffset( 3514 const MachineFunction &MF) const { 3515 return 0; 3516 } 3517 3518 /// Funclets only need to account for space for the callee saved registers, 3519 /// as the locals are accounted for in the parent's stack frame. 3520 unsigned AArch64FrameLowering::getWinEHFuncletFrameSize( 3521 const MachineFunction &MF) const { 3522 // This is the size of the pushed CSRs. 3523 unsigned CSSize = 3524 MF.getInfo<AArch64FunctionInfo>()->getCalleeSavedStackSize(); 3525 // This is the amount of stack a funclet needs to allocate. 3526 return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(), 3527 getStackAlign()); 3528 } 3529 3530 namespace { 3531 struct FrameObject { 3532 bool IsValid = false; 3533 // Index of the object in MFI. 3534 int ObjectIndex = 0; 3535 // Group ID this object belongs to. 3536 int GroupIndex = -1; 3537 // This object should be placed first (closest to SP). 3538 bool ObjectFirst = false; 3539 // This object's group (which always contains the object with 3540 // ObjectFirst==true) should be placed first. 3541 bool GroupFirst = false; 3542 }; 3543 3544 class GroupBuilder { 3545 SmallVector<int, 8> CurrentMembers; 3546 int NextGroupIndex = 0; 3547 std::vector<FrameObject> &Objects; 3548 3549 public: 3550 GroupBuilder(std::vector<FrameObject> &Objects) : Objects(Objects) {} 3551 void AddMember(int Index) { CurrentMembers.push_back(Index); } 3552 void EndCurrentGroup() { 3553 if (CurrentMembers.size() > 1) { 3554 // Create a new group with the current member list. This might remove them 3555 // from their pre-existing groups. That's OK, dealing with overlapping 3556 // groups is too hard and unlikely to make a difference. 3557 LLVM_DEBUG(dbgs() << "group:"); 3558 for (int Index : CurrentMembers) { 3559 Objects[Index].GroupIndex = NextGroupIndex; 3560 LLVM_DEBUG(dbgs() << " " << Index); 3561 } 3562 LLVM_DEBUG(dbgs() << "\n"); 3563 NextGroupIndex++; 3564 } 3565 CurrentMembers.clear(); 3566 } 3567 }; 3568 3569 bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) { 3570 // Objects at a lower index are closer to FP; objects at a higher index are 3571 // closer to SP. 3572 // 3573 // For consistency in our comparison, all invalid objects are placed 3574 // at the end. This also allows us to stop walking when we hit the 3575 // first invalid item after it's all sorted. 3576 // 3577 // The "first" object goes first (closest to SP), followed by the members of 3578 // the "first" group. 3579 // 3580 // The rest are sorted by the group index to keep the groups together. 3581 // Higher numbered groups are more likely to be around longer (i.e. untagged 3582 // in the function epilogue and not at some earlier point). Place them closer 3583 // to SP. 3584 // 3585 // If all else equal, sort by the object index to keep the objects in the 3586 // original order. 3587 return std::make_tuple(!A.IsValid, A.ObjectFirst, A.GroupFirst, A.GroupIndex, 3588 A.ObjectIndex) < 3589 std::make_tuple(!B.IsValid, B.ObjectFirst, B.GroupFirst, B.GroupIndex, 3590 B.ObjectIndex); 3591 } 3592 } // namespace 3593 3594 void AArch64FrameLowering::orderFrameObjects( 3595 const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const { 3596 if (!OrderFrameObjects || ObjectsToAllocate.empty()) 3597 return; 3598 3599 const MachineFrameInfo &MFI = MF.getFrameInfo(); 3600 std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd()); 3601 for (auto &Obj : ObjectsToAllocate) { 3602 FrameObjects[Obj].IsValid = true; 3603 FrameObjects[Obj].ObjectIndex = Obj; 3604 } 3605 3606 // Identify stack slots that are tagged at the same time. 3607 GroupBuilder GB(FrameObjects); 3608 for (auto &MBB : MF) { 3609 for (auto &MI : MBB) { 3610 if (MI.isDebugInstr()) 3611 continue; 3612 int OpIndex; 3613 switch (MI.getOpcode()) { 3614 case AArch64::STGloop: 3615 case AArch64::STZGloop: 3616 OpIndex = 3; 3617 break; 3618 case AArch64::STGOffset: 3619 case AArch64::STZGOffset: 3620 case AArch64::ST2GOffset: 3621 case AArch64::STZ2GOffset: 3622 OpIndex = 1; 3623 break; 3624 default: 3625 OpIndex = -1; 3626 } 3627 3628 int TaggedFI = -1; 3629 if (OpIndex >= 0) { 3630 const MachineOperand &MO = MI.getOperand(OpIndex); 3631 if (MO.isFI()) { 3632 int FI = MO.getIndex(); 3633 if (FI >= 0 && FI < MFI.getObjectIndexEnd() && 3634 FrameObjects[FI].IsValid) 3635 TaggedFI = FI; 3636 } 3637 } 3638 3639 // If this is a stack tagging instruction for a slot that is not part of a 3640 // group yet, either start a new group or add it to the current one. 3641 if (TaggedFI >= 0) 3642 GB.AddMember(TaggedFI); 3643 else 3644 GB.EndCurrentGroup(); 3645 } 3646 // Groups should never span multiple basic blocks. 3647 GB.EndCurrentGroup(); 3648 } 3649 3650 // If the function's tagged base pointer is pinned to a stack slot, we want to 3651 // put that slot first when possible. This will likely place it at SP + 0, 3652 // and save one instruction when generating the base pointer because IRG does 3653 // not allow an immediate offset. 3654 const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>(); 3655 Optional<int> TBPI = AFI.getTaggedBasePointerIndex(); 3656 if (TBPI) { 3657 FrameObjects[*TBPI].ObjectFirst = true; 3658 FrameObjects[*TBPI].GroupFirst = true; 3659 int FirstGroupIndex = FrameObjects[*TBPI].GroupIndex; 3660 if (FirstGroupIndex >= 0) 3661 for (FrameObject &Object : FrameObjects) 3662 if (Object.GroupIndex == FirstGroupIndex) 3663 Object.GroupFirst = true; 3664 } 3665 3666 llvm::stable_sort(FrameObjects, FrameObjectCompare); 3667 3668 int i = 0; 3669 for (auto &Obj : FrameObjects) { 3670 // All invalid items are sorted at the end, so it's safe to stop. 3671 if (!Obj.IsValid) 3672 break; 3673 ObjectsToAllocate[i++] = Obj.ObjectIndex; 3674 } 3675 3676 LLVM_DEBUG(dbgs() << "Final frame order:\n"; for (auto &Obj 3677 : FrameObjects) { 3678 if (!Obj.IsValid) 3679 break; 3680 dbgs() << " " << Obj.ObjectIndex << ": group " << Obj.GroupIndex; 3681 if (Obj.ObjectFirst) 3682 dbgs() << ", first"; 3683 if (Obj.GroupFirst) 3684 dbgs() << ", group-first"; 3685 dbgs() << "\n"; 3686 }); 3687 } 3688