1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains the AArch64 implementation of the TargetInstrInfo class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64InstrInfo.h" 14 #include "AArch64MachineFunctionInfo.h" 15 #include "AArch64Subtarget.h" 16 #include "MCTargetDesc/AArch64AddressingModes.h" 17 #include "Utils/AArch64BaseInfo.h" 18 #include "llvm/ADT/ArrayRef.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/CodeGen/MachineFrameInfo.h" 23 #include "llvm/CodeGen/MachineFunction.h" 24 #include "llvm/CodeGen/MachineInstr.h" 25 #include "llvm/CodeGen/MachineInstrBuilder.h" 26 #include "llvm/CodeGen/MachineMemOperand.h" 27 #include "llvm/CodeGen/MachineModuleInfo.h" 28 #include "llvm/CodeGen/MachineOperand.h" 29 #include "llvm/CodeGen/MachineRegisterInfo.h" 30 #include "llvm/CodeGen/StackMaps.h" 31 #include "llvm/CodeGen/TargetRegisterInfo.h" 32 #include "llvm/CodeGen/TargetSubtargetInfo.h" 33 #include "llvm/IR/DebugInfoMetadata.h" 34 #include "llvm/IR/DebugLoc.h" 35 #include "llvm/IR/GlobalValue.h" 36 #include "llvm/MC/MCAsmInfo.h" 37 #include "llvm/MC/MCInst.h" 38 #include "llvm/MC/MCInstBuilder.h" 39 #include "llvm/MC/MCInstrDesc.h" 40 #include "llvm/Support/Casting.h" 41 #include "llvm/Support/CodeGen.h" 42 #include "llvm/Support/CommandLine.h" 43 #include "llvm/Support/Compiler.h" 44 #include "llvm/Support/ErrorHandling.h" 45 #include "llvm/Support/MathExtras.h" 46 #include "llvm/Target/TargetMachine.h" 47 #include "llvm/Target/TargetOptions.h" 48 #include <cassert> 49 #include <cstdint> 50 #include <iterator> 51 #include <utility> 52 53 using namespace llvm; 54 55 #define GET_INSTRINFO_CTOR_DTOR 56 #include "AArch64GenInstrInfo.inc" 57 58 static cl::opt<unsigned> TBZDisplacementBits( 59 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), 60 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)")); 61 62 static cl::opt<unsigned> CBZDisplacementBits( 63 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), 64 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)")); 65 66 static cl::opt<unsigned> 67 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), 68 cl::desc("Restrict range of Bcc instructions (DEBUG)")); 69 70 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) 71 : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP, 72 AArch64::CATCHRET), 73 RI(STI.getTargetTriple()), Subtarget(STI) {} 74 75 /// GetInstSize - Return the number of bytes of code the specified 76 /// instruction may be. This returns the maximum number of bytes. 77 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 78 const MachineBasicBlock &MBB = *MI.getParent(); 79 const MachineFunction *MF = MBB.getParent(); 80 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); 81 82 { 83 auto Op = MI.getOpcode(); 84 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR) 85 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI); 86 } 87 88 // Meta-instructions emit no code. 89 if (MI.isMetaInstruction()) 90 return 0; 91 92 // FIXME: We currently only handle pseudoinstructions that don't get expanded 93 // before the assembly printer. 94 unsigned NumBytes = 0; 95 const MCInstrDesc &Desc = MI.getDesc(); 96 switch (Desc.getOpcode()) { 97 default: 98 // Anything not explicitly designated otherwise is a normal 4-byte insn. 99 NumBytes = 4; 100 break; 101 case TargetOpcode::STACKMAP: 102 // The upper bound for a stackmap intrinsic is the full length of its shadow 103 NumBytes = StackMapOpers(&MI).getNumPatchBytes(); 104 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 105 break; 106 case TargetOpcode::PATCHPOINT: 107 // The size of the patchpoint intrinsic is the number of bytes requested 108 NumBytes = PatchPointOpers(&MI).getNumPatchBytes(); 109 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 110 break; 111 case TargetOpcode::STATEPOINT: 112 NumBytes = StatepointOpers(&MI).getNumPatchBytes(); 113 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 114 // No patch bytes means a normal call inst is emitted 115 if (NumBytes == 0) 116 NumBytes = 4; 117 break; 118 case AArch64::TLSDESC_CALLSEQ: 119 // This gets lowered to an instruction sequence which takes 16 bytes 120 NumBytes = 16; 121 break; 122 case AArch64::SpeculationBarrierISBDSBEndBB: 123 // This gets lowered to 2 4-byte instructions. 124 NumBytes = 8; 125 break; 126 case AArch64::SpeculationBarrierSBEndBB: 127 // This gets lowered to 1 4-byte instructions. 128 NumBytes = 4; 129 break; 130 case AArch64::JumpTableDest32: 131 case AArch64::JumpTableDest16: 132 case AArch64::JumpTableDest8: 133 NumBytes = 12; 134 break; 135 case AArch64::SPACE: 136 NumBytes = MI.getOperand(1).getImm(); 137 break; 138 case AArch64::StoreSwiftAsyncContext: 139 NumBytes = 20; 140 break; 141 case TargetOpcode::BUNDLE: 142 NumBytes = getInstBundleLength(MI); 143 break; 144 } 145 146 return NumBytes; 147 } 148 149 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const { 150 unsigned Size = 0; 151 MachineBasicBlock::const_instr_iterator I = MI.getIterator(); 152 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); 153 while (++I != E && I->isInsideBundle()) { 154 assert(!I->isBundle() && "No nested bundle!"); 155 Size += getInstSizeInBytes(*I); 156 } 157 return Size; 158 } 159 160 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, 161 SmallVectorImpl<MachineOperand> &Cond) { 162 // Block ends with fall-through condbranch. 163 switch (LastInst->getOpcode()) { 164 default: 165 llvm_unreachable("Unknown branch instruction?"); 166 case AArch64::Bcc: 167 Target = LastInst->getOperand(1).getMBB(); 168 Cond.push_back(LastInst->getOperand(0)); 169 break; 170 case AArch64::CBZW: 171 case AArch64::CBZX: 172 case AArch64::CBNZW: 173 case AArch64::CBNZX: 174 Target = LastInst->getOperand(1).getMBB(); 175 Cond.push_back(MachineOperand::CreateImm(-1)); 176 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 177 Cond.push_back(LastInst->getOperand(0)); 178 break; 179 case AArch64::TBZW: 180 case AArch64::TBZX: 181 case AArch64::TBNZW: 182 case AArch64::TBNZX: 183 Target = LastInst->getOperand(2).getMBB(); 184 Cond.push_back(MachineOperand::CreateImm(-1)); 185 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 186 Cond.push_back(LastInst->getOperand(0)); 187 Cond.push_back(LastInst->getOperand(1)); 188 } 189 } 190 191 static unsigned getBranchDisplacementBits(unsigned Opc) { 192 switch (Opc) { 193 default: 194 llvm_unreachable("unexpected opcode!"); 195 case AArch64::B: 196 return 64; 197 case AArch64::TBNZW: 198 case AArch64::TBZW: 199 case AArch64::TBNZX: 200 case AArch64::TBZX: 201 return TBZDisplacementBits; 202 case AArch64::CBNZW: 203 case AArch64::CBZW: 204 case AArch64::CBNZX: 205 case AArch64::CBZX: 206 return CBZDisplacementBits; 207 case AArch64::Bcc: 208 return BCCDisplacementBits; 209 } 210 } 211 212 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp, 213 int64_t BrOffset) const { 214 unsigned Bits = getBranchDisplacementBits(BranchOp); 215 assert(Bits >= 3 && "max branch displacement must be enough to jump" 216 "over conditional branch expansion"); 217 return isIntN(Bits, BrOffset / 4); 218 } 219 220 MachineBasicBlock * 221 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const { 222 switch (MI.getOpcode()) { 223 default: 224 llvm_unreachable("unexpected opcode!"); 225 case AArch64::B: 226 return MI.getOperand(0).getMBB(); 227 case AArch64::TBZW: 228 case AArch64::TBNZW: 229 case AArch64::TBZX: 230 case AArch64::TBNZX: 231 return MI.getOperand(2).getMBB(); 232 case AArch64::CBZW: 233 case AArch64::CBNZW: 234 case AArch64::CBZX: 235 case AArch64::CBNZX: 236 case AArch64::Bcc: 237 return MI.getOperand(1).getMBB(); 238 } 239 } 240 241 // Branch analysis. 242 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB, 243 MachineBasicBlock *&TBB, 244 MachineBasicBlock *&FBB, 245 SmallVectorImpl<MachineOperand> &Cond, 246 bool AllowModify) const { 247 // If the block has no terminators, it just falls into the block after it. 248 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 249 if (I == MBB.end()) 250 return false; 251 252 // Skip over SpeculationBarrierEndBB terminators 253 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB || 254 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) { 255 --I; 256 } 257 258 if (!isUnpredicatedTerminator(*I)) 259 return false; 260 261 // Get the last instruction in the block. 262 MachineInstr *LastInst = &*I; 263 264 // If there is only one terminator instruction, process it. 265 unsigned LastOpc = LastInst->getOpcode(); 266 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 267 if (isUncondBranchOpcode(LastOpc)) { 268 TBB = LastInst->getOperand(0).getMBB(); 269 return false; 270 } 271 if (isCondBranchOpcode(LastOpc)) { 272 // Block ends with fall-through condbranch. 273 parseCondBranch(LastInst, TBB, Cond); 274 return false; 275 } 276 return true; // Can't handle indirect branch. 277 } 278 279 // Get the instruction before it if it is a terminator. 280 MachineInstr *SecondLastInst = &*I; 281 unsigned SecondLastOpc = SecondLastInst->getOpcode(); 282 283 // If AllowModify is true and the block ends with two or more unconditional 284 // branches, delete all but the first unconditional branch. 285 if (AllowModify && isUncondBranchOpcode(LastOpc)) { 286 while (isUncondBranchOpcode(SecondLastOpc)) { 287 LastInst->eraseFromParent(); 288 LastInst = SecondLastInst; 289 LastOpc = LastInst->getOpcode(); 290 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 291 // Return now the only terminator is an unconditional branch. 292 TBB = LastInst->getOperand(0).getMBB(); 293 return false; 294 } else { 295 SecondLastInst = &*I; 296 SecondLastOpc = SecondLastInst->getOpcode(); 297 } 298 } 299 } 300 301 // If we're allowed to modify and the block ends in a unconditional branch 302 // which could simply fallthrough, remove the branch. (Note: This case only 303 // matters when we can't understand the whole sequence, otherwise it's also 304 // handled by BranchFolding.cpp.) 305 if (AllowModify && isUncondBranchOpcode(LastOpc) && 306 MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) { 307 LastInst->eraseFromParent(); 308 LastInst = SecondLastInst; 309 LastOpc = LastInst->getOpcode(); 310 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 311 assert(!isUncondBranchOpcode(LastOpc) && 312 "unreachable unconditional branches removed above"); 313 314 if (isCondBranchOpcode(LastOpc)) { 315 // Block ends with fall-through condbranch. 316 parseCondBranch(LastInst, TBB, Cond); 317 return false; 318 } 319 return true; // Can't handle indirect branch. 320 } else { 321 SecondLastInst = &*I; 322 SecondLastOpc = SecondLastInst->getOpcode(); 323 } 324 } 325 326 // If there are three terminators, we don't know what sort of block this is. 327 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I)) 328 return true; 329 330 // If the block ends with a B and a Bcc, handle it. 331 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 332 parseCondBranch(SecondLastInst, TBB, Cond); 333 FBB = LastInst->getOperand(0).getMBB(); 334 return false; 335 } 336 337 // If the block ends with two unconditional branches, handle it. The second 338 // one is not executed, so remove it. 339 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 340 TBB = SecondLastInst->getOperand(0).getMBB(); 341 I = LastInst; 342 if (AllowModify) 343 I->eraseFromParent(); 344 return false; 345 } 346 347 // ...likewise if it ends with an indirect branch followed by an unconditional 348 // branch. 349 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 350 I = LastInst; 351 if (AllowModify) 352 I->eraseFromParent(); 353 return true; 354 } 355 356 // Otherwise, can't handle this. 357 return true; 358 } 359 360 bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB, 361 MachineBranchPredicate &MBP, 362 bool AllowModify) const { 363 // For the moment, handle only a block which ends with a cb(n)zx followed by 364 // a fallthrough. Why this? Because it is a common form. 365 // TODO: Should we handle b.cc? 366 367 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 368 if (I == MBB.end()) 369 return true; 370 371 // Skip over SpeculationBarrierEndBB terminators 372 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB || 373 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) { 374 --I; 375 } 376 377 if (!isUnpredicatedTerminator(*I)) 378 return true; 379 380 // Get the last instruction in the block. 381 MachineInstr *LastInst = &*I; 382 unsigned LastOpc = LastInst->getOpcode(); 383 if (!isCondBranchOpcode(LastOpc)) 384 return true; 385 386 switch (LastOpc) { 387 default: 388 return true; 389 case AArch64::CBZW: 390 case AArch64::CBZX: 391 case AArch64::CBNZW: 392 case AArch64::CBNZX: 393 break; 394 }; 395 396 MBP.TrueDest = LastInst->getOperand(1).getMBB(); 397 assert(MBP.TrueDest && "expected!"); 398 MBP.FalseDest = MBB.getNextNode(); 399 400 MBP.ConditionDef = nullptr; 401 MBP.SingleUseCondition = false; 402 403 MBP.LHS = LastInst->getOperand(0); 404 MBP.RHS = MachineOperand::CreateImm(0); 405 MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE 406 : MachineBranchPredicate::PRED_EQ; 407 return false; 408 } 409 410 bool AArch64InstrInfo::reverseBranchCondition( 411 SmallVectorImpl<MachineOperand> &Cond) const { 412 if (Cond[0].getImm() != -1) { 413 // Regular Bcc 414 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm(); 415 Cond[0].setImm(AArch64CC::getInvertedCondCode(CC)); 416 } else { 417 // Folded compare-and-branch 418 switch (Cond[1].getImm()) { 419 default: 420 llvm_unreachable("Unknown conditional branch!"); 421 case AArch64::CBZW: 422 Cond[1].setImm(AArch64::CBNZW); 423 break; 424 case AArch64::CBNZW: 425 Cond[1].setImm(AArch64::CBZW); 426 break; 427 case AArch64::CBZX: 428 Cond[1].setImm(AArch64::CBNZX); 429 break; 430 case AArch64::CBNZX: 431 Cond[1].setImm(AArch64::CBZX); 432 break; 433 case AArch64::TBZW: 434 Cond[1].setImm(AArch64::TBNZW); 435 break; 436 case AArch64::TBNZW: 437 Cond[1].setImm(AArch64::TBZW); 438 break; 439 case AArch64::TBZX: 440 Cond[1].setImm(AArch64::TBNZX); 441 break; 442 case AArch64::TBNZX: 443 Cond[1].setImm(AArch64::TBZX); 444 break; 445 } 446 } 447 448 return false; 449 } 450 451 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB, 452 int *BytesRemoved) const { 453 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 454 if (I == MBB.end()) 455 return 0; 456 457 if (!isUncondBranchOpcode(I->getOpcode()) && 458 !isCondBranchOpcode(I->getOpcode())) 459 return 0; 460 461 // Remove the branch. 462 I->eraseFromParent(); 463 464 I = MBB.end(); 465 466 if (I == MBB.begin()) { 467 if (BytesRemoved) 468 *BytesRemoved = 4; 469 return 1; 470 } 471 --I; 472 if (!isCondBranchOpcode(I->getOpcode())) { 473 if (BytesRemoved) 474 *BytesRemoved = 4; 475 return 1; 476 } 477 478 // Remove the branch. 479 I->eraseFromParent(); 480 if (BytesRemoved) 481 *BytesRemoved = 8; 482 483 return 2; 484 } 485 486 void AArch64InstrInfo::instantiateCondBranch( 487 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB, 488 ArrayRef<MachineOperand> Cond) const { 489 if (Cond[0].getImm() != -1) { 490 // Regular Bcc 491 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB); 492 } else { 493 // Folded compare-and-branch 494 // Note that we use addOperand instead of addReg to keep the flags. 495 const MachineInstrBuilder MIB = 496 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]); 497 if (Cond.size() > 3) 498 MIB.addImm(Cond[3].getImm()); 499 MIB.addMBB(TBB); 500 } 501 } 502 503 unsigned AArch64InstrInfo::insertBranch( 504 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, 505 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const { 506 // Shouldn't be a fall through. 507 assert(TBB && "insertBranch must not be told to insert a fallthrough"); 508 509 if (!FBB) { 510 if (Cond.empty()) // Unconditional branch? 511 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB); 512 else 513 instantiateCondBranch(MBB, DL, TBB, Cond); 514 515 if (BytesAdded) 516 *BytesAdded = 4; 517 518 return 1; 519 } 520 521 // Two-way conditional branch. 522 instantiateCondBranch(MBB, DL, TBB, Cond); 523 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB); 524 525 if (BytesAdded) 526 *BytesAdded = 8; 527 528 return 2; 529 } 530 531 // Find the original register that VReg is copied from. 532 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) { 533 while (Register::isVirtualRegister(VReg)) { 534 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 535 if (!DefMI->isFullCopy()) 536 return VReg; 537 VReg = DefMI->getOperand(1).getReg(); 538 } 539 return VReg; 540 } 541 542 // Determine if VReg is defined by an instruction that can be folded into a 543 // csel instruction. If so, return the folded opcode, and the replacement 544 // register. 545 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, 546 unsigned *NewVReg = nullptr) { 547 VReg = removeCopies(MRI, VReg); 548 if (!Register::isVirtualRegister(VReg)) 549 return 0; 550 551 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg)); 552 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 553 unsigned Opc = 0; 554 unsigned SrcOpNum = 0; 555 switch (DefMI->getOpcode()) { 556 case AArch64::ADDSXri: 557 case AArch64::ADDSWri: 558 // if NZCV is used, do not fold. 559 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 560 return 0; 561 // fall-through to ADDXri and ADDWri. 562 LLVM_FALLTHROUGH; 563 case AArch64::ADDXri: 564 case AArch64::ADDWri: 565 // add x, 1 -> csinc. 566 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 || 567 DefMI->getOperand(3).getImm() != 0) 568 return 0; 569 SrcOpNum = 1; 570 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr; 571 break; 572 573 case AArch64::ORNXrr: 574 case AArch64::ORNWrr: { 575 // not x -> csinv, represented as orn dst, xzr, src. 576 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 577 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 578 return 0; 579 SrcOpNum = 2; 580 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr; 581 break; 582 } 583 584 case AArch64::SUBSXrr: 585 case AArch64::SUBSWrr: 586 // if NZCV is used, do not fold. 587 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 588 return 0; 589 // fall-through to SUBXrr and SUBWrr. 590 LLVM_FALLTHROUGH; 591 case AArch64::SUBXrr: 592 case AArch64::SUBWrr: { 593 // neg x -> csneg, represented as sub dst, xzr, src. 594 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 595 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 596 return 0; 597 SrcOpNum = 2; 598 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr; 599 break; 600 } 601 default: 602 return 0; 603 } 604 assert(Opc && SrcOpNum && "Missing parameters"); 605 606 if (NewVReg) 607 *NewVReg = DefMI->getOperand(SrcOpNum).getReg(); 608 return Opc; 609 } 610 611 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 612 ArrayRef<MachineOperand> Cond, 613 Register DstReg, Register TrueReg, 614 Register FalseReg, int &CondCycles, 615 int &TrueCycles, 616 int &FalseCycles) const { 617 // Check register classes. 618 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 619 const TargetRegisterClass *RC = 620 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); 621 if (!RC) 622 return false; 623 624 // Also need to check the dest regclass, in case we're trying to optimize 625 // something like: 626 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2 627 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg))) 628 return false; 629 630 // Expanding cbz/tbz requires an extra cycle of latency on the condition. 631 unsigned ExtraCondLat = Cond.size() != 1; 632 633 // GPRs are handled by csel. 634 // FIXME: Fold in x+1, -x, and ~x when applicable. 635 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) || 636 AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 637 // Single-cycle csel, csinc, csinv, and csneg. 638 CondCycles = 1 + ExtraCondLat; 639 TrueCycles = FalseCycles = 1; 640 if (canFoldIntoCSel(MRI, TrueReg)) 641 TrueCycles = 0; 642 else if (canFoldIntoCSel(MRI, FalseReg)) 643 FalseCycles = 0; 644 return true; 645 } 646 647 // Scalar floating point is handled by fcsel. 648 // FIXME: Form fabs, fmin, and fmax when applicable. 649 if (AArch64::FPR64RegClass.hasSubClassEq(RC) || 650 AArch64::FPR32RegClass.hasSubClassEq(RC)) { 651 CondCycles = 5 + ExtraCondLat; 652 TrueCycles = FalseCycles = 2; 653 return true; 654 } 655 656 // Can't do vectors. 657 return false; 658 } 659 660 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, 661 MachineBasicBlock::iterator I, 662 const DebugLoc &DL, Register DstReg, 663 ArrayRef<MachineOperand> Cond, 664 Register TrueReg, Register FalseReg) const { 665 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 666 667 // Parse the condition code, see parseCondBranch() above. 668 AArch64CC::CondCode CC; 669 switch (Cond.size()) { 670 default: 671 llvm_unreachable("Unknown condition opcode in Cond"); 672 case 1: // b.cc 673 CC = AArch64CC::CondCode(Cond[0].getImm()); 674 break; 675 case 3: { // cbz/cbnz 676 // We must insert a compare against 0. 677 bool Is64Bit; 678 switch (Cond[1].getImm()) { 679 default: 680 llvm_unreachable("Unknown branch opcode in Cond"); 681 case AArch64::CBZW: 682 Is64Bit = false; 683 CC = AArch64CC::EQ; 684 break; 685 case AArch64::CBZX: 686 Is64Bit = true; 687 CC = AArch64CC::EQ; 688 break; 689 case AArch64::CBNZW: 690 Is64Bit = false; 691 CC = AArch64CC::NE; 692 break; 693 case AArch64::CBNZX: 694 Is64Bit = true; 695 CC = AArch64CC::NE; 696 break; 697 } 698 Register SrcReg = Cond[2].getReg(); 699 if (Is64Bit) { 700 // cmp reg, #0 is actually subs xzr, reg, #0. 701 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass); 702 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR) 703 .addReg(SrcReg) 704 .addImm(0) 705 .addImm(0); 706 } else { 707 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass); 708 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR) 709 .addReg(SrcReg) 710 .addImm(0) 711 .addImm(0); 712 } 713 break; 714 } 715 case 4: { // tbz/tbnz 716 // We must insert a tst instruction. 717 switch (Cond[1].getImm()) { 718 default: 719 llvm_unreachable("Unknown branch opcode in Cond"); 720 case AArch64::TBZW: 721 case AArch64::TBZX: 722 CC = AArch64CC::EQ; 723 break; 724 case AArch64::TBNZW: 725 case AArch64::TBNZX: 726 CC = AArch64CC::NE; 727 break; 728 } 729 // cmp reg, #foo is actually ands xzr, reg, #1<<foo. 730 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW) 731 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR) 732 .addReg(Cond[2].getReg()) 733 .addImm( 734 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32)); 735 else 736 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR) 737 .addReg(Cond[2].getReg()) 738 .addImm( 739 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64)); 740 break; 741 } 742 } 743 744 unsigned Opc = 0; 745 const TargetRegisterClass *RC = nullptr; 746 bool TryFold = false; 747 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) { 748 RC = &AArch64::GPR64RegClass; 749 Opc = AArch64::CSELXr; 750 TryFold = true; 751 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) { 752 RC = &AArch64::GPR32RegClass; 753 Opc = AArch64::CSELWr; 754 TryFold = true; 755 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) { 756 RC = &AArch64::FPR64RegClass; 757 Opc = AArch64::FCSELDrrr; 758 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) { 759 RC = &AArch64::FPR32RegClass; 760 Opc = AArch64::FCSELSrrr; 761 } 762 assert(RC && "Unsupported regclass"); 763 764 // Try folding simple instructions into the csel. 765 if (TryFold) { 766 unsigned NewVReg = 0; 767 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg); 768 if (FoldedOpc) { 769 // The folded opcodes csinc, csinc and csneg apply the operation to 770 // FalseReg, so we need to invert the condition. 771 CC = AArch64CC::getInvertedCondCode(CC); 772 TrueReg = FalseReg; 773 } else 774 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg); 775 776 // Fold the operation. Leave any dead instructions for DCE to clean up. 777 if (FoldedOpc) { 778 FalseReg = NewVReg; 779 Opc = FoldedOpc; 780 // The extends the live range of NewVReg. 781 MRI.clearKillFlags(NewVReg); 782 } 783 } 784 785 // Pull all virtual register into the appropriate class. 786 MRI.constrainRegClass(TrueReg, RC); 787 MRI.constrainRegClass(FalseReg, RC); 788 789 // Insert the csel. 790 BuildMI(MBB, I, DL, get(Opc), DstReg) 791 .addReg(TrueReg) 792 .addReg(FalseReg) 793 .addImm(CC); 794 } 795 796 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx. 797 static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) { 798 uint64_t Imm = MI.getOperand(1).getImm(); 799 uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); 800 uint64_t Encoding; 801 return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding); 802 } 803 804 // FIXME: this implementation should be micro-architecture dependent, so a 805 // micro-architecture target hook should be introduced here in future. 806 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { 807 if (!Subtarget.hasCustomCheapAsMoveHandling()) 808 return MI.isAsCheapAsAMove(); 809 810 const unsigned Opcode = MI.getOpcode(); 811 812 // Firstly, check cases gated by features. 813 814 if (Subtarget.hasZeroCycleZeroingFP()) { 815 if (Opcode == AArch64::FMOVH0 || 816 Opcode == AArch64::FMOVS0 || 817 Opcode == AArch64::FMOVD0) 818 return true; 819 } 820 821 if (Subtarget.hasZeroCycleZeroingGP()) { 822 if (Opcode == TargetOpcode::COPY && 823 (MI.getOperand(1).getReg() == AArch64::WZR || 824 MI.getOperand(1).getReg() == AArch64::XZR)) 825 return true; 826 } 827 828 // Secondly, check cases specific to sub-targets. 829 830 if (Subtarget.hasExynosCheapAsMoveHandling()) { 831 if (isExynosCheapAsMove(MI)) 832 return true; 833 834 return MI.isAsCheapAsAMove(); 835 } 836 837 // Finally, check generic cases. 838 839 switch (Opcode) { 840 default: 841 return false; 842 843 // add/sub on register without shift 844 case AArch64::ADDWri: 845 case AArch64::ADDXri: 846 case AArch64::SUBWri: 847 case AArch64::SUBXri: 848 return (MI.getOperand(3).getImm() == 0); 849 850 // logical ops on immediate 851 case AArch64::ANDWri: 852 case AArch64::ANDXri: 853 case AArch64::EORWri: 854 case AArch64::EORXri: 855 case AArch64::ORRWri: 856 case AArch64::ORRXri: 857 return true; 858 859 // logical ops on register without shift 860 case AArch64::ANDWrr: 861 case AArch64::ANDXrr: 862 case AArch64::BICWrr: 863 case AArch64::BICXrr: 864 case AArch64::EONWrr: 865 case AArch64::EONXrr: 866 case AArch64::EORWrr: 867 case AArch64::EORXrr: 868 case AArch64::ORNWrr: 869 case AArch64::ORNXrr: 870 case AArch64::ORRWrr: 871 case AArch64::ORRXrr: 872 return true; 873 874 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or 875 // ORRXri, it is as cheap as MOV 876 case AArch64::MOVi32imm: 877 return canBeExpandedToORR(MI, 32); 878 case AArch64::MOVi64imm: 879 return canBeExpandedToORR(MI, 64); 880 } 881 882 llvm_unreachable("Unknown opcode to check as cheap as a move!"); 883 } 884 885 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) { 886 switch (MI.getOpcode()) { 887 default: 888 return false; 889 890 case AArch64::ADDWrs: 891 case AArch64::ADDXrs: 892 case AArch64::ADDSWrs: 893 case AArch64::ADDSXrs: { 894 unsigned Imm = MI.getOperand(3).getImm(); 895 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 896 if (ShiftVal == 0) 897 return true; 898 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5; 899 } 900 901 case AArch64::ADDWrx: 902 case AArch64::ADDXrx: 903 case AArch64::ADDXrx64: 904 case AArch64::ADDSWrx: 905 case AArch64::ADDSXrx: 906 case AArch64::ADDSXrx64: { 907 unsigned Imm = MI.getOperand(3).getImm(); 908 switch (AArch64_AM::getArithExtendType(Imm)) { 909 default: 910 return false; 911 case AArch64_AM::UXTB: 912 case AArch64_AM::UXTH: 913 case AArch64_AM::UXTW: 914 case AArch64_AM::UXTX: 915 return AArch64_AM::getArithShiftValue(Imm) <= 4; 916 } 917 } 918 919 case AArch64::SUBWrs: 920 case AArch64::SUBSWrs: { 921 unsigned Imm = MI.getOperand(3).getImm(); 922 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 923 return ShiftVal == 0 || 924 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31); 925 } 926 927 case AArch64::SUBXrs: 928 case AArch64::SUBSXrs: { 929 unsigned Imm = MI.getOperand(3).getImm(); 930 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 931 return ShiftVal == 0 || 932 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63); 933 } 934 935 case AArch64::SUBWrx: 936 case AArch64::SUBXrx: 937 case AArch64::SUBXrx64: 938 case AArch64::SUBSWrx: 939 case AArch64::SUBSXrx: 940 case AArch64::SUBSXrx64: { 941 unsigned Imm = MI.getOperand(3).getImm(); 942 switch (AArch64_AM::getArithExtendType(Imm)) { 943 default: 944 return false; 945 case AArch64_AM::UXTB: 946 case AArch64_AM::UXTH: 947 case AArch64_AM::UXTW: 948 case AArch64_AM::UXTX: 949 return AArch64_AM::getArithShiftValue(Imm) == 0; 950 } 951 } 952 953 case AArch64::LDRBBroW: 954 case AArch64::LDRBBroX: 955 case AArch64::LDRBroW: 956 case AArch64::LDRBroX: 957 case AArch64::LDRDroW: 958 case AArch64::LDRDroX: 959 case AArch64::LDRHHroW: 960 case AArch64::LDRHHroX: 961 case AArch64::LDRHroW: 962 case AArch64::LDRHroX: 963 case AArch64::LDRQroW: 964 case AArch64::LDRQroX: 965 case AArch64::LDRSBWroW: 966 case AArch64::LDRSBWroX: 967 case AArch64::LDRSBXroW: 968 case AArch64::LDRSBXroX: 969 case AArch64::LDRSHWroW: 970 case AArch64::LDRSHWroX: 971 case AArch64::LDRSHXroW: 972 case AArch64::LDRSHXroX: 973 case AArch64::LDRSWroW: 974 case AArch64::LDRSWroX: 975 case AArch64::LDRSroW: 976 case AArch64::LDRSroX: 977 case AArch64::LDRWroW: 978 case AArch64::LDRWroX: 979 case AArch64::LDRXroW: 980 case AArch64::LDRXroX: 981 case AArch64::PRFMroW: 982 case AArch64::PRFMroX: 983 case AArch64::STRBBroW: 984 case AArch64::STRBBroX: 985 case AArch64::STRBroW: 986 case AArch64::STRBroX: 987 case AArch64::STRDroW: 988 case AArch64::STRDroX: 989 case AArch64::STRHHroW: 990 case AArch64::STRHHroX: 991 case AArch64::STRHroW: 992 case AArch64::STRHroX: 993 case AArch64::STRQroW: 994 case AArch64::STRQroX: 995 case AArch64::STRSroW: 996 case AArch64::STRSroX: 997 case AArch64::STRWroW: 998 case AArch64::STRWroX: 999 case AArch64::STRXroW: 1000 case AArch64::STRXroX: { 1001 unsigned IsSigned = MI.getOperand(3).getImm(); 1002 return !IsSigned; 1003 } 1004 } 1005 } 1006 1007 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) { 1008 unsigned Opc = MI.getOpcode(); 1009 switch (Opc) { 1010 default: 1011 return false; 1012 case AArch64::SEH_StackAlloc: 1013 case AArch64::SEH_SaveFPLR: 1014 case AArch64::SEH_SaveFPLR_X: 1015 case AArch64::SEH_SaveReg: 1016 case AArch64::SEH_SaveReg_X: 1017 case AArch64::SEH_SaveRegP: 1018 case AArch64::SEH_SaveRegP_X: 1019 case AArch64::SEH_SaveFReg: 1020 case AArch64::SEH_SaveFReg_X: 1021 case AArch64::SEH_SaveFRegP: 1022 case AArch64::SEH_SaveFRegP_X: 1023 case AArch64::SEH_SetFP: 1024 case AArch64::SEH_AddFP: 1025 case AArch64::SEH_Nop: 1026 case AArch64::SEH_PrologEnd: 1027 case AArch64::SEH_EpilogStart: 1028 case AArch64::SEH_EpilogEnd: 1029 return true; 1030 } 1031 } 1032 1033 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, 1034 Register &SrcReg, Register &DstReg, 1035 unsigned &SubIdx) const { 1036 switch (MI.getOpcode()) { 1037 default: 1038 return false; 1039 case AArch64::SBFMXri: // aka sxtw 1040 case AArch64::UBFMXri: // aka uxtw 1041 // Check for the 32 -> 64 bit extension case, these instructions can do 1042 // much more. 1043 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31) 1044 return false; 1045 // This is a signed or unsigned 32 -> 64 bit extension. 1046 SrcReg = MI.getOperand(1).getReg(); 1047 DstReg = MI.getOperand(0).getReg(); 1048 SubIdx = AArch64::sub_32; 1049 return true; 1050 } 1051 } 1052 1053 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint( 1054 const MachineInstr &MIa, const MachineInstr &MIb) const { 1055 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1056 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr; 1057 int64_t OffsetA = 0, OffsetB = 0; 1058 unsigned WidthA = 0, WidthB = 0; 1059 bool OffsetAIsScalable = false, OffsetBIsScalable = false; 1060 1061 assert(MIa.mayLoadOrStore() && "MIa must be a load or store."); 1062 assert(MIb.mayLoadOrStore() && "MIb must be a load or store."); 1063 1064 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() || 1065 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 1066 return false; 1067 1068 // Retrieve the base, offset from the base and width. Width 1069 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If 1070 // base are identical, and the offset of a lower memory access + 1071 // the width doesn't overlap the offset of a higher memory access, 1072 // then the memory accesses are different. 1073 // If OffsetAIsScalable and OffsetBIsScalable are both true, they 1074 // are assumed to have the same scale (vscale). 1075 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable, 1076 WidthA, TRI) && 1077 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable, 1078 WidthB, TRI)) { 1079 if (BaseOpA->isIdenticalTo(*BaseOpB) && 1080 OffsetAIsScalable == OffsetBIsScalable) { 1081 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1082 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1083 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1084 if (LowOffset + LowWidth <= HighOffset) 1085 return true; 1086 } 1087 } 1088 return false; 1089 } 1090 1091 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI, 1092 const MachineBasicBlock *MBB, 1093 const MachineFunction &MF) const { 1094 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF)) 1095 return true; 1096 switch (MI.getOpcode()) { 1097 case AArch64::HINT: 1098 // CSDB hints are scheduling barriers. 1099 if (MI.getOperand(0).getImm() == 0x14) 1100 return true; 1101 break; 1102 case AArch64::DSB: 1103 case AArch64::ISB: 1104 // DSB and ISB also are scheduling barriers. 1105 return true; 1106 default:; 1107 } 1108 return isSEHInstruction(MI); 1109 } 1110 1111 /// analyzeCompare - For a comparison instruction, return the source registers 1112 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. 1113 /// Return true if the comparison instruction can be analyzed. 1114 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, 1115 Register &SrcReg2, int &CmpMask, 1116 int &CmpValue) const { 1117 // The first operand can be a frame index where we'd normally expect a 1118 // register. 1119 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands"); 1120 if (!MI.getOperand(1).isReg()) 1121 return false; 1122 1123 auto NormalizeCmpValue = [](int64_t Value) -> int { 1124 // Comparison immediates may be 64-bit, but CmpValue is only an int. 1125 // Normalize to 0/1/2 return value, where 2 indicates any value apart from 1126 // 0 or 1. 1127 // TODO: Switch CmpValue to int64_t in the API to avoid this. 1128 if (Value == 0 || Value == 1) 1129 return Value; 1130 return 2; 1131 }; 1132 1133 switch (MI.getOpcode()) { 1134 default: 1135 break; 1136 case AArch64::PTEST_PP: 1137 SrcReg = MI.getOperand(0).getReg(); 1138 SrcReg2 = MI.getOperand(1).getReg(); 1139 // Not sure about the mask and value for now... 1140 CmpMask = ~0; 1141 CmpValue = 0; 1142 return true; 1143 case AArch64::SUBSWrr: 1144 case AArch64::SUBSWrs: 1145 case AArch64::SUBSWrx: 1146 case AArch64::SUBSXrr: 1147 case AArch64::SUBSXrs: 1148 case AArch64::SUBSXrx: 1149 case AArch64::ADDSWrr: 1150 case AArch64::ADDSWrs: 1151 case AArch64::ADDSWrx: 1152 case AArch64::ADDSXrr: 1153 case AArch64::ADDSXrs: 1154 case AArch64::ADDSXrx: 1155 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1156 SrcReg = MI.getOperand(1).getReg(); 1157 SrcReg2 = MI.getOperand(2).getReg(); 1158 CmpMask = ~0; 1159 CmpValue = 0; 1160 return true; 1161 case AArch64::SUBSWri: 1162 case AArch64::ADDSWri: 1163 case AArch64::SUBSXri: 1164 case AArch64::ADDSXri: 1165 SrcReg = MI.getOperand(1).getReg(); 1166 SrcReg2 = 0; 1167 CmpMask = ~0; 1168 CmpValue = NormalizeCmpValue(MI.getOperand(2).getImm()); 1169 return true; 1170 case AArch64::ANDSWri: 1171 case AArch64::ANDSXri: 1172 // ANDS does not use the same encoding scheme as the others xxxS 1173 // instructions. 1174 SrcReg = MI.getOperand(1).getReg(); 1175 SrcReg2 = 0; 1176 CmpMask = ~0; 1177 CmpValue = NormalizeCmpValue(AArch64_AM::decodeLogicalImmediate( 1178 MI.getOperand(2).getImm(), 1179 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64)); 1180 return true; 1181 } 1182 1183 return false; 1184 } 1185 1186 static bool UpdateOperandRegClass(MachineInstr &Instr) { 1187 MachineBasicBlock *MBB = Instr.getParent(); 1188 assert(MBB && "Can't get MachineBasicBlock here"); 1189 MachineFunction *MF = MBB->getParent(); 1190 assert(MF && "Can't get MachineFunction here"); 1191 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); 1192 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 1193 MachineRegisterInfo *MRI = &MF->getRegInfo(); 1194 1195 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx; 1196 ++OpIdx) { 1197 MachineOperand &MO = Instr.getOperand(OpIdx); 1198 const TargetRegisterClass *OpRegCstraints = 1199 Instr.getRegClassConstraint(OpIdx, TII, TRI); 1200 1201 // If there's no constraint, there's nothing to do. 1202 if (!OpRegCstraints) 1203 continue; 1204 // If the operand is a frame index, there's nothing to do here. 1205 // A frame index operand will resolve correctly during PEI. 1206 if (MO.isFI()) 1207 continue; 1208 1209 assert(MO.isReg() && 1210 "Operand has register constraints without being a register!"); 1211 1212 Register Reg = MO.getReg(); 1213 if (Register::isPhysicalRegister(Reg)) { 1214 if (!OpRegCstraints->contains(Reg)) 1215 return false; 1216 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) && 1217 !MRI->constrainRegClass(Reg, OpRegCstraints)) 1218 return false; 1219 } 1220 1221 return true; 1222 } 1223 1224 /// Return the opcode that does not set flags when possible - otherwise 1225 /// return the original opcode. The caller is responsible to do the actual 1226 /// substitution and legality checking. 1227 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) { 1228 // Don't convert all compare instructions, because for some the zero register 1229 // encoding becomes the sp register. 1230 bool MIDefinesZeroReg = false; 1231 if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR)) 1232 MIDefinesZeroReg = true; 1233 1234 switch (MI.getOpcode()) { 1235 default: 1236 return MI.getOpcode(); 1237 case AArch64::ADDSWrr: 1238 return AArch64::ADDWrr; 1239 case AArch64::ADDSWri: 1240 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri; 1241 case AArch64::ADDSWrs: 1242 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs; 1243 case AArch64::ADDSWrx: 1244 return AArch64::ADDWrx; 1245 case AArch64::ADDSXrr: 1246 return AArch64::ADDXrr; 1247 case AArch64::ADDSXri: 1248 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri; 1249 case AArch64::ADDSXrs: 1250 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs; 1251 case AArch64::ADDSXrx: 1252 return AArch64::ADDXrx; 1253 case AArch64::SUBSWrr: 1254 return AArch64::SUBWrr; 1255 case AArch64::SUBSWri: 1256 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri; 1257 case AArch64::SUBSWrs: 1258 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs; 1259 case AArch64::SUBSWrx: 1260 return AArch64::SUBWrx; 1261 case AArch64::SUBSXrr: 1262 return AArch64::SUBXrr; 1263 case AArch64::SUBSXri: 1264 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri; 1265 case AArch64::SUBSXrs: 1266 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs; 1267 case AArch64::SUBSXrx: 1268 return AArch64::SUBXrx; 1269 } 1270 } 1271 1272 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 }; 1273 1274 /// True when condition flags are accessed (either by writing or reading) 1275 /// on the instruction trace starting at From and ending at To. 1276 /// 1277 /// Note: If From and To are from different blocks it's assumed CC are accessed 1278 /// on the path. 1279 static bool areCFlagsAccessedBetweenInstrs( 1280 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, 1281 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) { 1282 // Early exit if To is at the beginning of the BB. 1283 if (To == To->getParent()->begin()) 1284 return true; 1285 1286 // Check whether the instructions are in the same basic block 1287 // If not, assume the condition flags might get modified somewhere. 1288 if (To->getParent() != From->getParent()) 1289 return true; 1290 1291 // From must be above To. 1292 assert(std::any_of( 1293 ++To.getReverse(), To->getParent()->rend(), 1294 [From](MachineInstr &MI) { return MI.getIterator() == From; })); 1295 1296 // We iterate backward starting at \p To until we hit \p From. 1297 for (const MachineInstr &Instr : 1298 instructionsWithoutDebug(++To.getReverse(), From.getReverse())) { 1299 if (((AccessToCheck & AK_Write) && 1300 Instr.modifiesRegister(AArch64::NZCV, TRI)) || 1301 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI))) 1302 return true; 1303 } 1304 return false; 1305 } 1306 1307 /// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating 1308 /// operation which could set the flags in an identical manner 1309 bool AArch64InstrInfo::optimizePTestInstr( 1310 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg, 1311 const MachineRegisterInfo *MRI) const { 1312 auto *Mask = MRI->getUniqueVRegDef(MaskReg); 1313 auto *Pred = MRI->getUniqueVRegDef(PredReg); 1314 auto NewOp = Pred->getOpcode(); 1315 bool OpChanged = false; 1316 1317 unsigned MaskOpcode = Mask->getOpcode(); 1318 unsigned PredOpcode = Pred->getOpcode(); 1319 bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode); 1320 bool PredIsWhileLike = isWhileOpcode(PredOpcode); 1321 1322 if (isPTrueOpcode(MaskOpcode) && (PredIsPTestLike || PredIsWhileLike)) { 1323 // For PTEST(PTRUE, OTHER_INST), PTEST is redundant when PTRUE doesn't 1324 // deactivate any lanes OTHER_INST might set. 1325 uint64_t MaskElementSize = getElementSizeForOpcode(MaskOpcode); 1326 uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode); 1327 1328 // Must be an all active predicate of matching element size. 1329 if ((PredElementSize != MaskElementSize) || 1330 (Mask->getOperand(1).getImm() != 31)) 1331 return false; 1332 1333 // Fallthough to simply remove the PTEST. 1334 } else if ((Mask == Pred) && (PredIsPTestLike || PredIsWhileLike)) { 1335 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an 1336 // instruction that sets the flags as PTEST would. 1337 1338 // Fallthough to simply remove the PTEST. 1339 } else if (PredIsPTestLike) { 1340 // For PTEST(PG_1, PTEST_LIKE(PG2, ...)), PTEST is redundant when both 1341 // instructions use the same predicate. 1342 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1343 if (Mask != PTestLikeMask) 1344 return false; 1345 1346 // Fallthough to simply remove the PTEST. 1347 } else { 1348 switch (Pred->getOpcode()) { 1349 case AArch64::BRKB_PPzP: 1350 case AArch64::BRKPB_PPzPP: { 1351 // Op 0 is chain, 1 is the mask, 2 the previous predicate to 1352 // propagate, 3 the new predicate. 1353 1354 // Check to see if our mask is the same as the brkpb's. If 1355 // not the resulting flag bits may be different and we 1356 // can't remove the ptest. 1357 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1358 if (Mask != PredMask) 1359 return false; 1360 1361 // Switch to the new opcode 1362 NewOp = Pred->getOpcode() == AArch64::BRKB_PPzP ? AArch64::BRKBS_PPzP 1363 : AArch64::BRKPBS_PPzPP; 1364 OpChanged = true; 1365 break; 1366 } 1367 case AArch64::BRKN_PPzP: { 1368 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1369 if (Mask != PredMask) 1370 return false; 1371 1372 NewOp = AArch64::BRKNS_PPzP; 1373 OpChanged = true; 1374 break; 1375 } 1376 case AArch64::RDFFR_PPz: { 1377 // rdffr p1.b, PredMask=p0/z <--- Definition of Pred 1378 // ptest Mask=p0, Pred=p1.b <--- If equal masks, remove this and use 1379 // `rdffrs p1.b, p0/z` above. 1380 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1381 if (Mask != PredMask) 1382 return false; 1383 1384 NewOp = AArch64::RDFFRS_PPz; 1385 OpChanged = true; 1386 break; 1387 } 1388 default: 1389 // Bail out if we don't recognize the input 1390 return false; 1391 } 1392 } 1393 1394 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1395 1396 // If another instruction between Pred and PTest accesses flags, don't remove 1397 // the ptest or update the earlier instruction to modify them. 1398 if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI)) 1399 return false; 1400 1401 // If we pass all the checks, it's safe to remove the PTEST and use the flags 1402 // as they are prior to PTEST. Sometimes this requires the tested PTEST 1403 // operand to be replaced with an equivalent instruction that also sets the 1404 // flags. 1405 Pred->setDesc(get(NewOp)); 1406 PTest->eraseFromParent(); 1407 if (OpChanged) { 1408 bool succeeded = UpdateOperandRegClass(*Pred); 1409 (void)succeeded; 1410 assert(succeeded && "Operands have incompatible register classes!"); 1411 Pred->addRegisterDefined(AArch64::NZCV, TRI); 1412 } 1413 1414 // Ensure that the flags def is live. 1415 if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) { 1416 unsigned i = 0, e = Pred->getNumOperands(); 1417 for (; i != e; ++i) { 1418 MachineOperand &MO = Pred->getOperand(i); 1419 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) { 1420 MO.setIsDead(false); 1421 break; 1422 } 1423 } 1424 } 1425 return true; 1426 } 1427 1428 /// Try to optimize a compare instruction. A compare instruction is an 1429 /// instruction which produces AArch64::NZCV. It can be truly compare 1430 /// instruction 1431 /// when there are no uses of its destination register. 1432 /// 1433 /// The following steps are tried in order: 1434 /// 1. Convert CmpInstr into an unconditional version. 1435 /// 2. Remove CmpInstr if above there is an instruction producing a needed 1436 /// condition code or an instruction which can be converted into such an 1437 /// instruction. 1438 /// Only comparison with zero is supported. 1439 bool AArch64InstrInfo::optimizeCompareInstr( 1440 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int CmpMask, 1441 int CmpValue, const MachineRegisterInfo *MRI) const { 1442 assert(CmpInstr.getParent()); 1443 assert(MRI); 1444 1445 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1446 int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true); 1447 if (DeadNZCVIdx != -1) { 1448 if (CmpInstr.definesRegister(AArch64::WZR) || 1449 CmpInstr.definesRegister(AArch64::XZR)) { 1450 CmpInstr.eraseFromParent(); 1451 return true; 1452 } 1453 unsigned Opc = CmpInstr.getOpcode(); 1454 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr); 1455 if (NewOpc == Opc) 1456 return false; 1457 const MCInstrDesc &MCID = get(NewOpc); 1458 CmpInstr.setDesc(MCID); 1459 CmpInstr.RemoveOperand(DeadNZCVIdx); 1460 bool succeeded = UpdateOperandRegClass(CmpInstr); 1461 (void)succeeded; 1462 assert(succeeded && "Some operands reg class are incompatible!"); 1463 return true; 1464 } 1465 1466 if (CmpInstr.getOpcode() == AArch64::PTEST_PP) 1467 return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI); 1468 1469 // Warning: CmpValue == 2 indicates *any* value apart from 0 or 1. 1470 assert((CmpValue == 0 || CmpValue == 1 || CmpValue == 2) && 1471 "CmpValue must be 0, 1, or 2!"); 1472 if (SrcReg2 != 0) 1473 return false; 1474 1475 // CmpInstr is a Compare instruction if destination register is not used. 1476 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg())) 1477 return false; 1478 1479 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI)) 1480 return true; 1481 return (CmpValue == 0 || CmpValue == 1) && 1482 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI); 1483 } 1484 1485 /// Get opcode of S version of Instr. 1486 /// If Instr is S version its opcode is returned. 1487 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version 1488 /// or we are not interested in it. 1489 static unsigned sForm(MachineInstr &Instr) { 1490 switch (Instr.getOpcode()) { 1491 default: 1492 return AArch64::INSTRUCTION_LIST_END; 1493 1494 case AArch64::ADDSWrr: 1495 case AArch64::ADDSWri: 1496 case AArch64::ADDSXrr: 1497 case AArch64::ADDSXri: 1498 case AArch64::SUBSWrr: 1499 case AArch64::SUBSWri: 1500 case AArch64::SUBSXrr: 1501 case AArch64::SUBSXri: 1502 return Instr.getOpcode(); 1503 1504 case AArch64::ADDWrr: 1505 return AArch64::ADDSWrr; 1506 case AArch64::ADDWri: 1507 return AArch64::ADDSWri; 1508 case AArch64::ADDXrr: 1509 return AArch64::ADDSXrr; 1510 case AArch64::ADDXri: 1511 return AArch64::ADDSXri; 1512 case AArch64::ADCWr: 1513 return AArch64::ADCSWr; 1514 case AArch64::ADCXr: 1515 return AArch64::ADCSXr; 1516 case AArch64::SUBWrr: 1517 return AArch64::SUBSWrr; 1518 case AArch64::SUBWri: 1519 return AArch64::SUBSWri; 1520 case AArch64::SUBXrr: 1521 return AArch64::SUBSXrr; 1522 case AArch64::SUBXri: 1523 return AArch64::SUBSXri; 1524 case AArch64::SBCWr: 1525 return AArch64::SBCSWr; 1526 case AArch64::SBCXr: 1527 return AArch64::SBCSXr; 1528 case AArch64::ANDWri: 1529 return AArch64::ANDSWri; 1530 case AArch64::ANDXri: 1531 return AArch64::ANDSXri; 1532 } 1533 } 1534 1535 /// Check if AArch64::NZCV should be alive in successors of MBB. 1536 static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) { 1537 for (auto *BB : MBB->successors()) 1538 if (BB->isLiveIn(AArch64::NZCV)) 1539 return true; 1540 return false; 1541 } 1542 1543 /// \returns The condition code operand index for \p Instr if it is a branch 1544 /// or select and -1 otherwise. 1545 static int 1546 findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) { 1547 switch (Instr.getOpcode()) { 1548 default: 1549 return -1; 1550 1551 case AArch64::Bcc: { 1552 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); 1553 assert(Idx >= 2); 1554 return Idx - 2; 1555 } 1556 1557 case AArch64::CSINVWr: 1558 case AArch64::CSINVXr: 1559 case AArch64::CSINCWr: 1560 case AArch64::CSINCXr: 1561 case AArch64::CSELWr: 1562 case AArch64::CSELXr: 1563 case AArch64::CSNEGWr: 1564 case AArch64::CSNEGXr: 1565 case AArch64::FCSELSrrr: 1566 case AArch64::FCSELDrrr: { 1567 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); 1568 assert(Idx >= 1); 1569 return Idx - 1; 1570 } 1571 } 1572 } 1573 1574 namespace { 1575 1576 struct UsedNZCV { 1577 bool N = false; 1578 bool Z = false; 1579 bool C = false; 1580 bool V = false; 1581 1582 UsedNZCV() = default; 1583 1584 UsedNZCV &operator|=(const UsedNZCV &UsedFlags) { 1585 this->N |= UsedFlags.N; 1586 this->Z |= UsedFlags.Z; 1587 this->C |= UsedFlags.C; 1588 this->V |= UsedFlags.V; 1589 return *this; 1590 } 1591 }; 1592 1593 } // end anonymous namespace 1594 1595 /// Find a condition code used by the instruction. 1596 /// Returns AArch64CC::Invalid if either the instruction does not use condition 1597 /// codes or we don't optimize CmpInstr in the presence of such instructions. 1598 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) { 1599 int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr); 1600 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>( 1601 Instr.getOperand(CCIdx).getImm()) 1602 : AArch64CC::Invalid; 1603 } 1604 1605 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) { 1606 assert(CC != AArch64CC::Invalid); 1607 UsedNZCV UsedFlags; 1608 switch (CC) { 1609 default: 1610 break; 1611 1612 case AArch64CC::EQ: // Z set 1613 case AArch64CC::NE: // Z clear 1614 UsedFlags.Z = true; 1615 break; 1616 1617 case AArch64CC::HI: // Z clear and C set 1618 case AArch64CC::LS: // Z set or C clear 1619 UsedFlags.Z = true; 1620 LLVM_FALLTHROUGH; 1621 case AArch64CC::HS: // C set 1622 case AArch64CC::LO: // C clear 1623 UsedFlags.C = true; 1624 break; 1625 1626 case AArch64CC::MI: // N set 1627 case AArch64CC::PL: // N clear 1628 UsedFlags.N = true; 1629 break; 1630 1631 case AArch64CC::VS: // V set 1632 case AArch64CC::VC: // V clear 1633 UsedFlags.V = true; 1634 break; 1635 1636 case AArch64CC::GT: // Z clear, N and V the same 1637 case AArch64CC::LE: // Z set, N and V differ 1638 UsedFlags.Z = true; 1639 LLVM_FALLTHROUGH; 1640 case AArch64CC::GE: // N and V the same 1641 case AArch64CC::LT: // N and V differ 1642 UsedFlags.N = true; 1643 UsedFlags.V = true; 1644 break; 1645 } 1646 return UsedFlags; 1647 } 1648 1649 /// \returns Conditions flags used after \p CmpInstr in its MachineBB if they 1650 /// are not containing C or V flags and NZCV flags are not alive in successors 1651 /// of the same \p CmpInstr and \p MI parent. \returns None otherwise. 1652 /// 1653 /// Collect instructions using that flags in \p CCUseInstrs if provided. 1654 static Optional<UsedNZCV> 1655 examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, 1656 const TargetRegisterInfo &TRI, 1657 SmallVectorImpl<MachineInstr *> *CCUseInstrs = nullptr) { 1658 MachineBasicBlock *CmpParent = CmpInstr.getParent(); 1659 if (MI.getParent() != CmpParent) 1660 return None; 1661 1662 if (areCFlagsAliveInSuccessors(CmpParent)) 1663 return None; 1664 1665 UsedNZCV NZCVUsedAfterCmp; 1666 for (MachineInstr &Instr : instructionsWithoutDebug( 1667 std::next(CmpInstr.getIterator()), CmpParent->instr_end())) { 1668 if (Instr.readsRegister(AArch64::NZCV, &TRI)) { 1669 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr); 1670 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction 1671 return None; 1672 NZCVUsedAfterCmp |= getUsedNZCV(CC); 1673 if (CCUseInstrs) 1674 CCUseInstrs->push_back(&Instr); 1675 } 1676 if (Instr.modifiesRegister(AArch64::NZCV, &TRI)) 1677 break; 1678 } 1679 if (NZCVUsedAfterCmp.C || NZCVUsedAfterCmp.V) 1680 return None; 1681 return NZCVUsedAfterCmp; 1682 } 1683 1684 static bool isADDSRegImm(unsigned Opcode) { 1685 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri; 1686 } 1687 1688 static bool isSUBSRegImm(unsigned Opcode) { 1689 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri; 1690 } 1691 1692 /// Check if CmpInstr can be substituted by MI. 1693 /// 1694 /// CmpInstr can be substituted: 1695 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0' 1696 /// - and, MI and CmpInstr are from the same MachineBB 1697 /// - and, condition flags are not alive in successors of the CmpInstr parent 1698 /// - and, if MI opcode is the S form there must be no defs of flags between 1699 /// MI and CmpInstr 1700 /// or if MI opcode is not the S form there must be neither defs of flags 1701 /// nor uses of flags between MI and CmpInstr. 1702 /// - and C/V flags are not used after CmpInstr 1703 static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr, 1704 const TargetRegisterInfo &TRI) { 1705 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END); 1706 1707 const unsigned CmpOpcode = CmpInstr.getOpcode(); 1708 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode)) 1709 return false; 1710 1711 if (!examineCFlagsUse(MI, CmpInstr, TRI)) 1712 return false; 1713 1714 AccessKind AccessToCheck = AK_Write; 1715 if (sForm(MI) != MI.getOpcode()) 1716 AccessToCheck = AK_All; 1717 return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck); 1718 } 1719 1720 /// Substitute an instruction comparing to zero with another instruction 1721 /// which produces needed condition flags. 1722 /// 1723 /// Return true on success. 1724 bool AArch64InstrInfo::substituteCmpToZero( 1725 MachineInstr &CmpInstr, unsigned SrcReg, 1726 const MachineRegisterInfo &MRI) const { 1727 // Get the unique definition of SrcReg. 1728 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg); 1729 if (!MI) 1730 return false; 1731 1732 const TargetRegisterInfo &TRI = getRegisterInfo(); 1733 1734 unsigned NewOpc = sForm(*MI); 1735 if (NewOpc == AArch64::INSTRUCTION_LIST_END) 1736 return false; 1737 1738 if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI)) 1739 return false; 1740 1741 // Update the instruction to set NZCV. 1742 MI->setDesc(get(NewOpc)); 1743 CmpInstr.eraseFromParent(); 1744 bool succeeded = UpdateOperandRegClass(*MI); 1745 (void)succeeded; 1746 assert(succeeded && "Some operands reg class are incompatible!"); 1747 MI->addRegisterDefined(AArch64::NZCV, &TRI); 1748 return true; 1749 } 1750 1751 /// \returns True if \p CmpInstr can be removed. 1752 /// 1753 /// \p IsInvertCC is true if, after removing \p CmpInstr, condition 1754 /// codes used in \p CCUseInstrs must be inverted. 1755 static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr, 1756 int CmpValue, const TargetRegisterInfo &TRI, 1757 SmallVectorImpl<MachineInstr *> &CCUseInstrs, 1758 bool &IsInvertCC) { 1759 assert((CmpValue == 0 || CmpValue == 1) && 1760 "Only comparisons to 0 or 1 considered for removal!"); 1761 1762 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>' 1763 unsigned MIOpc = MI.getOpcode(); 1764 if (MIOpc == AArch64::CSINCWr) { 1765 if (MI.getOperand(1).getReg() != AArch64::WZR || 1766 MI.getOperand(2).getReg() != AArch64::WZR) 1767 return false; 1768 } else if (MIOpc == AArch64::CSINCXr) { 1769 if (MI.getOperand(1).getReg() != AArch64::XZR || 1770 MI.getOperand(2).getReg() != AArch64::XZR) 1771 return false; 1772 } else { 1773 return false; 1774 } 1775 AArch64CC::CondCode MICC = findCondCodeUsedByInstr(MI); 1776 if (MICC == AArch64CC::Invalid) 1777 return false; 1778 1779 // NZCV needs to be defined 1780 if (MI.findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) 1781 return false; 1782 1783 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1' 1784 const unsigned CmpOpcode = CmpInstr.getOpcode(); 1785 bool IsSubsRegImm = isSUBSRegImm(CmpOpcode); 1786 if (CmpValue && !IsSubsRegImm) 1787 return false; 1788 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode)) 1789 return false; 1790 1791 // MI conditions allowed: eq, ne, mi, pl 1792 UsedNZCV MIUsedNZCV = getUsedNZCV(MICC); 1793 if (MIUsedNZCV.C || MIUsedNZCV.V) 1794 return false; 1795 1796 Optional<UsedNZCV> NZCVUsedAfterCmp = 1797 examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs); 1798 // Condition flags are not used in CmpInstr basic block successors and only 1799 // Z or N flags allowed to be used after CmpInstr within its basic block 1800 if (!NZCVUsedAfterCmp) 1801 return false; 1802 // Z or N flag used after CmpInstr must correspond to the flag used in MI 1803 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) || 1804 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z)) 1805 return false; 1806 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne 1807 if (MIUsedNZCV.N && !CmpValue) 1808 return false; 1809 1810 // There must be no defs of flags between MI and CmpInstr 1811 if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write)) 1812 return false; 1813 1814 // Condition code is inverted in the following cases: 1815 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' 1816 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1' 1817 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) || 1818 (!CmpValue && MICC == AArch64CC::NE); 1819 return true; 1820 } 1821 1822 /// Remove comparision in csinc-cmp sequence 1823 /// 1824 /// Examples: 1825 /// 1. \code 1826 /// csinc w9, wzr, wzr, ne 1827 /// cmp w9, #0 1828 /// b.eq 1829 /// \endcode 1830 /// to 1831 /// \code 1832 /// csinc w9, wzr, wzr, ne 1833 /// b.ne 1834 /// \endcode 1835 /// 1836 /// 2. \code 1837 /// csinc x2, xzr, xzr, mi 1838 /// cmp x2, #1 1839 /// b.pl 1840 /// \endcode 1841 /// to 1842 /// \code 1843 /// csinc x2, xzr, xzr, mi 1844 /// b.pl 1845 /// \endcode 1846 /// 1847 /// \param CmpInstr comparison instruction 1848 /// \return True when comparison removed 1849 bool AArch64InstrInfo::removeCmpToZeroOrOne( 1850 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue, 1851 const MachineRegisterInfo &MRI) const { 1852 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg); 1853 if (!MI) 1854 return false; 1855 const TargetRegisterInfo &TRI = getRegisterInfo(); 1856 SmallVector<MachineInstr *, 4> CCUseInstrs; 1857 bool IsInvertCC = false; 1858 if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs, 1859 IsInvertCC)) 1860 return false; 1861 // Make transformation 1862 CmpInstr.eraseFromParent(); 1863 if (IsInvertCC) { 1864 // Invert condition codes in CmpInstr CC users 1865 for (MachineInstr *CCUseInstr : CCUseInstrs) { 1866 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr); 1867 assert(Idx >= 0 && "Unexpected instruction using CC."); 1868 MachineOperand &CCOperand = CCUseInstr->getOperand(Idx); 1869 AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode( 1870 static_cast<AArch64CC::CondCode>(CCOperand.getImm())); 1871 CCOperand.setImm(CCUse); 1872 } 1873 } 1874 return true; 1875 } 1876 1877 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 1878 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD && 1879 MI.getOpcode() != AArch64::CATCHRET) 1880 return false; 1881 1882 MachineBasicBlock &MBB = *MI.getParent(); 1883 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>(); 1884 auto TRI = Subtarget.getRegisterInfo(); 1885 DebugLoc DL = MI.getDebugLoc(); 1886 1887 if (MI.getOpcode() == AArch64::CATCHRET) { 1888 // Skip to the first instruction before the epilog. 1889 const TargetInstrInfo *TII = 1890 MBB.getParent()->getSubtarget().getInstrInfo(); 1891 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB(); 1892 auto MBBI = MachineBasicBlock::iterator(MI); 1893 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI); 1894 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) && 1895 FirstEpilogSEH != MBB.begin()) 1896 FirstEpilogSEH = std::prev(FirstEpilogSEH); 1897 if (FirstEpilogSEH != MBB.begin()) 1898 FirstEpilogSEH = std::next(FirstEpilogSEH); 1899 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP)) 1900 .addReg(AArch64::X0, RegState::Define) 1901 .addMBB(TargetMBB); 1902 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri)) 1903 .addReg(AArch64::X0, RegState::Define) 1904 .addReg(AArch64::X0) 1905 .addMBB(TargetMBB) 1906 .addImm(0); 1907 return true; 1908 } 1909 1910 Register Reg = MI.getOperand(0).getReg(); 1911 Module &M = *MBB.getParent()->getFunction().getParent(); 1912 if (M.getStackProtectorGuard() == "sysreg") { 1913 const AArch64SysReg::SysReg *SrcReg = 1914 AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg()); 1915 if (!SrcReg) 1916 report_fatal_error("Unknown SysReg for Stack Protector Guard Register"); 1917 1918 // mrs xN, sysreg 1919 BuildMI(MBB, MI, DL, get(AArch64::MRS)) 1920 .addDef(Reg, RegState::Renamable) 1921 .addImm(SrcReg->Encoding); 1922 int Offset = M.getStackProtectorGuardOffset(); 1923 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) { 1924 // ldr xN, [xN, #offset] 1925 BuildMI(MBB, MI, DL, get(AArch64::LDRXui)) 1926 .addDef(Reg) 1927 .addUse(Reg, RegState::Kill) 1928 .addImm(Offset / 8); 1929 } else if (Offset >= -256 && Offset <= 255) { 1930 // ldur xN, [xN, #offset] 1931 BuildMI(MBB, MI, DL, get(AArch64::LDURXi)) 1932 .addDef(Reg) 1933 .addUse(Reg, RegState::Kill) 1934 .addImm(Offset); 1935 } else if (Offset >= -4095 && Offset <= 4095) { 1936 if (Offset > 0) { 1937 // add xN, xN, #offset 1938 BuildMI(MBB, MI, DL, get(AArch64::ADDXri)) 1939 .addDef(Reg) 1940 .addUse(Reg, RegState::Kill) 1941 .addImm(Offset) 1942 .addImm(0); 1943 } else { 1944 // sub xN, xN, #offset 1945 BuildMI(MBB, MI, DL, get(AArch64::SUBXri)) 1946 .addDef(Reg) 1947 .addUse(Reg, RegState::Kill) 1948 .addImm(-Offset) 1949 .addImm(0); 1950 } 1951 // ldr xN, [xN] 1952 BuildMI(MBB, MI, DL, get(AArch64::LDRXui)) 1953 .addDef(Reg) 1954 .addUse(Reg, RegState::Kill) 1955 .addImm(0); 1956 } else { 1957 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger 1958 // than 23760. 1959 // It might be nice to use AArch64::MOVi32imm here, which would get 1960 // expanded in PreSched2 after PostRA, but our lone scratch Reg already 1961 // contains the MRS result. findScratchNonCalleeSaveRegister() in 1962 // AArch64FrameLowering might help us find such a scratch register 1963 // though. If we failed to find a scratch register, we could emit a 1964 // stream of add instructions to build up the immediate. Or, we could try 1965 // to insert a AArch64::MOVi32imm before register allocation so that we 1966 // didn't need to scavenge for a scratch register. 1967 report_fatal_error("Unable to encode Stack Protector Guard Offset"); 1968 } 1969 MBB.erase(MI); 1970 return true; 1971 } 1972 1973 const GlobalValue *GV = 1974 cast<GlobalValue>((*MI.memoperands_begin())->getValue()); 1975 const TargetMachine &TM = MBB.getParent()->getTarget(); 1976 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM); 1977 const unsigned char MO_NC = AArch64II::MO_NC; 1978 1979 if ((OpFlags & AArch64II::MO_GOT) != 0) { 1980 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg) 1981 .addGlobalAddress(GV, 0, OpFlags); 1982 if (Subtarget.isTargetILP32()) { 1983 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 1984 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 1985 .addDef(Reg32, RegState::Dead) 1986 .addUse(Reg, RegState::Kill) 1987 .addImm(0) 1988 .addMemOperand(*MI.memoperands_begin()) 1989 .addDef(Reg, RegState::Implicit); 1990 } else { 1991 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 1992 .addReg(Reg, RegState::Kill) 1993 .addImm(0) 1994 .addMemOperand(*MI.memoperands_begin()); 1995 } 1996 } else if (TM.getCodeModel() == CodeModel::Large) { 1997 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?"); 1998 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg) 1999 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC) 2000 .addImm(0); 2001 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2002 .addReg(Reg, RegState::Kill) 2003 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC) 2004 .addImm(16); 2005 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2006 .addReg(Reg, RegState::Kill) 2007 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC) 2008 .addImm(32); 2009 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2010 .addReg(Reg, RegState::Kill) 2011 .addGlobalAddress(GV, 0, AArch64II::MO_G3) 2012 .addImm(48); 2013 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 2014 .addReg(Reg, RegState::Kill) 2015 .addImm(0) 2016 .addMemOperand(*MI.memoperands_begin()); 2017 } else if (TM.getCodeModel() == CodeModel::Tiny) { 2018 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg) 2019 .addGlobalAddress(GV, 0, OpFlags); 2020 } else { 2021 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg) 2022 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE); 2023 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC; 2024 if (Subtarget.isTargetILP32()) { 2025 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 2026 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 2027 .addDef(Reg32, RegState::Dead) 2028 .addUse(Reg, RegState::Kill) 2029 .addGlobalAddress(GV, 0, LoFlags) 2030 .addMemOperand(*MI.memoperands_begin()) 2031 .addDef(Reg, RegState::Implicit); 2032 } else { 2033 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 2034 .addReg(Reg, RegState::Kill) 2035 .addGlobalAddress(GV, 0, LoFlags) 2036 .addMemOperand(*MI.memoperands_begin()); 2037 } 2038 } 2039 2040 MBB.erase(MI); 2041 2042 return true; 2043 } 2044 2045 // Return true if this instruction simply sets its single destination register 2046 // to zero. This is equivalent to a register rename of the zero-register. 2047 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) { 2048 switch (MI.getOpcode()) { 2049 default: 2050 break; 2051 case AArch64::MOVZWi: 2052 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0) 2053 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) { 2054 assert(MI.getDesc().getNumOperands() == 3 && 2055 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands"); 2056 return true; 2057 } 2058 break; 2059 case AArch64::ANDWri: // and Rd, Rzr, #imm 2060 return MI.getOperand(1).getReg() == AArch64::WZR; 2061 case AArch64::ANDXri: 2062 return MI.getOperand(1).getReg() == AArch64::XZR; 2063 case TargetOpcode::COPY: 2064 return MI.getOperand(1).getReg() == AArch64::WZR; 2065 } 2066 return false; 2067 } 2068 2069 // Return true if this instruction simply renames a general register without 2070 // modifying bits. 2071 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) { 2072 switch (MI.getOpcode()) { 2073 default: 2074 break; 2075 case TargetOpcode::COPY: { 2076 // GPR32 copies will by lowered to ORRXrs 2077 Register DstReg = MI.getOperand(0).getReg(); 2078 return (AArch64::GPR32RegClass.contains(DstReg) || 2079 AArch64::GPR64RegClass.contains(DstReg)); 2080 } 2081 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0) 2082 if (MI.getOperand(1).getReg() == AArch64::XZR) { 2083 assert(MI.getDesc().getNumOperands() == 4 && 2084 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands"); 2085 return true; 2086 } 2087 break; 2088 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0) 2089 if (MI.getOperand(2).getImm() == 0) { 2090 assert(MI.getDesc().getNumOperands() == 4 && 2091 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands"); 2092 return true; 2093 } 2094 break; 2095 } 2096 return false; 2097 } 2098 2099 // Return true if this instruction simply renames a general register without 2100 // modifying bits. 2101 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) { 2102 switch (MI.getOpcode()) { 2103 default: 2104 break; 2105 case TargetOpcode::COPY: { 2106 Register DstReg = MI.getOperand(0).getReg(); 2107 return AArch64::FPR128RegClass.contains(DstReg); 2108 } 2109 case AArch64::ORRv16i8: 2110 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) { 2111 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() && 2112 "invalid ORRv16i8 operands"); 2113 return true; 2114 } 2115 break; 2116 } 2117 return false; 2118 } 2119 2120 unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 2121 int &FrameIndex) const { 2122 switch (MI.getOpcode()) { 2123 default: 2124 break; 2125 case AArch64::LDRWui: 2126 case AArch64::LDRXui: 2127 case AArch64::LDRBui: 2128 case AArch64::LDRHui: 2129 case AArch64::LDRSui: 2130 case AArch64::LDRDui: 2131 case AArch64::LDRQui: 2132 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 2133 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 2134 FrameIndex = MI.getOperand(1).getIndex(); 2135 return MI.getOperand(0).getReg(); 2136 } 2137 break; 2138 } 2139 2140 return 0; 2141 } 2142 2143 unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI, 2144 int &FrameIndex) const { 2145 switch (MI.getOpcode()) { 2146 default: 2147 break; 2148 case AArch64::STRWui: 2149 case AArch64::STRXui: 2150 case AArch64::STRBui: 2151 case AArch64::STRHui: 2152 case AArch64::STRSui: 2153 case AArch64::STRDui: 2154 case AArch64::STRQui: 2155 case AArch64::LDR_PXI: 2156 case AArch64::STR_PXI: 2157 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 2158 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 2159 FrameIndex = MI.getOperand(1).getIndex(); 2160 return MI.getOperand(0).getReg(); 2161 } 2162 break; 2163 } 2164 return 0; 2165 } 2166 2167 /// Check all MachineMemOperands for a hint to suppress pairing. 2168 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) { 2169 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 2170 return MMO->getFlags() & MOSuppressPair; 2171 }); 2172 } 2173 2174 /// Set a flag on the first MachineMemOperand to suppress pairing. 2175 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) { 2176 if (MI.memoperands_empty()) 2177 return; 2178 (*MI.memoperands_begin())->setFlags(MOSuppressPair); 2179 } 2180 2181 /// Check all MachineMemOperands for a hint that the load/store is strided. 2182 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) { 2183 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 2184 return MMO->getFlags() & MOStridedAccess; 2185 }); 2186 } 2187 2188 bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) { 2189 switch (Opc) { 2190 default: 2191 return false; 2192 case AArch64::STURSi: 2193 case AArch64::STRSpre: 2194 case AArch64::STURDi: 2195 case AArch64::STRDpre: 2196 case AArch64::STURQi: 2197 case AArch64::STRQpre: 2198 case AArch64::STURBBi: 2199 case AArch64::STURHHi: 2200 case AArch64::STURWi: 2201 case AArch64::STRWpre: 2202 case AArch64::STURXi: 2203 case AArch64::STRXpre: 2204 case AArch64::LDURSi: 2205 case AArch64::LDRSpre: 2206 case AArch64::LDURDi: 2207 case AArch64::LDRDpre: 2208 case AArch64::LDURQi: 2209 case AArch64::LDRQpre: 2210 case AArch64::LDURWi: 2211 case AArch64::LDRWpre: 2212 case AArch64::LDURXi: 2213 case AArch64::LDRXpre: 2214 case AArch64::LDURSWi: 2215 case AArch64::LDURHHi: 2216 case AArch64::LDURBBi: 2217 case AArch64::LDURSBWi: 2218 case AArch64::LDURSHWi: 2219 return true; 2220 } 2221 } 2222 2223 Optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) { 2224 switch (Opc) { 2225 default: return {}; 2226 case AArch64::PRFMui: return AArch64::PRFUMi; 2227 case AArch64::LDRXui: return AArch64::LDURXi; 2228 case AArch64::LDRWui: return AArch64::LDURWi; 2229 case AArch64::LDRBui: return AArch64::LDURBi; 2230 case AArch64::LDRHui: return AArch64::LDURHi; 2231 case AArch64::LDRSui: return AArch64::LDURSi; 2232 case AArch64::LDRDui: return AArch64::LDURDi; 2233 case AArch64::LDRQui: return AArch64::LDURQi; 2234 case AArch64::LDRBBui: return AArch64::LDURBBi; 2235 case AArch64::LDRHHui: return AArch64::LDURHHi; 2236 case AArch64::LDRSBXui: return AArch64::LDURSBXi; 2237 case AArch64::LDRSBWui: return AArch64::LDURSBWi; 2238 case AArch64::LDRSHXui: return AArch64::LDURSHXi; 2239 case AArch64::LDRSHWui: return AArch64::LDURSHWi; 2240 case AArch64::LDRSWui: return AArch64::LDURSWi; 2241 case AArch64::STRXui: return AArch64::STURXi; 2242 case AArch64::STRWui: return AArch64::STURWi; 2243 case AArch64::STRBui: return AArch64::STURBi; 2244 case AArch64::STRHui: return AArch64::STURHi; 2245 case AArch64::STRSui: return AArch64::STURSi; 2246 case AArch64::STRDui: return AArch64::STURDi; 2247 case AArch64::STRQui: return AArch64::STURQi; 2248 case AArch64::STRBBui: return AArch64::STURBBi; 2249 case AArch64::STRHHui: return AArch64::STURHHi; 2250 } 2251 } 2252 2253 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) { 2254 switch (Opc) { 2255 default: 2256 return 2; 2257 case AArch64::LDPXi: 2258 case AArch64::LDPDi: 2259 case AArch64::STPXi: 2260 case AArch64::STPDi: 2261 case AArch64::LDNPXi: 2262 case AArch64::LDNPDi: 2263 case AArch64::STNPXi: 2264 case AArch64::STNPDi: 2265 case AArch64::LDPQi: 2266 case AArch64::STPQi: 2267 case AArch64::LDNPQi: 2268 case AArch64::STNPQi: 2269 case AArch64::LDPWi: 2270 case AArch64::LDPSi: 2271 case AArch64::STPWi: 2272 case AArch64::STPSi: 2273 case AArch64::LDNPWi: 2274 case AArch64::LDNPSi: 2275 case AArch64::STNPWi: 2276 case AArch64::STNPSi: 2277 case AArch64::LDG: 2278 case AArch64::STGPi: 2279 case AArch64::LD1B_IMM: 2280 case AArch64::LD1H_IMM: 2281 case AArch64::LD1W_IMM: 2282 case AArch64::LD1D_IMM: 2283 case AArch64::ST1B_IMM: 2284 case AArch64::ST1H_IMM: 2285 case AArch64::ST1W_IMM: 2286 case AArch64::ST1D_IMM: 2287 case AArch64::LD1B_H_IMM: 2288 case AArch64::LD1SB_H_IMM: 2289 case AArch64::LD1H_S_IMM: 2290 case AArch64::LD1SH_S_IMM: 2291 case AArch64::LD1W_D_IMM: 2292 case AArch64::LD1SW_D_IMM: 2293 case AArch64::ST1B_H_IMM: 2294 case AArch64::ST1H_S_IMM: 2295 case AArch64::ST1W_D_IMM: 2296 case AArch64::LD1B_S_IMM: 2297 case AArch64::LD1SB_S_IMM: 2298 case AArch64::LD1H_D_IMM: 2299 case AArch64::LD1SH_D_IMM: 2300 case AArch64::ST1B_S_IMM: 2301 case AArch64::ST1H_D_IMM: 2302 case AArch64::LD1B_D_IMM: 2303 case AArch64::LD1SB_D_IMM: 2304 case AArch64::ST1B_D_IMM: 2305 case AArch64::LD1RB_IMM: 2306 case AArch64::LD1RB_H_IMM: 2307 case AArch64::LD1RB_S_IMM: 2308 case AArch64::LD1RB_D_IMM: 2309 case AArch64::LD1RSB_H_IMM: 2310 case AArch64::LD1RSB_S_IMM: 2311 case AArch64::LD1RSB_D_IMM: 2312 case AArch64::LD1RH_IMM: 2313 case AArch64::LD1RH_S_IMM: 2314 case AArch64::LD1RH_D_IMM: 2315 case AArch64::LD1RSH_S_IMM: 2316 case AArch64::LD1RSH_D_IMM: 2317 case AArch64::LD1RW_IMM: 2318 case AArch64::LD1RW_D_IMM: 2319 case AArch64::LD1RSW_IMM: 2320 case AArch64::LD1RD_IMM: 2321 return 3; 2322 case AArch64::ADDG: 2323 case AArch64::STGOffset: 2324 case AArch64::LDR_PXI: 2325 case AArch64::STR_PXI: 2326 return 2; 2327 } 2328 } 2329 2330 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) { 2331 switch (MI.getOpcode()) { 2332 default: 2333 return false; 2334 // Scaled instructions. 2335 case AArch64::STRSui: 2336 case AArch64::STRDui: 2337 case AArch64::STRQui: 2338 case AArch64::STRXui: 2339 case AArch64::STRWui: 2340 case AArch64::LDRSui: 2341 case AArch64::LDRDui: 2342 case AArch64::LDRQui: 2343 case AArch64::LDRXui: 2344 case AArch64::LDRWui: 2345 case AArch64::LDRSWui: 2346 // Unscaled instructions. 2347 case AArch64::STURSi: 2348 case AArch64::STRSpre: 2349 case AArch64::STURDi: 2350 case AArch64::STRDpre: 2351 case AArch64::STURQi: 2352 case AArch64::STRQpre: 2353 case AArch64::STURWi: 2354 case AArch64::STRWpre: 2355 case AArch64::STURXi: 2356 case AArch64::STRXpre: 2357 case AArch64::LDURSi: 2358 case AArch64::LDRSpre: 2359 case AArch64::LDURDi: 2360 case AArch64::LDRDpre: 2361 case AArch64::LDURQi: 2362 case AArch64::LDRQpre: 2363 case AArch64::LDURWi: 2364 case AArch64::LDRWpre: 2365 case AArch64::LDURXi: 2366 case AArch64::LDRXpre: 2367 case AArch64::LDURSWi: 2368 return true; 2369 } 2370 } 2371 2372 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc, 2373 bool &Is64Bit) { 2374 switch (Opc) { 2375 default: 2376 llvm_unreachable("Opcode has no flag setting equivalent!"); 2377 // 32-bit cases: 2378 case AArch64::ADDWri: 2379 Is64Bit = false; 2380 return AArch64::ADDSWri; 2381 case AArch64::ADDWrr: 2382 Is64Bit = false; 2383 return AArch64::ADDSWrr; 2384 case AArch64::ADDWrs: 2385 Is64Bit = false; 2386 return AArch64::ADDSWrs; 2387 case AArch64::ADDWrx: 2388 Is64Bit = false; 2389 return AArch64::ADDSWrx; 2390 case AArch64::ANDWri: 2391 Is64Bit = false; 2392 return AArch64::ANDSWri; 2393 case AArch64::ANDWrr: 2394 Is64Bit = false; 2395 return AArch64::ANDSWrr; 2396 case AArch64::ANDWrs: 2397 Is64Bit = false; 2398 return AArch64::ANDSWrs; 2399 case AArch64::BICWrr: 2400 Is64Bit = false; 2401 return AArch64::BICSWrr; 2402 case AArch64::BICWrs: 2403 Is64Bit = false; 2404 return AArch64::BICSWrs; 2405 case AArch64::SUBWri: 2406 Is64Bit = false; 2407 return AArch64::SUBSWri; 2408 case AArch64::SUBWrr: 2409 Is64Bit = false; 2410 return AArch64::SUBSWrr; 2411 case AArch64::SUBWrs: 2412 Is64Bit = false; 2413 return AArch64::SUBSWrs; 2414 case AArch64::SUBWrx: 2415 Is64Bit = false; 2416 return AArch64::SUBSWrx; 2417 // 64-bit cases: 2418 case AArch64::ADDXri: 2419 Is64Bit = true; 2420 return AArch64::ADDSXri; 2421 case AArch64::ADDXrr: 2422 Is64Bit = true; 2423 return AArch64::ADDSXrr; 2424 case AArch64::ADDXrs: 2425 Is64Bit = true; 2426 return AArch64::ADDSXrs; 2427 case AArch64::ADDXrx: 2428 Is64Bit = true; 2429 return AArch64::ADDSXrx; 2430 case AArch64::ANDXri: 2431 Is64Bit = true; 2432 return AArch64::ANDSXri; 2433 case AArch64::ANDXrr: 2434 Is64Bit = true; 2435 return AArch64::ANDSXrr; 2436 case AArch64::ANDXrs: 2437 Is64Bit = true; 2438 return AArch64::ANDSXrs; 2439 case AArch64::BICXrr: 2440 Is64Bit = true; 2441 return AArch64::BICSXrr; 2442 case AArch64::BICXrs: 2443 Is64Bit = true; 2444 return AArch64::BICSXrs; 2445 case AArch64::SUBXri: 2446 Is64Bit = true; 2447 return AArch64::SUBSXri; 2448 case AArch64::SUBXrr: 2449 Is64Bit = true; 2450 return AArch64::SUBSXrr; 2451 case AArch64::SUBXrs: 2452 Is64Bit = true; 2453 return AArch64::SUBSXrs; 2454 case AArch64::SUBXrx: 2455 Is64Bit = true; 2456 return AArch64::SUBSXrx; 2457 } 2458 } 2459 2460 // Is this a candidate for ld/st merging or pairing? For example, we don't 2461 // touch volatiles or load/stores that have a hint to avoid pair formation. 2462 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const { 2463 2464 bool IsPreLdSt = isPreLdSt(MI); 2465 2466 // If this is a volatile load/store, don't mess with it. 2467 if (MI.hasOrderedMemoryRef()) 2468 return false; 2469 2470 // Make sure this is a reg/fi+imm (as opposed to an address reloc). 2471 // For Pre-inc LD/ST, the operand is shifted by one. 2472 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() || 2473 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) && 2474 "Expected a reg or frame index operand."); 2475 2476 // For Pre-indexed addressing quadword instructions, the third operand is the 2477 // immediate value. 2478 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm(); 2479 2480 if (!MI.getOperand(2).isImm() && !IsImmPreLdSt) 2481 return false; 2482 2483 // Can't merge/pair if the instruction modifies the base register. 2484 // e.g., ldr x0, [x0] 2485 // This case will never occur with an FI base. 2486 // However, if the instruction is an LDR/STR<S,D,Q,W,X>pre, it can be merged. 2487 // For example: 2488 // ldr q0, [x11, #32]! 2489 // ldr q1, [x11, #16] 2490 // to 2491 // ldp q0, q1, [x11, #32]! 2492 if (MI.getOperand(1).isReg() && !IsPreLdSt) { 2493 Register BaseReg = MI.getOperand(1).getReg(); 2494 const TargetRegisterInfo *TRI = &getRegisterInfo(); 2495 if (MI.modifiesRegister(BaseReg, TRI)) 2496 return false; 2497 } 2498 2499 // Check if this load/store has a hint to avoid pair formation. 2500 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. 2501 if (isLdStPairSuppressed(MI)) 2502 return false; 2503 2504 // Do not pair any callee-save store/reload instructions in the 2505 // prologue/epilogue if the CFI information encoded the operations as separate 2506 // instructions, as that will cause the size of the actual prologue to mismatch 2507 // with the prologue size recorded in the Windows CFI. 2508 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo(); 2509 bool NeedsWinCFI = MAI->usesWindowsCFI() && 2510 MI.getMF()->getFunction().needsUnwindTableEntry(); 2511 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) || 2512 MI.getFlag(MachineInstr::FrameDestroy))) 2513 return false; 2514 2515 // On some CPUs quad load/store pairs are slower than two single load/stores. 2516 if (Subtarget.isPaired128Slow()) { 2517 switch (MI.getOpcode()) { 2518 default: 2519 break; 2520 case AArch64::LDURQi: 2521 case AArch64::STURQi: 2522 case AArch64::LDRQui: 2523 case AArch64::STRQui: 2524 return false; 2525 } 2526 } 2527 2528 return true; 2529 } 2530 2531 bool AArch64InstrInfo::getMemOperandsWithOffsetWidth( 2532 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, 2533 int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, 2534 const TargetRegisterInfo *TRI) const { 2535 if (!LdSt.mayLoadOrStore()) 2536 return false; 2537 2538 const MachineOperand *BaseOp; 2539 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable, 2540 Width, TRI)) 2541 return false; 2542 BaseOps.push_back(BaseOp); 2543 return true; 2544 } 2545 2546 Optional<ExtAddrMode> 2547 AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI, 2548 const TargetRegisterInfo *TRI) const { 2549 const MachineOperand *Base; // Filled with the base operand of MI. 2550 int64_t Offset; // Filled with the offset of MI. 2551 bool OffsetIsScalable; 2552 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI)) 2553 return None; 2554 2555 if (!Base->isReg()) 2556 return None; 2557 ExtAddrMode AM; 2558 AM.BaseReg = Base->getReg(); 2559 AM.Displacement = Offset; 2560 AM.ScaledReg = 0; 2561 return AM; 2562 } 2563 2564 bool AArch64InstrInfo::getMemOperandWithOffsetWidth( 2565 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset, 2566 bool &OffsetIsScalable, unsigned &Width, 2567 const TargetRegisterInfo *TRI) const { 2568 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 2569 // Handle only loads/stores with base register followed by immediate offset. 2570 if (LdSt.getNumExplicitOperands() == 3) { 2571 // Non-paired instruction (e.g., ldr x1, [x0, #8]). 2572 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) || 2573 !LdSt.getOperand(2).isImm()) 2574 return false; 2575 } else if (LdSt.getNumExplicitOperands() == 4) { 2576 // Paired instruction (e.g., ldp x1, x2, [x0, #8]). 2577 if (!LdSt.getOperand(1).isReg() || 2578 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) || 2579 !LdSt.getOperand(3).isImm()) 2580 return false; 2581 } else 2582 return false; 2583 2584 // Get the scaling factor for the instruction and set the width for the 2585 // instruction. 2586 TypeSize Scale(0U, false); 2587 int64_t Dummy1, Dummy2; 2588 2589 // If this returns false, then it's an instruction we don't want to handle. 2590 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2)) 2591 return false; 2592 2593 // Compute the offset. Offset is calculated as the immediate operand 2594 // multiplied by the scaling factor. Unscaled instructions have scaling factor 2595 // set to 1. 2596 if (LdSt.getNumExplicitOperands() == 3) { 2597 BaseOp = &LdSt.getOperand(1); 2598 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinSize(); 2599 } else { 2600 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands"); 2601 BaseOp = &LdSt.getOperand(2); 2602 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinSize(); 2603 } 2604 OffsetIsScalable = Scale.isScalable(); 2605 2606 if (!BaseOp->isReg() && !BaseOp->isFI()) 2607 return false; 2608 2609 return true; 2610 } 2611 2612 MachineOperand & 2613 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const { 2614 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 2615 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1); 2616 assert(OfsOp.isImm() && "Offset operand wasn't immediate."); 2617 return OfsOp; 2618 } 2619 2620 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale, 2621 unsigned &Width, int64_t &MinOffset, 2622 int64_t &MaxOffset) { 2623 const unsigned SVEMaxBytesPerVector = AArch64::SVEMaxBitsPerVector / 8; 2624 switch (Opcode) { 2625 // Not a memory operation or something we want to handle. 2626 default: 2627 Scale = TypeSize::Fixed(0); 2628 Width = 0; 2629 MinOffset = MaxOffset = 0; 2630 return false; 2631 case AArch64::STRWpost: 2632 case AArch64::LDRWpost: 2633 Width = 32; 2634 Scale = TypeSize::Fixed(4); 2635 MinOffset = -256; 2636 MaxOffset = 255; 2637 break; 2638 case AArch64::LDURQi: 2639 case AArch64::STURQi: 2640 Width = 16; 2641 Scale = TypeSize::Fixed(1); 2642 MinOffset = -256; 2643 MaxOffset = 255; 2644 break; 2645 case AArch64::PRFUMi: 2646 case AArch64::LDURXi: 2647 case AArch64::LDURDi: 2648 case AArch64::STURXi: 2649 case AArch64::STURDi: 2650 Width = 8; 2651 Scale = TypeSize::Fixed(1); 2652 MinOffset = -256; 2653 MaxOffset = 255; 2654 break; 2655 case AArch64::LDURWi: 2656 case AArch64::LDURSi: 2657 case AArch64::LDURSWi: 2658 case AArch64::STURWi: 2659 case AArch64::STURSi: 2660 Width = 4; 2661 Scale = TypeSize::Fixed(1); 2662 MinOffset = -256; 2663 MaxOffset = 255; 2664 break; 2665 case AArch64::LDURHi: 2666 case AArch64::LDURHHi: 2667 case AArch64::LDURSHXi: 2668 case AArch64::LDURSHWi: 2669 case AArch64::STURHi: 2670 case AArch64::STURHHi: 2671 Width = 2; 2672 Scale = TypeSize::Fixed(1); 2673 MinOffset = -256; 2674 MaxOffset = 255; 2675 break; 2676 case AArch64::LDURBi: 2677 case AArch64::LDURBBi: 2678 case AArch64::LDURSBXi: 2679 case AArch64::LDURSBWi: 2680 case AArch64::STURBi: 2681 case AArch64::STURBBi: 2682 Width = 1; 2683 Scale = TypeSize::Fixed(1); 2684 MinOffset = -256; 2685 MaxOffset = 255; 2686 break; 2687 case AArch64::LDPQi: 2688 case AArch64::LDNPQi: 2689 case AArch64::STPQi: 2690 case AArch64::STNPQi: 2691 Scale = TypeSize::Fixed(16); 2692 Width = 32; 2693 MinOffset = -64; 2694 MaxOffset = 63; 2695 break; 2696 case AArch64::LDRQui: 2697 case AArch64::STRQui: 2698 Scale = TypeSize::Fixed(16); 2699 Width = 16; 2700 MinOffset = 0; 2701 MaxOffset = 4095; 2702 break; 2703 case AArch64::LDPXi: 2704 case AArch64::LDPDi: 2705 case AArch64::LDNPXi: 2706 case AArch64::LDNPDi: 2707 case AArch64::STPXi: 2708 case AArch64::STPDi: 2709 case AArch64::STNPXi: 2710 case AArch64::STNPDi: 2711 Scale = TypeSize::Fixed(8); 2712 Width = 16; 2713 MinOffset = -64; 2714 MaxOffset = 63; 2715 break; 2716 case AArch64::PRFMui: 2717 case AArch64::LDRXui: 2718 case AArch64::LDRDui: 2719 case AArch64::STRXui: 2720 case AArch64::STRDui: 2721 Scale = TypeSize::Fixed(8); 2722 Width = 8; 2723 MinOffset = 0; 2724 MaxOffset = 4095; 2725 break; 2726 case AArch64::StoreSwiftAsyncContext: 2727 // Store is an STRXui, but there might be an ADDXri in the expansion too. 2728 Scale = TypeSize::Fixed(1); 2729 Width = 8; 2730 MinOffset = 0; 2731 MaxOffset = 4095; 2732 break; 2733 case AArch64::LDPWi: 2734 case AArch64::LDPSi: 2735 case AArch64::LDNPWi: 2736 case AArch64::LDNPSi: 2737 case AArch64::STPWi: 2738 case AArch64::STPSi: 2739 case AArch64::STNPWi: 2740 case AArch64::STNPSi: 2741 Scale = TypeSize::Fixed(4); 2742 Width = 8; 2743 MinOffset = -64; 2744 MaxOffset = 63; 2745 break; 2746 case AArch64::LDRWui: 2747 case AArch64::LDRSui: 2748 case AArch64::LDRSWui: 2749 case AArch64::STRWui: 2750 case AArch64::STRSui: 2751 Scale = TypeSize::Fixed(4); 2752 Width = 4; 2753 MinOffset = 0; 2754 MaxOffset = 4095; 2755 break; 2756 case AArch64::LDRHui: 2757 case AArch64::LDRHHui: 2758 case AArch64::LDRSHWui: 2759 case AArch64::LDRSHXui: 2760 case AArch64::STRHui: 2761 case AArch64::STRHHui: 2762 Scale = TypeSize::Fixed(2); 2763 Width = 2; 2764 MinOffset = 0; 2765 MaxOffset = 4095; 2766 break; 2767 case AArch64::LDRBui: 2768 case AArch64::LDRBBui: 2769 case AArch64::LDRSBWui: 2770 case AArch64::LDRSBXui: 2771 case AArch64::STRBui: 2772 case AArch64::STRBBui: 2773 Scale = TypeSize::Fixed(1); 2774 Width = 1; 2775 MinOffset = 0; 2776 MaxOffset = 4095; 2777 break; 2778 case AArch64::STPXpre: 2779 case AArch64::LDPXpost: 2780 case AArch64::STPDpre: 2781 case AArch64::LDPDpost: 2782 Scale = TypeSize::Fixed(8); 2783 Width = 8; 2784 MinOffset = -512; 2785 MaxOffset = 504; 2786 break; 2787 case AArch64::STPQpre: 2788 case AArch64::LDPQpost: 2789 Scale = TypeSize::Fixed(16); 2790 Width = 16; 2791 MinOffset = -1024; 2792 MaxOffset = 1008; 2793 break; 2794 case AArch64::STRXpre: 2795 case AArch64::STRDpre: 2796 case AArch64::LDRXpost: 2797 case AArch64::LDRDpost: 2798 Scale = TypeSize::Fixed(1); 2799 Width = 8; 2800 MinOffset = -256; 2801 MaxOffset = 255; 2802 break; 2803 case AArch64::STRQpre: 2804 case AArch64::LDRQpost: 2805 Scale = TypeSize::Fixed(1); 2806 Width = 16; 2807 MinOffset = -256; 2808 MaxOffset = 255; 2809 break; 2810 case AArch64::ADDG: 2811 Scale = TypeSize::Fixed(16); 2812 Width = 0; 2813 MinOffset = 0; 2814 MaxOffset = 63; 2815 break; 2816 case AArch64::TAGPstack: 2817 Scale = TypeSize::Fixed(16); 2818 Width = 0; 2819 // TAGP with a negative offset turns into SUBP, which has a maximum offset 2820 // of 63 (not 64!). 2821 MinOffset = -63; 2822 MaxOffset = 63; 2823 break; 2824 case AArch64::LDG: 2825 case AArch64::STGOffset: 2826 case AArch64::STZGOffset: 2827 Scale = TypeSize::Fixed(16); 2828 Width = 16; 2829 MinOffset = -256; 2830 MaxOffset = 255; 2831 break; 2832 case AArch64::STR_ZZZZXI: 2833 case AArch64::LDR_ZZZZXI: 2834 Scale = TypeSize::Scalable(16); 2835 Width = SVEMaxBytesPerVector * 4; 2836 MinOffset = -256; 2837 MaxOffset = 252; 2838 break; 2839 case AArch64::STR_ZZZXI: 2840 case AArch64::LDR_ZZZXI: 2841 Scale = TypeSize::Scalable(16); 2842 Width = SVEMaxBytesPerVector * 3; 2843 MinOffset = -256; 2844 MaxOffset = 253; 2845 break; 2846 case AArch64::STR_ZZXI: 2847 case AArch64::LDR_ZZXI: 2848 Scale = TypeSize::Scalable(16); 2849 Width = SVEMaxBytesPerVector * 2; 2850 MinOffset = -256; 2851 MaxOffset = 254; 2852 break; 2853 case AArch64::LDR_PXI: 2854 case AArch64::STR_PXI: 2855 Scale = TypeSize::Scalable(2); 2856 Width = SVEMaxBytesPerVector / 8; 2857 MinOffset = -256; 2858 MaxOffset = 255; 2859 break; 2860 case AArch64::LDR_ZXI: 2861 case AArch64::STR_ZXI: 2862 Scale = TypeSize::Scalable(16); 2863 Width = SVEMaxBytesPerVector; 2864 MinOffset = -256; 2865 MaxOffset = 255; 2866 break; 2867 case AArch64::LD1B_IMM: 2868 case AArch64::LD1H_IMM: 2869 case AArch64::LD1W_IMM: 2870 case AArch64::LD1D_IMM: 2871 case AArch64::ST1B_IMM: 2872 case AArch64::ST1H_IMM: 2873 case AArch64::ST1W_IMM: 2874 case AArch64::ST1D_IMM: 2875 // A full vectors worth of data 2876 // Width = mbytes * elements 2877 Scale = TypeSize::Scalable(16); 2878 Width = SVEMaxBytesPerVector; 2879 MinOffset = -8; 2880 MaxOffset = 7; 2881 break; 2882 case AArch64::LD1B_H_IMM: 2883 case AArch64::LD1SB_H_IMM: 2884 case AArch64::LD1H_S_IMM: 2885 case AArch64::LD1SH_S_IMM: 2886 case AArch64::LD1W_D_IMM: 2887 case AArch64::LD1SW_D_IMM: 2888 case AArch64::ST1B_H_IMM: 2889 case AArch64::ST1H_S_IMM: 2890 case AArch64::ST1W_D_IMM: 2891 // A half vector worth of data 2892 // Width = mbytes * elements 2893 Scale = TypeSize::Scalable(8); 2894 Width = SVEMaxBytesPerVector / 2; 2895 MinOffset = -8; 2896 MaxOffset = 7; 2897 break; 2898 case AArch64::LD1B_S_IMM: 2899 case AArch64::LD1SB_S_IMM: 2900 case AArch64::LD1H_D_IMM: 2901 case AArch64::LD1SH_D_IMM: 2902 case AArch64::ST1B_S_IMM: 2903 case AArch64::ST1H_D_IMM: 2904 // A quarter vector worth of data 2905 // Width = mbytes * elements 2906 Scale = TypeSize::Scalable(4); 2907 Width = SVEMaxBytesPerVector / 4; 2908 MinOffset = -8; 2909 MaxOffset = 7; 2910 break; 2911 case AArch64::LD1B_D_IMM: 2912 case AArch64::LD1SB_D_IMM: 2913 case AArch64::ST1B_D_IMM: 2914 // A eighth vector worth of data 2915 // Width = mbytes * elements 2916 Scale = TypeSize::Scalable(2); 2917 Width = SVEMaxBytesPerVector / 8; 2918 MinOffset = -8; 2919 MaxOffset = 7; 2920 break; 2921 case AArch64::ST2GOffset: 2922 case AArch64::STZ2GOffset: 2923 Scale = TypeSize::Fixed(16); 2924 Width = 32; 2925 MinOffset = -256; 2926 MaxOffset = 255; 2927 break; 2928 case AArch64::STGPi: 2929 Scale = TypeSize::Fixed(16); 2930 Width = 16; 2931 MinOffset = -64; 2932 MaxOffset = 63; 2933 break; 2934 case AArch64::LD1RB_IMM: 2935 case AArch64::LD1RB_H_IMM: 2936 case AArch64::LD1RB_S_IMM: 2937 case AArch64::LD1RB_D_IMM: 2938 case AArch64::LD1RSB_H_IMM: 2939 case AArch64::LD1RSB_S_IMM: 2940 case AArch64::LD1RSB_D_IMM: 2941 Scale = TypeSize::Fixed(1); 2942 Width = 1; 2943 MinOffset = 0; 2944 MaxOffset = 63; 2945 break; 2946 case AArch64::LD1RH_IMM: 2947 case AArch64::LD1RH_S_IMM: 2948 case AArch64::LD1RH_D_IMM: 2949 case AArch64::LD1RSH_S_IMM: 2950 case AArch64::LD1RSH_D_IMM: 2951 Scale = TypeSize::Fixed(2); 2952 Width = 2; 2953 MinOffset = 0; 2954 MaxOffset = 63; 2955 break; 2956 case AArch64::LD1RW_IMM: 2957 case AArch64::LD1RW_D_IMM: 2958 case AArch64::LD1RSW_IMM: 2959 Scale = TypeSize::Fixed(4); 2960 Width = 4; 2961 MinOffset = 0; 2962 MaxOffset = 63; 2963 break; 2964 case AArch64::LD1RD_IMM: 2965 Scale = TypeSize::Fixed(8); 2966 Width = 8; 2967 MinOffset = 0; 2968 MaxOffset = 63; 2969 break; 2970 } 2971 2972 return true; 2973 } 2974 2975 // Scaling factor for unscaled load or store. 2976 int AArch64InstrInfo::getMemScale(unsigned Opc) { 2977 switch (Opc) { 2978 default: 2979 llvm_unreachable("Opcode has unknown scale!"); 2980 case AArch64::LDRBBui: 2981 case AArch64::LDURBBi: 2982 case AArch64::LDRSBWui: 2983 case AArch64::LDURSBWi: 2984 case AArch64::STRBBui: 2985 case AArch64::STURBBi: 2986 return 1; 2987 case AArch64::LDRHHui: 2988 case AArch64::LDURHHi: 2989 case AArch64::LDRSHWui: 2990 case AArch64::LDURSHWi: 2991 case AArch64::STRHHui: 2992 case AArch64::STURHHi: 2993 return 2; 2994 case AArch64::LDRSui: 2995 case AArch64::LDURSi: 2996 case AArch64::LDRSpre: 2997 case AArch64::LDRSWui: 2998 case AArch64::LDURSWi: 2999 case AArch64::LDRWpre: 3000 case AArch64::LDRWui: 3001 case AArch64::LDURWi: 3002 case AArch64::STRSui: 3003 case AArch64::STURSi: 3004 case AArch64::STRSpre: 3005 case AArch64::STRWui: 3006 case AArch64::STURWi: 3007 case AArch64::STRWpre: 3008 case AArch64::LDPSi: 3009 case AArch64::LDPSWi: 3010 case AArch64::LDPWi: 3011 case AArch64::STPSi: 3012 case AArch64::STPWi: 3013 return 4; 3014 case AArch64::LDRDui: 3015 case AArch64::LDURDi: 3016 case AArch64::LDRDpre: 3017 case AArch64::LDRXui: 3018 case AArch64::LDURXi: 3019 case AArch64::LDRXpre: 3020 case AArch64::STRDui: 3021 case AArch64::STURDi: 3022 case AArch64::STRDpre: 3023 case AArch64::STRXui: 3024 case AArch64::STURXi: 3025 case AArch64::STRXpre: 3026 case AArch64::LDPDi: 3027 case AArch64::LDPXi: 3028 case AArch64::STPDi: 3029 case AArch64::STPXi: 3030 return 8; 3031 case AArch64::LDRQui: 3032 case AArch64::LDURQi: 3033 case AArch64::STRQui: 3034 case AArch64::STURQi: 3035 case AArch64::STRQpre: 3036 case AArch64::LDPQi: 3037 case AArch64::LDRQpre: 3038 case AArch64::STPQi: 3039 case AArch64::STGOffset: 3040 case AArch64::STZGOffset: 3041 case AArch64::ST2GOffset: 3042 case AArch64::STZ2GOffset: 3043 case AArch64::STGPi: 3044 return 16; 3045 } 3046 } 3047 3048 bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) { 3049 switch (MI.getOpcode()) { 3050 default: 3051 return false; 3052 case AArch64::LDRWpre: 3053 case AArch64::LDRXpre: 3054 case AArch64::LDRSpre: 3055 case AArch64::LDRDpre: 3056 case AArch64::LDRQpre: 3057 return true; 3058 } 3059 } 3060 3061 bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) { 3062 switch (MI.getOpcode()) { 3063 default: 3064 return false; 3065 case AArch64::STRWpre: 3066 case AArch64::STRXpre: 3067 case AArch64::STRSpre: 3068 case AArch64::STRDpre: 3069 case AArch64::STRQpre: 3070 return true; 3071 } 3072 } 3073 3074 bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) { 3075 return isPreLd(MI) || isPreSt(MI); 3076 } 3077 3078 // Scale the unscaled offsets. Returns false if the unscaled offset can't be 3079 // scaled. 3080 static bool scaleOffset(unsigned Opc, int64_t &Offset) { 3081 int Scale = AArch64InstrInfo::getMemScale(Opc); 3082 3083 // If the byte-offset isn't a multiple of the stride, we can't scale this 3084 // offset. 3085 if (Offset % Scale != 0) 3086 return false; 3087 3088 // Convert the byte-offset used by unscaled into an "element" offset used 3089 // by the scaled pair load/store instructions. 3090 Offset /= Scale; 3091 return true; 3092 } 3093 3094 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) { 3095 if (FirstOpc == SecondOpc) 3096 return true; 3097 // We can also pair sign-ext and zero-ext instructions. 3098 switch (FirstOpc) { 3099 default: 3100 return false; 3101 case AArch64::LDRWui: 3102 case AArch64::LDURWi: 3103 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi; 3104 case AArch64::LDRSWui: 3105 case AArch64::LDURSWi: 3106 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi; 3107 } 3108 // These instructions can't be paired based on their opcodes. 3109 return false; 3110 } 3111 3112 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, 3113 int64_t Offset1, unsigned Opcode1, int FI2, 3114 int64_t Offset2, unsigned Opcode2) { 3115 // Accesses through fixed stack object frame indices may access a different 3116 // fixed stack slot. Check that the object offsets + offsets match. 3117 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) { 3118 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1); 3119 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2); 3120 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered."); 3121 // Convert to scaled object offsets. 3122 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1); 3123 if (ObjectOffset1 % Scale1 != 0) 3124 return false; 3125 ObjectOffset1 /= Scale1; 3126 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2); 3127 if (ObjectOffset2 % Scale2 != 0) 3128 return false; 3129 ObjectOffset2 /= Scale2; 3130 ObjectOffset1 += Offset1; 3131 ObjectOffset2 += Offset2; 3132 return ObjectOffset1 + 1 == ObjectOffset2; 3133 } 3134 3135 return FI1 == FI2; 3136 } 3137 3138 /// Detect opportunities for ldp/stp formation. 3139 /// 3140 /// Only called for LdSt for which getMemOperandWithOffset returns true. 3141 bool AArch64InstrInfo::shouldClusterMemOps( 3142 ArrayRef<const MachineOperand *> BaseOps1, 3143 ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads, 3144 unsigned NumBytes) const { 3145 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1); 3146 const MachineOperand &BaseOp1 = *BaseOps1.front(); 3147 const MachineOperand &BaseOp2 = *BaseOps2.front(); 3148 const MachineInstr &FirstLdSt = *BaseOp1.getParent(); 3149 const MachineInstr &SecondLdSt = *BaseOp2.getParent(); 3150 if (BaseOp1.getType() != BaseOp2.getType()) 3151 return false; 3152 3153 assert((BaseOp1.isReg() || BaseOp1.isFI()) && 3154 "Only base registers and frame indices are supported."); 3155 3156 // Check for both base regs and base FI. 3157 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg()) 3158 return false; 3159 3160 // Only cluster up to a single pair. 3161 if (NumLoads > 2) 3162 return false; 3163 3164 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt)) 3165 return false; 3166 3167 // Can we pair these instructions based on their opcodes? 3168 unsigned FirstOpc = FirstLdSt.getOpcode(); 3169 unsigned SecondOpc = SecondLdSt.getOpcode(); 3170 if (!canPairLdStOpc(FirstOpc, SecondOpc)) 3171 return false; 3172 3173 // Can't merge volatiles or load/stores that have a hint to avoid pair 3174 // formation, for example. 3175 if (!isCandidateToMergeOrPair(FirstLdSt) || 3176 !isCandidateToMergeOrPair(SecondLdSt)) 3177 return false; 3178 3179 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate. 3180 int64_t Offset1 = FirstLdSt.getOperand(2).getImm(); 3181 if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1)) 3182 return false; 3183 3184 int64_t Offset2 = SecondLdSt.getOperand(2).getImm(); 3185 if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2)) 3186 return false; 3187 3188 // Pairwise instructions have a 7-bit signed offset field. 3189 if (Offset1 > 63 || Offset1 < -64) 3190 return false; 3191 3192 // The caller should already have ordered First/SecondLdSt by offset. 3193 // Note: except for non-equal frame index bases 3194 if (BaseOp1.isFI()) { 3195 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) && 3196 "Caller should have ordered offsets."); 3197 3198 const MachineFrameInfo &MFI = 3199 FirstLdSt.getParent()->getParent()->getFrameInfo(); 3200 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc, 3201 BaseOp2.getIndex(), Offset2, SecondOpc); 3202 } 3203 3204 assert(Offset1 <= Offset2 && "Caller should have ordered offsets."); 3205 3206 return Offset1 + 1 == Offset2; 3207 } 3208 3209 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB, 3210 unsigned Reg, unsigned SubIdx, 3211 unsigned State, 3212 const TargetRegisterInfo *TRI) { 3213 if (!SubIdx) 3214 return MIB.addReg(Reg, State); 3215 3216 if (Register::isPhysicalRegister(Reg)) 3217 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State); 3218 return MIB.addReg(Reg, State, SubIdx); 3219 } 3220 3221 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, 3222 unsigned NumRegs) { 3223 // We really want the positive remainder mod 32 here, that happens to be 3224 // easily obtainable with a mask. 3225 return ((DestReg - SrcReg) & 0x1f) < NumRegs; 3226 } 3227 3228 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB, 3229 MachineBasicBlock::iterator I, 3230 const DebugLoc &DL, MCRegister DestReg, 3231 MCRegister SrcReg, bool KillSrc, 3232 unsigned Opcode, 3233 ArrayRef<unsigned> Indices) const { 3234 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON"); 3235 const TargetRegisterInfo *TRI = &getRegisterInfo(); 3236 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 3237 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 3238 unsigned NumRegs = Indices.size(); 3239 3240 int SubReg = 0, End = NumRegs, Incr = 1; 3241 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) { 3242 SubReg = NumRegs - 1; 3243 End = -1; 3244 Incr = -1; 3245 } 3246 3247 for (; SubReg != End; SubReg += Incr) { 3248 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 3249 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 3250 AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI); 3251 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 3252 } 3253 } 3254 3255 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB, 3256 MachineBasicBlock::iterator I, 3257 DebugLoc DL, unsigned DestReg, 3258 unsigned SrcReg, bool KillSrc, 3259 unsigned Opcode, unsigned ZeroReg, 3260 llvm::ArrayRef<unsigned> Indices) const { 3261 const TargetRegisterInfo *TRI = &getRegisterInfo(); 3262 unsigned NumRegs = Indices.size(); 3263 3264 #ifndef NDEBUG 3265 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 3266 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 3267 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 && 3268 "GPR reg sequences should not be able to overlap"); 3269 #endif 3270 3271 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) { 3272 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 3273 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 3274 MIB.addReg(ZeroReg); 3275 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 3276 MIB.addImm(0); 3277 } 3278 } 3279 3280 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, 3281 MachineBasicBlock::iterator I, 3282 const DebugLoc &DL, MCRegister DestReg, 3283 MCRegister SrcReg, bool KillSrc) const { 3284 if (AArch64::GPR32spRegClass.contains(DestReg) && 3285 (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) { 3286 const TargetRegisterInfo *TRI = &getRegisterInfo(); 3287 3288 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) { 3289 // If either operand is WSP, expand to ADD #0. 3290 if (Subtarget.hasZeroCycleRegMove()) { 3291 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move. 3292 MCRegister DestRegX = TRI->getMatchingSuperReg( 3293 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 3294 MCRegister SrcRegX = TRI->getMatchingSuperReg( 3295 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 3296 // This instruction is reading and writing X registers. This may upset 3297 // the register scavenger and machine verifier, so we need to indicate 3298 // that we are reading an undefined value from SrcRegX, but a proper 3299 // value from SrcReg. 3300 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX) 3301 .addReg(SrcRegX, RegState::Undef) 3302 .addImm(0) 3303 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) 3304 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 3305 } else { 3306 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg) 3307 .addReg(SrcReg, getKillRegState(KillSrc)) 3308 .addImm(0) 3309 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 3310 } 3311 } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) { 3312 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg) 3313 .addImm(0) 3314 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 3315 } else { 3316 if (Subtarget.hasZeroCycleRegMove()) { 3317 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move. 3318 MCRegister DestRegX = TRI->getMatchingSuperReg( 3319 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 3320 MCRegister SrcRegX = TRI->getMatchingSuperReg( 3321 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 3322 // This instruction is reading and writing X registers. This may upset 3323 // the register scavenger and machine verifier, so we need to indicate 3324 // that we are reading an undefined value from SrcRegX, but a proper 3325 // value from SrcReg. 3326 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX) 3327 .addReg(AArch64::XZR) 3328 .addReg(SrcRegX, RegState::Undef) 3329 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 3330 } else { 3331 // Otherwise, expand to ORR WZR. 3332 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg) 3333 .addReg(AArch64::WZR) 3334 .addReg(SrcReg, getKillRegState(KillSrc)); 3335 } 3336 } 3337 return; 3338 } 3339 3340 // Copy a Predicate register by ORRing with itself. 3341 if (AArch64::PPRRegClass.contains(DestReg) && 3342 AArch64::PPRRegClass.contains(SrcReg)) { 3343 assert(Subtarget.hasSVE() && "Unexpected SVE register."); 3344 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg) 3345 .addReg(SrcReg) // Pg 3346 .addReg(SrcReg) 3347 .addReg(SrcReg, getKillRegState(KillSrc)); 3348 return; 3349 } 3350 3351 // Copy a Z register by ORRing with itself. 3352 if (AArch64::ZPRRegClass.contains(DestReg) && 3353 AArch64::ZPRRegClass.contains(SrcReg)) { 3354 assert(Subtarget.hasSVE() && "Unexpected SVE register."); 3355 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg) 3356 .addReg(SrcReg) 3357 .addReg(SrcReg, getKillRegState(KillSrc)); 3358 return; 3359 } 3360 3361 // Copy a Z register pair by copying the individual sub-registers. 3362 if (AArch64::ZPR2RegClass.contains(DestReg) && 3363 AArch64::ZPR2RegClass.contains(SrcReg)) { 3364 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1}; 3365 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 3366 Indices); 3367 return; 3368 } 3369 3370 // Copy a Z register triple by copying the individual sub-registers. 3371 if (AArch64::ZPR3RegClass.contains(DestReg) && 3372 AArch64::ZPR3RegClass.contains(SrcReg)) { 3373 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, 3374 AArch64::zsub2}; 3375 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 3376 Indices); 3377 return; 3378 } 3379 3380 // Copy a Z register quad by copying the individual sub-registers. 3381 if (AArch64::ZPR4RegClass.contains(DestReg) && 3382 AArch64::ZPR4RegClass.contains(SrcReg)) { 3383 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, 3384 AArch64::zsub2, AArch64::zsub3}; 3385 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 3386 Indices); 3387 return; 3388 } 3389 3390 if (AArch64::GPR64spRegClass.contains(DestReg) && 3391 (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) { 3392 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) { 3393 // If either operand is SP, expand to ADD #0. 3394 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg) 3395 .addReg(SrcReg, getKillRegState(KillSrc)) 3396 .addImm(0) 3397 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 3398 } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) { 3399 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg) 3400 .addImm(0) 3401 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 3402 } else { 3403 // Otherwise, expand to ORR XZR. 3404 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg) 3405 .addReg(AArch64::XZR) 3406 .addReg(SrcReg, getKillRegState(KillSrc)); 3407 } 3408 return; 3409 } 3410 3411 // Copy a DDDD register quad by copying the individual sub-registers. 3412 if (AArch64::DDDDRegClass.contains(DestReg) && 3413 AArch64::DDDDRegClass.contains(SrcReg)) { 3414 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 3415 AArch64::dsub2, AArch64::dsub3}; 3416 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 3417 Indices); 3418 return; 3419 } 3420 3421 // Copy a DDD register triple by copying the individual sub-registers. 3422 if (AArch64::DDDRegClass.contains(DestReg) && 3423 AArch64::DDDRegClass.contains(SrcReg)) { 3424 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 3425 AArch64::dsub2}; 3426 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 3427 Indices); 3428 return; 3429 } 3430 3431 // Copy a DD register pair by copying the individual sub-registers. 3432 if (AArch64::DDRegClass.contains(DestReg) && 3433 AArch64::DDRegClass.contains(SrcReg)) { 3434 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1}; 3435 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 3436 Indices); 3437 return; 3438 } 3439 3440 // Copy a QQQQ register quad by copying the individual sub-registers. 3441 if (AArch64::QQQQRegClass.contains(DestReg) && 3442 AArch64::QQQQRegClass.contains(SrcReg)) { 3443 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 3444 AArch64::qsub2, AArch64::qsub3}; 3445 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 3446 Indices); 3447 return; 3448 } 3449 3450 // Copy a QQQ register triple by copying the individual sub-registers. 3451 if (AArch64::QQQRegClass.contains(DestReg) && 3452 AArch64::QQQRegClass.contains(SrcReg)) { 3453 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 3454 AArch64::qsub2}; 3455 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 3456 Indices); 3457 return; 3458 } 3459 3460 // Copy a QQ register pair by copying the individual sub-registers. 3461 if (AArch64::QQRegClass.contains(DestReg) && 3462 AArch64::QQRegClass.contains(SrcReg)) { 3463 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1}; 3464 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 3465 Indices); 3466 return; 3467 } 3468 3469 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) && 3470 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) { 3471 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64}; 3472 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs, 3473 AArch64::XZR, Indices); 3474 return; 3475 } 3476 3477 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) && 3478 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) { 3479 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32}; 3480 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs, 3481 AArch64::WZR, Indices); 3482 return; 3483 } 3484 3485 if (AArch64::FPR128RegClass.contains(DestReg) && 3486 AArch64::FPR128RegClass.contains(SrcReg)) { 3487 if (Subtarget.hasNEON()) { 3488 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 3489 .addReg(SrcReg) 3490 .addReg(SrcReg, getKillRegState(KillSrc)); 3491 } else { 3492 BuildMI(MBB, I, DL, get(AArch64::STRQpre)) 3493 .addReg(AArch64::SP, RegState::Define) 3494 .addReg(SrcReg, getKillRegState(KillSrc)) 3495 .addReg(AArch64::SP) 3496 .addImm(-16); 3497 BuildMI(MBB, I, DL, get(AArch64::LDRQpre)) 3498 .addReg(AArch64::SP, RegState::Define) 3499 .addReg(DestReg, RegState::Define) 3500 .addReg(AArch64::SP) 3501 .addImm(16); 3502 } 3503 return; 3504 } 3505 3506 if (AArch64::FPR64RegClass.contains(DestReg) && 3507 AArch64::FPR64RegClass.contains(SrcReg)) { 3508 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg) 3509 .addReg(SrcReg, getKillRegState(KillSrc)); 3510 return; 3511 } 3512 3513 if (AArch64::FPR32RegClass.contains(DestReg) && 3514 AArch64::FPR32RegClass.contains(SrcReg)) { 3515 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 3516 .addReg(SrcReg, getKillRegState(KillSrc)); 3517 return; 3518 } 3519 3520 if (AArch64::FPR16RegClass.contains(DestReg) && 3521 AArch64::FPR16RegClass.contains(SrcReg)) { 3522 DestReg = 3523 RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass); 3524 SrcReg = 3525 RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass); 3526 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 3527 .addReg(SrcReg, getKillRegState(KillSrc)); 3528 return; 3529 } 3530 3531 if (AArch64::FPR8RegClass.contains(DestReg) && 3532 AArch64::FPR8RegClass.contains(SrcReg)) { 3533 DestReg = 3534 RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass); 3535 SrcReg = 3536 RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass); 3537 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 3538 .addReg(SrcReg, getKillRegState(KillSrc)); 3539 return; 3540 } 3541 3542 // Copies between GPR64 and FPR64. 3543 if (AArch64::FPR64RegClass.contains(DestReg) && 3544 AArch64::GPR64RegClass.contains(SrcReg)) { 3545 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg) 3546 .addReg(SrcReg, getKillRegState(KillSrc)); 3547 return; 3548 } 3549 if (AArch64::GPR64RegClass.contains(DestReg) && 3550 AArch64::FPR64RegClass.contains(SrcReg)) { 3551 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg) 3552 .addReg(SrcReg, getKillRegState(KillSrc)); 3553 return; 3554 } 3555 // Copies between GPR32 and FPR32. 3556 if (AArch64::FPR32RegClass.contains(DestReg) && 3557 AArch64::GPR32RegClass.contains(SrcReg)) { 3558 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg) 3559 .addReg(SrcReg, getKillRegState(KillSrc)); 3560 return; 3561 } 3562 if (AArch64::GPR32RegClass.contains(DestReg) && 3563 AArch64::FPR32RegClass.contains(SrcReg)) { 3564 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg) 3565 .addReg(SrcReg, getKillRegState(KillSrc)); 3566 return; 3567 } 3568 3569 if (DestReg == AArch64::NZCV) { 3570 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy"); 3571 BuildMI(MBB, I, DL, get(AArch64::MSR)) 3572 .addImm(AArch64SysReg::NZCV) 3573 .addReg(SrcReg, getKillRegState(KillSrc)) 3574 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define); 3575 return; 3576 } 3577 3578 if (SrcReg == AArch64::NZCV) { 3579 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy"); 3580 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg) 3581 .addImm(AArch64SysReg::NZCV) 3582 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc)); 3583 return; 3584 } 3585 3586 #ifndef NDEBUG 3587 const TargetRegisterInfo &TRI = getRegisterInfo(); 3588 errs() << TRI.getRegAsmName(DestReg) << " = COPY " 3589 << TRI.getRegAsmName(SrcReg) << "\n"; 3590 #endif 3591 llvm_unreachable("unimplemented reg-to-reg copy"); 3592 } 3593 3594 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, 3595 MachineBasicBlock &MBB, 3596 MachineBasicBlock::iterator InsertBefore, 3597 const MCInstrDesc &MCID, 3598 Register SrcReg, bool IsKill, 3599 unsigned SubIdx0, unsigned SubIdx1, int FI, 3600 MachineMemOperand *MMO) { 3601 Register SrcReg0 = SrcReg; 3602 Register SrcReg1 = SrcReg; 3603 if (Register::isPhysicalRegister(SrcReg)) { 3604 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0); 3605 SubIdx0 = 0; 3606 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1); 3607 SubIdx1 = 0; 3608 } 3609 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 3610 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0) 3611 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1) 3612 .addFrameIndex(FI) 3613 .addImm(0) 3614 .addMemOperand(MMO); 3615 } 3616 3617 void AArch64InstrInfo::storeRegToStackSlot( 3618 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, 3619 bool isKill, int FI, const TargetRegisterClass *RC, 3620 const TargetRegisterInfo *TRI) const { 3621 MachineFunction &MF = *MBB.getParent(); 3622 MachineFrameInfo &MFI = MF.getFrameInfo(); 3623 3624 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 3625 MachineMemOperand *MMO = 3626 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 3627 MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); 3628 unsigned Opc = 0; 3629 bool Offset = true; 3630 unsigned StackID = TargetStackID::Default; 3631 switch (TRI->getSpillSize(*RC)) { 3632 case 1: 3633 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 3634 Opc = AArch64::STRBui; 3635 break; 3636 case 2: 3637 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 3638 Opc = AArch64::STRHui; 3639 else if (AArch64::PPRRegClass.hasSubClassEq(RC)) { 3640 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3641 Opc = AArch64::STR_PXI; 3642 StackID = TargetStackID::ScalableVector; 3643 } 3644 break; 3645 case 4: 3646 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 3647 Opc = AArch64::STRWui; 3648 if (Register::isVirtualRegister(SrcReg)) 3649 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass); 3650 else 3651 assert(SrcReg != AArch64::WSP); 3652 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 3653 Opc = AArch64::STRSui; 3654 break; 3655 case 8: 3656 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 3657 Opc = AArch64::STRXui; 3658 if (Register::isVirtualRegister(SrcReg)) 3659 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 3660 else 3661 assert(SrcReg != AArch64::SP); 3662 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 3663 Opc = AArch64::STRDui; 3664 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 3665 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 3666 get(AArch64::STPWi), SrcReg, isKill, 3667 AArch64::sube32, AArch64::subo32, FI, MMO); 3668 return; 3669 } 3670 break; 3671 case 16: 3672 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 3673 Opc = AArch64::STRQui; 3674 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 3675 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3676 Opc = AArch64::ST1Twov1d; 3677 Offset = false; 3678 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 3679 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 3680 get(AArch64::STPXi), SrcReg, isKill, 3681 AArch64::sube64, AArch64::subo64, FI, MMO); 3682 return; 3683 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { 3684 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3685 Opc = AArch64::STR_ZXI; 3686 StackID = TargetStackID::ScalableVector; 3687 } 3688 break; 3689 case 24: 3690 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 3691 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3692 Opc = AArch64::ST1Threev1d; 3693 Offset = false; 3694 } 3695 break; 3696 case 32: 3697 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 3698 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3699 Opc = AArch64::ST1Fourv1d; 3700 Offset = false; 3701 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 3702 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3703 Opc = AArch64::ST1Twov2d; 3704 Offset = false; 3705 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) { 3706 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3707 Opc = AArch64::STR_ZZXI; 3708 StackID = TargetStackID::ScalableVector; 3709 } 3710 break; 3711 case 48: 3712 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 3713 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3714 Opc = AArch64::ST1Threev2d; 3715 Offset = false; 3716 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { 3717 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3718 Opc = AArch64::STR_ZZZXI; 3719 StackID = TargetStackID::ScalableVector; 3720 } 3721 break; 3722 case 64: 3723 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 3724 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3725 Opc = AArch64::ST1Fourv2d; 3726 Offset = false; 3727 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) { 3728 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3729 Opc = AArch64::STR_ZZZZXI; 3730 StackID = TargetStackID::ScalableVector; 3731 } 3732 break; 3733 } 3734 assert(Opc && "Unknown register class"); 3735 MFI.setStackID(FI, StackID); 3736 3737 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 3738 .addReg(SrcReg, getKillRegState(isKill)) 3739 .addFrameIndex(FI); 3740 3741 if (Offset) 3742 MI.addImm(0); 3743 MI.addMemOperand(MMO); 3744 } 3745 3746 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, 3747 MachineBasicBlock &MBB, 3748 MachineBasicBlock::iterator InsertBefore, 3749 const MCInstrDesc &MCID, 3750 Register DestReg, unsigned SubIdx0, 3751 unsigned SubIdx1, int FI, 3752 MachineMemOperand *MMO) { 3753 Register DestReg0 = DestReg; 3754 Register DestReg1 = DestReg; 3755 bool IsUndef = true; 3756 if (Register::isPhysicalRegister(DestReg)) { 3757 DestReg0 = TRI.getSubReg(DestReg, SubIdx0); 3758 SubIdx0 = 0; 3759 DestReg1 = TRI.getSubReg(DestReg, SubIdx1); 3760 SubIdx1 = 0; 3761 IsUndef = false; 3762 } 3763 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 3764 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0) 3765 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1) 3766 .addFrameIndex(FI) 3767 .addImm(0) 3768 .addMemOperand(MMO); 3769 } 3770 3771 void AArch64InstrInfo::loadRegFromStackSlot( 3772 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, 3773 int FI, const TargetRegisterClass *RC, 3774 const TargetRegisterInfo *TRI) const { 3775 MachineFunction &MF = *MBB.getParent(); 3776 MachineFrameInfo &MFI = MF.getFrameInfo(); 3777 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 3778 MachineMemOperand *MMO = 3779 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, 3780 MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); 3781 3782 unsigned Opc = 0; 3783 bool Offset = true; 3784 unsigned StackID = TargetStackID::Default; 3785 switch (TRI->getSpillSize(*RC)) { 3786 case 1: 3787 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 3788 Opc = AArch64::LDRBui; 3789 break; 3790 case 2: 3791 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 3792 Opc = AArch64::LDRHui; 3793 else if (AArch64::PPRRegClass.hasSubClassEq(RC)) { 3794 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 3795 Opc = AArch64::LDR_PXI; 3796 StackID = TargetStackID::ScalableVector; 3797 } 3798 break; 3799 case 4: 3800 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 3801 Opc = AArch64::LDRWui; 3802 if (Register::isVirtualRegister(DestReg)) 3803 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass); 3804 else 3805 assert(DestReg != AArch64::WSP); 3806 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 3807 Opc = AArch64::LDRSui; 3808 break; 3809 case 8: 3810 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 3811 Opc = AArch64::LDRXui; 3812 if (Register::isVirtualRegister(DestReg)) 3813 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass); 3814 else 3815 assert(DestReg != AArch64::SP); 3816 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 3817 Opc = AArch64::LDRDui; 3818 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 3819 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 3820 get(AArch64::LDPWi), DestReg, AArch64::sube32, 3821 AArch64::subo32, FI, MMO); 3822 return; 3823 } 3824 break; 3825 case 16: 3826 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 3827 Opc = AArch64::LDRQui; 3828 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 3829 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3830 Opc = AArch64::LD1Twov1d; 3831 Offset = false; 3832 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 3833 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 3834 get(AArch64::LDPXi), DestReg, AArch64::sube64, 3835 AArch64::subo64, FI, MMO); 3836 return; 3837 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { 3838 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 3839 Opc = AArch64::LDR_ZXI; 3840 StackID = TargetStackID::ScalableVector; 3841 } 3842 break; 3843 case 24: 3844 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 3845 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3846 Opc = AArch64::LD1Threev1d; 3847 Offset = false; 3848 } 3849 break; 3850 case 32: 3851 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 3852 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3853 Opc = AArch64::LD1Fourv1d; 3854 Offset = false; 3855 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 3856 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3857 Opc = AArch64::LD1Twov2d; 3858 Offset = false; 3859 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) { 3860 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 3861 Opc = AArch64::LDR_ZZXI; 3862 StackID = TargetStackID::ScalableVector; 3863 } 3864 break; 3865 case 48: 3866 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 3867 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3868 Opc = AArch64::LD1Threev2d; 3869 Offset = false; 3870 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { 3871 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 3872 Opc = AArch64::LDR_ZZZXI; 3873 StackID = TargetStackID::ScalableVector; 3874 } 3875 break; 3876 case 64: 3877 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 3878 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3879 Opc = AArch64::LD1Fourv2d; 3880 Offset = false; 3881 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) { 3882 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 3883 Opc = AArch64::LDR_ZZZZXI; 3884 StackID = TargetStackID::ScalableVector; 3885 } 3886 break; 3887 } 3888 3889 assert(Opc && "Unknown register class"); 3890 MFI.setStackID(FI, StackID); 3891 3892 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 3893 .addReg(DestReg, getDefRegState(true)) 3894 .addFrameIndex(FI); 3895 if (Offset) 3896 MI.addImm(0); 3897 MI.addMemOperand(MMO); 3898 } 3899 3900 bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, 3901 const MachineInstr &UseMI, 3902 const TargetRegisterInfo *TRI) { 3903 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()), 3904 UseMI.getIterator()), 3905 [TRI](const MachineInstr &I) { 3906 return I.modifiesRegister(AArch64::NZCV, TRI) || 3907 I.readsRegister(AArch64::NZCV, TRI); 3908 }); 3909 } 3910 3911 void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets( 3912 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) { 3913 // The smallest scalable element supported by scaled SVE addressing 3914 // modes are predicates, which are 2 scalable bytes in size. So the scalable 3915 // byte offset must always be a multiple of 2. 3916 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); 3917 3918 // VGSized offsets are divided by '2', because the VG register is the 3919 // the number of 64bit granules as opposed to 128bit vector chunks, 3920 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled. 3921 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes. 3922 // VG = n * 2 and the dwarf offset must be VG * 8 bytes. 3923 ByteSized = Offset.getFixed(); 3924 VGSized = Offset.getScalable() / 2; 3925 } 3926 3927 /// Returns the offset in parts to which this frame offset can be 3928 /// decomposed for the purpose of describing a frame offset. 3929 /// For non-scalable offsets this is simply its byte size. 3930 void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( 3931 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors, 3932 int64_t &NumDataVectors) { 3933 // The smallest scalable element supported by scaled SVE addressing 3934 // modes are predicates, which are 2 scalable bytes in size. So the scalable 3935 // byte offset must always be a multiple of 2. 3936 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); 3937 3938 NumBytes = Offset.getFixed(); 3939 NumDataVectors = 0; 3940 NumPredicateVectors = Offset.getScalable() / 2; 3941 // This method is used to get the offsets to adjust the frame offset. 3942 // If the function requires ADDPL to be used and needs more than two ADDPL 3943 // instructions, part of the offset is folded into NumDataVectors so that it 3944 // uses ADDVL for part of it, reducing the number of ADDPL instructions. 3945 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 || 3946 NumPredicateVectors > 62) { 3947 NumDataVectors = NumPredicateVectors / 8; 3948 NumPredicateVectors -= NumDataVectors * 8; 3949 } 3950 } 3951 3952 // Helper function to emit a frame offset adjustment from a given 3953 // pointer (SrcReg), stored into DestReg. This function is explicit 3954 // in that it requires the opcode. 3955 static void emitFrameOffsetAdj(MachineBasicBlock &MBB, 3956 MachineBasicBlock::iterator MBBI, 3957 const DebugLoc &DL, unsigned DestReg, 3958 unsigned SrcReg, int64_t Offset, unsigned Opc, 3959 const TargetInstrInfo *TII, 3960 MachineInstr::MIFlag Flag, bool NeedsWinCFI, 3961 bool *HasWinCFI) { 3962 int Sign = 1; 3963 unsigned MaxEncoding, ShiftSize; 3964 switch (Opc) { 3965 case AArch64::ADDXri: 3966 case AArch64::ADDSXri: 3967 case AArch64::SUBXri: 3968 case AArch64::SUBSXri: 3969 MaxEncoding = 0xfff; 3970 ShiftSize = 12; 3971 break; 3972 case AArch64::ADDVL_XXI: 3973 case AArch64::ADDPL_XXI: 3974 MaxEncoding = 31; 3975 ShiftSize = 0; 3976 if (Offset < 0) { 3977 MaxEncoding = 32; 3978 Sign = -1; 3979 Offset = -Offset; 3980 } 3981 break; 3982 default: 3983 llvm_unreachable("Unsupported opcode"); 3984 } 3985 3986 // FIXME: If the offset won't fit in 24-bits, compute the offset into a 3987 // scratch register. If DestReg is a virtual register, use it as the 3988 // scratch register; otherwise, create a new virtual register (to be 3989 // replaced by the scavenger at the end of PEI). That case can be optimized 3990 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch 3991 // register can be loaded with offset%8 and the add/sub can use an extending 3992 // instruction with LSL#3. 3993 // Currently the function handles any offsets but generates a poor sequence 3994 // of code. 3995 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate"); 3996 3997 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize; 3998 Register TmpReg = DestReg; 3999 if (TmpReg == AArch64::XZR) 4000 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister( 4001 &AArch64::GPR64RegClass); 4002 do { 4003 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue); 4004 unsigned LocalShiftSize = 0; 4005 if (ThisVal > MaxEncoding) { 4006 ThisVal = ThisVal >> ShiftSize; 4007 LocalShiftSize = ShiftSize; 4008 } 4009 assert((ThisVal >> ShiftSize) <= MaxEncoding && 4010 "Encoding cannot handle value that big"); 4011 4012 Offset -= ThisVal << LocalShiftSize; 4013 if (Offset == 0) 4014 TmpReg = DestReg; 4015 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg) 4016 .addReg(SrcReg) 4017 .addImm(Sign * (int)ThisVal); 4018 if (ShiftSize) 4019 MBI = MBI.addImm( 4020 AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize)); 4021 MBI = MBI.setMIFlag(Flag); 4022 4023 if (NeedsWinCFI) { 4024 assert(Sign == 1 && "SEH directives should always have a positive sign"); 4025 int Imm = (int)(ThisVal << LocalShiftSize); 4026 if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) || 4027 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) { 4028 if (HasWinCFI) 4029 *HasWinCFI = true; 4030 if (Imm == 0) 4031 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag); 4032 else 4033 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)) 4034 .addImm(Imm) 4035 .setMIFlag(Flag); 4036 assert(Offset == 0 && "Expected remaining offset to be zero to " 4037 "emit a single SEH directive"); 4038 } else if (DestReg == AArch64::SP) { 4039 if (HasWinCFI) 4040 *HasWinCFI = true; 4041 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc"); 4042 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) 4043 .addImm(Imm) 4044 .setMIFlag(Flag); 4045 } 4046 if (HasWinCFI) 4047 *HasWinCFI = true; 4048 } 4049 4050 SrcReg = TmpReg; 4051 } while (Offset); 4052 } 4053 4054 void llvm::emitFrameOffset(MachineBasicBlock &MBB, 4055 MachineBasicBlock::iterator MBBI, const DebugLoc &DL, 4056 unsigned DestReg, unsigned SrcReg, 4057 StackOffset Offset, const TargetInstrInfo *TII, 4058 MachineInstr::MIFlag Flag, bool SetNZCV, 4059 bool NeedsWinCFI, bool *HasWinCFI) { 4060 int64_t Bytes, NumPredicateVectors, NumDataVectors; 4061 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( 4062 Offset, Bytes, NumPredicateVectors, NumDataVectors); 4063 4064 // First emit non-scalable frame offsets, or a simple 'mov'. 4065 if (Bytes || (!Offset && SrcReg != DestReg)) { 4066 assert((DestReg != AArch64::SP || Bytes % 8 == 0) && 4067 "SP increment/decrement not 8-byte aligned"); 4068 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri; 4069 if (Bytes < 0) { 4070 Bytes = -Bytes; 4071 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri; 4072 } 4073 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag, 4074 NeedsWinCFI, HasWinCFI); 4075 SrcReg = DestReg; 4076 } 4077 4078 assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) && 4079 "SetNZCV not supported with SVE vectors"); 4080 assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) && 4081 "WinCFI not supported with SVE vectors"); 4082 4083 if (NumDataVectors) { 4084 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors, 4085 AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr); 4086 SrcReg = DestReg; 4087 } 4088 4089 if (NumPredicateVectors) { 4090 assert(DestReg != AArch64::SP && "Unaligned access to SP"); 4091 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors, 4092 AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr); 4093 } 4094 } 4095 4096 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( 4097 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, 4098 MachineBasicBlock::iterator InsertPt, int FrameIndex, 4099 LiveIntervals *LIS, VirtRegMap *VRM) const { 4100 // This is a bit of a hack. Consider this instruction: 4101 // 4102 // %0 = COPY %sp; GPR64all:%0 4103 // 4104 // We explicitly chose GPR64all for the virtual register so such a copy might 4105 // be eliminated by RegisterCoalescer. However, that may not be possible, and 4106 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all 4107 // register class, TargetInstrInfo::foldMemoryOperand() is going to try. 4108 // 4109 // To prevent that, we are going to constrain the %0 register class here. 4110 // 4111 // <rdar://problem/11522048> 4112 // 4113 if (MI.isFullCopy()) { 4114 Register DstReg = MI.getOperand(0).getReg(); 4115 Register SrcReg = MI.getOperand(1).getReg(); 4116 if (SrcReg == AArch64::SP && Register::isVirtualRegister(DstReg)) { 4117 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass); 4118 return nullptr; 4119 } 4120 if (DstReg == AArch64::SP && Register::isVirtualRegister(SrcReg)) { 4121 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 4122 return nullptr; 4123 } 4124 } 4125 4126 // Handle the case where a copy is being spilled or filled but the source 4127 // and destination register class don't match. For example: 4128 // 4129 // %0 = COPY %xzr; GPR64common:%0 4130 // 4131 // In this case we can still safely fold away the COPY and generate the 4132 // following spill code: 4133 // 4134 // STRXui %xzr, %stack.0 4135 // 4136 // This also eliminates spilled cross register class COPYs (e.g. between x and 4137 // d regs) of the same size. For example: 4138 // 4139 // %0 = COPY %1; GPR64:%0, FPR64:%1 4140 // 4141 // will be filled as 4142 // 4143 // LDRDui %0, fi<#0> 4144 // 4145 // instead of 4146 // 4147 // LDRXui %Temp, fi<#0> 4148 // %0 = FMOV %Temp 4149 // 4150 if (MI.isCopy() && Ops.size() == 1 && 4151 // Make sure we're only folding the explicit COPY defs/uses. 4152 (Ops[0] == 0 || Ops[0] == 1)) { 4153 bool IsSpill = Ops[0] == 0; 4154 bool IsFill = !IsSpill; 4155 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); 4156 const MachineRegisterInfo &MRI = MF.getRegInfo(); 4157 MachineBasicBlock &MBB = *MI.getParent(); 4158 const MachineOperand &DstMO = MI.getOperand(0); 4159 const MachineOperand &SrcMO = MI.getOperand(1); 4160 Register DstReg = DstMO.getReg(); 4161 Register SrcReg = SrcMO.getReg(); 4162 // This is slightly expensive to compute for physical regs since 4163 // getMinimalPhysRegClass is slow. 4164 auto getRegClass = [&](unsigned Reg) { 4165 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg) 4166 : TRI.getMinimalPhysRegClass(Reg); 4167 }; 4168 4169 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) { 4170 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) == 4171 TRI.getRegSizeInBits(*getRegClass(SrcReg)) && 4172 "Mismatched register size in non subreg COPY"); 4173 if (IsSpill) 4174 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex, 4175 getRegClass(SrcReg), &TRI); 4176 else 4177 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, 4178 getRegClass(DstReg), &TRI); 4179 return &*--InsertPt; 4180 } 4181 4182 // Handle cases like spilling def of: 4183 // 4184 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0 4185 // 4186 // where the physical register source can be widened and stored to the full 4187 // virtual reg destination stack slot, in this case producing: 4188 // 4189 // STRXui %xzr, %stack.0 4190 // 4191 if (IsSpill && DstMO.isUndef() && Register::isPhysicalRegister(SrcReg)) { 4192 assert(SrcMO.getSubReg() == 0 && 4193 "Unexpected subreg on physical register"); 4194 const TargetRegisterClass *SpillRC; 4195 unsigned SpillSubreg; 4196 switch (DstMO.getSubReg()) { 4197 default: 4198 SpillRC = nullptr; 4199 break; 4200 case AArch64::sub_32: 4201 case AArch64::ssub: 4202 if (AArch64::GPR32RegClass.contains(SrcReg)) { 4203 SpillRC = &AArch64::GPR64RegClass; 4204 SpillSubreg = AArch64::sub_32; 4205 } else if (AArch64::FPR32RegClass.contains(SrcReg)) { 4206 SpillRC = &AArch64::FPR64RegClass; 4207 SpillSubreg = AArch64::ssub; 4208 } else 4209 SpillRC = nullptr; 4210 break; 4211 case AArch64::dsub: 4212 if (AArch64::FPR64RegClass.contains(SrcReg)) { 4213 SpillRC = &AArch64::FPR128RegClass; 4214 SpillSubreg = AArch64::dsub; 4215 } else 4216 SpillRC = nullptr; 4217 break; 4218 } 4219 4220 if (SpillRC) 4221 if (unsigned WidenedSrcReg = 4222 TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) { 4223 storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(), 4224 FrameIndex, SpillRC, &TRI); 4225 return &*--InsertPt; 4226 } 4227 } 4228 4229 // Handle cases like filling use of: 4230 // 4231 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1 4232 // 4233 // where we can load the full virtual reg source stack slot, into the subreg 4234 // destination, in this case producing: 4235 // 4236 // LDRWui %0:sub_32<def,read-undef>, %stack.0 4237 // 4238 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) { 4239 const TargetRegisterClass *FillRC; 4240 switch (DstMO.getSubReg()) { 4241 default: 4242 FillRC = nullptr; 4243 break; 4244 case AArch64::sub_32: 4245 FillRC = &AArch64::GPR32RegClass; 4246 break; 4247 case AArch64::ssub: 4248 FillRC = &AArch64::FPR32RegClass; 4249 break; 4250 case AArch64::dsub: 4251 FillRC = &AArch64::FPR64RegClass; 4252 break; 4253 } 4254 4255 if (FillRC) { 4256 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) == 4257 TRI.getRegSizeInBits(*FillRC) && 4258 "Mismatched regclass size on folded subreg COPY"); 4259 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI); 4260 MachineInstr &LoadMI = *--InsertPt; 4261 MachineOperand &LoadDst = LoadMI.getOperand(0); 4262 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load"); 4263 LoadDst.setSubReg(DstMO.getSubReg()); 4264 LoadDst.setIsUndef(); 4265 return &LoadMI; 4266 } 4267 } 4268 } 4269 4270 // Cannot fold. 4271 return nullptr; 4272 } 4273 4274 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, 4275 StackOffset &SOffset, 4276 bool *OutUseUnscaledOp, 4277 unsigned *OutUnscaledOp, 4278 int64_t *EmittableOffset) { 4279 // Set output values in case of early exit. 4280 if (EmittableOffset) 4281 *EmittableOffset = 0; 4282 if (OutUseUnscaledOp) 4283 *OutUseUnscaledOp = false; 4284 if (OutUnscaledOp) 4285 *OutUnscaledOp = 0; 4286 4287 // Exit early for structured vector spills/fills as they can't take an 4288 // immediate offset. 4289 switch (MI.getOpcode()) { 4290 default: 4291 break; 4292 case AArch64::LD1Twov2d: 4293 case AArch64::LD1Threev2d: 4294 case AArch64::LD1Fourv2d: 4295 case AArch64::LD1Twov1d: 4296 case AArch64::LD1Threev1d: 4297 case AArch64::LD1Fourv1d: 4298 case AArch64::ST1Twov2d: 4299 case AArch64::ST1Threev2d: 4300 case AArch64::ST1Fourv2d: 4301 case AArch64::ST1Twov1d: 4302 case AArch64::ST1Threev1d: 4303 case AArch64::ST1Fourv1d: 4304 case AArch64::IRG: 4305 case AArch64::IRGstack: 4306 case AArch64::STGloop: 4307 case AArch64::STZGloop: 4308 return AArch64FrameOffsetCannotUpdate; 4309 } 4310 4311 // Get the min/max offset and the scale. 4312 TypeSize ScaleValue(0U, false); 4313 unsigned Width; 4314 int64_t MinOff, MaxOff; 4315 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff, 4316 MaxOff)) 4317 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 4318 4319 // Construct the complete offset. 4320 bool IsMulVL = ScaleValue.isScalable(); 4321 unsigned Scale = ScaleValue.getKnownMinSize(); 4322 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed(); 4323 4324 const MachineOperand &ImmOpnd = 4325 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode())); 4326 Offset += ImmOpnd.getImm() * Scale; 4327 4328 // If the offset doesn't match the scale, we rewrite the instruction to 4329 // use the unscaled instruction instead. Likewise, if we have a negative 4330 // offset and there is an unscaled op to use. 4331 Optional<unsigned> UnscaledOp = 4332 AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode()); 4333 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0); 4334 if (useUnscaledOp && 4335 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff, 4336 MaxOff)) 4337 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 4338 4339 Scale = ScaleValue.getKnownMinSize(); 4340 assert(IsMulVL == ScaleValue.isScalable() && 4341 "Unscaled opcode has different value for scalable"); 4342 4343 int64_t Remainder = Offset % Scale; 4344 assert(!(Remainder && useUnscaledOp) && 4345 "Cannot have remainder when using unscaled op"); 4346 4347 assert(MinOff < MaxOff && "Unexpected Min/Max offsets"); 4348 int64_t NewOffset = Offset / Scale; 4349 if (MinOff <= NewOffset && NewOffset <= MaxOff) 4350 Offset = Remainder; 4351 else { 4352 NewOffset = NewOffset < 0 ? MinOff : MaxOff; 4353 Offset = Offset - NewOffset * Scale + Remainder; 4354 } 4355 4356 if (EmittableOffset) 4357 *EmittableOffset = NewOffset; 4358 if (OutUseUnscaledOp) 4359 *OutUseUnscaledOp = useUnscaledOp; 4360 if (OutUnscaledOp && UnscaledOp) 4361 *OutUnscaledOp = *UnscaledOp; 4362 4363 if (IsMulVL) 4364 SOffset = StackOffset::get(SOffset.getFixed(), Offset); 4365 else 4366 SOffset = StackOffset::get(Offset, SOffset.getScalable()); 4367 return AArch64FrameOffsetCanUpdate | 4368 (SOffset ? 0 : AArch64FrameOffsetIsLegal); 4369 } 4370 4371 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, 4372 unsigned FrameReg, StackOffset &Offset, 4373 const AArch64InstrInfo *TII) { 4374 unsigned Opcode = MI.getOpcode(); 4375 unsigned ImmIdx = FrameRegIdx + 1; 4376 4377 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) { 4378 Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm()); 4379 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(), 4380 MI.getOperand(0).getReg(), FrameReg, Offset, TII, 4381 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri)); 4382 MI.eraseFromParent(); 4383 Offset = StackOffset(); 4384 return true; 4385 } 4386 4387 int64_t NewOffset; 4388 unsigned UnscaledOp; 4389 bool UseUnscaledOp; 4390 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp, 4391 &UnscaledOp, &NewOffset); 4392 if (Status & AArch64FrameOffsetCanUpdate) { 4393 if (Status & AArch64FrameOffsetIsLegal) 4394 // Replace the FrameIndex with FrameReg. 4395 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); 4396 if (UseUnscaledOp) 4397 MI.setDesc(TII->get(UnscaledOp)); 4398 4399 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset); 4400 return !Offset; 4401 } 4402 4403 return false; 4404 } 4405 4406 MCInst AArch64InstrInfo::getNop() const { 4407 return MCInstBuilder(AArch64::HINT).addImm(0); 4408 } 4409 4410 // AArch64 supports MachineCombiner. 4411 bool AArch64InstrInfo::useMachineCombiner() const { return true; } 4412 4413 // True when Opc sets flag 4414 static bool isCombineInstrSettingFlag(unsigned Opc) { 4415 switch (Opc) { 4416 case AArch64::ADDSWrr: 4417 case AArch64::ADDSWri: 4418 case AArch64::ADDSXrr: 4419 case AArch64::ADDSXri: 4420 case AArch64::SUBSWrr: 4421 case AArch64::SUBSXrr: 4422 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 4423 case AArch64::SUBSWri: 4424 case AArch64::SUBSXri: 4425 return true; 4426 default: 4427 break; 4428 } 4429 return false; 4430 } 4431 4432 // 32b Opcodes that can be combined with a MUL 4433 static bool isCombineInstrCandidate32(unsigned Opc) { 4434 switch (Opc) { 4435 case AArch64::ADDWrr: 4436 case AArch64::ADDWri: 4437 case AArch64::SUBWrr: 4438 case AArch64::ADDSWrr: 4439 case AArch64::ADDSWri: 4440 case AArch64::SUBSWrr: 4441 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 4442 case AArch64::SUBWri: 4443 case AArch64::SUBSWri: 4444 return true; 4445 default: 4446 break; 4447 } 4448 return false; 4449 } 4450 4451 // 64b Opcodes that can be combined with a MUL 4452 static bool isCombineInstrCandidate64(unsigned Opc) { 4453 switch (Opc) { 4454 case AArch64::ADDXrr: 4455 case AArch64::ADDXri: 4456 case AArch64::SUBXrr: 4457 case AArch64::ADDSXrr: 4458 case AArch64::ADDSXri: 4459 case AArch64::SUBSXrr: 4460 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 4461 case AArch64::SUBXri: 4462 case AArch64::SUBSXri: 4463 case AArch64::ADDv8i8: 4464 case AArch64::ADDv16i8: 4465 case AArch64::ADDv4i16: 4466 case AArch64::ADDv8i16: 4467 case AArch64::ADDv2i32: 4468 case AArch64::ADDv4i32: 4469 case AArch64::SUBv8i8: 4470 case AArch64::SUBv16i8: 4471 case AArch64::SUBv4i16: 4472 case AArch64::SUBv8i16: 4473 case AArch64::SUBv2i32: 4474 case AArch64::SUBv4i32: 4475 return true; 4476 default: 4477 break; 4478 } 4479 return false; 4480 } 4481 4482 // FP Opcodes that can be combined with a FMUL. 4483 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { 4484 switch (Inst.getOpcode()) { 4485 default: 4486 break; 4487 case AArch64::FADDHrr: 4488 case AArch64::FADDSrr: 4489 case AArch64::FADDDrr: 4490 case AArch64::FADDv4f16: 4491 case AArch64::FADDv8f16: 4492 case AArch64::FADDv2f32: 4493 case AArch64::FADDv2f64: 4494 case AArch64::FADDv4f32: 4495 case AArch64::FSUBHrr: 4496 case AArch64::FSUBSrr: 4497 case AArch64::FSUBDrr: 4498 case AArch64::FSUBv4f16: 4499 case AArch64::FSUBv8f16: 4500 case AArch64::FSUBv2f32: 4501 case AArch64::FSUBv2f64: 4502 case AArch64::FSUBv4f32: 4503 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options; 4504 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by 4505 // the target options or if FADD/FSUB has the contract fast-math flag. 4506 return Options.UnsafeFPMath || 4507 Options.AllowFPOpFusion == FPOpFusion::Fast || 4508 Inst.getFlag(MachineInstr::FmContract); 4509 return true; 4510 } 4511 return false; 4512 } 4513 4514 // Opcodes that can be combined with a MUL 4515 static bool isCombineInstrCandidate(unsigned Opc) { 4516 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc)); 4517 } 4518 4519 // 4520 // Utility routine that checks if \param MO is defined by an 4521 // \param CombineOpc instruction in the basic block \param MBB 4522 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, 4523 unsigned CombineOpc, unsigned ZeroReg = 0, 4524 bool CheckZeroReg = false) { 4525 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4526 MachineInstr *MI = nullptr; 4527 4528 if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) 4529 MI = MRI.getUniqueVRegDef(MO.getReg()); 4530 // And it needs to be in the trace (otherwise, it won't have a depth). 4531 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc) 4532 return false; 4533 // Must only used by the user we combine with. 4534 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) 4535 return false; 4536 4537 if (CheckZeroReg) { 4538 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && 4539 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && 4540 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); 4541 // The third input reg must be zero. 4542 if (MI->getOperand(3).getReg() != ZeroReg) 4543 return false; 4544 } 4545 4546 return true; 4547 } 4548 4549 // 4550 // Is \param MO defined by an integer multiply and can be combined? 4551 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, 4552 unsigned MulOpc, unsigned ZeroReg) { 4553 return canCombine(MBB, MO, MulOpc, ZeroReg, true); 4554 } 4555 4556 // 4557 // Is \param MO defined by a floating-point multiply and can be combined? 4558 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, 4559 unsigned MulOpc) { 4560 return canCombine(MBB, MO, MulOpc); 4561 } 4562 4563 // TODO: There are many more machine instruction opcodes to match: 4564 // 1. Other data types (integer, vectors) 4565 // 2. Other math / logic operations (xor, or) 4566 // 3. Other forms of the same operation (intrinsics and other variants) 4567 bool AArch64InstrInfo::isAssociativeAndCommutative( 4568 const MachineInstr &Inst) const { 4569 switch (Inst.getOpcode()) { 4570 case AArch64::FADDDrr: 4571 case AArch64::FADDSrr: 4572 case AArch64::FADDv2f32: 4573 case AArch64::FADDv2f64: 4574 case AArch64::FADDv4f32: 4575 case AArch64::FMULDrr: 4576 case AArch64::FMULSrr: 4577 case AArch64::FMULX32: 4578 case AArch64::FMULX64: 4579 case AArch64::FMULXv2f32: 4580 case AArch64::FMULXv2f64: 4581 case AArch64::FMULXv4f32: 4582 case AArch64::FMULv2f32: 4583 case AArch64::FMULv2f64: 4584 case AArch64::FMULv4f32: 4585 return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath; 4586 default: 4587 return false; 4588 } 4589 } 4590 4591 /// Find instructions that can be turned into madd. 4592 static bool getMaddPatterns(MachineInstr &Root, 4593 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 4594 unsigned Opc = Root.getOpcode(); 4595 MachineBasicBlock &MBB = *Root.getParent(); 4596 bool Found = false; 4597 4598 if (!isCombineInstrCandidate(Opc)) 4599 return false; 4600 if (isCombineInstrSettingFlag(Opc)) { 4601 int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true); 4602 // When NZCV is live bail out. 4603 if (Cmp_NZCV == -1) 4604 return false; 4605 unsigned NewOpc = convertToNonFlagSettingOpc(Root); 4606 // When opcode can't change bail out. 4607 // CHECKME: do we miss any cases for opcode conversion? 4608 if (NewOpc == Opc) 4609 return false; 4610 Opc = NewOpc; 4611 } 4612 4613 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg, 4614 MachineCombinerPattern Pattern) { 4615 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) { 4616 Patterns.push_back(Pattern); 4617 Found = true; 4618 } 4619 }; 4620 4621 auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) { 4622 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) { 4623 Patterns.push_back(Pattern); 4624 Found = true; 4625 } 4626 }; 4627 4628 typedef MachineCombinerPattern MCP; 4629 4630 switch (Opc) { 4631 default: 4632 break; 4633 case AArch64::ADDWrr: 4634 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 4635 "ADDWrr does not have register operands"); 4636 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1); 4637 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2); 4638 break; 4639 case AArch64::ADDXrr: 4640 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1); 4641 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2); 4642 break; 4643 case AArch64::SUBWrr: 4644 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1); 4645 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2); 4646 break; 4647 case AArch64::SUBXrr: 4648 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1); 4649 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2); 4650 break; 4651 case AArch64::ADDWri: 4652 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1); 4653 break; 4654 case AArch64::ADDXri: 4655 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1); 4656 break; 4657 case AArch64::SUBWri: 4658 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1); 4659 break; 4660 case AArch64::SUBXri: 4661 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1); 4662 break; 4663 case AArch64::ADDv8i8: 4664 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1); 4665 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2); 4666 break; 4667 case AArch64::ADDv16i8: 4668 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1); 4669 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2); 4670 break; 4671 case AArch64::ADDv4i16: 4672 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1); 4673 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2); 4674 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1); 4675 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2); 4676 break; 4677 case AArch64::ADDv8i16: 4678 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1); 4679 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2); 4680 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1); 4681 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2); 4682 break; 4683 case AArch64::ADDv2i32: 4684 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1); 4685 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2); 4686 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1); 4687 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2); 4688 break; 4689 case AArch64::ADDv4i32: 4690 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1); 4691 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2); 4692 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1); 4693 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2); 4694 break; 4695 case AArch64::SUBv8i8: 4696 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1); 4697 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2); 4698 break; 4699 case AArch64::SUBv16i8: 4700 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1); 4701 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2); 4702 break; 4703 case AArch64::SUBv4i16: 4704 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1); 4705 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2); 4706 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1); 4707 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2); 4708 break; 4709 case AArch64::SUBv8i16: 4710 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1); 4711 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2); 4712 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1); 4713 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2); 4714 break; 4715 case AArch64::SUBv2i32: 4716 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1); 4717 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2); 4718 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1); 4719 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2); 4720 break; 4721 case AArch64::SUBv4i32: 4722 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1); 4723 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2); 4724 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1); 4725 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2); 4726 break; 4727 } 4728 return Found; 4729 } 4730 /// Floating-Point Support 4731 4732 /// Find instructions that can be turned into madd. 4733 static bool getFMAPatterns(MachineInstr &Root, 4734 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 4735 4736 if (!isCombineInstrCandidateFP(Root)) 4737 return false; 4738 4739 MachineBasicBlock &MBB = *Root.getParent(); 4740 bool Found = false; 4741 4742 auto Match = [&](int Opcode, int Operand, 4743 MachineCombinerPattern Pattern) -> bool { 4744 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) { 4745 Patterns.push_back(Pattern); 4746 return true; 4747 } 4748 return false; 4749 }; 4750 4751 typedef MachineCombinerPattern MCP; 4752 4753 switch (Root.getOpcode()) { 4754 default: 4755 assert(false && "Unsupported FP instruction in combiner\n"); 4756 break; 4757 case AArch64::FADDHrr: 4758 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 4759 "FADDHrr does not have register operands"); 4760 4761 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1); 4762 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2); 4763 break; 4764 case AArch64::FADDSrr: 4765 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 4766 "FADDSrr does not have register operands"); 4767 4768 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) || 4769 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1); 4770 4771 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) || 4772 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2); 4773 break; 4774 case AArch64::FADDDrr: 4775 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) || 4776 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1); 4777 4778 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) || 4779 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2); 4780 break; 4781 case AArch64::FADDv4f16: 4782 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) || 4783 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1); 4784 4785 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) || 4786 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2); 4787 break; 4788 case AArch64::FADDv8f16: 4789 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) || 4790 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1); 4791 4792 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) || 4793 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2); 4794 break; 4795 case AArch64::FADDv2f32: 4796 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) || 4797 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1); 4798 4799 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) || 4800 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2); 4801 break; 4802 case AArch64::FADDv2f64: 4803 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) || 4804 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1); 4805 4806 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) || 4807 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2); 4808 break; 4809 case AArch64::FADDv4f32: 4810 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) || 4811 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1); 4812 4813 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) || 4814 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2); 4815 break; 4816 case AArch64::FSUBHrr: 4817 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1); 4818 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2); 4819 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1); 4820 break; 4821 case AArch64::FSUBSrr: 4822 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1); 4823 4824 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) || 4825 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2); 4826 4827 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1); 4828 break; 4829 case AArch64::FSUBDrr: 4830 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1); 4831 4832 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) || 4833 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2); 4834 4835 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1); 4836 break; 4837 case AArch64::FSUBv4f16: 4838 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) || 4839 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2); 4840 4841 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) || 4842 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1); 4843 break; 4844 case AArch64::FSUBv8f16: 4845 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) || 4846 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2); 4847 4848 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) || 4849 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1); 4850 break; 4851 case AArch64::FSUBv2f32: 4852 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) || 4853 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2); 4854 4855 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) || 4856 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1); 4857 break; 4858 case AArch64::FSUBv2f64: 4859 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) || 4860 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2); 4861 4862 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) || 4863 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1); 4864 break; 4865 case AArch64::FSUBv4f32: 4866 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) || 4867 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2); 4868 4869 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) || 4870 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1); 4871 break; 4872 } 4873 return Found; 4874 } 4875 4876 /// Return true when a code sequence can improve throughput. It 4877 /// should be called only for instructions in loops. 4878 /// \param Pattern - combiner pattern 4879 bool AArch64InstrInfo::isThroughputPattern( 4880 MachineCombinerPattern Pattern) const { 4881 switch (Pattern) { 4882 default: 4883 break; 4884 case MachineCombinerPattern::FMULADDH_OP1: 4885 case MachineCombinerPattern::FMULADDH_OP2: 4886 case MachineCombinerPattern::FMULSUBH_OP1: 4887 case MachineCombinerPattern::FMULSUBH_OP2: 4888 case MachineCombinerPattern::FMULADDS_OP1: 4889 case MachineCombinerPattern::FMULADDS_OP2: 4890 case MachineCombinerPattern::FMULSUBS_OP1: 4891 case MachineCombinerPattern::FMULSUBS_OP2: 4892 case MachineCombinerPattern::FMULADDD_OP1: 4893 case MachineCombinerPattern::FMULADDD_OP2: 4894 case MachineCombinerPattern::FMULSUBD_OP1: 4895 case MachineCombinerPattern::FMULSUBD_OP2: 4896 case MachineCombinerPattern::FNMULSUBH_OP1: 4897 case MachineCombinerPattern::FNMULSUBS_OP1: 4898 case MachineCombinerPattern::FNMULSUBD_OP1: 4899 case MachineCombinerPattern::FMLAv4i16_indexed_OP1: 4900 case MachineCombinerPattern::FMLAv4i16_indexed_OP2: 4901 case MachineCombinerPattern::FMLAv8i16_indexed_OP1: 4902 case MachineCombinerPattern::FMLAv8i16_indexed_OP2: 4903 case MachineCombinerPattern::FMLAv1i32_indexed_OP1: 4904 case MachineCombinerPattern::FMLAv1i32_indexed_OP2: 4905 case MachineCombinerPattern::FMLAv1i64_indexed_OP1: 4906 case MachineCombinerPattern::FMLAv1i64_indexed_OP2: 4907 case MachineCombinerPattern::FMLAv4f16_OP2: 4908 case MachineCombinerPattern::FMLAv4f16_OP1: 4909 case MachineCombinerPattern::FMLAv8f16_OP1: 4910 case MachineCombinerPattern::FMLAv8f16_OP2: 4911 case MachineCombinerPattern::FMLAv2f32_OP2: 4912 case MachineCombinerPattern::FMLAv2f32_OP1: 4913 case MachineCombinerPattern::FMLAv2f64_OP1: 4914 case MachineCombinerPattern::FMLAv2f64_OP2: 4915 case MachineCombinerPattern::FMLAv2i32_indexed_OP1: 4916 case MachineCombinerPattern::FMLAv2i32_indexed_OP2: 4917 case MachineCombinerPattern::FMLAv2i64_indexed_OP1: 4918 case MachineCombinerPattern::FMLAv2i64_indexed_OP2: 4919 case MachineCombinerPattern::FMLAv4f32_OP1: 4920 case MachineCombinerPattern::FMLAv4f32_OP2: 4921 case MachineCombinerPattern::FMLAv4i32_indexed_OP1: 4922 case MachineCombinerPattern::FMLAv4i32_indexed_OP2: 4923 case MachineCombinerPattern::FMLSv4i16_indexed_OP1: 4924 case MachineCombinerPattern::FMLSv4i16_indexed_OP2: 4925 case MachineCombinerPattern::FMLSv8i16_indexed_OP1: 4926 case MachineCombinerPattern::FMLSv8i16_indexed_OP2: 4927 case MachineCombinerPattern::FMLSv1i32_indexed_OP2: 4928 case MachineCombinerPattern::FMLSv1i64_indexed_OP2: 4929 case MachineCombinerPattern::FMLSv2i32_indexed_OP2: 4930 case MachineCombinerPattern::FMLSv2i64_indexed_OP2: 4931 case MachineCombinerPattern::FMLSv4f16_OP1: 4932 case MachineCombinerPattern::FMLSv4f16_OP2: 4933 case MachineCombinerPattern::FMLSv8f16_OP1: 4934 case MachineCombinerPattern::FMLSv8f16_OP2: 4935 case MachineCombinerPattern::FMLSv2f32_OP2: 4936 case MachineCombinerPattern::FMLSv2f64_OP2: 4937 case MachineCombinerPattern::FMLSv4i32_indexed_OP2: 4938 case MachineCombinerPattern::FMLSv4f32_OP2: 4939 case MachineCombinerPattern::MULADDv8i8_OP1: 4940 case MachineCombinerPattern::MULADDv8i8_OP2: 4941 case MachineCombinerPattern::MULADDv16i8_OP1: 4942 case MachineCombinerPattern::MULADDv16i8_OP2: 4943 case MachineCombinerPattern::MULADDv4i16_OP1: 4944 case MachineCombinerPattern::MULADDv4i16_OP2: 4945 case MachineCombinerPattern::MULADDv8i16_OP1: 4946 case MachineCombinerPattern::MULADDv8i16_OP2: 4947 case MachineCombinerPattern::MULADDv2i32_OP1: 4948 case MachineCombinerPattern::MULADDv2i32_OP2: 4949 case MachineCombinerPattern::MULADDv4i32_OP1: 4950 case MachineCombinerPattern::MULADDv4i32_OP2: 4951 case MachineCombinerPattern::MULSUBv8i8_OP1: 4952 case MachineCombinerPattern::MULSUBv8i8_OP2: 4953 case MachineCombinerPattern::MULSUBv16i8_OP1: 4954 case MachineCombinerPattern::MULSUBv16i8_OP2: 4955 case MachineCombinerPattern::MULSUBv4i16_OP1: 4956 case MachineCombinerPattern::MULSUBv4i16_OP2: 4957 case MachineCombinerPattern::MULSUBv8i16_OP1: 4958 case MachineCombinerPattern::MULSUBv8i16_OP2: 4959 case MachineCombinerPattern::MULSUBv2i32_OP1: 4960 case MachineCombinerPattern::MULSUBv2i32_OP2: 4961 case MachineCombinerPattern::MULSUBv4i32_OP1: 4962 case MachineCombinerPattern::MULSUBv4i32_OP2: 4963 case MachineCombinerPattern::MULADDv4i16_indexed_OP1: 4964 case MachineCombinerPattern::MULADDv4i16_indexed_OP2: 4965 case MachineCombinerPattern::MULADDv8i16_indexed_OP1: 4966 case MachineCombinerPattern::MULADDv8i16_indexed_OP2: 4967 case MachineCombinerPattern::MULADDv2i32_indexed_OP1: 4968 case MachineCombinerPattern::MULADDv2i32_indexed_OP2: 4969 case MachineCombinerPattern::MULADDv4i32_indexed_OP1: 4970 case MachineCombinerPattern::MULADDv4i32_indexed_OP2: 4971 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1: 4972 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2: 4973 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1: 4974 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2: 4975 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1: 4976 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2: 4977 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1: 4978 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2: 4979 return true; 4980 } // end switch (Pattern) 4981 return false; 4982 } 4983 /// Return true when there is potentially a faster code sequence for an 4984 /// instruction chain ending in \p Root. All potential patterns are listed in 4985 /// the \p Pattern vector. Pattern should be sorted in priority order since the 4986 /// pattern evaluator stops checking as soon as it finds a faster sequence. 4987 4988 bool AArch64InstrInfo::getMachineCombinerPatterns( 4989 MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns, 4990 bool DoRegPressureReduce) const { 4991 // Integer patterns 4992 if (getMaddPatterns(Root, Patterns)) 4993 return true; 4994 // Floating point patterns 4995 if (getFMAPatterns(Root, Patterns)) 4996 return true; 4997 4998 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, 4999 DoRegPressureReduce); 5000 } 5001 5002 enum class FMAInstKind { Default, Indexed, Accumulator }; 5003 /// genFusedMultiply - Generate fused multiply instructions. 5004 /// This function supports both integer and floating point instructions. 5005 /// A typical example: 5006 /// F|MUL I=A,B,0 5007 /// F|ADD R,I,C 5008 /// ==> F|MADD R,A,B,C 5009 /// \param MF Containing MachineFunction 5010 /// \param MRI Register information 5011 /// \param TII Target information 5012 /// \param Root is the F|ADD instruction 5013 /// \param [out] InsInstrs is a vector of machine instructions and will 5014 /// contain the generated madd instruction 5015 /// \param IdxMulOpd is index of operand in Root that is the result of 5016 /// the F|MUL. In the example above IdxMulOpd is 1. 5017 /// \param MaddOpc the opcode fo the f|madd instruction 5018 /// \param RC Register class of operands 5019 /// \param kind of fma instruction (addressing mode) to be generated 5020 /// \param ReplacedAddend is the result register from the instruction 5021 /// replacing the non-combined operand, if any. 5022 static MachineInstr * 5023 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, 5024 const TargetInstrInfo *TII, MachineInstr &Root, 5025 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd, 5026 unsigned MaddOpc, const TargetRegisterClass *RC, 5027 FMAInstKind kind = FMAInstKind::Default, 5028 const Register *ReplacedAddend = nullptr) { 5029 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 5030 5031 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1; 5032 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 5033 Register ResultReg = Root.getOperand(0).getReg(); 5034 Register SrcReg0 = MUL->getOperand(1).getReg(); 5035 bool Src0IsKill = MUL->getOperand(1).isKill(); 5036 Register SrcReg1 = MUL->getOperand(2).getReg(); 5037 bool Src1IsKill = MUL->getOperand(2).isKill(); 5038 5039 unsigned SrcReg2; 5040 bool Src2IsKill; 5041 if (ReplacedAddend) { 5042 // If we just generated a new addend, we must be it's only use. 5043 SrcReg2 = *ReplacedAddend; 5044 Src2IsKill = true; 5045 } else { 5046 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg(); 5047 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill(); 5048 } 5049 5050 if (Register::isVirtualRegister(ResultReg)) 5051 MRI.constrainRegClass(ResultReg, RC); 5052 if (Register::isVirtualRegister(SrcReg0)) 5053 MRI.constrainRegClass(SrcReg0, RC); 5054 if (Register::isVirtualRegister(SrcReg1)) 5055 MRI.constrainRegClass(SrcReg1, RC); 5056 if (Register::isVirtualRegister(SrcReg2)) 5057 MRI.constrainRegClass(SrcReg2, RC); 5058 5059 MachineInstrBuilder MIB; 5060 if (kind == FMAInstKind::Default) 5061 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 5062 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 5063 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 5064 .addReg(SrcReg2, getKillRegState(Src2IsKill)); 5065 else if (kind == FMAInstKind::Indexed) 5066 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 5067 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 5068 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 5069 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 5070 .addImm(MUL->getOperand(3).getImm()); 5071 else if (kind == FMAInstKind::Accumulator) 5072 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 5073 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 5074 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 5075 .addReg(SrcReg1, getKillRegState(Src1IsKill)); 5076 else 5077 assert(false && "Invalid FMA instruction kind \n"); 5078 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL) 5079 InsInstrs.push_back(MIB); 5080 return MUL; 5081 } 5082 5083 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate 5084 /// instructions. 5085 /// 5086 /// \see genFusedMultiply 5087 static MachineInstr *genFusedMultiplyAcc( 5088 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 5089 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 5090 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { 5091 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 5092 FMAInstKind::Accumulator); 5093 } 5094 5095 /// genNeg - Helper to generate an intermediate negation of the second operand 5096 /// of Root 5097 static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI, 5098 const TargetInstrInfo *TII, MachineInstr &Root, 5099 SmallVectorImpl<MachineInstr *> &InsInstrs, 5100 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 5101 unsigned MnegOpc, const TargetRegisterClass *RC) { 5102 Register NewVR = MRI.createVirtualRegister(RC); 5103 MachineInstrBuilder MIB = 5104 BuildMI(MF, Root.getDebugLoc(), TII->get(MnegOpc), NewVR) 5105 .add(Root.getOperand(2)); 5106 InsInstrs.push_back(MIB); 5107 5108 assert(InstrIdxForVirtReg.empty()); 5109 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5110 5111 return NewVR; 5112 } 5113 5114 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate 5115 /// instructions with an additional negation of the accumulator 5116 static MachineInstr *genFusedMultiplyAccNeg( 5117 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 5118 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 5119 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, 5120 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { 5121 assert(IdxMulOpd == 1); 5122 5123 Register NewVR = 5124 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); 5125 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 5126 FMAInstKind::Accumulator, &NewVR); 5127 } 5128 5129 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate 5130 /// instructions. 5131 /// 5132 /// \see genFusedMultiply 5133 static MachineInstr *genFusedMultiplyIdx( 5134 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 5135 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 5136 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { 5137 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 5138 FMAInstKind::Indexed); 5139 } 5140 5141 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate 5142 /// instructions with an additional negation of the accumulator 5143 static MachineInstr *genFusedMultiplyIdxNeg( 5144 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 5145 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 5146 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, 5147 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { 5148 assert(IdxMulOpd == 1); 5149 5150 Register NewVR = 5151 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); 5152 5153 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 5154 FMAInstKind::Indexed, &NewVR); 5155 } 5156 5157 /// genMaddR - Generate madd instruction and combine mul and add using 5158 /// an extra virtual register 5159 /// Example - an ADD intermediate needs to be stored in a register: 5160 /// MUL I=A,B,0 5161 /// ADD R,I,Imm 5162 /// ==> ORR V, ZR, Imm 5163 /// ==> MADD R,A,B,V 5164 /// \param MF Containing MachineFunction 5165 /// \param MRI Register information 5166 /// \param TII Target information 5167 /// \param Root is the ADD instruction 5168 /// \param [out] InsInstrs is a vector of machine instructions and will 5169 /// contain the generated madd instruction 5170 /// \param IdxMulOpd is index of operand in Root that is the result of 5171 /// the MUL. In the example above IdxMulOpd is 1. 5172 /// \param MaddOpc the opcode fo the madd instruction 5173 /// \param VR is a virtual register that holds the value of an ADD operand 5174 /// (V in the example above). 5175 /// \param RC Register class of operands 5176 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, 5177 const TargetInstrInfo *TII, MachineInstr &Root, 5178 SmallVectorImpl<MachineInstr *> &InsInstrs, 5179 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR, 5180 const TargetRegisterClass *RC) { 5181 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 5182 5183 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 5184 Register ResultReg = Root.getOperand(0).getReg(); 5185 Register SrcReg0 = MUL->getOperand(1).getReg(); 5186 bool Src0IsKill = MUL->getOperand(1).isKill(); 5187 Register SrcReg1 = MUL->getOperand(2).getReg(); 5188 bool Src1IsKill = MUL->getOperand(2).isKill(); 5189 5190 if (Register::isVirtualRegister(ResultReg)) 5191 MRI.constrainRegClass(ResultReg, RC); 5192 if (Register::isVirtualRegister(SrcReg0)) 5193 MRI.constrainRegClass(SrcReg0, RC); 5194 if (Register::isVirtualRegister(SrcReg1)) 5195 MRI.constrainRegClass(SrcReg1, RC); 5196 if (Register::isVirtualRegister(VR)) 5197 MRI.constrainRegClass(VR, RC); 5198 5199 MachineInstrBuilder MIB = 5200 BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 5201 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 5202 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 5203 .addReg(VR); 5204 // Insert the MADD 5205 InsInstrs.push_back(MIB); 5206 return MUL; 5207 } 5208 5209 /// When getMachineCombinerPatterns() finds potential patterns, 5210 /// this function generates the instructions that could replace the 5211 /// original code sequence 5212 void AArch64InstrInfo::genAlternativeCodeSequence( 5213 MachineInstr &Root, MachineCombinerPattern Pattern, 5214 SmallVectorImpl<MachineInstr *> &InsInstrs, 5215 SmallVectorImpl<MachineInstr *> &DelInstrs, 5216 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const { 5217 MachineBasicBlock &MBB = *Root.getParent(); 5218 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5219 MachineFunction &MF = *MBB.getParent(); 5220 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 5221 5222 MachineInstr *MUL = nullptr; 5223 const TargetRegisterClass *RC; 5224 unsigned Opc; 5225 switch (Pattern) { 5226 default: 5227 // Reassociate instructions. 5228 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, 5229 DelInstrs, InstrIdxForVirtReg); 5230 return; 5231 case MachineCombinerPattern::MULADDW_OP1: 5232 case MachineCombinerPattern::MULADDX_OP1: 5233 // MUL I=A,B,0 5234 // ADD R,I,C 5235 // ==> MADD R,A,B,C 5236 // --- Create(MADD); 5237 if (Pattern == MachineCombinerPattern::MULADDW_OP1) { 5238 Opc = AArch64::MADDWrrr; 5239 RC = &AArch64::GPR32RegClass; 5240 } else { 5241 Opc = AArch64::MADDXrrr; 5242 RC = &AArch64::GPR64RegClass; 5243 } 5244 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5245 break; 5246 case MachineCombinerPattern::MULADDW_OP2: 5247 case MachineCombinerPattern::MULADDX_OP2: 5248 // MUL I=A,B,0 5249 // ADD R,C,I 5250 // ==> MADD R,A,B,C 5251 // --- Create(MADD); 5252 if (Pattern == MachineCombinerPattern::MULADDW_OP2) { 5253 Opc = AArch64::MADDWrrr; 5254 RC = &AArch64::GPR32RegClass; 5255 } else { 5256 Opc = AArch64::MADDXrrr; 5257 RC = &AArch64::GPR64RegClass; 5258 } 5259 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5260 break; 5261 case MachineCombinerPattern::MULADDWI_OP1: 5262 case MachineCombinerPattern::MULADDXI_OP1: { 5263 // MUL I=A,B,0 5264 // ADD R,I,Imm 5265 // ==> ORR V, ZR, Imm 5266 // ==> MADD R,A,B,V 5267 // --- Create(MADD); 5268 const TargetRegisterClass *OrrRC; 5269 unsigned BitSize, OrrOpc, ZeroReg; 5270 if (Pattern == MachineCombinerPattern::MULADDWI_OP1) { 5271 OrrOpc = AArch64::ORRWri; 5272 OrrRC = &AArch64::GPR32spRegClass; 5273 BitSize = 32; 5274 ZeroReg = AArch64::WZR; 5275 Opc = AArch64::MADDWrrr; 5276 RC = &AArch64::GPR32RegClass; 5277 } else { 5278 OrrOpc = AArch64::ORRXri; 5279 OrrRC = &AArch64::GPR64spRegClass; 5280 BitSize = 64; 5281 ZeroReg = AArch64::XZR; 5282 Opc = AArch64::MADDXrrr; 5283 RC = &AArch64::GPR64RegClass; 5284 } 5285 Register NewVR = MRI.createVirtualRegister(OrrRC); 5286 uint64_t Imm = Root.getOperand(2).getImm(); 5287 5288 if (Root.getOperand(3).isImm()) { 5289 unsigned Val = Root.getOperand(3).getImm(); 5290 Imm = Imm << Val; 5291 } 5292 uint64_t UImm = SignExtend64(Imm, BitSize); 5293 uint64_t Encoding; 5294 if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { 5295 MachineInstrBuilder MIB1 = 5296 BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR) 5297 .addReg(ZeroReg) 5298 .addImm(Encoding); 5299 InsInstrs.push_back(MIB1); 5300 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5301 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 5302 } 5303 break; 5304 } 5305 case MachineCombinerPattern::MULSUBW_OP1: 5306 case MachineCombinerPattern::MULSUBX_OP1: { 5307 // MUL I=A,B,0 5308 // SUB R,I, C 5309 // ==> SUB V, 0, C 5310 // ==> MADD R,A,B,V // = -C + A*B 5311 // --- Create(MADD); 5312 const TargetRegisterClass *SubRC; 5313 unsigned SubOpc, ZeroReg; 5314 if (Pattern == MachineCombinerPattern::MULSUBW_OP1) { 5315 SubOpc = AArch64::SUBWrr; 5316 SubRC = &AArch64::GPR32spRegClass; 5317 ZeroReg = AArch64::WZR; 5318 Opc = AArch64::MADDWrrr; 5319 RC = &AArch64::GPR32RegClass; 5320 } else { 5321 SubOpc = AArch64::SUBXrr; 5322 SubRC = &AArch64::GPR64spRegClass; 5323 ZeroReg = AArch64::XZR; 5324 Opc = AArch64::MADDXrrr; 5325 RC = &AArch64::GPR64RegClass; 5326 } 5327 Register NewVR = MRI.createVirtualRegister(SubRC); 5328 // SUB NewVR, 0, C 5329 MachineInstrBuilder MIB1 = 5330 BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR) 5331 .addReg(ZeroReg) 5332 .add(Root.getOperand(2)); 5333 InsInstrs.push_back(MIB1); 5334 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5335 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 5336 break; 5337 } 5338 case MachineCombinerPattern::MULSUBW_OP2: 5339 case MachineCombinerPattern::MULSUBX_OP2: 5340 // MUL I=A,B,0 5341 // SUB R,C,I 5342 // ==> MSUB R,A,B,C (computes C - A*B) 5343 // --- Create(MSUB); 5344 if (Pattern == MachineCombinerPattern::MULSUBW_OP2) { 5345 Opc = AArch64::MSUBWrrr; 5346 RC = &AArch64::GPR32RegClass; 5347 } else { 5348 Opc = AArch64::MSUBXrrr; 5349 RC = &AArch64::GPR64RegClass; 5350 } 5351 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5352 break; 5353 case MachineCombinerPattern::MULSUBWI_OP1: 5354 case MachineCombinerPattern::MULSUBXI_OP1: { 5355 // MUL I=A,B,0 5356 // SUB R,I, Imm 5357 // ==> ORR V, ZR, -Imm 5358 // ==> MADD R,A,B,V // = -Imm + A*B 5359 // --- Create(MADD); 5360 const TargetRegisterClass *OrrRC; 5361 unsigned BitSize, OrrOpc, ZeroReg; 5362 if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) { 5363 OrrOpc = AArch64::ORRWri; 5364 OrrRC = &AArch64::GPR32spRegClass; 5365 BitSize = 32; 5366 ZeroReg = AArch64::WZR; 5367 Opc = AArch64::MADDWrrr; 5368 RC = &AArch64::GPR32RegClass; 5369 } else { 5370 OrrOpc = AArch64::ORRXri; 5371 OrrRC = &AArch64::GPR64spRegClass; 5372 BitSize = 64; 5373 ZeroReg = AArch64::XZR; 5374 Opc = AArch64::MADDXrrr; 5375 RC = &AArch64::GPR64RegClass; 5376 } 5377 Register NewVR = MRI.createVirtualRegister(OrrRC); 5378 uint64_t Imm = Root.getOperand(2).getImm(); 5379 if (Root.getOperand(3).isImm()) { 5380 unsigned Val = Root.getOperand(3).getImm(); 5381 Imm = Imm << Val; 5382 } 5383 uint64_t UImm = SignExtend64(-Imm, BitSize); 5384 uint64_t Encoding; 5385 if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { 5386 MachineInstrBuilder MIB1 = 5387 BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR) 5388 .addReg(ZeroReg) 5389 .addImm(Encoding); 5390 InsInstrs.push_back(MIB1); 5391 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5392 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 5393 } 5394 break; 5395 } 5396 5397 case MachineCombinerPattern::MULADDv8i8_OP1: 5398 Opc = AArch64::MLAv8i8; 5399 RC = &AArch64::FPR64RegClass; 5400 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5401 break; 5402 case MachineCombinerPattern::MULADDv8i8_OP2: 5403 Opc = AArch64::MLAv8i8; 5404 RC = &AArch64::FPR64RegClass; 5405 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5406 break; 5407 case MachineCombinerPattern::MULADDv16i8_OP1: 5408 Opc = AArch64::MLAv16i8; 5409 RC = &AArch64::FPR128RegClass; 5410 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5411 break; 5412 case MachineCombinerPattern::MULADDv16i8_OP2: 5413 Opc = AArch64::MLAv16i8; 5414 RC = &AArch64::FPR128RegClass; 5415 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5416 break; 5417 case MachineCombinerPattern::MULADDv4i16_OP1: 5418 Opc = AArch64::MLAv4i16; 5419 RC = &AArch64::FPR64RegClass; 5420 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5421 break; 5422 case MachineCombinerPattern::MULADDv4i16_OP2: 5423 Opc = AArch64::MLAv4i16; 5424 RC = &AArch64::FPR64RegClass; 5425 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5426 break; 5427 case MachineCombinerPattern::MULADDv8i16_OP1: 5428 Opc = AArch64::MLAv8i16; 5429 RC = &AArch64::FPR128RegClass; 5430 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5431 break; 5432 case MachineCombinerPattern::MULADDv8i16_OP2: 5433 Opc = AArch64::MLAv8i16; 5434 RC = &AArch64::FPR128RegClass; 5435 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5436 break; 5437 case MachineCombinerPattern::MULADDv2i32_OP1: 5438 Opc = AArch64::MLAv2i32; 5439 RC = &AArch64::FPR64RegClass; 5440 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5441 break; 5442 case MachineCombinerPattern::MULADDv2i32_OP2: 5443 Opc = AArch64::MLAv2i32; 5444 RC = &AArch64::FPR64RegClass; 5445 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5446 break; 5447 case MachineCombinerPattern::MULADDv4i32_OP1: 5448 Opc = AArch64::MLAv4i32; 5449 RC = &AArch64::FPR128RegClass; 5450 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5451 break; 5452 case MachineCombinerPattern::MULADDv4i32_OP2: 5453 Opc = AArch64::MLAv4i32; 5454 RC = &AArch64::FPR128RegClass; 5455 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5456 break; 5457 5458 case MachineCombinerPattern::MULSUBv8i8_OP1: 5459 Opc = AArch64::MLAv8i8; 5460 RC = &AArch64::FPR64RegClass; 5461 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 5462 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8, 5463 RC); 5464 break; 5465 case MachineCombinerPattern::MULSUBv8i8_OP2: 5466 Opc = AArch64::MLSv8i8; 5467 RC = &AArch64::FPR64RegClass; 5468 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5469 break; 5470 case MachineCombinerPattern::MULSUBv16i8_OP1: 5471 Opc = AArch64::MLAv16i8; 5472 RC = &AArch64::FPR128RegClass; 5473 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 5474 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8, 5475 RC); 5476 break; 5477 case MachineCombinerPattern::MULSUBv16i8_OP2: 5478 Opc = AArch64::MLSv16i8; 5479 RC = &AArch64::FPR128RegClass; 5480 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5481 break; 5482 case MachineCombinerPattern::MULSUBv4i16_OP1: 5483 Opc = AArch64::MLAv4i16; 5484 RC = &AArch64::FPR64RegClass; 5485 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 5486 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, 5487 RC); 5488 break; 5489 case MachineCombinerPattern::MULSUBv4i16_OP2: 5490 Opc = AArch64::MLSv4i16; 5491 RC = &AArch64::FPR64RegClass; 5492 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5493 break; 5494 case MachineCombinerPattern::MULSUBv8i16_OP1: 5495 Opc = AArch64::MLAv8i16; 5496 RC = &AArch64::FPR128RegClass; 5497 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 5498 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, 5499 RC); 5500 break; 5501 case MachineCombinerPattern::MULSUBv8i16_OP2: 5502 Opc = AArch64::MLSv8i16; 5503 RC = &AArch64::FPR128RegClass; 5504 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5505 break; 5506 case MachineCombinerPattern::MULSUBv2i32_OP1: 5507 Opc = AArch64::MLAv2i32; 5508 RC = &AArch64::FPR64RegClass; 5509 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 5510 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, 5511 RC); 5512 break; 5513 case MachineCombinerPattern::MULSUBv2i32_OP2: 5514 Opc = AArch64::MLSv2i32; 5515 RC = &AArch64::FPR64RegClass; 5516 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5517 break; 5518 case MachineCombinerPattern::MULSUBv4i32_OP1: 5519 Opc = AArch64::MLAv4i32; 5520 RC = &AArch64::FPR128RegClass; 5521 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 5522 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, 5523 RC); 5524 break; 5525 case MachineCombinerPattern::MULSUBv4i32_OP2: 5526 Opc = AArch64::MLSv4i32; 5527 RC = &AArch64::FPR128RegClass; 5528 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5529 break; 5530 5531 case MachineCombinerPattern::MULADDv4i16_indexed_OP1: 5532 Opc = AArch64::MLAv4i16_indexed; 5533 RC = &AArch64::FPR64RegClass; 5534 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5535 break; 5536 case MachineCombinerPattern::MULADDv4i16_indexed_OP2: 5537 Opc = AArch64::MLAv4i16_indexed; 5538 RC = &AArch64::FPR64RegClass; 5539 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5540 break; 5541 case MachineCombinerPattern::MULADDv8i16_indexed_OP1: 5542 Opc = AArch64::MLAv8i16_indexed; 5543 RC = &AArch64::FPR128RegClass; 5544 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5545 break; 5546 case MachineCombinerPattern::MULADDv8i16_indexed_OP2: 5547 Opc = AArch64::MLAv8i16_indexed; 5548 RC = &AArch64::FPR128RegClass; 5549 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5550 break; 5551 case MachineCombinerPattern::MULADDv2i32_indexed_OP1: 5552 Opc = AArch64::MLAv2i32_indexed; 5553 RC = &AArch64::FPR64RegClass; 5554 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5555 break; 5556 case MachineCombinerPattern::MULADDv2i32_indexed_OP2: 5557 Opc = AArch64::MLAv2i32_indexed; 5558 RC = &AArch64::FPR64RegClass; 5559 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5560 break; 5561 case MachineCombinerPattern::MULADDv4i32_indexed_OP1: 5562 Opc = AArch64::MLAv4i32_indexed; 5563 RC = &AArch64::FPR128RegClass; 5564 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5565 break; 5566 case MachineCombinerPattern::MULADDv4i32_indexed_OP2: 5567 Opc = AArch64::MLAv4i32_indexed; 5568 RC = &AArch64::FPR128RegClass; 5569 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5570 break; 5571 5572 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1: 5573 Opc = AArch64::MLAv4i16_indexed; 5574 RC = &AArch64::FPR64RegClass; 5575 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 5576 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, 5577 RC); 5578 break; 5579 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2: 5580 Opc = AArch64::MLSv4i16_indexed; 5581 RC = &AArch64::FPR64RegClass; 5582 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5583 break; 5584 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1: 5585 Opc = AArch64::MLAv8i16_indexed; 5586 RC = &AArch64::FPR128RegClass; 5587 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 5588 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, 5589 RC); 5590 break; 5591 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2: 5592 Opc = AArch64::MLSv8i16_indexed; 5593 RC = &AArch64::FPR128RegClass; 5594 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5595 break; 5596 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1: 5597 Opc = AArch64::MLAv2i32_indexed; 5598 RC = &AArch64::FPR64RegClass; 5599 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 5600 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, 5601 RC); 5602 break; 5603 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2: 5604 Opc = AArch64::MLSv2i32_indexed; 5605 RC = &AArch64::FPR64RegClass; 5606 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5607 break; 5608 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1: 5609 Opc = AArch64::MLAv4i32_indexed; 5610 RC = &AArch64::FPR128RegClass; 5611 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 5612 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, 5613 RC); 5614 break; 5615 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2: 5616 Opc = AArch64::MLSv4i32_indexed; 5617 RC = &AArch64::FPR128RegClass; 5618 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5619 break; 5620 5621 // Floating Point Support 5622 case MachineCombinerPattern::FMULADDH_OP1: 5623 Opc = AArch64::FMADDHrrr; 5624 RC = &AArch64::FPR16RegClass; 5625 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5626 break; 5627 case MachineCombinerPattern::FMULADDS_OP1: 5628 Opc = AArch64::FMADDSrrr; 5629 RC = &AArch64::FPR32RegClass; 5630 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5631 break; 5632 case MachineCombinerPattern::FMULADDD_OP1: 5633 Opc = AArch64::FMADDDrrr; 5634 RC = &AArch64::FPR64RegClass; 5635 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5636 break; 5637 5638 case MachineCombinerPattern::FMULADDH_OP2: 5639 Opc = AArch64::FMADDHrrr; 5640 RC = &AArch64::FPR16RegClass; 5641 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5642 break; 5643 case MachineCombinerPattern::FMULADDS_OP2: 5644 Opc = AArch64::FMADDSrrr; 5645 RC = &AArch64::FPR32RegClass; 5646 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5647 break; 5648 case MachineCombinerPattern::FMULADDD_OP2: 5649 Opc = AArch64::FMADDDrrr; 5650 RC = &AArch64::FPR64RegClass; 5651 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5652 break; 5653 5654 case MachineCombinerPattern::FMLAv1i32_indexed_OP1: 5655 Opc = AArch64::FMLAv1i32_indexed; 5656 RC = &AArch64::FPR32RegClass; 5657 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5658 FMAInstKind::Indexed); 5659 break; 5660 case MachineCombinerPattern::FMLAv1i32_indexed_OP2: 5661 Opc = AArch64::FMLAv1i32_indexed; 5662 RC = &AArch64::FPR32RegClass; 5663 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5664 FMAInstKind::Indexed); 5665 break; 5666 5667 case MachineCombinerPattern::FMLAv1i64_indexed_OP1: 5668 Opc = AArch64::FMLAv1i64_indexed; 5669 RC = &AArch64::FPR64RegClass; 5670 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5671 FMAInstKind::Indexed); 5672 break; 5673 case MachineCombinerPattern::FMLAv1i64_indexed_OP2: 5674 Opc = AArch64::FMLAv1i64_indexed; 5675 RC = &AArch64::FPR64RegClass; 5676 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5677 FMAInstKind::Indexed); 5678 break; 5679 5680 case MachineCombinerPattern::FMLAv4i16_indexed_OP1: 5681 RC = &AArch64::FPR64RegClass; 5682 Opc = AArch64::FMLAv4i16_indexed; 5683 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5684 FMAInstKind::Indexed); 5685 break; 5686 case MachineCombinerPattern::FMLAv4f16_OP1: 5687 RC = &AArch64::FPR64RegClass; 5688 Opc = AArch64::FMLAv4f16; 5689 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5690 FMAInstKind::Accumulator); 5691 break; 5692 case MachineCombinerPattern::FMLAv4i16_indexed_OP2: 5693 RC = &AArch64::FPR64RegClass; 5694 Opc = AArch64::FMLAv4i16_indexed; 5695 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5696 FMAInstKind::Indexed); 5697 break; 5698 case MachineCombinerPattern::FMLAv4f16_OP2: 5699 RC = &AArch64::FPR64RegClass; 5700 Opc = AArch64::FMLAv4f16; 5701 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5702 FMAInstKind::Accumulator); 5703 break; 5704 5705 case MachineCombinerPattern::FMLAv2i32_indexed_OP1: 5706 case MachineCombinerPattern::FMLAv2f32_OP1: 5707 RC = &AArch64::FPR64RegClass; 5708 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) { 5709 Opc = AArch64::FMLAv2i32_indexed; 5710 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5711 FMAInstKind::Indexed); 5712 } else { 5713 Opc = AArch64::FMLAv2f32; 5714 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5715 FMAInstKind::Accumulator); 5716 } 5717 break; 5718 case MachineCombinerPattern::FMLAv2i32_indexed_OP2: 5719 case MachineCombinerPattern::FMLAv2f32_OP2: 5720 RC = &AArch64::FPR64RegClass; 5721 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) { 5722 Opc = AArch64::FMLAv2i32_indexed; 5723 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5724 FMAInstKind::Indexed); 5725 } else { 5726 Opc = AArch64::FMLAv2f32; 5727 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5728 FMAInstKind::Accumulator); 5729 } 5730 break; 5731 5732 case MachineCombinerPattern::FMLAv8i16_indexed_OP1: 5733 RC = &AArch64::FPR128RegClass; 5734 Opc = AArch64::FMLAv8i16_indexed; 5735 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5736 FMAInstKind::Indexed); 5737 break; 5738 case MachineCombinerPattern::FMLAv8f16_OP1: 5739 RC = &AArch64::FPR128RegClass; 5740 Opc = AArch64::FMLAv8f16; 5741 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5742 FMAInstKind::Accumulator); 5743 break; 5744 case MachineCombinerPattern::FMLAv8i16_indexed_OP2: 5745 RC = &AArch64::FPR128RegClass; 5746 Opc = AArch64::FMLAv8i16_indexed; 5747 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5748 FMAInstKind::Indexed); 5749 break; 5750 case MachineCombinerPattern::FMLAv8f16_OP2: 5751 RC = &AArch64::FPR128RegClass; 5752 Opc = AArch64::FMLAv8f16; 5753 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5754 FMAInstKind::Accumulator); 5755 break; 5756 5757 case MachineCombinerPattern::FMLAv2i64_indexed_OP1: 5758 case MachineCombinerPattern::FMLAv2f64_OP1: 5759 RC = &AArch64::FPR128RegClass; 5760 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) { 5761 Opc = AArch64::FMLAv2i64_indexed; 5762 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5763 FMAInstKind::Indexed); 5764 } else { 5765 Opc = AArch64::FMLAv2f64; 5766 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5767 FMAInstKind::Accumulator); 5768 } 5769 break; 5770 case MachineCombinerPattern::FMLAv2i64_indexed_OP2: 5771 case MachineCombinerPattern::FMLAv2f64_OP2: 5772 RC = &AArch64::FPR128RegClass; 5773 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) { 5774 Opc = AArch64::FMLAv2i64_indexed; 5775 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5776 FMAInstKind::Indexed); 5777 } else { 5778 Opc = AArch64::FMLAv2f64; 5779 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5780 FMAInstKind::Accumulator); 5781 } 5782 break; 5783 5784 case MachineCombinerPattern::FMLAv4i32_indexed_OP1: 5785 case MachineCombinerPattern::FMLAv4f32_OP1: 5786 RC = &AArch64::FPR128RegClass; 5787 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) { 5788 Opc = AArch64::FMLAv4i32_indexed; 5789 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5790 FMAInstKind::Indexed); 5791 } else { 5792 Opc = AArch64::FMLAv4f32; 5793 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5794 FMAInstKind::Accumulator); 5795 } 5796 break; 5797 5798 case MachineCombinerPattern::FMLAv4i32_indexed_OP2: 5799 case MachineCombinerPattern::FMLAv4f32_OP2: 5800 RC = &AArch64::FPR128RegClass; 5801 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) { 5802 Opc = AArch64::FMLAv4i32_indexed; 5803 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5804 FMAInstKind::Indexed); 5805 } else { 5806 Opc = AArch64::FMLAv4f32; 5807 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5808 FMAInstKind::Accumulator); 5809 } 5810 break; 5811 5812 case MachineCombinerPattern::FMULSUBH_OP1: 5813 Opc = AArch64::FNMSUBHrrr; 5814 RC = &AArch64::FPR16RegClass; 5815 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5816 break; 5817 case MachineCombinerPattern::FMULSUBS_OP1: 5818 Opc = AArch64::FNMSUBSrrr; 5819 RC = &AArch64::FPR32RegClass; 5820 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5821 break; 5822 case MachineCombinerPattern::FMULSUBD_OP1: 5823 Opc = AArch64::FNMSUBDrrr; 5824 RC = &AArch64::FPR64RegClass; 5825 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5826 break; 5827 5828 case MachineCombinerPattern::FNMULSUBH_OP1: 5829 Opc = AArch64::FNMADDHrrr; 5830 RC = &AArch64::FPR16RegClass; 5831 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5832 break; 5833 case MachineCombinerPattern::FNMULSUBS_OP1: 5834 Opc = AArch64::FNMADDSrrr; 5835 RC = &AArch64::FPR32RegClass; 5836 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5837 break; 5838 case MachineCombinerPattern::FNMULSUBD_OP1: 5839 Opc = AArch64::FNMADDDrrr; 5840 RC = &AArch64::FPR64RegClass; 5841 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5842 break; 5843 5844 case MachineCombinerPattern::FMULSUBH_OP2: 5845 Opc = AArch64::FMSUBHrrr; 5846 RC = &AArch64::FPR16RegClass; 5847 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5848 break; 5849 case MachineCombinerPattern::FMULSUBS_OP2: 5850 Opc = AArch64::FMSUBSrrr; 5851 RC = &AArch64::FPR32RegClass; 5852 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5853 break; 5854 case MachineCombinerPattern::FMULSUBD_OP2: 5855 Opc = AArch64::FMSUBDrrr; 5856 RC = &AArch64::FPR64RegClass; 5857 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5858 break; 5859 5860 case MachineCombinerPattern::FMLSv1i32_indexed_OP2: 5861 Opc = AArch64::FMLSv1i32_indexed; 5862 RC = &AArch64::FPR32RegClass; 5863 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5864 FMAInstKind::Indexed); 5865 break; 5866 5867 case MachineCombinerPattern::FMLSv1i64_indexed_OP2: 5868 Opc = AArch64::FMLSv1i64_indexed; 5869 RC = &AArch64::FPR64RegClass; 5870 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5871 FMAInstKind::Indexed); 5872 break; 5873 5874 case MachineCombinerPattern::FMLSv4f16_OP1: 5875 case MachineCombinerPattern::FMLSv4i16_indexed_OP1: { 5876 RC = &AArch64::FPR64RegClass; 5877 Register NewVR = MRI.createVirtualRegister(RC); 5878 MachineInstrBuilder MIB1 = 5879 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f16), NewVR) 5880 .add(Root.getOperand(2)); 5881 InsInstrs.push_back(MIB1); 5882 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5883 if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) { 5884 Opc = AArch64::FMLAv4f16; 5885 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5886 FMAInstKind::Accumulator, &NewVR); 5887 } else { 5888 Opc = AArch64::FMLAv4i16_indexed; 5889 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5890 FMAInstKind::Indexed, &NewVR); 5891 } 5892 break; 5893 } 5894 case MachineCombinerPattern::FMLSv4f16_OP2: 5895 RC = &AArch64::FPR64RegClass; 5896 Opc = AArch64::FMLSv4f16; 5897 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5898 FMAInstKind::Accumulator); 5899 break; 5900 case MachineCombinerPattern::FMLSv4i16_indexed_OP2: 5901 RC = &AArch64::FPR64RegClass; 5902 Opc = AArch64::FMLSv4i16_indexed; 5903 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5904 FMAInstKind::Indexed); 5905 break; 5906 5907 case MachineCombinerPattern::FMLSv2f32_OP2: 5908 case MachineCombinerPattern::FMLSv2i32_indexed_OP2: 5909 RC = &AArch64::FPR64RegClass; 5910 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) { 5911 Opc = AArch64::FMLSv2i32_indexed; 5912 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5913 FMAInstKind::Indexed); 5914 } else { 5915 Opc = AArch64::FMLSv2f32; 5916 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5917 FMAInstKind::Accumulator); 5918 } 5919 break; 5920 5921 case MachineCombinerPattern::FMLSv8f16_OP1: 5922 case MachineCombinerPattern::FMLSv8i16_indexed_OP1: { 5923 RC = &AArch64::FPR128RegClass; 5924 Register NewVR = MRI.createVirtualRegister(RC); 5925 MachineInstrBuilder MIB1 = 5926 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv8f16), NewVR) 5927 .add(Root.getOperand(2)); 5928 InsInstrs.push_back(MIB1); 5929 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5930 if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) { 5931 Opc = AArch64::FMLAv8f16; 5932 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5933 FMAInstKind::Accumulator, &NewVR); 5934 } else { 5935 Opc = AArch64::FMLAv8i16_indexed; 5936 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5937 FMAInstKind::Indexed, &NewVR); 5938 } 5939 break; 5940 } 5941 case MachineCombinerPattern::FMLSv8f16_OP2: 5942 RC = &AArch64::FPR128RegClass; 5943 Opc = AArch64::FMLSv8f16; 5944 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5945 FMAInstKind::Accumulator); 5946 break; 5947 case MachineCombinerPattern::FMLSv8i16_indexed_OP2: 5948 RC = &AArch64::FPR128RegClass; 5949 Opc = AArch64::FMLSv8i16_indexed; 5950 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5951 FMAInstKind::Indexed); 5952 break; 5953 5954 case MachineCombinerPattern::FMLSv2f64_OP2: 5955 case MachineCombinerPattern::FMLSv2i64_indexed_OP2: 5956 RC = &AArch64::FPR128RegClass; 5957 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) { 5958 Opc = AArch64::FMLSv2i64_indexed; 5959 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5960 FMAInstKind::Indexed); 5961 } else { 5962 Opc = AArch64::FMLSv2f64; 5963 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5964 FMAInstKind::Accumulator); 5965 } 5966 break; 5967 5968 case MachineCombinerPattern::FMLSv4f32_OP2: 5969 case MachineCombinerPattern::FMLSv4i32_indexed_OP2: 5970 RC = &AArch64::FPR128RegClass; 5971 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) { 5972 Opc = AArch64::FMLSv4i32_indexed; 5973 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5974 FMAInstKind::Indexed); 5975 } else { 5976 Opc = AArch64::FMLSv4f32; 5977 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5978 FMAInstKind::Accumulator); 5979 } 5980 break; 5981 case MachineCombinerPattern::FMLSv2f32_OP1: 5982 case MachineCombinerPattern::FMLSv2i32_indexed_OP1: { 5983 RC = &AArch64::FPR64RegClass; 5984 Register NewVR = MRI.createVirtualRegister(RC); 5985 MachineInstrBuilder MIB1 = 5986 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR) 5987 .add(Root.getOperand(2)); 5988 InsInstrs.push_back(MIB1); 5989 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5990 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) { 5991 Opc = AArch64::FMLAv2i32_indexed; 5992 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5993 FMAInstKind::Indexed, &NewVR); 5994 } else { 5995 Opc = AArch64::FMLAv2f32; 5996 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5997 FMAInstKind::Accumulator, &NewVR); 5998 } 5999 break; 6000 } 6001 case MachineCombinerPattern::FMLSv4f32_OP1: 6002 case MachineCombinerPattern::FMLSv4i32_indexed_OP1: { 6003 RC = &AArch64::FPR128RegClass; 6004 Register NewVR = MRI.createVirtualRegister(RC); 6005 MachineInstrBuilder MIB1 = 6006 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR) 6007 .add(Root.getOperand(2)); 6008 InsInstrs.push_back(MIB1); 6009 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6010 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) { 6011 Opc = AArch64::FMLAv4i32_indexed; 6012 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6013 FMAInstKind::Indexed, &NewVR); 6014 } else { 6015 Opc = AArch64::FMLAv4f32; 6016 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6017 FMAInstKind::Accumulator, &NewVR); 6018 } 6019 break; 6020 } 6021 case MachineCombinerPattern::FMLSv2f64_OP1: 6022 case MachineCombinerPattern::FMLSv2i64_indexed_OP1: { 6023 RC = &AArch64::FPR128RegClass; 6024 Register NewVR = MRI.createVirtualRegister(RC); 6025 MachineInstrBuilder MIB1 = 6026 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR) 6027 .add(Root.getOperand(2)); 6028 InsInstrs.push_back(MIB1); 6029 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6030 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) { 6031 Opc = AArch64::FMLAv2i64_indexed; 6032 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6033 FMAInstKind::Indexed, &NewVR); 6034 } else { 6035 Opc = AArch64::FMLAv2f64; 6036 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6037 FMAInstKind::Accumulator, &NewVR); 6038 } 6039 break; 6040 } 6041 } // end switch (Pattern) 6042 // Record MUL and ADD/SUB for deletion 6043 // FIXME: This assertion fails in CodeGen/AArch64/tailmerging_in_mbp.ll and 6044 // CodeGen/AArch64/urem-seteq-nonzero.ll. 6045 // assert(MUL && "MUL was never set"); 6046 DelInstrs.push_back(MUL); 6047 DelInstrs.push_back(&Root); 6048 } 6049 6050 /// Replace csincr-branch sequence by simple conditional branch 6051 /// 6052 /// Examples: 6053 /// 1. \code 6054 /// csinc w9, wzr, wzr, <condition code> 6055 /// tbnz w9, #0, 0x44 6056 /// \endcode 6057 /// to 6058 /// \code 6059 /// b.<inverted condition code> 6060 /// \endcode 6061 /// 6062 /// 2. \code 6063 /// csinc w9, wzr, wzr, <condition code> 6064 /// tbz w9, #0, 0x44 6065 /// \endcode 6066 /// to 6067 /// \code 6068 /// b.<condition code> 6069 /// \endcode 6070 /// 6071 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the 6072 /// compare's constant operand is power of 2. 6073 /// 6074 /// Examples: 6075 /// \code 6076 /// and w8, w8, #0x400 6077 /// cbnz w8, L1 6078 /// \endcode 6079 /// to 6080 /// \code 6081 /// tbnz w8, #10, L1 6082 /// \endcode 6083 /// 6084 /// \param MI Conditional Branch 6085 /// \return True when the simple conditional branch is generated 6086 /// 6087 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const { 6088 bool IsNegativeBranch = false; 6089 bool IsTestAndBranch = false; 6090 unsigned TargetBBInMI = 0; 6091 switch (MI.getOpcode()) { 6092 default: 6093 llvm_unreachable("Unknown branch instruction?"); 6094 case AArch64::Bcc: 6095 return false; 6096 case AArch64::CBZW: 6097 case AArch64::CBZX: 6098 TargetBBInMI = 1; 6099 break; 6100 case AArch64::CBNZW: 6101 case AArch64::CBNZX: 6102 TargetBBInMI = 1; 6103 IsNegativeBranch = true; 6104 break; 6105 case AArch64::TBZW: 6106 case AArch64::TBZX: 6107 TargetBBInMI = 2; 6108 IsTestAndBranch = true; 6109 break; 6110 case AArch64::TBNZW: 6111 case AArch64::TBNZX: 6112 TargetBBInMI = 2; 6113 IsNegativeBranch = true; 6114 IsTestAndBranch = true; 6115 break; 6116 } 6117 // So we increment a zero register and test for bits other 6118 // than bit 0? Conservatively bail out in case the verifier 6119 // missed this case. 6120 if (IsTestAndBranch && MI.getOperand(1).getImm()) 6121 return false; 6122 6123 // Find Definition. 6124 assert(MI.getParent() && "Incomplete machine instruciton\n"); 6125 MachineBasicBlock *MBB = MI.getParent(); 6126 MachineFunction *MF = MBB->getParent(); 6127 MachineRegisterInfo *MRI = &MF->getRegInfo(); 6128 Register VReg = MI.getOperand(0).getReg(); 6129 if (!Register::isVirtualRegister(VReg)) 6130 return false; 6131 6132 MachineInstr *DefMI = MRI->getVRegDef(VReg); 6133 6134 // Look through COPY instructions to find definition. 6135 while (DefMI->isCopy()) { 6136 Register CopyVReg = DefMI->getOperand(1).getReg(); 6137 if (!MRI->hasOneNonDBGUse(CopyVReg)) 6138 return false; 6139 if (!MRI->hasOneDef(CopyVReg)) 6140 return false; 6141 DefMI = MRI->getVRegDef(CopyVReg); 6142 } 6143 6144 switch (DefMI->getOpcode()) { 6145 default: 6146 return false; 6147 // Fold AND into a TBZ/TBNZ if constant operand is power of 2. 6148 case AArch64::ANDWri: 6149 case AArch64::ANDXri: { 6150 if (IsTestAndBranch) 6151 return false; 6152 if (DefMI->getParent() != MBB) 6153 return false; 6154 if (!MRI->hasOneNonDBGUse(VReg)) 6155 return false; 6156 6157 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri); 6158 uint64_t Mask = AArch64_AM::decodeLogicalImmediate( 6159 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64); 6160 if (!isPowerOf2_64(Mask)) 6161 return false; 6162 6163 MachineOperand &MO = DefMI->getOperand(1); 6164 Register NewReg = MO.getReg(); 6165 if (!Register::isVirtualRegister(NewReg)) 6166 return false; 6167 6168 assert(!MRI->def_empty(NewReg) && "Register must be defined."); 6169 6170 MachineBasicBlock &RefToMBB = *MBB; 6171 MachineBasicBlock *TBB = MI.getOperand(1).getMBB(); 6172 DebugLoc DL = MI.getDebugLoc(); 6173 unsigned Imm = Log2_64(Mask); 6174 unsigned Opc = (Imm < 32) 6175 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW) 6176 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX); 6177 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc)) 6178 .addReg(NewReg) 6179 .addImm(Imm) 6180 .addMBB(TBB); 6181 // Register lives on to the CBZ now. 6182 MO.setIsKill(false); 6183 6184 // For immediate smaller than 32, we need to use the 32-bit 6185 // variant (W) in all cases. Indeed the 64-bit variant does not 6186 // allow to encode them. 6187 // Therefore, if the input register is 64-bit, we need to take the 6188 // 32-bit sub-part. 6189 if (!Is32Bit && Imm < 32) 6190 NewMI->getOperand(0).setSubReg(AArch64::sub_32); 6191 MI.eraseFromParent(); 6192 return true; 6193 } 6194 // Look for CSINC 6195 case AArch64::CSINCWr: 6196 case AArch64::CSINCXr: { 6197 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR && 6198 DefMI->getOperand(2).getReg() == AArch64::WZR) && 6199 !(DefMI->getOperand(1).getReg() == AArch64::XZR && 6200 DefMI->getOperand(2).getReg() == AArch64::XZR)) 6201 return false; 6202 6203 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) 6204 return false; 6205 6206 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm(); 6207 // Convert only when the condition code is not modified between 6208 // the CSINC and the branch. The CC may be used by other 6209 // instructions in between. 6210 if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write)) 6211 return false; 6212 MachineBasicBlock &RefToMBB = *MBB; 6213 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB(); 6214 DebugLoc DL = MI.getDebugLoc(); 6215 if (IsNegativeBranch) 6216 CC = AArch64CC::getInvertedCondCode(CC); 6217 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB); 6218 MI.eraseFromParent(); 6219 return true; 6220 } 6221 } 6222 } 6223 6224 std::pair<unsigned, unsigned> 6225 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { 6226 const unsigned Mask = AArch64II::MO_FRAGMENT; 6227 return std::make_pair(TF & Mask, TF & ~Mask); 6228 } 6229 6230 ArrayRef<std::pair<unsigned, const char *>> 6231 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { 6232 using namespace AArch64II; 6233 6234 static const std::pair<unsigned, const char *> TargetFlags[] = { 6235 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"}, 6236 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"}, 6237 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"}, 6238 {MO_HI12, "aarch64-hi12"}}; 6239 return makeArrayRef(TargetFlags); 6240 } 6241 6242 ArrayRef<std::pair<unsigned, const char *>> 6243 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { 6244 using namespace AArch64II; 6245 6246 static const std::pair<unsigned, const char *> TargetFlags[] = { 6247 {MO_COFFSTUB, "aarch64-coffstub"}, 6248 {MO_GOT, "aarch64-got"}, 6249 {MO_NC, "aarch64-nc"}, 6250 {MO_S, "aarch64-s"}, 6251 {MO_TLS, "aarch64-tls"}, 6252 {MO_DLLIMPORT, "aarch64-dllimport"}, 6253 {MO_PREL, "aarch64-prel"}, 6254 {MO_TAGGED, "aarch64-tagged"}}; 6255 return makeArrayRef(TargetFlags); 6256 } 6257 6258 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>> 6259 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const { 6260 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] = 6261 {{MOSuppressPair, "aarch64-suppress-pair"}, 6262 {MOStridedAccess, "aarch64-strided-access"}}; 6263 return makeArrayRef(TargetFlags); 6264 } 6265 6266 /// Constants defining how certain sequences should be outlined. 6267 /// This encompasses how an outlined function should be called, and what kind of 6268 /// frame should be emitted for that outlined function. 6269 /// 6270 /// \p MachineOutlinerDefault implies that the function should be called with 6271 /// a save and restore of LR to the stack. 6272 /// 6273 /// That is, 6274 /// 6275 /// I1 Save LR OUTLINED_FUNCTION: 6276 /// I2 --> BL OUTLINED_FUNCTION I1 6277 /// I3 Restore LR I2 6278 /// I3 6279 /// RET 6280 /// 6281 /// * Call construction overhead: 3 (save + BL + restore) 6282 /// * Frame construction overhead: 1 (ret) 6283 /// * Requires stack fixups? Yes 6284 /// 6285 /// \p MachineOutlinerTailCall implies that the function is being created from 6286 /// a sequence of instructions ending in a return. 6287 /// 6288 /// That is, 6289 /// 6290 /// I1 OUTLINED_FUNCTION: 6291 /// I2 --> B OUTLINED_FUNCTION I1 6292 /// RET I2 6293 /// RET 6294 /// 6295 /// * Call construction overhead: 1 (B) 6296 /// * Frame construction overhead: 0 (Return included in sequence) 6297 /// * Requires stack fixups? No 6298 /// 6299 /// \p MachineOutlinerNoLRSave implies that the function should be called using 6300 /// a BL instruction, but doesn't require LR to be saved and restored. This 6301 /// happens when LR is known to be dead. 6302 /// 6303 /// That is, 6304 /// 6305 /// I1 OUTLINED_FUNCTION: 6306 /// I2 --> BL OUTLINED_FUNCTION I1 6307 /// I3 I2 6308 /// I3 6309 /// RET 6310 /// 6311 /// * Call construction overhead: 1 (BL) 6312 /// * Frame construction overhead: 1 (RET) 6313 /// * Requires stack fixups? No 6314 /// 6315 /// \p MachineOutlinerThunk implies that the function is being created from 6316 /// a sequence of instructions ending in a call. The outlined function is 6317 /// called with a BL instruction, and the outlined function tail-calls the 6318 /// original call destination. 6319 /// 6320 /// That is, 6321 /// 6322 /// I1 OUTLINED_FUNCTION: 6323 /// I2 --> BL OUTLINED_FUNCTION I1 6324 /// BL f I2 6325 /// B f 6326 /// * Call construction overhead: 1 (BL) 6327 /// * Frame construction overhead: 0 6328 /// * Requires stack fixups? No 6329 /// 6330 /// \p MachineOutlinerRegSave implies that the function should be called with a 6331 /// save and restore of LR to an available register. This allows us to avoid 6332 /// stack fixups. Note that this outlining variant is compatible with the 6333 /// NoLRSave case. 6334 /// 6335 /// That is, 6336 /// 6337 /// I1 Save LR OUTLINED_FUNCTION: 6338 /// I2 --> BL OUTLINED_FUNCTION I1 6339 /// I3 Restore LR I2 6340 /// I3 6341 /// RET 6342 /// 6343 /// * Call construction overhead: 3 (save + BL + restore) 6344 /// * Frame construction overhead: 1 (ret) 6345 /// * Requires stack fixups? No 6346 enum MachineOutlinerClass { 6347 MachineOutlinerDefault, /// Emit a save, restore, call, and return. 6348 MachineOutlinerTailCall, /// Only emit a branch. 6349 MachineOutlinerNoLRSave, /// Emit a call and return. 6350 MachineOutlinerThunk, /// Emit a call and tail-call. 6351 MachineOutlinerRegSave /// Same as default, but save to a register. 6352 }; 6353 6354 enum MachineOutlinerMBBFlags { 6355 LRUnavailableSomewhere = 0x2, 6356 HasCalls = 0x4, 6357 UnsafeRegsDead = 0x8 6358 }; 6359 6360 unsigned 6361 AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const { 6362 assert(C.LRUWasSet && "LRU wasn't set?"); 6363 MachineFunction *MF = C.getMF(); 6364 const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>( 6365 MF->getSubtarget().getRegisterInfo()); 6366 6367 // Check if there is an available register across the sequence that we can 6368 // use. 6369 for (unsigned Reg : AArch64::GPR64RegClass) { 6370 if (!ARI->isReservedReg(*MF, Reg) && 6371 Reg != AArch64::LR && // LR is not reserved, but don't use it. 6372 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved. 6373 Reg != AArch64::X17 && // Ditto for X17. 6374 C.LRU.available(Reg) && C.UsedInSequence.available(Reg)) 6375 return Reg; 6376 } 6377 6378 // No suitable register. Return 0. 6379 return 0u; 6380 } 6381 6382 static bool 6383 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a, 6384 const outliner::Candidate &b) { 6385 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>(); 6386 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>(); 6387 6388 return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) && 6389 MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true); 6390 } 6391 6392 static bool 6393 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a, 6394 const outliner::Candidate &b) { 6395 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>(); 6396 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>(); 6397 6398 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey(); 6399 } 6400 6401 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a, 6402 const outliner::Candidate &b) { 6403 const AArch64Subtarget &SubtargetA = 6404 a.getMF()->getSubtarget<AArch64Subtarget>(); 6405 const AArch64Subtarget &SubtargetB = 6406 b.getMF()->getSubtarget<AArch64Subtarget>(); 6407 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps(); 6408 } 6409 6410 outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( 6411 std::vector<outliner::Candidate> &RepeatedSequenceLocs) const { 6412 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0]; 6413 unsigned SequenceSize = 6414 std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0, 6415 [this](unsigned Sum, const MachineInstr &MI) { 6416 return Sum + getInstSizeInBytes(MI); 6417 }); 6418 unsigned NumBytesToCreateFrame = 0; 6419 6420 // We only allow outlining for functions having exactly matching return 6421 // address signing attributes, i.e., all share the same value for the 6422 // attribute "sign-return-address" and all share the same type of key they 6423 // are signed with. 6424 // Additionally we require all functions to simultaniously either support 6425 // v8.3a features or not. Otherwise an outlined function could get signed 6426 // using dedicated v8.3 instructions and a call from a function that doesn't 6427 // support v8.3 instructions would therefore be invalid. 6428 if (std::adjacent_find( 6429 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), 6430 [](const outliner::Candidate &a, const outliner::Candidate &b) { 6431 // Return true if a and b are non-equal w.r.t. return address 6432 // signing or support of v8.3a features 6433 if (outliningCandidatesSigningScopeConsensus(a, b) && 6434 outliningCandidatesSigningKeyConsensus(a, b) && 6435 outliningCandidatesV8_3OpsConsensus(a, b)) { 6436 return false; 6437 } 6438 return true; 6439 }) != RepeatedSequenceLocs.end()) { 6440 return outliner::OutlinedFunction(); 6441 } 6442 6443 // Since at this point all candidates agree on their return address signing 6444 // picking just one is fine. If the candidate functions potentially sign their 6445 // return addresses, the outlined function should do the same. Note that in 6446 // the case of "sign-return-address"="non-leaf" this is an assumption: It is 6447 // not certainly true that the outlined function will have to sign its return 6448 // address but this decision is made later, when the decision to outline 6449 // has already been made. 6450 // The same holds for the number of additional instructions we need: On 6451 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is 6452 // necessary. However, at this point we don't know if the outlined function 6453 // will have a RET instruction so we assume the worst. 6454 const TargetRegisterInfo &TRI = getRegisterInfo(); 6455 if (FirstCand.getMF() 6456 ->getInfo<AArch64FunctionInfo>() 6457 ->shouldSignReturnAddress(true)) { 6458 // One PAC and one AUT instructions 6459 NumBytesToCreateFrame += 8; 6460 6461 // We have to check if sp modifying instructions would get outlined. 6462 // If so we only allow outlining if sp is unchanged overall, so matching 6463 // sub and add instructions are okay to outline, all other sp modifications 6464 // are not 6465 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) { 6466 int SPValue = 0; 6467 MachineBasicBlock::iterator MBBI = C.front(); 6468 for (;;) { 6469 if (MBBI->modifiesRegister(AArch64::SP, &TRI)) { 6470 switch (MBBI->getOpcode()) { 6471 case AArch64::ADDXri: 6472 case AArch64::ADDWri: 6473 assert(MBBI->getNumOperands() == 4 && "Wrong number of operands"); 6474 assert(MBBI->getOperand(2).isImm() && 6475 "Expected operand to be immediate"); 6476 assert(MBBI->getOperand(1).isReg() && 6477 "Expected operand to be a register"); 6478 // Check if the add just increments sp. If so, we search for 6479 // matching sub instructions that decrement sp. If not, the 6480 // modification is illegal 6481 if (MBBI->getOperand(1).getReg() == AArch64::SP) 6482 SPValue += MBBI->getOperand(2).getImm(); 6483 else 6484 return true; 6485 break; 6486 case AArch64::SUBXri: 6487 case AArch64::SUBWri: 6488 assert(MBBI->getNumOperands() == 4 && "Wrong number of operands"); 6489 assert(MBBI->getOperand(2).isImm() && 6490 "Expected operand to be immediate"); 6491 assert(MBBI->getOperand(1).isReg() && 6492 "Expected operand to be a register"); 6493 // Check if the sub just decrements sp. If so, we search for 6494 // matching add instructions that increment sp. If not, the 6495 // modification is illegal 6496 if (MBBI->getOperand(1).getReg() == AArch64::SP) 6497 SPValue -= MBBI->getOperand(2).getImm(); 6498 else 6499 return true; 6500 break; 6501 default: 6502 return true; 6503 } 6504 } 6505 if (MBBI == C.back()) 6506 break; 6507 ++MBBI; 6508 } 6509 if (SPValue) 6510 return true; 6511 return false; 6512 }; 6513 // Remove candidates with illegal stack modifying instructions 6514 llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification); 6515 6516 // If the sequence doesn't have enough candidates left, then we're done. 6517 if (RepeatedSequenceLocs.size() < 2) 6518 return outliner::OutlinedFunction(); 6519 } 6520 6521 // Properties about candidate MBBs that hold for all of them. 6522 unsigned FlagsSetInAll = 0xF; 6523 6524 // Compute liveness information for each candidate, and set FlagsSetInAll. 6525 std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), 6526 [&FlagsSetInAll](outliner::Candidate &C) { 6527 FlagsSetInAll &= C.Flags; 6528 }); 6529 6530 // According to the AArch64 Procedure Call Standard, the following are 6531 // undefined on entry/exit from a function call: 6532 // 6533 // * Registers x16, x17, (and thus w16, w17) 6534 // * Condition codes (and thus the NZCV register) 6535 // 6536 // Because if this, we can't outline any sequence of instructions where 6537 // one 6538 // of these registers is live into/across it. Thus, we need to delete 6539 // those 6540 // candidates. 6541 auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) { 6542 // If the unsafe registers in this block are all dead, then we don't need 6543 // to compute liveness here. 6544 if (C.Flags & UnsafeRegsDead) 6545 return false; 6546 C.initLRU(TRI); 6547 LiveRegUnits LRU = C.LRU; 6548 return (!LRU.available(AArch64::W16) || !LRU.available(AArch64::W17) || 6549 !LRU.available(AArch64::NZCV)); 6550 }; 6551 6552 // Are there any candidates where those registers are live? 6553 if (!(FlagsSetInAll & UnsafeRegsDead)) { 6554 // Erase every candidate that violates the restrictions above. (It could be 6555 // true that we have viable candidates, so it's not worth bailing out in 6556 // the case that, say, 1 out of 20 candidates violate the restructions.) 6557 llvm::erase_if(RepeatedSequenceLocs, CantGuaranteeValueAcrossCall); 6558 6559 // If the sequence doesn't have enough candidates left, then we're done. 6560 if (RepeatedSequenceLocs.size() < 2) 6561 return outliner::OutlinedFunction(); 6562 } 6563 6564 // At this point, we have only "safe" candidates to outline. Figure out 6565 // frame + call instruction information. 6566 6567 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode(); 6568 6569 // Helper lambda which sets call information for every candidate. 6570 auto SetCandidateCallInfo = 6571 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) { 6572 for (outliner::Candidate &C : RepeatedSequenceLocs) 6573 C.setCallInfo(CallID, NumBytesForCall); 6574 }; 6575 6576 unsigned FrameID = MachineOutlinerDefault; 6577 NumBytesToCreateFrame += 4; 6578 6579 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) { 6580 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement(); 6581 }); 6582 6583 // We check to see if CFI Instructions are present, and if they are 6584 // we find the number of CFI Instructions in the candidates. 6585 unsigned CFICount = 0; 6586 MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front(); 6587 for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx(); 6588 Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) { 6589 const std::vector<MCCFIInstruction> &CFIInstructions = 6590 RepeatedSequenceLocs[0].getMF()->getFrameInstructions(); 6591 if (MBBI->isCFIInstruction()) { 6592 unsigned CFIIndex = MBBI->getOperand(0).getCFIIndex(); 6593 MCCFIInstruction CFI = CFIInstructions[CFIIndex]; 6594 CFICount++; 6595 } 6596 MBBI++; 6597 } 6598 6599 // We compare the number of found CFI Instructions to the number of CFI 6600 // instructions in the parent function for each candidate. We must check this 6601 // since if we outline one of the CFI instructions in a function, we have to 6602 // outline them all for correctness. If we do not, the address offsets will be 6603 // incorrect between the two sections of the program. 6604 for (outliner::Candidate &C : RepeatedSequenceLocs) { 6605 std::vector<MCCFIInstruction> CFIInstructions = 6606 C.getMF()->getFrameInstructions(); 6607 6608 if (CFICount > 0 && CFICount != CFIInstructions.size()) 6609 return outliner::OutlinedFunction(); 6610 } 6611 6612 // Returns true if an instructions is safe to fix up, false otherwise. 6613 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) { 6614 if (MI.isCall()) 6615 return true; 6616 6617 if (!MI.modifiesRegister(AArch64::SP, &TRI) && 6618 !MI.readsRegister(AArch64::SP, &TRI)) 6619 return true; 6620 6621 // Any modification of SP will break our code to save/restore LR. 6622 // FIXME: We could handle some instructions which add a constant 6623 // offset to SP, with a bit more work. 6624 if (MI.modifiesRegister(AArch64::SP, &TRI)) 6625 return false; 6626 6627 // At this point, we have a stack instruction that we might need to 6628 // fix up. We'll handle it if it's a load or store. 6629 if (MI.mayLoadOrStore()) { 6630 const MachineOperand *Base; // Filled with the base operand of MI. 6631 int64_t Offset; // Filled with the offset of MI. 6632 bool OffsetIsScalable; 6633 6634 // Does it allow us to offset the base operand and is the base the 6635 // register SP? 6636 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) || 6637 !Base->isReg() || Base->getReg() != AArch64::SP) 6638 return false; 6639 6640 // Fixe-up code below assumes bytes. 6641 if (OffsetIsScalable) 6642 return false; 6643 6644 // Find the minimum/maximum offset for this instruction and check 6645 // if fixing it up would be in range. 6646 int64_t MinOffset, 6647 MaxOffset; // Unscaled offsets for the instruction. 6648 TypeSize Scale(0U, false); // The scale to multiply the offsets by. 6649 unsigned DummyWidth; 6650 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset); 6651 6652 Offset += 16; // Update the offset to what it would be if we outlined. 6653 if (Offset < MinOffset * (int64_t)Scale.getFixedSize() || 6654 Offset > MaxOffset * (int64_t)Scale.getFixedSize()) 6655 return false; 6656 6657 // It's in range, so we can outline it. 6658 return true; 6659 } 6660 6661 // FIXME: Add handling for instructions like "add x0, sp, #8". 6662 6663 // We can't fix it up, so don't outline it. 6664 return false; 6665 }; 6666 6667 // True if it's possible to fix up each stack instruction in this sequence. 6668 // Important for frames/call variants that modify the stack. 6669 bool AllStackInstrsSafe = std::all_of( 6670 FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup); 6671 6672 // If the last instruction in any candidate is a terminator, then we should 6673 // tail call all of the candidates. 6674 if (RepeatedSequenceLocs[0].back()->isTerminator()) { 6675 FrameID = MachineOutlinerTailCall; 6676 NumBytesToCreateFrame = 0; 6677 SetCandidateCallInfo(MachineOutlinerTailCall, 4); 6678 } 6679 6680 else if (LastInstrOpcode == AArch64::BL || 6681 ((LastInstrOpcode == AArch64::BLR || 6682 LastInstrOpcode == AArch64::BLRNoIP) && 6683 !HasBTI)) { 6684 // FIXME: Do we need to check if the code after this uses the value of LR? 6685 FrameID = MachineOutlinerThunk; 6686 NumBytesToCreateFrame = 0; 6687 SetCandidateCallInfo(MachineOutlinerThunk, 4); 6688 } 6689 6690 else { 6691 // We need to decide how to emit calls + frames. We can always emit the same 6692 // frame if we don't need to save to the stack. If we have to save to the 6693 // stack, then we need a different frame. 6694 unsigned NumBytesNoStackCalls = 0; 6695 std::vector<outliner::Candidate> CandidatesWithoutStackFixups; 6696 6697 // Check if we have to save LR. 6698 for (outliner::Candidate &C : RepeatedSequenceLocs) { 6699 C.initLRU(TRI); 6700 6701 // If we have a noreturn caller, then we're going to be conservative and 6702 // say that we have to save LR. If we don't have a ret at the end of the 6703 // block, then we can't reason about liveness accurately. 6704 // 6705 // FIXME: We can probably do better than always disabling this in 6706 // noreturn functions by fixing up the liveness info. 6707 bool IsNoReturn = 6708 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn); 6709 6710 // Is LR available? If so, we don't need a save. 6711 if (C.LRU.available(AArch64::LR) && !IsNoReturn) { 6712 NumBytesNoStackCalls += 4; 6713 C.setCallInfo(MachineOutlinerNoLRSave, 4); 6714 CandidatesWithoutStackFixups.push_back(C); 6715 } 6716 6717 // Is an unused register available? If so, we won't modify the stack, so 6718 // we can outline with the same frame type as those that don't save LR. 6719 else if (findRegisterToSaveLRTo(C)) { 6720 NumBytesNoStackCalls += 12; 6721 C.setCallInfo(MachineOutlinerRegSave, 12); 6722 CandidatesWithoutStackFixups.push_back(C); 6723 } 6724 6725 // Is SP used in the sequence at all? If not, we don't have to modify 6726 // the stack, so we are guaranteed to get the same frame. 6727 else if (C.UsedInSequence.available(AArch64::SP)) { 6728 NumBytesNoStackCalls += 12; 6729 C.setCallInfo(MachineOutlinerDefault, 12); 6730 CandidatesWithoutStackFixups.push_back(C); 6731 } 6732 6733 // If we outline this, we need to modify the stack. Pretend we don't 6734 // outline this by saving all of its bytes. 6735 else { 6736 NumBytesNoStackCalls += SequenceSize; 6737 } 6738 } 6739 6740 // If there are no places where we have to save LR, then note that we 6741 // don't have to update the stack. Otherwise, give every candidate the 6742 // default call type, as long as it's safe to do so. 6743 if (!AllStackInstrsSafe || 6744 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) { 6745 RepeatedSequenceLocs = CandidatesWithoutStackFixups; 6746 FrameID = MachineOutlinerNoLRSave; 6747 } else { 6748 SetCandidateCallInfo(MachineOutlinerDefault, 12); 6749 6750 // Bugzilla ID: 46767 6751 // TODO: Check if fixing up the stack more than once is safe so we can 6752 // outline these. 6753 // 6754 // An outline resulting in a caller that requires stack fixups at the 6755 // callsite to a callee that also requires stack fixups can happen when 6756 // there are no available registers at the candidate callsite for a 6757 // candidate that itself also has calls. 6758 // 6759 // In other words if function_containing_sequence in the following pseudo 6760 // assembly requires that we save LR at the point of the call, but there 6761 // are no available registers: in this case we save using SP and as a 6762 // result the SP offsets requires stack fixups by multiples of 16. 6763 // 6764 // function_containing_sequence: 6765 // ... 6766 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N 6767 // call OUTLINED_FUNCTION_N 6768 // restore LR from SP 6769 // ... 6770 // 6771 // OUTLINED_FUNCTION_N: 6772 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N 6773 // ... 6774 // bl foo 6775 // restore LR from SP 6776 // ret 6777 // 6778 // Because the code to handle more than one stack fixup does not 6779 // currently have the proper checks for legality, these cases will assert 6780 // in the AArch64 MachineOutliner. This is because the code to do this 6781 // needs more hardening, testing, better checks that generated code is 6782 // legal, etc and because it is only verified to handle a single pass of 6783 // stack fixup. 6784 // 6785 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch 6786 // these cases until they are known to be handled. Bugzilla 46767 is 6787 // referenced in comments at the assert site. 6788 // 6789 // To avoid asserting (or generating non-legal code on noassert builds) 6790 // we remove all candidates which would need more than one stack fixup by 6791 // pruning the cases where the candidate has calls while also having no 6792 // available LR and having no available general purpose registers to copy 6793 // LR to (ie one extra stack save/restore). 6794 // 6795 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { 6796 erase_if(RepeatedSequenceLocs, [this](outliner::Candidate &C) { 6797 return (std::any_of( 6798 C.front(), std::next(C.back()), 6799 [](const MachineInstr &MI) { return MI.isCall(); })) && 6800 (!C.LRU.available(AArch64::LR) || !findRegisterToSaveLRTo(C)); 6801 }); 6802 } 6803 } 6804 6805 // If we dropped all of the candidates, bail out here. 6806 if (RepeatedSequenceLocs.size() < 2) { 6807 RepeatedSequenceLocs.clear(); 6808 return outliner::OutlinedFunction(); 6809 } 6810 } 6811 6812 // Does every candidate's MBB contain a call? If so, then we might have a call 6813 // in the range. 6814 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { 6815 // Check if the range contains a call. These require a save + restore of the 6816 // link register. 6817 bool ModStackToSaveLR = false; 6818 if (std::any_of(FirstCand.front(), FirstCand.back(), 6819 [](const MachineInstr &MI) { return MI.isCall(); })) 6820 ModStackToSaveLR = true; 6821 6822 // Handle the last instruction separately. If this is a tail call, then the 6823 // last instruction is a call. We don't want to save + restore in this case. 6824 // However, it could be possible that the last instruction is a call without 6825 // it being valid to tail call this sequence. We should consider this as 6826 // well. 6827 else if (FrameID != MachineOutlinerThunk && 6828 FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall()) 6829 ModStackToSaveLR = true; 6830 6831 if (ModStackToSaveLR) { 6832 // We can't fix up the stack. Bail out. 6833 if (!AllStackInstrsSafe) { 6834 RepeatedSequenceLocs.clear(); 6835 return outliner::OutlinedFunction(); 6836 } 6837 6838 // Save + restore LR. 6839 NumBytesToCreateFrame += 8; 6840 } 6841 } 6842 6843 // If we have CFI instructions, we can only outline if the outlined section 6844 // can be a tail call 6845 if (FrameID != MachineOutlinerTailCall && CFICount > 0) 6846 return outliner::OutlinedFunction(); 6847 6848 return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, 6849 NumBytesToCreateFrame, FrameID); 6850 } 6851 6852 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom( 6853 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const { 6854 const Function &F = MF.getFunction(); 6855 6856 // Can F be deduplicated by the linker? If it can, don't outline from it. 6857 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage()) 6858 return false; 6859 6860 // Don't outline from functions with section markings; the program could 6861 // expect that all the code is in the named section. 6862 // FIXME: Allow outlining from multiple functions with the same section 6863 // marking. 6864 if (F.hasSection()) 6865 return false; 6866 6867 // Outlining from functions with redzones is unsafe since the outliner may 6868 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't 6869 // outline from it. 6870 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 6871 if (!AFI || AFI->hasRedZone().getValueOr(true)) 6872 return false; 6873 6874 // FIXME: Teach the outliner to generate/handle Windows unwind info. 6875 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) 6876 return false; 6877 6878 // It's safe to outline from MF. 6879 return true; 6880 } 6881 6882 bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB, 6883 unsigned &Flags) const { 6884 // Check if LR is available through all of the MBB. If it's not, then set 6885 // a flag. 6886 assert(MBB.getParent()->getRegInfo().tracksLiveness() && 6887 "Suitable Machine Function for outlining must track liveness"); 6888 LiveRegUnits LRU(getRegisterInfo()); 6889 6890 std::for_each(MBB.rbegin(), MBB.rend(), 6891 [&LRU](MachineInstr &MI) { LRU.accumulate(MI); }); 6892 6893 // Check if each of the unsafe registers are available... 6894 bool W16AvailableInBlock = LRU.available(AArch64::W16); 6895 bool W17AvailableInBlock = LRU.available(AArch64::W17); 6896 bool NZCVAvailableInBlock = LRU.available(AArch64::NZCV); 6897 6898 // If all of these are dead (and not live out), we know we don't have to check 6899 // them later. 6900 if (W16AvailableInBlock && W17AvailableInBlock && NZCVAvailableInBlock) 6901 Flags |= MachineOutlinerMBBFlags::UnsafeRegsDead; 6902 6903 // Now, add the live outs to the set. 6904 LRU.addLiveOuts(MBB); 6905 6906 // If any of these registers is available in the MBB, but also a live out of 6907 // the block, then we know outlining is unsafe. 6908 if (W16AvailableInBlock && !LRU.available(AArch64::W16)) 6909 return false; 6910 if (W17AvailableInBlock && !LRU.available(AArch64::W17)) 6911 return false; 6912 if (NZCVAvailableInBlock && !LRU.available(AArch64::NZCV)) 6913 return false; 6914 6915 // Check if there's a call inside this MachineBasicBlock. If there is, then 6916 // set a flag. 6917 if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); })) 6918 Flags |= MachineOutlinerMBBFlags::HasCalls; 6919 6920 MachineFunction *MF = MBB.getParent(); 6921 6922 // In the event that we outline, we may have to save LR. If there is an 6923 // available register in the MBB, then we'll always save LR there. Check if 6924 // this is true. 6925 bool CanSaveLR = false; 6926 const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>( 6927 MF->getSubtarget().getRegisterInfo()); 6928 6929 // Check if there is an available register across the sequence that we can 6930 // use. 6931 for (unsigned Reg : AArch64::GPR64RegClass) { 6932 if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR && 6933 Reg != AArch64::X16 && Reg != AArch64::X17 && LRU.available(Reg)) { 6934 CanSaveLR = true; 6935 break; 6936 } 6937 } 6938 6939 // Check if we have a register we can save LR to, and if LR was used 6940 // somewhere. If both of those things are true, then we need to evaluate the 6941 // safety of outlining stack instructions later. 6942 if (!CanSaveLR && !LRU.available(AArch64::LR)) 6943 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere; 6944 6945 return true; 6946 } 6947 6948 outliner::InstrType 6949 AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, 6950 unsigned Flags) const { 6951 MachineInstr &MI = *MIT; 6952 MachineBasicBlock *MBB = MI.getParent(); 6953 MachineFunction *MF = MBB->getParent(); 6954 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>(); 6955 6956 // Don't outline anything used for return address signing. The outlined 6957 // function will get signed later if needed 6958 switch (MI.getOpcode()) { 6959 case AArch64::PACIASP: 6960 case AArch64::PACIBSP: 6961 case AArch64::AUTIASP: 6962 case AArch64::AUTIBSP: 6963 case AArch64::RETAA: 6964 case AArch64::RETAB: 6965 case AArch64::EMITBKEY: 6966 return outliner::InstrType::Illegal; 6967 } 6968 6969 // Don't outline LOHs. 6970 if (FuncInfo->getLOHRelated().count(&MI)) 6971 return outliner::InstrType::Illegal; 6972 6973 // We can only outline these if we will tail call the outlined function, or 6974 // fix up the CFI offsets. Currently, CFI instructions are outlined only if 6975 // in a tail call. 6976 // 6977 // FIXME: If the proper fixups for the offset are implemented, this should be 6978 // possible. 6979 if (MI.isCFIInstruction()) 6980 return outliner::InstrType::Legal; 6981 6982 // Don't allow debug values to impact outlining type. 6983 if (MI.isDebugInstr() || MI.isIndirectDebugValue()) 6984 return outliner::InstrType::Invisible; 6985 6986 // At this point, KILL instructions don't really tell us much so we can go 6987 // ahead and skip over them. 6988 if (MI.isKill()) 6989 return outliner::InstrType::Invisible; 6990 6991 // Is this a terminator for a basic block? 6992 if (MI.isTerminator()) { 6993 6994 // Is this the end of a function? 6995 if (MI.getParent()->succ_empty()) 6996 return outliner::InstrType::Legal; 6997 6998 // It's not, so don't outline it. 6999 return outliner::InstrType::Illegal; 7000 } 7001 7002 // Make sure none of the operands are un-outlinable. 7003 for (const MachineOperand &MOP : MI.operands()) { 7004 if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() || 7005 MOP.isTargetIndex()) 7006 return outliner::InstrType::Illegal; 7007 7008 // If it uses LR or W30 explicitly, then don't touch it. 7009 if (MOP.isReg() && !MOP.isImplicit() && 7010 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30)) 7011 return outliner::InstrType::Illegal; 7012 } 7013 7014 // Special cases for instructions that can always be outlined, but will fail 7015 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always 7016 // be outlined because they don't require a *specific* value to be in LR. 7017 if (MI.getOpcode() == AArch64::ADRP) 7018 return outliner::InstrType::Legal; 7019 7020 // If MI is a call we might be able to outline it. We don't want to outline 7021 // any calls that rely on the position of items on the stack. When we outline 7022 // something containing a call, we have to emit a save and restore of LR in 7023 // the outlined function. Currently, this always happens by saving LR to the 7024 // stack. Thus, if we outline, say, half the parameters for a function call 7025 // plus the call, then we'll break the callee's expectations for the layout 7026 // of the stack. 7027 // 7028 // FIXME: Allow calls to functions which construct a stack frame, as long 7029 // as they don't access arguments on the stack. 7030 // FIXME: Figure out some way to analyze functions defined in other modules. 7031 // We should be able to compute the memory usage based on the IR calling 7032 // convention, even if we can't see the definition. 7033 if (MI.isCall()) { 7034 // Get the function associated with the call. Look at each operand and find 7035 // the one that represents the callee and get its name. 7036 const Function *Callee = nullptr; 7037 for (const MachineOperand &MOP : MI.operands()) { 7038 if (MOP.isGlobal()) { 7039 Callee = dyn_cast<Function>(MOP.getGlobal()); 7040 break; 7041 } 7042 } 7043 7044 // Never outline calls to mcount. There isn't any rule that would require 7045 // this, but the Linux kernel's "ftrace" feature depends on it. 7046 if (Callee && Callee->getName() == "\01_mcount") 7047 return outliner::InstrType::Illegal; 7048 7049 // If we don't know anything about the callee, assume it depends on the 7050 // stack layout of the caller. In that case, it's only legal to outline 7051 // as a tail-call. Explicitly list the call instructions we know about so we 7052 // don't get unexpected results with call pseudo-instructions. 7053 auto UnknownCallOutlineType = outliner::InstrType::Illegal; 7054 if (MI.getOpcode() == AArch64::BLR || 7055 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL) 7056 UnknownCallOutlineType = outliner::InstrType::LegalTerminator; 7057 7058 if (!Callee) 7059 return UnknownCallOutlineType; 7060 7061 // We have a function we have information about. Check it if it's something 7062 // can safely outline. 7063 MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee); 7064 7065 // We don't know what's going on with the callee at all. Don't touch it. 7066 if (!CalleeMF) 7067 return UnknownCallOutlineType; 7068 7069 // Check if we know anything about the callee saves on the function. If we 7070 // don't, then don't touch it, since that implies that we haven't 7071 // computed anything about its stack frame yet. 7072 MachineFrameInfo &MFI = CalleeMF->getFrameInfo(); 7073 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 || 7074 MFI.getNumObjects() > 0) 7075 return UnknownCallOutlineType; 7076 7077 // At this point, we can say that CalleeMF ought to not pass anything on the 7078 // stack. Therefore, we can outline it. 7079 return outliner::InstrType::Legal; 7080 } 7081 7082 // Don't outline positions. 7083 if (MI.isPosition()) 7084 return outliner::InstrType::Illegal; 7085 7086 // Don't touch the link register or W30. 7087 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) || 7088 MI.modifiesRegister(AArch64::W30, &getRegisterInfo())) 7089 return outliner::InstrType::Illegal; 7090 7091 // Don't outline BTI instructions, because that will prevent the outlining 7092 // site from being indirectly callable. 7093 if (MI.getOpcode() == AArch64::HINT) { 7094 int64_t Imm = MI.getOperand(0).getImm(); 7095 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38) 7096 return outliner::InstrType::Illegal; 7097 } 7098 7099 return outliner::InstrType::Legal; 7100 } 7101 7102 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const { 7103 for (MachineInstr &MI : MBB) { 7104 const MachineOperand *Base; 7105 unsigned Width; 7106 int64_t Offset; 7107 bool OffsetIsScalable; 7108 7109 // Is this a load or store with an immediate offset with SP as the base? 7110 if (!MI.mayLoadOrStore() || 7111 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width, 7112 &RI) || 7113 (Base->isReg() && Base->getReg() != AArch64::SP)) 7114 continue; 7115 7116 // It is, so we have to fix it up. 7117 TypeSize Scale(0U, false); 7118 int64_t Dummy1, Dummy2; 7119 7120 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI); 7121 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!"); 7122 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2); 7123 assert(Scale != 0 && "Unexpected opcode!"); 7124 assert(!OffsetIsScalable && "Expected offset to be a byte offset"); 7125 7126 // We've pushed the return address to the stack, so add 16 to the offset. 7127 // This is safe, since we already checked if it would overflow when we 7128 // checked if this instruction was legal to outline. 7129 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedSize(); 7130 StackOffsetOperand.setImm(NewImm); 7131 } 7132 } 7133 7134 static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, 7135 bool ShouldSignReturnAddr, 7136 bool ShouldSignReturnAddrWithAKey) { 7137 if (ShouldSignReturnAddr) { 7138 MachineBasicBlock::iterator MBBPAC = MBB.begin(); 7139 MachineBasicBlock::iterator MBBAUT = MBB.getFirstTerminator(); 7140 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); 7141 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 7142 DebugLoc DL; 7143 7144 if (MBBAUT != MBB.end()) 7145 DL = MBBAUT->getDebugLoc(); 7146 7147 // At the very beginning of the basic block we insert the following 7148 // depending on the key type 7149 // 7150 // a_key: b_key: 7151 // PACIASP EMITBKEY 7152 // CFI_INSTRUCTION PACIBSP 7153 // CFI_INSTRUCTION 7154 unsigned PACI; 7155 if (ShouldSignReturnAddrWithAKey) { 7156 PACI = Subtarget.hasPAuth() ? AArch64::PACIA : AArch64::PACIASP; 7157 } else { 7158 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::EMITBKEY)) 7159 .setMIFlag(MachineInstr::FrameSetup); 7160 PACI = Subtarget.hasPAuth() ? AArch64::PACIB : AArch64::PACIBSP; 7161 } 7162 7163 auto MI = BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(PACI)); 7164 if (Subtarget.hasPAuth()) 7165 MI.addReg(AArch64::LR, RegState::Define) 7166 .addReg(AArch64::LR) 7167 .addReg(AArch64::SP, RegState::InternalRead); 7168 MI.setMIFlag(MachineInstr::FrameSetup); 7169 7170 unsigned CFIIndex = 7171 MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr)); 7172 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::CFI_INSTRUCTION)) 7173 .addCFIIndex(CFIIndex) 7174 .setMIFlags(MachineInstr::FrameSetup); 7175 7176 // If v8.3a features are available we can replace a RET instruction by 7177 // RETAA or RETAB and omit the AUT instructions 7178 if (Subtarget.hasPAuth() && MBBAUT != MBB.end() && 7179 MBBAUT->getOpcode() == AArch64::RET) { 7180 BuildMI(MBB, MBBAUT, DL, 7181 TII->get(ShouldSignReturnAddrWithAKey ? AArch64::RETAA 7182 : AArch64::RETAB)) 7183 .copyImplicitOps(*MBBAUT); 7184 MBB.erase(MBBAUT); 7185 } else { 7186 BuildMI(MBB, MBBAUT, DL, 7187 TII->get(ShouldSignReturnAddrWithAKey ? AArch64::AUTIASP 7188 : AArch64::AUTIBSP)) 7189 .setMIFlag(MachineInstr::FrameDestroy); 7190 } 7191 } 7192 } 7193 7194 void AArch64InstrInfo::buildOutlinedFrame( 7195 MachineBasicBlock &MBB, MachineFunction &MF, 7196 const outliner::OutlinedFunction &OF) const { 7197 7198 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>(); 7199 7200 if (OF.FrameConstructionID == MachineOutlinerTailCall) 7201 FI->setOutliningStyle("Tail Call"); 7202 else if (OF.FrameConstructionID == MachineOutlinerThunk) { 7203 // For thunk outlining, rewrite the last instruction from a call to a 7204 // tail-call. 7205 MachineInstr *Call = &*--MBB.instr_end(); 7206 unsigned TailOpcode; 7207 if (Call->getOpcode() == AArch64::BL) { 7208 TailOpcode = AArch64::TCRETURNdi; 7209 } else { 7210 assert(Call->getOpcode() == AArch64::BLR || 7211 Call->getOpcode() == AArch64::BLRNoIP); 7212 TailOpcode = AArch64::TCRETURNriALL; 7213 } 7214 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode)) 7215 .add(Call->getOperand(0)) 7216 .addImm(0); 7217 MBB.insert(MBB.end(), TC); 7218 Call->eraseFromParent(); 7219 7220 FI->setOutliningStyle("Thunk"); 7221 } 7222 7223 bool IsLeafFunction = true; 7224 7225 // Is there a call in the outlined range? 7226 auto IsNonTailCall = [](const MachineInstr &MI) { 7227 return MI.isCall() && !MI.isReturn(); 7228 }; 7229 7230 if (llvm::any_of(MBB.instrs(), IsNonTailCall)) { 7231 // Fix up the instructions in the range, since we're going to modify the 7232 // stack. 7233 7234 // Bugzilla ID: 46767 7235 // TODO: Check if fixing up twice is safe so we can outline these. 7236 assert(OF.FrameConstructionID != MachineOutlinerDefault && 7237 "Can only fix up stack references once"); 7238 fixupPostOutline(MBB); 7239 7240 IsLeafFunction = false; 7241 7242 // LR has to be a live in so that we can save it. 7243 if (!MBB.isLiveIn(AArch64::LR)) 7244 MBB.addLiveIn(AArch64::LR); 7245 7246 MachineBasicBlock::iterator It = MBB.begin(); 7247 MachineBasicBlock::iterator Et = MBB.end(); 7248 7249 if (OF.FrameConstructionID == MachineOutlinerTailCall || 7250 OF.FrameConstructionID == MachineOutlinerThunk) 7251 Et = std::prev(MBB.end()); 7252 7253 // Insert a save before the outlined region 7254 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 7255 .addReg(AArch64::SP, RegState::Define) 7256 .addReg(AArch64::LR) 7257 .addReg(AArch64::SP) 7258 .addImm(-16); 7259 It = MBB.insert(It, STRXpre); 7260 7261 const TargetSubtargetInfo &STI = MF.getSubtarget(); 7262 const MCRegisterInfo *MRI = STI.getRegisterInfo(); 7263 unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true); 7264 7265 // Add a CFI saying the stack was moved 16 B down. 7266 int64_t StackPosEntry = 7267 MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16)); 7268 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 7269 .addCFIIndex(StackPosEntry) 7270 .setMIFlags(MachineInstr::FrameSetup); 7271 7272 // Add a CFI saying that the LR that we want to find is now 16 B higher than 7273 // before. 7274 int64_t LRPosEntry = 7275 MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, -16)); 7276 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 7277 .addCFIIndex(LRPosEntry) 7278 .setMIFlags(MachineInstr::FrameSetup); 7279 7280 // Insert a restore before the terminator for the function. 7281 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 7282 .addReg(AArch64::SP, RegState::Define) 7283 .addReg(AArch64::LR, RegState::Define) 7284 .addReg(AArch64::SP) 7285 .addImm(16); 7286 Et = MBB.insert(Et, LDRXpost); 7287 } 7288 7289 // If a bunch of candidates reach this point they must agree on their return 7290 // address signing. It is therefore enough to just consider the signing 7291 // behaviour of one of them 7292 const auto &MFI = *OF.Candidates.front().getMF()->getInfo<AArch64FunctionInfo>(); 7293 bool ShouldSignReturnAddr = MFI.shouldSignReturnAddress(!IsLeafFunction); 7294 7295 // a_key is the default 7296 bool ShouldSignReturnAddrWithAKey = !MFI.shouldSignWithBKey(); 7297 7298 // If this is a tail call outlined function, then there's already a return. 7299 if (OF.FrameConstructionID == MachineOutlinerTailCall || 7300 OF.FrameConstructionID == MachineOutlinerThunk) { 7301 signOutlinedFunction(MF, MBB, ShouldSignReturnAddr, 7302 ShouldSignReturnAddrWithAKey); 7303 return; 7304 } 7305 7306 // It's not a tail call, so we have to insert the return ourselves. 7307 7308 // LR has to be a live in so that we can return to it. 7309 if (!MBB.isLiveIn(AArch64::LR)) 7310 MBB.addLiveIn(AArch64::LR); 7311 7312 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET)) 7313 .addReg(AArch64::LR); 7314 MBB.insert(MBB.end(), ret); 7315 7316 signOutlinedFunction(MF, MBB, ShouldSignReturnAddr, 7317 ShouldSignReturnAddrWithAKey); 7318 7319 FI->setOutliningStyle("Function"); 7320 7321 // Did we have to modify the stack by saving the link register? 7322 if (OF.FrameConstructionID != MachineOutlinerDefault) 7323 return; 7324 7325 // We modified the stack. 7326 // Walk over the basic block and fix up all the stack accesses. 7327 fixupPostOutline(MBB); 7328 } 7329 7330 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall( 7331 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, 7332 MachineFunction &MF, const outliner::Candidate &C) const { 7333 7334 // Are we tail calling? 7335 if (C.CallConstructionID == MachineOutlinerTailCall) { 7336 // If yes, then we can just branch to the label. 7337 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi)) 7338 .addGlobalAddress(M.getNamedValue(MF.getName())) 7339 .addImm(0)); 7340 return It; 7341 } 7342 7343 // Are we saving the link register? 7344 if (C.CallConstructionID == MachineOutlinerNoLRSave || 7345 C.CallConstructionID == MachineOutlinerThunk) { 7346 // No, so just insert the call. 7347 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 7348 .addGlobalAddress(M.getNamedValue(MF.getName()))); 7349 return It; 7350 } 7351 7352 // We want to return the spot where we inserted the call. 7353 MachineBasicBlock::iterator CallPt; 7354 7355 // Instructions for saving and restoring LR around the call instruction we're 7356 // going to insert. 7357 MachineInstr *Save; 7358 MachineInstr *Restore; 7359 // Can we save to a register? 7360 if (C.CallConstructionID == MachineOutlinerRegSave) { 7361 // FIXME: This logic should be sunk into a target-specific interface so that 7362 // we don't have to recompute the register. 7363 unsigned Reg = findRegisterToSaveLRTo(C); 7364 assert(Reg != 0 && "No callee-saved register available?"); 7365 7366 // Save and restore LR from that register. 7367 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg) 7368 .addReg(AArch64::XZR) 7369 .addReg(AArch64::LR) 7370 .addImm(0); 7371 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR) 7372 .addReg(AArch64::XZR) 7373 .addReg(Reg) 7374 .addImm(0); 7375 } else { 7376 // We have the default case. Save and restore from SP. 7377 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 7378 .addReg(AArch64::SP, RegState::Define) 7379 .addReg(AArch64::LR) 7380 .addReg(AArch64::SP) 7381 .addImm(-16); 7382 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 7383 .addReg(AArch64::SP, RegState::Define) 7384 .addReg(AArch64::LR, RegState::Define) 7385 .addReg(AArch64::SP) 7386 .addImm(16); 7387 } 7388 7389 It = MBB.insert(It, Save); 7390 It++; 7391 7392 // Insert the call. 7393 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 7394 .addGlobalAddress(M.getNamedValue(MF.getName()))); 7395 CallPt = It; 7396 It++; 7397 7398 It = MBB.insert(It, Restore); 7399 return CallPt; 7400 } 7401 7402 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault( 7403 MachineFunction &MF) const { 7404 return MF.getFunction().hasMinSize(); 7405 } 7406 7407 Optional<DestSourcePair> 7408 AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { 7409 7410 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg 7411 // and zero immediate operands used as an alias for mov instruction. 7412 if (MI.getOpcode() == AArch64::ORRWrs && 7413 MI.getOperand(1).getReg() == AArch64::WZR && 7414 MI.getOperand(3).getImm() == 0x0) { 7415 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 7416 } 7417 7418 if (MI.getOpcode() == AArch64::ORRXrs && 7419 MI.getOperand(1).getReg() == AArch64::XZR && 7420 MI.getOperand(3).getImm() == 0x0) { 7421 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 7422 } 7423 7424 return None; 7425 } 7426 7427 Optional<RegImmPair> AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, 7428 Register Reg) const { 7429 int Sign = 1; 7430 int64_t Offset = 0; 7431 7432 // TODO: Handle cases where Reg is a super- or sub-register of the 7433 // destination register. 7434 const MachineOperand &Op0 = MI.getOperand(0); 7435 if (!Op0.isReg() || Reg != Op0.getReg()) 7436 return None; 7437 7438 switch (MI.getOpcode()) { 7439 default: 7440 return None; 7441 case AArch64::SUBWri: 7442 case AArch64::SUBXri: 7443 case AArch64::SUBSWri: 7444 case AArch64::SUBSXri: 7445 Sign *= -1; 7446 LLVM_FALLTHROUGH; 7447 case AArch64::ADDSWri: 7448 case AArch64::ADDSXri: 7449 case AArch64::ADDWri: 7450 case AArch64::ADDXri: { 7451 // TODO: Third operand can be global address (usually some string). 7452 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() || 7453 !MI.getOperand(2).isImm()) 7454 return None; 7455 int Shift = MI.getOperand(3).getImm(); 7456 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12"); 7457 Offset = Sign * (MI.getOperand(2).getImm() << Shift); 7458 } 7459 } 7460 return RegImmPair{MI.getOperand(1).getReg(), Offset}; 7461 } 7462 7463 /// If the given ORR instruction is a copy, and \p DescribedReg overlaps with 7464 /// the destination register then, if possible, describe the value in terms of 7465 /// the source register. 7466 static Optional<ParamLoadedValue> 7467 describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg, 7468 const TargetInstrInfo *TII, 7469 const TargetRegisterInfo *TRI) { 7470 auto DestSrc = TII->isCopyInstr(MI); 7471 if (!DestSrc) 7472 return None; 7473 7474 Register DestReg = DestSrc->Destination->getReg(); 7475 Register SrcReg = DestSrc->Source->getReg(); 7476 7477 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {}); 7478 7479 // If the described register is the destination, just return the source. 7480 if (DestReg == DescribedReg) 7481 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 7482 7483 // ORRWrs zero-extends to 64-bits, so we need to consider such cases. 7484 if (MI.getOpcode() == AArch64::ORRWrs && 7485 TRI->isSuperRegister(DestReg, DescribedReg)) 7486 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 7487 7488 // We may need to describe the lower part of a ORRXrs move. 7489 if (MI.getOpcode() == AArch64::ORRXrs && 7490 TRI->isSubRegister(DestReg, DescribedReg)) { 7491 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32); 7492 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr); 7493 } 7494 7495 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) && 7496 "Unhandled ORR[XW]rs copy case"); 7497 7498 return None; 7499 } 7500 7501 Optional<ParamLoadedValue> 7502 AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI, 7503 Register Reg) const { 7504 const MachineFunction *MF = MI.getMF(); 7505 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 7506 switch (MI.getOpcode()) { 7507 case AArch64::MOVZWi: 7508 case AArch64::MOVZXi: { 7509 // MOVZWi may be used for producing zero-extended 32-bit immediates in 7510 // 64-bit parameters, so we need to consider super-registers. 7511 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg)) 7512 return None; 7513 7514 if (!MI.getOperand(1).isImm()) 7515 return None; 7516 int64_t Immediate = MI.getOperand(1).getImm(); 7517 int Shift = MI.getOperand(2).getImm(); 7518 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift), 7519 nullptr); 7520 } 7521 case AArch64::ORRWrs: 7522 case AArch64::ORRXrs: 7523 return describeORRLoadedValue(MI, Reg, this, TRI); 7524 } 7525 7526 return TargetInstrInfo::describeLoadedValue(MI, Reg); 7527 } 7528 7529 bool AArch64InstrInfo::isExtendLikelyToBeFolded( 7530 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const { 7531 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT || 7532 ExtMI.getOpcode() == TargetOpcode::G_ZEXT || 7533 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT); 7534 7535 // Anyexts are nops. 7536 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT) 7537 return true; 7538 7539 Register DefReg = ExtMI.getOperand(0).getReg(); 7540 if (!MRI.hasOneNonDBGUse(DefReg)) 7541 return false; 7542 7543 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an 7544 // addressing mode. 7545 auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg); 7546 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD; 7547 } 7548 7549 uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const { 7550 return get(Opc).TSFlags & AArch64::ElementSizeMask; 7551 } 7552 7553 bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const { 7554 return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike; 7555 } 7556 7557 bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const { 7558 return get(Opc).TSFlags & AArch64::InstrFlagIsWhile; 7559 } 7560 7561 unsigned int 7562 AArch64InstrInfo::getTailDuplicateSize(CodeGenOpt::Level OptLevel) const { 7563 return OptLevel >= CodeGenOpt::Aggressive ? 6 : 2; 7564 } 7565 7566 unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) { 7567 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr()) 7568 return AArch64::BLRNoIP; 7569 else 7570 return AArch64::BLR; 7571 } 7572 7573 #define GET_INSTRINFO_HELPERS 7574 #define GET_INSTRMAP_INFO 7575 #include "AArch64GenInstrInfo.inc" 7576