1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains the AArch64 implementation of the TargetInstrInfo class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64InstrInfo.h" 14 #include "AArch64MachineFunctionInfo.h" 15 #include "AArch64Subtarget.h" 16 #include "MCTargetDesc/AArch64AddressingModes.h" 17 #include "Utils/AArch64BaseInfo.h" 18 #include "llvm/ADT/ArrayRef.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/CodeGen/MachineFrameInfo.h" 23 #include "llvm/CodeGen/MachineFunction.h" 24 #include "llvm/CodeGen/MachineInstr.h" 25 #include "llvm/CodeGen/MachineInstrBuilder.h" 26 #include "llvm/CodeGen/MachineMemOperand.h" 27 #include "llvm/CodeGen/MachineOperand.h" 28 #include "llvm/CodeGen/MachineRegisterInfo.h" 29 #include "llvm/CodeGen/MachineModuleInfo.h" 30 #include "llvm/CodeGen/StackMaps.h" 31 #include "llvm/CodeGen/TargetRegisterInfo.h" 32 #include "llvm/CodeGen/TargetSubtargetInfo.h" 33 #include "llvm/IR/DebugInfoMetadata.h" 34 #include "llvm/IR/DebugLoc.h" 35 #include "llvm/IR/GlobalValue.h" 36 #include "llvm/MC/MCAsmInfo.h" 37 #include "llvm/MC/MCInst.h" 38 #include "llvm/MC/MCInstrDesc.h" 39 #include "llvm/Support/Casting.h" 40 #include "llvm/Support/CodeGen.h" 41 #include "llvm/Support/CommandLine.h" 42 #include "llvm/Support/Compiler.h" 43 #include "llvm/Support/ErrorHandling.h" 44 #include "llvm/Support/MathExtras.h" 45 #include "llvm/Target/TargetMachine.h" 46 #include "llvm/Target/TargetOptions.h" 47 #include <cassert> 48 #include <cstdint> 49 #include <iterator> 50 #include <utility> 51 52 using namespace llvm; 53 54 #define GET_INSTRINFO_CTOR_DTOR 55 #include "AArch64GenInstrInfo.inc" 56 57 static cl::opt<unsigned> TBZDisplacementBits( 58 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), 59 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)")); 60 61 static cl::opt<unsigned> CBZDisplacementBits( 62 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), 63 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)")); 64 65 static cl::opt<unsigned> 66 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), 67 cl::desc("Restrict range of Bcc instructions (DEBUG)")); 68 69 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) 70 : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP, 71 AArch64::CATCHRET), 72 RI(STI.getTargetTriple()), Subtarget(STI) {} 73 74 /// GetInstSize - Return the number of bytes of code the specified 75 /// instruction may be. This returns the maximum number of bytes. 76 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 77 const MachineBasicBlock &MBB = *MI.getParent(); 78 const MachineFunction *MF = MBB.getParent(); 79 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); 80 81 { 82 auto Op = MI.getOpcode(); 83 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR) 84 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI); 85 } 86 87 // Meta-instructions emit no code. 88 if (MI.isMetaInstruction()) 89 return 0; 90 91 // FIXME: We currently only handle pseudoinstructions that don't get expanded 92 // before the assembly printer. 93 unsigned NumBytes = 0; 94 const MCInstrDesc &Desc = MI.getDesc(); 95 switch (Desc.getOpcode()) { 96 default: 97 // Anything not explicitly designated otherwise is a normal 4-byte insn. 98 NumBytes = 4; 99 break; 100 case TargetOpcode::STACKMAP: 101 // The upper bound for a stackmap intrinsic is the full length of its shadow 102 NumBytes = StackMapOpers(&MI).getNumPatchBytes(); 103 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 104 break; 105 case TargetOpcode::PATCHPOINT: 106 // The size of the patchpoint intrinsic is the number of bytes requested 107 NumBytes = PatchPointOpers(&MI).getNumPatchBytes(); 108 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 109 break; 110 case AArch64::TLSDESC_CALLSEQ: 111 // This gets lowered to an instruction sequence which takes 16 bytes 112 NumBytes = 16; 113 break; 114 case AArch64::JumpTableDest32: 115 case AArch64::JumpTableDest16: 116 case AArch64::JumpTableDest8: 117 NumBytes = 12; 118 break; 119 case AArch64::SPACE: 120 NumBytes = MI.getOperand(1).getImm(); 121 break; 122 case TargetOpcode::BUNDLE: 123 NumBytes = getInstBundleLength(MI); 124 break; 125 } 126 127 return NumBytes; 128 } 129 130 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const { 131 unsigned Size = 0; 132 MachineBasicBlock::const_instr_iterator I = MI.getIterator(); 133 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); 134 while (++I != E && I->isInsideBundle()) { 135 assert(!I->isBundle() && "No nested bundle!"); 136 Size += getInstSizeInBytes(*I); 137 } 138 return Size; 139 } 140 141 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, 142 SmallVectorImpl<MachineOperand> &Cond) { 143 // Block ends with fall-through condbranch. 144 switch (LastInst->getOpcode()) { 145 default: 146 llvm_unreachable("Unknown branch instruction?"); 147 case AArch64::Bcc: 148 Target = LastInst->getOperand(1).getMBB(); 149 Cond.push_back(LastInst->getOperand(0)); 150 break; 151 case AArch64::CBZW: 152 case AArch64::CBZX: 153 case AArch64::CBNZW: 154 case AArch64::CBNZX: 155 Target = LastInst->getOperand(1).getMBB(); 156 Cond.push_back(MachineOperand::CreateImm(-1)); 157 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 158 Cond.push_back(LastInst->getOperand(0)); 159 break; 160 case AArch64::TBZW: 161 case AArch64::TBZX: 162 case AArch64::TBNZW: 163 case AArch64::TBNZX: 164 Target = LastInst->getOperand(2).getMBB(); 165 Cond.push_back(MachineOperand::CreateImm(-1)); 166 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 167 Cond.push_back(LastInst->getOperand(0)); 168 Cond.push_back(LastInst->getOperand(1)); 169 } 170 } 171 172 static unsigned getBranchDisplacementBits(unsigned Opc) { 173 switch (Opc) { 174 default: 175 llvm_unreachable("unexpected opcode!"); 176 case AArch64::B: 177 return 64; 178 case AArch64::TBNZW: 179 case AArch64::TBZW: 180 case AArch64::TBNZX: 181 case AArch64::TBZX: 182 return TBZDisplacementBits; 183 case AArch64::CBNZW: 184 case AArch64::CBZW: 185 case AArch64::CBNZX: 186 case AArch64::CBZX: 187 return CBZDisplacementBits; 188 case AArch64::Bcc: 189 return BCCDisplacementBits; 190 } 191 } 192 193 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp, 194 int64_t BrOffset) const { 195 unsigned Bits = getBranchDisplacementBits(BranchOp); 196 assert(Bits >= 3 && "max branch displacement must be enough to jump" 197 "over conditional branch expansion"); 198 return isIntN(Bits, BrOffset / 4); 199 } 200 201 MachineBasicBlock * 202 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const { 203 switch (MI.getOpcode()) { 204 default: 205 llvm_unreachable("unexpected opcode!"); 206 case AArch64::B: 207 return MI.getOperand(0).getMBB(); 208 case AArch64::TBZW: 209 case AArch64::TBNZW: 210 case AArch64::TBZX: 211 case AArch64::TBNZX: 212 return MI.getOperand(2).getMBB(); 213 case AArch64::CBZW: 214 case AArch64::CBNZW: 215 case AArch64::CBZX: 216 case AArch64::CBNZX: 217 case AArch64::Bcc: 218 return MI.getOperand(1).getMBB(); 219 } 220 } 221 222 // Branch analysis. 223 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB, 224 MachineBasicBlock *&TBB, 225 MachineBasicBlock *&FBB, 226 SmallVectorImpl<MachineOperand> &Cond, 227 bool AllowModify) const { 228 // If the block has no terminators, it just falls into the block after it. 229 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 230 if (I == MBB.end()) 231 return false; 232 233 if (!isUnpredicatedTerminator(*I)) 234 return false; 235 236 // Get the last instruction in the block. 237 MachineInstr *LastInst = &*I; 238 239 // If there is only one terminator instruction, process it. 240 unsigned LastOpc = LastInst->getOpcode(); 241 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 242 if (isUncondBranchOpcode(LastOpc)) { 243 TBB = LastInst->getOperand(0).getMBB(); 244 return false; 245 } 246 if (isCondBranchOpcode(LastOpc)) { 247 // Block ends with fall-through condbranch. 248 parseCondBranch(LastInst, TBB, Cond); 249 return false; 250 } 251 return true; // Can't handle indirect branch. 252 } 253 254 // Get the instruction before it if it is a terminator. 255 MachineInstr *SecondLastInst = &*I; 256 unsigned SecondLastOpc = SecondLastInst->getOpcode(); 257 258 // If AllowModify is true and the block ends with two or more unconditional 259 // branches, delete all but the first unconditional branch. 260 if (AllowModify && isUncondBranchOpcode(LastOpc)) { 261 while (isUncondBranchOpcode(SecondLastOpc)) { 262 LastInst->eraseFromParent(); 263 LastInst = SecondLastInst; 264 LastOpc = LastInst->getOpcode(); 265 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 266 // Return now the only terminator is an unconditional branch. 267 TBB = LastInst->getOperand(0).getMBB(); 268 return false; 269 } else { 270 SecondLastInst = &*I; 271 SecondLastOpc = SecondLastInst->getOpcode(); 272 } 273 } 274 } 275 276 // If there are three terminators, we don't know what sort of block this is. 277 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I)) 278 return true; 279 280 // If the block ends with a B and a Bcc, handle it. 281 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 282 parseCondBranch(SecondLastInst, TBB, Cond); 283 FBB = LastInst->getOperand(0).getMBB(); 284 return false; 285 } 286 287 // If the block ends with two unconditional branches, handle it. The second 288 // one is not executed, so remove it. 289 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 290 TBB = SecondLastInst->getOperand(0).getMBB(); 291 I = LastInst; 292 if (AllowModify) 293 I->eraseFromParent(); 294 return false; 295 } 296 297 // ...likewise if it ends with an indirect branch followed by an unconditional 298 // branch. 299 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 300 I = LastInst; 301 if (AllowModify) 302 I->eraseFromParent(); 303 return true; 304 } 305 306 // Otherwise, can't handle this. 307 return true; 308 } 309 310 bool AArch64InstrInfo::reverseBranchCondition( 311 SmallVectorImpl<MachineOperand> &Cond) const { 312 if (Cond[0].getImm() != -1) { 313 // Regular Bcc 314 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm(); 315 Cond[0].setImm(AArch64CC::getInvertedCondCode(CC)); 316 } else { 317 // Folded compare-and-branch 318 switch (Cond[1].getImm()) { 319 default: 320 llvm_unreachable("Unknown conditional branch!"); 321 case AArch64::CBZW: 322 Cond[1].setImm(AArch64::CBNZW); 323 break; 324 case AArch64::CBNZW: 325 Cond[1].setImm(AArch64::CBZW); 326 break; 327 case AArch64::CBZX: 328 Cond[1].setImm(AArch64::CBNZX); 329 break; 330 case AArch64::CBNZX: 331 Cond[1].setImm(AArch64::CBZX); 332 break; 333 case AArch64::TBZW: 334 Cond[1].setImm(AArch64::TBNZW); 335 break; 336 case AArch64::TBNZW: 337 Cond[1].setImm(AArch64::TBZW); 338 break; 339 case AArch64::TBZX: 340 Cond[1].setImm(AArch64::TBNZX); 341 break; 342 case AArch64::TBNZX: 343 Cond[1].setImm(AArch64::TBZX); 344 break; 345 } 346 } 347 348 return false; 349 } 350 351 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB, 352 int *BytesRemoved) const { 353 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 354 if (I == MBB.end()) 355 return 0; 356 357 if (!isUncondBranchOpcode(I->getOpcode()) && 358 !isCondBranchOpcode(I->getOpcode())) 359 return 0; 360 361 // Remove the branch. 362 I->eraseFromParent(); 363 364 I = MBB.end(); 365 366 if (I == MBB.begin()) { 367 if (BytesRemoved) 368 *BytesRemoved = 4; 369 return 1; 370 } 371 --I; 372 if (!isCondBranchOpcode(I->getOpcode())) { 373 if (BytesRemoved) 374 *BytesRemoved = 4; 375 return 1; 376 } 377 378 // Remove the branch. 379 I->eraseFromParent(); 380 if (BytesRemoved) 381 *BytesRemoved = 8; 382 383 return 2; 384 } 385 386 void AArch64InstrInfo::instantiateCondBranch( 387 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB, 388 ArrayRef<MachineOperand> Cond) const { 389 if (Cond[0].getImm() != -1) { 390 // Regular Bcc 391 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB); 392 } else { 393 // Folded compare-and-branch 394 // Note that we use addOperand instead of addReg to keep the flags. 395 const MachineInstrBuilder MIB = 396 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]); 397 if (Cond.size() > 3) 398 MIB.addImm(Cond[3].getImm()); 399 MIB.addMBB(TBB); 400 } 401 } 402 403 unsigned AArch64InstrInfo::insertBranch( 404 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, 405 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const { 406 // Shouldn't be a fall through. 407 assert(TBB && "insertBranch must not be told to insert a fallthrough"); 408 409 if (!FBB) { 410 if (Cond.empty()) // Unconditional branch? 411 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB); 412 else 413 instantiateCondBranch(MBB, DL, TBB, Cond); 414 415 if (BytesAdded) 416 *BytesAdded = 4; 417 418 return 1; 419 } 420 421 // Two-way conditional branch. 422 instantiateCondBranch(MBB, DL, TBB, Cond); 423 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB); 424 425 if (BytesAdded) 426 *BytesAdded = 8; 427 428 return 2; 429 } 430 431 // Find the original register that VReg is copied from. 432 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) { 433 while (Register::isVirtualRegister(VReg)) { 434 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 435 if (!DefMI->isFullCopy()) 436 return VReg; 437 VReg = DefMI->getOperand(1).getReg(); 438 } 439 return VReg; 440 } 441 442 // Determine if VReg is defined by an instruction that can be folded into a 443 // csel instruction. If so, return the folded opcode, and the replacement 444 // register. 445 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, 446 unsigned *NewVReg = nullptr) { 447 VReg = removeCopies(MRI, VReg); 448 if (!Register::isVirtualRegister(VReg)) 449 return 0; 450 451 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg)); 452 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 453 unsigned Opc = 0; 454 unsigned SrcOpNum = 0; 455 switch (DefMI->getOpcode()) { 456 case AArch64::ADDSXri: 457 case AArch64::ADDSWri: 458 // if NZCV is used, do not fold. 459 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 460 return 0; 461 // fall-through to ADDXri and ADDWri. 462 LLVM_FALLTHROUGH; 463 case AArch64::ADDXri: 464 case AArch64::ADDWri: 465 // add x, 1 -> csinc. 466 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 || 467 DefMI->getOperand(3).getImm() != 0) 468 return 0; 469 SrcOpNum = 1; 470 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr; 471 break; 472 473 case AArch64::ORNXrr: 474 case AArch64::ORNWrr: { 475 // not x -> csinv, represented as orn dst, xzr, src. 476 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 477 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 478 return 0; 479 SrcOpNum = 2; 480 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr; 481 break; 482 } 483 484 case AArch64::SUBSXrr: 485 case AArch64::SUBSWrr: 486 // if NZCV is used, do not fold. 487 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 488 return 0; 489 // fall-through to SUBXrr and SUBWrr. 490 LLVM_FALLTHROUGH; 491 case AArch64::SUBXrr: 492 case AArch64::SUBWrr: { 493 // neg x -> csneg, represented as sub dst, xzr, src. 494 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 495 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 496 return 0; 497 SrcOpNum = 2; 498 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr; 499 break; 500 } 501 default: 502 return 0; 503 } 504 assert(Opc && SrcOpNum && "Missing parameters"); 505 506 if (NewVReg) 507 *NewVReg = DefMI->getOperand(SrcOpNum).getReg(); 508 return Opc; 509 } 510 511 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 512 ArrayRef<MachineOperand> Cond, 513 Register DstReg, Register TrueReg, 514 Register FalseReg, int &CondCycles, 515 int &TrueCycles, 516 int &FalseCycles) const { 517 // Check register classes. 518 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 519 const TargetRegisterClass *RC = 520 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); 521 if (!RC) 522 return false; 523 524 // Also need to check the dest regclass, in case we're trying to optimize 525 // something like: 526 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2 527 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg))) 528 return false; 529 530 // Expanding cbz/tbz requires an extra cycle of latency on the condition. 531 unsigned ExtraCondLat = Cond.size() != 1; 532 533 // GPRs are handled by csel. 534 // FIXME: Fold in x+1, -x, and ~x when applicable. 535 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) || 536 AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 537 // Single-cycle csel, csinc, csinv, and csneg. 538 CondCycles = 1 + ExtraCondLat; 539 TrueCycles = FalseCycles = 1; 540 if (canFoldIntoCSel(MRI, TrueReg)) 541 TrueCycles = 0; 542 else if (canFoldIntoCSel(MRI, FalseReg)) 543 FalseCycles = 0; 544 return true; 545 } 546 547 // Scalar floating point is handled by fcsel. 548 // FIXME: Form fabs, fmin, and fmax when applicable. 549 if (AArch64::FPR64RegClass.hasSubClassEq(RC) || 550 AArch64::FPR32RegClass.hasSubClassEq(RC)) { 551 CondCycles = 5 + ExtraCondLat; 552 TrueCycles = FalseCycles = 2; 553 return true; 554 } 555 556 // Can't do vectors. 557 return false; 558 } 559 560 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, 561 MachineBasicBlock::iterator I, 562 const DebugLoc &DL, Register DstReg, 563 ArrayRef<MachineOperand> Cond, 564 Register TrueReg, Register FalseReg) const { 565 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 566 567 // Parse the condition code, see parseCondBranch() above. 568 AArch64CC::CondCode CC; 569 switch (Cond.size()) { 570 default: 571 llvm_unreachable("Unknown condition opcode in Cond"); 572 case 1: // b.cc 573 CC = AArch64CC::CondCode(Cond[0].getImm()); 574 break; 575 case 3: { // cbz/cbnz 576 // We must insert a compare against 0. 577 bool Is64Bit; 578 switch (Cond[1].getImm()) { 579 default: 580 llvm_unreachable("Unknown branch opcode in Cond"); 581 case AArch64::CBZW: 582 Is64Bit = false; 583 CC = AArch64CC::EQ; 584 break; 585 case AArch64::CBZX: 586 Is64Bit = true; 587 CC = AArch64CC::EQ; 588 break; 589 case AArch64::CBNZW: 590 Is64Bit = false; 591 CC = AArch64CC::NE; 592 break; 593 case AArch64::CBNZX: 594 Is64Bit = true; 595 CC = AArch64CC::NE; 596 break; 597 } 598 Register SrcReg = Cond[2].getReg(); 599 if (Is64Bit) { 600 // cmp reg, #0 is actually subs xzr, reg, #0. 601 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass); 602 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR) 603 .addReg(SrcReg) 604 .addImm(0) 605 .addImm(0); 606 } else { 607 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass); 608 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR) 609 .addReg(SrcReg) 610 .addImm(0) 611 .addImm(0); 612 } 613 break; 614 } 615 case 4: { // tbz/tbnz 616 // We must insert a tst instruction. 617 switch (Cond[1].getImm()) { 618 default: 619 llvm_unreachable("Unknown branch opcode in Cond"); 620 case AArch64::TBZW: 621 case AArch64::TBZX: 622 CC = AArch64CC::EQ; 623 break; 624 case AArch64::TBNZW: 625 case AArch64::TBNZX: 626 CC = AArch64CC::NE; 627 break; 628 } 629 // cmp reg, #foo is actually ands xzr, reg, #1<<foo. 630 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW) 631 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR) 632 .addReg(Cond[2].getReg()) 633 .addImm( 634 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32)); 635 else 636 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR) 637 .addReg(Cond[2].getReg()) 638 .addImm( 639 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64)); 640 break; 641 } 642 } 643 644 unsigned Opc = 0; 645 const TargetRegisterClass *RC = nullptr; 646 bool TryFold = false; 647 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) { 648 RC = &AArch64::GPR64RegClass; 649 Opc = AArch64::CSELXr; 650 TryFold = true; 651 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) { 652 RC = &AArch64::GPR32RegClass; 653 Opc = AArch64::CSELWr; 654 TryFold = true; 655 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) { 656 RC = &AArch64::FPR64RegClass; 657 Opc = AArch64::FCSELDrrr; 658 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) { 659 RC = &AArch64::FPR32RegClass; 660 Opc = AArch64::FCSELSrrr; 661 } 662 assert(RC && "Unsupported regclass"); 663 664 // Try folding simple instructions into the csel. 665 if (TryFold) { 666 unsigned NewVReg = 0; 667 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg); 668 if (FoldedOpc) { 669 // The folded opcodes csinc, csinc and csneg apply the operation to 670 // FalseReg, so we need to invert the condition. 671 CC = AArch64CC::getInvertedCondCode(CC); 672 TrueReg = FalseReg; 673 } else 674 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg); 675 676 // Fold the operation. Leave any dead instructions for DCE to clean up. 677 if (FoldedOpc) { 678 FalseReg = NewVReg; 679 Opc = FoldedOpc; 680 // The extends the live range of NewVReg. 681 MRI.clearKillFlags(NewVReg); 682 } 683 } 684 685 // Pull all virtual register into the appropriate class. 686 MRI.constrainRegClass(TrueReg, RC); 687 MRI.constrainRegClass(FalseReg, RC); 688 689 // Insert the csel. 690 BuildMI(MBB, I, DL, get(Opc), DstReg) 691 .addReg(TrueReg) 692 .addReg(FalseReg) 693 .addImm(CC); 694 } 695 696 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx. 697 static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) { 698 uint64_t Imm = MI.getOperand(1).getImm(); 699 uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); 700 uint64_t Encoding; 701 return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding); 702 } 703 704 // FIXME: this implementation should be micro-architecture dependent, so a 705 // micro-architecture target hook should be introduced here in future. 706 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { 707 if (!Subtarget.hasCustomCheapAsMoveHandling()) 708 return MI.isAsCheapAsAMove(); 709 710 const unsigned Opcode = MI.getOpcode(); 711 712 // Firstly, check cases gated by features. 713 714 if (Subtarget.hasZeroCycleZeroingFP()) { 715 if (Opcode == AArch64::FMOVH0 || 716 Opcode == AArch64::FMOVS0 || 717 Opcode == AArch64::FMOVD0) 718 return true; 719 } 720 721 if (Subtarget.hasZeroCycleZeroingGP()) { 722 if (Opcode == TargetOpcode::COPY && 723 (MI.getOperand(1).getReg() == AArch64::WZR || 724 MI.getOperand(1).getReg() == AArch64::XZR)) 725 return true; 726 } 727 728 // Secondly, check cases specific to sub-targets. 729 730 if (Subtarget.hasExynosCheapAsMoveHandling()) { 731 if (isExynosCheapAsMove(MI)) 732 return true; 733 734 return MI.isAsCheapAsAMove(); 735 } 736 737 // Finally, check generic cases. 738 739 switch (Opcode) { 740 default: 741 return false; 742 743 // add/sub on register without shift 744 case AArch64::ADDWri: 745 case AArch64::ADDXri: 746 case AArch64::SUBWri: 747 case AArch64::SUBXri: 748 return (MI.getOperand(3).getImm() == 0); 749 750 // logical ops on immediate 751 case AArch64::ANDWri: 752 case AArch64::ANDXri: 753 case AArch64::EORWri: 754 case AArch64::EORXri: 755 case AArch64::ORRWri: 756 case AArch64::ORRXri: 757 return true; 758 759 // logical ops on register without shift 760 case AArch64::ANDWrr: 761 case AArch64::ANDXrr: 762 case AArch64::BICWrr: 763 case AArch64::BICXrr: 764 case AArch64::EONWrr: 765 case AArch64::EONXrr: 766 case AArch64::EORWrr: 767 case AArch64::EORXrr: 768 case AArch64::ORNWrr: 769 case AArch64::ORNXrr: 770 case AArch64::ORRWrr: 771 case AArch64::ORRXrr: 772 return true; 773 774 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or 775 // ORRXri, it is as cheap as MOV 776 case AArch64::MOVi32imm: 777 return canBeExpandedToORR(MI, 32); 778 case AArch64::MOVi64imm: 779 return canBeExpandedToORR(MI, 64); 780 } 781 782 llvm_unreachable("Unknown opcode to check as cheap as a move!"); 783 } 784 785 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) { 786 switch (MI.getOpcode()) { 787 default: 788 return false; 789 790 case AArch64::ADDWrs: 791 case AArch64::ADDXrs: 792 case AArch64::ADDSWrs: 793 case AArch64::ADDSXrs: { 794 unsigned Imm = MI.getOperand(3).getImm(); 795 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 796 if (ShiftVal == 0) 797 return true; 798 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5; 799 } 800 801 case AArch64::ADDWrx: 802 case AArch64::ADDXrx: 803 case AArch64::ADDXrx64: 804 case AArch64::ADDSWrx: 805 case AArch64::ADDSXrx: 806 case AArch64::ADDSXrx64: { 807 unsigned Imm = MI.getOperand(3).getImm(); 808 switch (AArch64_AM::getArithExtendType(Imm)) { 809 default: 810 return false; 811 case AArch64_AM::UXTB: 812 case AArch64_AM::UXTH: 813 case AArch64_AM::UXTW: 814 case AArch64_AM::UXTX: 815 return AArch64_AM::getArithShiftValue(Imm) <= 4; 816 } 817 } 818 819 case AArch64::SUBWrs: 820 case AArch64::SUBSWrs: { 821 unsigned Imm = MI.getOperand(3).getImm(); 822 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 823 return ShiftVal == 0 || 824 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31); 825 } 826 827 case AArch64::SUBXrs: 828 case AArch64::SUBSXrs: { 829 unsigned Imm = MI.getOperand(3).getImm(); 830 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 831 return ShiftVal == 0 || 832 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63); 833 } 834 835 case AArch64::SUBWrx: 836 case AArch64::SUBXrx: 837 case AArch64::SUBXrx64: 838 case AArch64::SUBSWrx: 839 case AArch64::SUBSXrx: 840 case AArch64::SUBSXrx64: { 841 unsigned Imm = MI.getOperand(3).getImm(); 842 switch (AArch64_AM::getArithExtendType(Imm)) { 843 default: 844 return false; 845 case AArch64_AM::UXTB: 846 case AArch64_AM::UXTH: 847 case AArch64_AM::UXTW: 848 case AArch64_AM::UXTX: 849 return AArch64_AM::getArithShiftValue(Imm) == 0; 850 } 851 } 852 853 case AArch64::LDRBBroW: 854 case AArch64::LDRBBroX: 855 case AArch64::LDRBroW: 856 case AArch64::LDRBroX: 857 case AArch64::LDRDroW: 858 case AArch64::LDRDroX: 859 case AArch64::LDRHHroW: 860 case AArch64::LDRHHroX: 861 case AArch64::LDRHroW: 862 case AArch64::LDRHroX: 863 case AArch64::LDRQroW: 864 case AArch64::LDRQroX: 865 case AArch64::LDRSBWroW: 866 case AArch64::LDRSBWroX: 867 case AArch64::LDRSBXroW: 868 case AArch64::LDRSBXroX: 869 case AArch64::LDRSHWroW: 870 case AArch64::LDRSHWroX: 871 case AArch64::LDRSHXroW: 872 case AArch64::LDRSHXroX: 873 case AArch64::LDRSWroW: 874 case AArch64::LDRSWroX: 875 case AArch64::LDRSroW: 876 case AArch64::LDRSroX: 877 case AArch64::LDRWroW: 878 case AArch64::LDRWroX: 879 case AArch64::LDRXroW: 880 case AArch64::LDRXroX: 881 case AArch64::PRFMroW: 882 case AArch64::PRFMroX: 883 case AArch64::STRBBroW: 884 case AArch64::STRBBroX: 885 case AArch64::STRBroW: 886 case AArch64::STRBroX: 887 case AArch64::STRDroW: 888 case AArch64::STRDroX: 889 case AArch64::STRHHroW: 890 case AArch64::STRHHroX: 891 case AArch64::STRHroW: 892 case AArch64::STRHroX: 893 case AArch64::STRQroW: 894 case AArch64::STRQroX: 895 case AArch64::STRSroW: 896 case AArch64::STRSroX: 897 case AArch64::STRWroW: 898 case AArch64::STRWroX: 899 case AArch64::STRXroW: 900 case AArch64::STRXroX: { 901 unsigned IsSigned = MI.getOperand(3).getImm(); 902 return !IsSigned; 903 } 904 } 905 } 906 907 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) { 908 unsigned Opc = MI.getOpcode(); 909 switch (Opc) { 910 default: 911 return false; 912 case AArch64::SEH_StackAlloc: 913 case AArch64::SEH_SaveFPLR: 914 case AArch64::SEH_SaveFPLR_X: 915 case AArch64::SEH_SaveReg: 916 case AArch64::SEH_SaveReg_X: 917 case AArch64::SEH_SaveRegP: 918 case AArch64::SEH_SaveRegP_X: 919 case AArch64::SEH_SaveFReg: 920 case AArch64::SEH_SaveFReg_X: 921 case AArch64::SEH_SaveFRegP: 922 case AArch64::SEH_SaveFRegP_X: 923 case AArch64::SEH_SetFP: 924 case AArch64::SEH_AddFP: 925 case AArch64::SEH_Nop: 926 case AArch64::SEH_PrologEnd: 927 case AArch64::SEH_EpilogStart: 928 case AArch64::SEH_EpilogEnd: 929 return true; 930 } 931 } 932 933 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, 934 Register &SrcReg, Register &DstReg, 935 unsigned &SubIdx) const { 936 switch (MI.getOpcode()) { 937 default: 938 return false; 939 case AArch64::SBFMXri: // aka sxtw 940 case AArch64::UBFMXri: // aka uxtw 941 // Check for the 32 -> 64 bit extension case, these instructions can do 942 // much more. 943 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31) 944 return false; 945 // This is a signed or unsigned 32 -> 64 bit extension. 946 SrcReg = MI.getOperand(1).getReg(); 947 DstReg = MI.getOperand(0).getReg(); 948 SubIdx = AArch64::sub_32; 949 return true; 950 } 951 } 952 953 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint( 954 const MachineInstr &MIa, const MachineInstr &MIb) const { 955 const TargetRegisterInfo *TRI = &getRegisterInfo(); 956 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr; 957 int64_t OffsetA = 0, OffsetB = 0; 958 unsigned WidthA = 0, WidthB = 0; 959 bool OffsetAIsScalable = false, OffsetBIsScalable = false; 960 961 assert(MIa.mayLoadOrStore() && "MIa must be a load or store."); 962 assert(MIb.mayLoadOrStore() && "MIb must be a load or store."); 963 964 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() || 965 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 966 return false; 967 968 // Retrieve the base, offset from the base and width. Width 969 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If 970 // base are identical, and the offset of a lower memory access + 971 // the width doesn't overlap the offset of a higher memory access, 972 // then the memory accesses are different. 973 // If OffsetAIsScalable and OffsetBIsScalable are both true, they 974 // are assumed to have the same scale (vscale). 975 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable, 976 WidthA, TRI) && 977 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable, 978 WidthB, TRI)) { 979 if (BaseOpA->isIdenticalTo(*BaseOpB) && 980 OffsetAIsScalable == OffsetBIsScalable) { 981 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 982 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 983 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 984 if (LowOffset + LowWidth <= HighOffset) 985 return true; 986 } 987 } 988 return false; 989 } 990 991 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI, 992 const MachineBasicBlock *MBB, 993 const MachineFunction &MF) const { 994 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF)) 995 return true; 996 switch (MI.getOpcode()) { 997 case AArch64::HINT: 998 // CSDB hints are scheduling barriers. 999 if (MI.getOperand(0).getImm() == 0x14) 1000 return true; 1001 break; 1002 case AArch64::DSB: 1003 case AArch64::ISB: 1004 // DSB and ISB also are scheduling barriers. 1005 return true; 1006 default:; 1007 } 1008 return isSEHInstruction(MI); 1009 } 1010 1011 /// analyzeCompare - For a comparison instruction, return the source registers 1012 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. 1013 /// Return true if the comparison instruction can be analyzed. 1014 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, 1015 Register &SrcReg2, int &CmpMask, 1016 int &CmpValue) const { 1017 // The first operand can be a frame index where we'd normally expect a 1018 // register. 1019 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands"); 1020 if (!MI.getOperand(1).isReg()) 1021 return false; 1022 1023 switch (MI.getOpcode()) { 1024 default: 1025 break; 1026 case AArch64::SUBSWrr: 1027 case AArch64::SUBSWrs: 1028 case AArch64::SUBSWrx: 1029 case AArch64::SUBSXrr: 1030 case AArch64::SUBSXrs: 1031 case AArch64::SUBSXrx: 1032 case AArch64::ADDSWrr: 1033 case AArch64::ADDSWrs: 1034 case AArch64::ADDSWrx: 1035 case AArch64::ADDSXrr: 1036 case AArch64::ADDSXrs: 1037 case AArch64::ADDSXrx: 1038 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1039 SrcReg = MI.getOperand(1).getReg(); 1040 SrcReg2 = MI.getOperand(2).getReg(); 1041 CmpMask = ~0; 1042 CmpValue = 0; 1043 return true; 1044 case AArch64::SUBSWri: 1045 case AArch64::ADDSWri: 1046 case AArch64::SUBSXri: 1047 case AArch64::ADDSXri: 1048 SrcReg = MI.getOperand(1).getReg(); 1049 SrcReg2 = 0; 1050 CmpMask = ~0; 1051 // FIXME: In order to convert CmpValue to 0 or 1 1052 CmpValue = MI.getOperand(2).getImm() != 0; 1053 return true; 1054 case AArch64::ANDSWri: 1055 case AArch64::ANDSXri: 1056 // ANDS does not use the same encoding scheme as the others xxxS 1057 // instructions. 1058 SrcReg = MI.getOperand(1).getReg(); 1059 SrcReg2 = 0; 1060 CmpMask = ~0; 1061 // FIXME:The return val type of decodeLogicalImmediate is uint64_t, 1062 // while the type of CmpValue is int. When converting uint64_t to int, 1063 // the high 32 bits of uint64_t will be lost. 1064 // In fact it causes a bug in spec2006-483.xalancbmk 1065 // CmpValue is only used to compare with zero in OptimizeCompareInstr 1066 CmpValue = AArch64_AM::decodeLogicalImmediate( 1067 MI.getOperand(2).getImm(), 1068 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0; 1069 return true; 1070 } 1071 1072 return false; 1073 } 1074 1075 static bool UpdateOperandRegClass(MachineInstr &Instr) { 1076 MachineBasicBlock *MBB = Instr.getParent(); 1077 assert(MBB && "Can't get MachineBasicBlock here"); 1078 MachineFunction *MF = MBB->getParent(); 1079 assert(MF && "Can't get MachineFunction here"); 1080 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); 1081 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 1082 MachineRegisterInfo *MRI = &MF->getRegInfo(); 1083 1084 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx; 1085 ++OpIdx) { 1086 MachineOperand &MO = Instr.getOperand(OpIdx); 1087 const TargetRegisterClass *OpRegCstraints = 1088 Instr.getRegClassConstraint(OpIdx, TII, TRI); 1089 1090 // If there's no constraint, there's nothing to do. 1091 if (!OpRegCstraints) 1092 continue; 1093 // If the operand is a frame index, there's nothing to do here. 1094 // A frame index operand will resolve correctly during PEI. 1095 if (MO.isFI()) 1096 continue; 1097 1098 assert(MO.isReg() && 1099 "Operand has register constraints without being a register!"); 1100 1101 Register Reg = MO.getReg(); 1102 if (Register::isPhysicalRegister(Reg)) { 1103 if (!OpRegCstraints->contains(Reg)) 1104 return false; 1105 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) && 1106 !MRI->constrainRegClass(Reg, OpRegCstraints)) 1107 return false; 1108 } 1109 1110 return true; 1111 } 1112 1113 /// Return the opcode that does not set flags when possible - otherwise 1114 /// return the original opcode. The caller is responsible to do the actual 1115 /// substitution and legality checking. 1116 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) { 1117 // Don't convert all compare instructions, because for some the zero register 1118 // encoding becomes the sp register. 1119 bool MIDefinesZeroReg = false; 1120 if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR)) 1121 MIDefinesZeroReg = true; 1122 1123 switch (MI.getOpcode()) { 1124 default: 1125 return MI.getOpcode(); 1126 case AArch64::ADDSWrr: 1127 return AArch64::ADDWrr; 1128 case AArch64::ADDSWri: 1129 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri; 1130 case AArch64::ADDSWrs: 1131 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs; 1132 case AArch64::ADDSWrx: 1133 return AArch64::ADDWrx; 1134 case AArch64::ADDSXrr: 1135 return AArch64::ADDXrr; 1136 case AArch64::ADDSXri: 1137 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri; 1138 case AArch64::ADDSXrs: 1139 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs; 1140 case AArch64::ADDSXrx: 1141 return AArch64::ADDXrx; 1142 case AArch64::SUBSWrr: 1143 return AArch64::SUBWrr; 1144 case AArch64::SUBSWri: 1145 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri; 1146 case AArch64::SUBSWrs: 1147 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs; 1148 case AArch64::SUBSWrx: 1149 return AArch64::SUBWrx; 1150 case AArch64::SUBSXrr: 1151 return AArch64::SUBXrr; 1152 case AArch64::SUBSXri: 1153 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri; 1154 case AArch64::SUBSXrs: 1155 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs; 1156 case AArch64::SUBSXrx: 1157 return AArch64::SUBXrx; 1158 } 1159 } 1160 1161 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 }; 1162 1163 /// True when condition flags are accessed (either by writing or reading) 1164 /// on the instruction trace starting at From and ending at To. 1165 /// 1166 /// Note: If From and To are from different blocks it's assumed CC are accessed 1167 /// on the path. 1168 static bool areCFlagsAccessedBetweenInstrs( 1169 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, 1170 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) { 1171 // Early exit if To is at the beginning of the BB. 1172 if (To == To->getParent()->begin()) 1173 return true; 1174 1175 // Check whether the instructions are in the same basic block 1176 // If not, assume the condition flags might get modified somewhere. 1177 if (To->getParent() != From->getParent()) 1178 return true; 1179 1180 // From must be above To. 1181 assert(std::find_if(++To.getReverse(), To->getParent()->rend(), 1182 [From](MachineInstr &MI) { 1183 return MI.getIterator() == From; 1184 }) != To->getParent()->rend()); 1185 1186 // We iterate backward starting \p To until we hit \p From. 1187 for (--To; To != From; --To) { 1188 const MachineInstr &Instr = *To; 1189 1190 if (((AccessToCheck & AK_Write) && 1191 Instr.modifiesRegister(AArch64::NZCV, TRI)) || 1192 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI))) 1193 return true; 1194 } 1195 return false; 1196 } 1197 1198 /// Try to optimize a compare instruction. A compare instruction is an 1199 /// instruction which produces AArch64::NZCV. It can be truly compare 1200 /// instruction 1201 /// when there are no uses of its destination register. 1202 /// 1203 /// The following steps are tried in order: 1204 /// 1. Convert CmpInstr into an unconditional version. 1205 /// 2. Remove CmpInstr if above there is an instruction producing a needed 1206 /// condition code or an instruction which can be converted into such an 1207 /// instruction. 1208 /// Only comparison with zero is supported. 1209 bool AArch64InstrInfo::optimizeCompareInstr( 1210 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int CmpMask, 1211 int CmpValue, const MachineRegisterInfo *MRI) const { 1212 assert(CmpInstr.getParent()); 1213 assert(MRI); 1214 1215 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1216 int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true); 1217 if (DeadNZCVIdx != -1) { 1218 if (CmpInstr.definesRegister(AArch64::WZR) || 1219 CmpInstr.definesRegister(AArch64::XZR)) { 1220 CmpInstr.eraseFromParent(); 1221 return true; 1222 } 1223 unsigned Opc = CmpInstr.getOpcode(); 1224 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr); 1225 if (NewOpc == Opc) 1226 return false; 1227 const MCInstrDesc &MCID = get(NewOpc); 1228 CmpInstr.setDesc(MCID); 1229 CmpInstr.RemoveOperand(DeadNZCVIdx); 1230 bool succeeded = UpdateOperandRegClass(CmpInstr); 1231 (void)succeeded; 1232 assert(succeeded && "Some operands reg class are incompatible!"); 1233 return true; 1234 } 1235 1236 // Continue only if we have a "ri" where immediate is zero. 1237 // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare 1238 // function. 1239 assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!"); 1240 if (CmpValue != 0 || SrcReg2 != 0) 1241 return false; 1242 1243 // CmpInstr is a Compare instruction if destination register is not used. 1244 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg())) 1245 return false; 1246 1247 return substituteCmpToZero(CmpInstr, SrcReg, MRI); 1248 } 1249 1250 /// Get opcode of S version of Instr. 1251 /// If Instr is S version its opcode is returned. 1252 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version 1253 /// or we are not interested in it. 1254 static unsigned sForm(MachineInstr &Instr) { 1255 switch (Instr.getOpcode()) { 1256 default: 1257 return AArch64::INSTRUCTION_LIST_END; 1258 1259 case AArch64::ADDSWrr: 1260 case AArch64::ADDSWri: 1261 case AArch64::ADDSXrr: 1262 case AArch64::ADDSXri: 1263 case AArch64::SUBSWrr: 1264 case AArch64::SUBSWri: 1265 case AArch64::SUBSXrr: 1266 case AArch64::SUBSXri: 1267 return Instr.getOpcode(); 1268 1269 case AArch64::ADDWrr: 1270 return AArch64::ADDSWrr; 1271 case AArch64::ADDWri: 1272 return AArch64::ADDSWri; 1273 case AArch64::ADDXrr: 1274 return AArch64::ADDSXrr; 1275 case AArch64::ADDXri: 1276 return AArch64::ADDSXri; 1277 case AArch64::ADCWr: 1278 return AArch64::ADCSWr; 1279 case AArch64::ADCXr: 1280 return AArch64::ADCSXr; 1281 case AArch64::SUBWrr: 1282 return AArch64::SUBSWrr; 1283 case AArch64::SUBWri: 1284 return AArch64::SUBSWri; 1285 case AArch64::SUBXrr: 1286 return AArch64::SUBSXrr; 1287 case AArch64::SUBXri: 1288 return AArch64::SUBSXri; 1289 case AArch64::SBCWr: 1290 return AArch64::SBCSWr; 1291 case AArch64::SBCXr: 1292 return AArch64::SBCSXr; 1293 case AArch64::ANDWri: 1294 return AArch64::ANDSWri; 1295 case AArch64::ANDXri: 1296 return AArch64::ANDSXri; 1297 } 1298 } 1299 1300 /// Check if AArch64::NZCV should be alive in successors of MBB. 1301 static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) { 1302 for (auto *BB : MBB->successors()) 1303 if (BB->isLiveIn(AArch64::NZCV)) 1304 return true; 1305 return false; 1306 } 1307 1308 namespace { 1309 1310 struct UsedNZCV { 1311 bool N = false; 1312 bool Z = false; 1313 bool C = false; 1314 bool V = false; 1315 1316 UsedNZCV() = default; 1317 1318 UsedNZCV &operator|=(const UsedNZCV &UsedFlags) { 1319 this->N |= UsedFlags.N; 1320 this->Z |= UsedFlags.Z; 1321 this->C |= UsedFlags.C; 1322 this->V |= UsedFlags.V; 1323 return *this; 1324 } 1325 }; 1326 1327 } // end anonymous namespace 1328 1329 /// Find a condition code used by the instruction. 1330 /// Returns AArch64CC::Invalid if either the instruction does not use condition 1331 /// codes or we don't optimize CmpInstr in the presence of such instructions. 1332 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) { 1333 switch (Instr.getOpcode()) { 1334 default: 1335 return AArch64CC::Invalid; 1336 1337 case AArch64::Bcc: { 1338 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); 1339 assert(Idx >= 2); 1340 return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm()); 1341 } 1342 1343 case AArch64::CSINVWr: 1344 case AArch64::CSINVXr: 1345 case AArch64::CSINCWr: 1346 case AArch64::CSINCXr: 1347 case AArch64::CSELWr: 1348 case AArch64::CSELXr: 1349 case AArch64::CSNEGWr: 1350 case AArch64::CSNEGXr: 1351 case AArch64::FCSELSrrr: 1352 case AArch64::FCSELDrrr: { 1353 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); 1354 assert(Idx >= 1); 1355 return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm()); 1356 } 1357 } 1358 } 1359 1360 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) { 1361 assert(CC != AArch64CC::Invalid); 1362 UsedNZCV UsedFlags; 1363 switch (CC) { 1364 default: 1365 break; 1366 1367 case AArch64CC::EQ: // Z set 1368 case AArch64CC::NE: // Z clear 1369 UsedFlags.Z = true; 1370 break; 1371 1372 case AArch64CC::HI: // Z clear and C set 1373 case AArch64CC::LS: // Z set or C clear 1374 UsedFlags.Z = true; 1375 LLVM_FALLTHROUGH; 1376 case AArch64CC::HS: // C set 1377 case AArch64CC::LO: // C clear 1378 UsedFlags.C = true; 1379 break; 1380 1381 case AArch64CC::MI: // N set 1382 case AArch64CC::PL: // N clear 1383 UsedFlags.N = true; 1384 break; 1385 1386 case AArch64CC::VS: // V set 1387 case AArch64CC::VC: // V clear 1388 UsedFlags.V = true; 1389 break; 1390 1391 case AArch64CC::GT: // Z clear, N and V the same 1392 case AArch64CC::LE: // Z set, N and V differ 1393 UsedFlags.Z = true; 1394 LLVM_FALLTHROUGH; 1395 case AArch64CC::GE: // N and V the same 1396 case AArch64CC::LT: // N and V differ 1397 UsedFlags.N = true; 1398 UsedFlags.V = true; 1399 break; 1400 } 1401 return UsedFlags; 1402 } 1403 1404 static bool isADDSRegImm(unsigned Opcode) { 1405 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri; 1406 } 1407 1408 static bool isSUBSRegImm(unsigned Opcode) { 1409 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri; 1410 } 1411 1412 /// Check if CmpInstr can be substituted by MI. 1413 /// 1414 /// CmpInstr can be substituted: 1415 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0' 1416 /// - and, MI and CmpInstr are from the same MachineBB 1417 /// - and, condition flags are not alive in successors of the CmpInstr parent 1418 /// - and, if MI opcode is the S form there must be no defs of flags between 1419 /// MI and CmpInstr 1420 /// or if MI opcode is not the S form there must be neither defs of flags 1421 /// nor uses of flags between MI and CmpInstr. 1422 /// - and C/V flags are not used after CmpInstr 1423 static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr, 1424 const TargetRegisterInfo *TRI) { 1425 assert(MI); 1426 assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END); 1427 assert(CmpInstr); 1428 1429 const unsigned CmpOpcode = CmpInstr->getOpcode(); 1430 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode)) 1431 return false; 1432 1433 if (MI->getParent() != CmpInstr->getParent()) 1434 return false; 1435 1436 if (areCFlagsAliveInSuccessors(CmpInstr->getParent())) 1437 return false; 1438 1439 AccessKind AccessToCheck = AK_Write; 1440 if (sForm(*MI) != MI->getOpcode()) 1441 AccessToCheck = AK_All; 1442 if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck)) 1443 return false; 1444 1445 UsedNZCV NZCVUsedAfterCmp; 1446 for (auto I = std::next(CmpInstr->getIterator()), 1447 E = CmpInstr->getParent()->instr_end(); 1448 I != E; ++I) { 1449 const MachineInstr &Instr = *I; 1450 if (Instr.readsRegister(AArch64::NZCV, TRI)) { 1451 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr); 1452 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction 1453 return false; 1454 NZCVUsedAfterCmp |= getUsedNZCV(CC); 1455 } 1456 1457 if (Instr.modifiesRegister(AArch64::NZCV, TRI)) 1458 break; 1459 } 1460 1461 return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V; 1462 } 1463 1464 /// Substitute an instruction comparing to zero with another instruction 1465 /// which produces needed condition flags. 1466 /// 1467 /// Return true on success. 1468 bool AArch64InstrInfo::substituteCmpToZero( 1469 MachineInstr &CmpInstr, unsigned SrcReg, 1470 const MachineRegisterInfo *MRI) const { 1471 assert(MRI); 1472 // Get the unique definition of SrcReg. 1473 MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); 1474 if (!MI) 1475 return false; 1476 1477 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1478 1479 unsigned NewOpc = sForm(*MI); 1480 if (NewOpc == AArch64::INSTRUCTION_LIST_END) 1481 return false; 1482 1483 if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI)) 1484 return false; 1485 1486 // Update the instruction to set NZCV. 1487 MI->setDesc(get(NewOpc)); 1488 CmpInstr.eraseFromParent(); 1489 bool succeeded = UpdateOperandRegClass(*MI); 1490 (void)succeeded; 1491 assert(succeeded && "Some operands reg class are incompatible!"); 1492 MI->addRegisterDefined(AArch64::NZCV, TRI); 1493 return true; 1494 } 1495 1496 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 1497 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD && 1498 MI.getOpcode() != AArch64::CATCHRET) 1499 return false; 1500 1501 MachineBasicBlock &MBB = *MI.getParent(); 1502 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>(); 1503 auto TRI = Subtarget.getRegisterInfo(); 1504 DebugLoc DL = MI.getDebugLoc(); 1505 1506 if (MI.getOpcode() == AArch64::CATCHRET) { 1507 // Skip to the first instruction before the epilog. 1508 const TargetInstrInfo *TII = 1509 MBB.getParent()->getSubtarget().getInstrInfo(); 1510 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB(); 1511 auto MBBI = MachineBasicBlock::iterator(MI); 1512 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI); 1513 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) && 1514 FirstEpilogSEH != MBB.begin()) 1515 FirstEpilogSEH = std::prev(FirstEpilogSEH); 1516 if (FirstEpilogSEH != MBB.begin()) 1517 FirstEpilogSEH = std::next(FirstEpilogSEH); 1518 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP)) 1519 .addReg(AArch64::X0, RegState::Define) 1520 .addMBB(TargetMBB); 1521 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri)) 1522 .addReg(AArch64::X0, RegState::Define) 1523 .addReg(AArch64::X0) 1524 .addMBB(TargetMBB) 1525 .addImm(0); 1526 return true; 1527 } 1528 1529 Register Reg = MI.getOperand(0).getReg(); 1530 const GlobalValue *GV = 1531 cast<GlobalValue>((*MI.memoperands_begin())->getValue()); 1532 const TargetMachine &TM = MBB.getParent()->getTarget(); 1533 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM); 1534 const unsigned char MO_NC = AArch64II::MO_NC; 1535 1536 if ((OpFlags & AArch64II::MO_GOT) != 0) { 1537 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg) 1538 .addGlobalAddress(GV, 0, OpFlags); 1539 if (Subtarget.isTargetILP32()) { 1540 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 1541 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 1542 .addDef(Reg32, RegState::Dead) 1543 .addUse(Reg, RegState::Kill) 1544 .addImm(0) 1545 .addMemOperand(*MI.memoperands_begin()) 1546 .addDef(Reg, RegState::Implicit); 1547 } else { 1548 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 1549 .addReg(Reg, RegState::Kill) 1550 .addImm(0) 1551 .addMemOperand(*MI.memoperands_begin()); 1552 } 1553 } else if (TM.getCodeModel() == CodeModel::Large) { 1554 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?"); 1555 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg) 1556 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC) 1557 .addImm(0); 1558 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 1559 .addReg(Reg, RegState::Kill) 1560 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC) 1561 .addImm(16); 1562 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 1563 .addReg(Reg, RegState::Kill) 1564 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC) 1565 .addImm(32); 1566 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 1567 .addReg(Reg, RegState::Kill) 1568 .addGlobalAddress(GV, 0, AArch64II::MO_G3) 1569 .addImm(48); 1570 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 1571 .addReg(Reg, RegState::Kill) 1572 .addImm(0) 1573 .addMemOperand(*MI.memoperands_begin()); 1574 } else if (TM.getCodeModel() == CodeModel::Tiny) { 1575 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg) 1576 .addGlobalAddress(GV, 0, OpFlags); 1577 } else { 1578 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg) 1579 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE); 1580 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC; 1581 if (Subtarget.isTargetILP32()) { 1582 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 1583 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 1584 .addDef(Reg32, RegState::Dead) 1585 .addUse(Reg, RegState::Kill) 1586 .addGlobalAddress(GV, 0, LoFlags) 1587 .addMemOperand(*MI.memoperands_begin()) 1588 .addDef(Reg, RegState::Implicit); 1589 } else { 1590 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 1591 .addReg(Reg, RegState::Kill) 1592 .addGlobalAddress(GV, 0, LoFlags) 1593 .addMemOperand(*MI.memoperands_begin()); 1594 } 1595 } 1596 1597 MBB.erase(MI); 1598 1599 return true; 1600 } 1601 1602 // Return true if this instruction simply sets its single destination register 1603 // to zero. This is equivalent to a register rename of the zero-register. 1604 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) { 1605 switch (MI.getOpcode()) { 1606 default: 1607 break; 1608 case AArch64::MOVZWi: 1609 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0) 1610 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) { 1611 assert(MI.getDesc().getNumOperands() == 3 && 1612 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands"); 1613 return true; 1614 } 1615 break; 1616 case AArch64::ANDWri: // and Rd, Rzr, #imm 1617 return MI.getOperand(1).getReg() == AArch64::WZR; 1618 case AArch64::ANDXri: 1619 return MI.getOperand(1).getReg() == AArch64::XZR; 1620 case TargetOpcode::COPY: 1621 return MI.getOperand(1).getReg() == AArch64::WZR; 1622 } 1623 return false; 1624 } 1625 1626 // Return true if this instruction simply renames a general register without 1627 // modifying bits. 1628 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) { 1629 switch (MI.getOpcode()) { 1630 default: 1631 break; 1632 case TargetOpcode::COPY: { 1633 // GPR32 copies will by lowered to ORRXrs 1634 Register DstReg = MI.getOperand(0).getReg(); 1635 return (AArch64::GPR32RegClass.contains(DstReg) || 1636 AArch64::GPR64RegClass.contains(DstReg)); 1637 } 1638 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0) 1639 if (MI.getOperand(1).getReg() == AArch64::XZR) { 1640 assert(MI.getDesc().getNumOperands() == 4 && 1641 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands"); 1642 return true; 1643 } 1644 break; 1645 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0) 1646 if (MI.getOperand(2).getImm() == 0) { 1647 assert(MI.getDesc().getNumOperands() == 4 && 1648 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands"); 1649 return true; 1650 } 1651 break; 1652 } 1653 return false; 1654 } 1655 1656 // Return true if this instruction simply renames a general register without 1657 // modifying bits. 1658 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) { 1659 switch (MI.getOpcode()) { 1660 default: 1661 break; 1662 case TargetOpcode::COPY: { 1663 // FPR64 copies will by lowered to ORR.16b 1664 Register DstReg = MI.getOperand(0).getReg(); 1665 return (AArch64::FPR64RegClass.contains(DstReg) || 1666 AArch64::FPR128RegClass.contains(DstReg)); 1667 } 1668 case AArch64::ORRv16i8: 1669 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) { 1670 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() && 1671 "invalid ORRv16i8 operands"); 1672 return true; 1673 } 1674 break; 1675 } 1676 return false; 1677 } 1678 1679 unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 1680 int &FrameIndex) const { 1681 switch (MI.getOpcode()) { 1682 default: 1683 break; 1684 case AArch64::LDRWui: 1685 case AArch64::LDRXui: 1686 case AArch64::LDRBui: 1687 case AArch64::LDRHui: 1688 case AArch64::LDRSui: 1689 case AArch64::LDRDui: 1690 case AArch64::LDRQui: 1691 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 1692 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 1693 FrameIndex = MI.getOperand(1).getIndex(); 1694 return MI.getOperand(0).getReg(); 1695 } 1696 break; 1697 } 1698 1699 return 0; 1700 } 1701 1702 unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI, 1703 int &FrameIndex) const { 1704 switch (MI.getOpcode()) { 1705 default: 1706 break; 1707 case AArch64::STRWui: 1708 case AArch64::STRXui: 1709 case AArch64::STRBui: 1710 case AArch64::STRHui: 1711 case AArch64::STRSui: 1712 case AArch64::STRDui: 1713 case AArch64::STRQui: 1714 case AArch64::LDR_PXI: 1715 case AArch64::STR_PXI: 1716 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 1717 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 1718 FrameIndex = MI.getOperand(1).getIndex(); 1719 return MI.getOperand(0).getReg(); 1720 } 1721 break; 1722 } 1723 return 0; 1724 } 1725 1726 /// Check all MachineMemOperands for a hint to suppress pairing. 1727 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) { 1728 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 1729 return MMO->getFlags() & MOSuppressPair; 1730 }); 1731 } 1732 1733 /// Set a flag on the first MachineMemOperand to suppress pairing. 1734 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) { 1735 if (MI.memoperands_empty()) 1736 return; 1737 (*MI.memoperands_begin())->setFlags(MOSuppressPair); 1738 } 1739 1740 /// Check all MachineMemOperands for a hint that the load/store is strided. 1741 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) { 1742 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 1743 return MMO->getFlags() & MOStridedAccess; 1744 }); 1745 } 1746 1747 bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) { 1748 switch (Opc) { 1749 default: 1750 return false; 1751 case AArch64::STURSi: 1752 case AArch64::STURDi: 1753 case AArch64::STURQi: 1754 case AArch64::STURBBi: 1755 case AArch64::STURHHi: 1756 case AArch64::STURWi: 1757 case AArch64::STURXi: 1758 case AArch64::LDURSi: 1759 case AArch64::LDURDi: 1760 case AArch64::LDURQi: 1761 case AArch64::LDURWi: 1762 case AArch64::LDURXi: 1763 case AArch64::LDURSWi: 1764 case AArch64::LDURHHi: 1765 case AArch64::LDURBBi: 1766 case AArch64::LDURSBWi: 1767 case AArch64::LDURSHWi: 1768 return true; 1769 } 1770 } 1771 1772 Optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) { 1773 switch (Opc) { 1774 default: return {}; 1775 case AArch64::PRFMui: return AArch64::PRFUMi; 1776 case AArch64::LDRXui: return AArch64::LDURXi; 1777 case AArch64::LDRWui: return AArch64::LDURWi; 1778 case AArch64::LDRBui: return AArch64::LDURBi; 1779 case AArch64::LDRHui: return AArch64::LDURHi; 1780 case AArch64::LDRSui: return AArch64::LDURSi; 1781 case AArch64::LDRDui: return AArch64::LDURDi; 1782 case AArch64::LDRQui: return AArch64::LDURQi; 1783 case AArch64::LDRBBui: return AArch64::LDURBBi; 1784 case AArch64::LDRHHui: return AArch64::LDURHHi; 1785 case AArch64::LDRSBXui: return AArch64::LDURSBXi; 1786 case AArch64::LDRSBWui: return AArch64::LDURSBWi; 1787 case AArch64::LDRSHXui: return AArch64::LDURSHXi; 1788 case AArch64::LDRSHWui: return AArch64::LDURSHWi; 1789 case AArch64::LDRSWui: return AArch64::LDURSWi; 1790 case AArch64::STRXui: return AArch64::STURXi; 1791 case AArch64::STRWui: return AArch64::STURWi; 1792 case AArch64::STRBui: return AArch64::STURBi; 1793 case AArch64::STRHui: return AArch64::STURHi; 1794 case AArch64::STRSui: return AArch64::STURSi; 1795 case AArch64::STRDui: return AArch64::STURDi; 1796 case AArch64::STRQui: return AArch64::STURQi; 1797 case AArch64::STRBBui: return AArch64::STURBBi; 1798 case AArch64::STRHHui: return AArch64::STURHHi; 1799 } 1800 } 1801 1802 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) { 1803 switch (Opc) { 1804 default: 1805 return 2; 1806 case AArch64::LDPXi: 1807 case AArch64::LDPDi: 1808 case AArch64::STPXi: 1809 case AArch64::STPDi: 1810 case AArch64::LDNPXi: 1811 case AArch64::LDNPDi: 1812 case AArch64::STNPXi: 1813 case AArch64::STNPDi: 1814 case AArch64::LDPQi: 1815 case AArch64::STPQi: 1816 case AArch64::LDNPQi: 1817 case AArch64::STNPQi: 1818 case AArch64::LDPWi: 1819 case AArch64::LDPSi: 1820 case AArch64::STPWi: 1821 case AArch64::STPSi: 1822 case AArch64::LDNPWi: 1823 case AArch64::LDNPSi: 1824 case AArch64::STNPWi: 1825 case AArch64::STNPSi: 1826 case AArch64::LDG: 1827 case AArch64::STGPi: 1828 case AArch64::LD1B_IMM: 1829 case AArch64::LD1H_IMM: 1830 case AArch64::LD1W_IMM: 1831 case AArch64::LD1D_IMM: 1832 case AArch64::ST1B_IMM: 1833 case AArch64::ST1H_IMM: 1834 case AArch64::ST1W_IMM: 1835 case AArch64::ST1D_IMM: 1836 return 3; 1837 case AArch64::ADDG: 1838 case AArch64::STGOffset: 1839 case AArch64::LDR_PXI: 1840 case AArch64::STR_PXI: 1841 return 2; 1842 } 1843 } 1844 1845 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) { 1846 switch (MI.getOpcode()) { 1847 default: 1848 return false; 1849 // Scaled instructions. 1850 case AArch64::STRSui: 1851 case AArch64::STRDui: 1852 case AArch64::STRQui: 1853 case AArch64::STRXui: 1854 case AArch64::STRWui: 1855 case AArch64::LDRSui: 1856 case AArch64::LDRDui: 1857 case AArch64::LDRQui: 1858 case AArch64::LDRXui: 1859 case AArch64::LDRWui: 1860 case AArch64::LDRSWui: 1861 // Unscaled instructions. 1862 case AArch64::STURSi: 1863 case AArch64::STURDi: 1864 case AArch64::STURQi: 1865 case AArch64::STURWi: 1866 case AArch64::STURXi: 1867 case AArch64::LDURSi: 1868 case AArch64::LDURDi: 1869 case AArch64::LDURQi: 1870 case AArch64::LDURWi: 1871 case AArch64::LDURXi: 1872 case AArch64::LDURSWi: 1873 return true; 1874 } 1875 } 1876 1877 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc, 1878 bool &Is64Bit) { 1879 switch (Opc) { 1880 default: 1881 llvm_unreachable("Opcode has no flag setting equivalent!"); 1882 // 32-bit cases: 1883 case AArch64::ADDWri: 1884 Is64Bit = false; 1885 return AArch64::ADDSWri; 1886 case AArch64::ADDWrr: 1887 Is64Bit = false; 1888 return AArch64::ADDSWrr; 1889 case AArch64::ADDWrs: 1890 Is64Bit = false; 1891 return AArch64::ADDSWrs; 1892 case AArch64::ADDWrx: 1893 Is64Bit = false; 1894 return AArch64::ADDSWrx; 1895 case AArch64::ANDWri: 1896 Is64Bit = false; 1897 return AArch64::ANDSWri; 1898 case AArch64::ANDWrr: 1899 Is64Bit = false; 1900 return AArch64::ANDSWrr; 1901 case AArch64::ANDWrs: 1902 Is64Bit = false; 1903 return AArch64::ANDSWrs; 1904 case AArch64::BICWrr: 1905 Is64Bit = false; 1906 return AArch64::BICSWrr; 1907 case AArch64::BICWrs: 1908 Is64Bit = false; 1909 return AArch64::BICSWrs; 1910 case AArch64::SUBWri: 1911 Is64Bit = false; 1912 return AArch64::SUBSWri; 1913 case AArch64::SUBWrr: 1914 Is64Bit = false; 1915 return AArch64::SUBSWrr; 1916 case AArch64::SUBWrs: 1917 Is64Bit = false; 1918 return AArch64::SUBSWrs; 1919 case AArch64::SUBWrx: 1920 Is64Bit = false; 1921 return AArch64::SUBSWrx; 1922 // 64-bit cases: 1923 case AArch64::ADDXri: 1924 Is64Bit = true; 1925 return AArch64::ADDSXri; 1926 case AArch64::ADDXrr: 1927 Is64Bit = true; 1928 return AArch64::ADDSXrr; 1929 case AArch64::ADDXrs: 1930 Is64Bit = true; 1931 return AArch64::ADDSXrs; 1932 case AArch64::ADDXrx: 1933 Is64Bit = true; 1934 return AArch64::ADDSXrx; 1935 case AArch64::ANDXri: 1936 Is64Bit = true; 1937 return AArch64::ANDSXri; 1938 case AArch64::ANDXrr: 1939 Is64Bit = true; 1940 return AArch64::ANDSXrr; 1941 case AArch64::ANDXrs: 1942 Is64Bit = true; 1943 return AArch64::ANDSXrs; 1944 case AArch64::BICXrr: 1945 Is64Bit = true; 1946 return AArch64::BICSXrr; 1947 case AArch64::BICXrs: 1948 Is64Bit = true; 1949 return AArch64::BICSXrs; 1950 case AArch64::SUBXri: 1951 Is64Bit = true; 1952 return AArch64::SUBSXri; 1953 case AArch64::SUBXrr: 1954 Is64Bit = true; 1955 return AArch64::SUBSXrr; 1956 case AArch64::SUBXrs: 1957 Is64Bit = true; 1958 return AArch64::SUBSXrs; 1959 case AArch64::SUBXrx: 1960 Is64Bit = true; 1961 return AArch64::SUBSXrx; 1962 } 1963 } 1964 1965 // Is this a candidate for ld/st merging or pairing? For example, we don't 1966 // touch volatiles or load/stores that have a hint to avoid pair formation. 1967 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const { 1968 // If this is a volatile load/store, don't mess with it. 1969 if (MI.hasOrderedMemoryRef()) 1970 return false; 1971 1972 // Make sure this is a reg/fi+imm (as opposed to an address reloc). 1973 assert((MI.getOperand(1).isReg() || MI.getOperand(1).isFI()) && 1974 "Expected a reg or frame index operand."); 1975 if (!MI.getOperand(2).isImm()) 1976 return false; 1977 1978 // Can't merge/pair if the instruction modifies the base register. 1979 // e.g., ldr x0, [x0] 1980 // This case will never occur with an FI base. 1981 if (MI.getOperand(1).isReg()) { 1982 Register BaseReg = MI.getOperand(1).getReg(); 1983 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1984 if (MI.modifiesRegister(BaseReg, TRI)) 1985 return false; 1986 } 1987 1988 // Check if this load/store has a hint to avoid pair formation. 1989 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. 1990 if (isLdStPairSuppressed(MI)) 1991 return false; 1992 1993 // Do not pair any callee-save store/reload instructions in the 1994 // prologue/epilogue if the CFI information encoded the operations as separate 1995 // instructions, as that will cause the size of the actual prologue to mismatch 1996 // with the prologue size recorded in the Windows CFI. 1997 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo(); 1998 bool NeedsWinCFI = MAI->usesWindowsCFI() && 1999 MI.getMF()->getFunction().needsUnwindTableEntry(); 2000 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) || 2001 MI.getFlag(MachineInstr::FrameDestroy))) 2002 return false; 2003 2004 // On some CPUs quad load/store pairs are slower than two single load/stores. 2005 if (Subtarget.isPaired128Slow()) { 2006 switch (MI.getOpcode()) { 2007 default: 2008 break; 2009 case AArch64::LDURQi: 2010 case AArch64::STURQi: 2011 case AArch64::LDRQui: 2012 case AArch64::STRQui: 2013 return false; 2014 } 2015 } 2016 2017 return true; 2018 } 2019 2020 bool AArch64InstrInfo::getMemOperandsWithOffset( 2021 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, 2022 int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) 2023 const { 2024 if (!LdSt.mayLoadOrStore()) 2025 return false; 2026 2027 const MachineOperand *BaseOp; 2028 unsigned Width; 2029 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable, 2030 Width, TRI)) 2031 return false; 2032 BaseOps.push_back(BaseOp); 2033 return true; 2034 } 2035 2036 bool AArch64InstrInfo::getMemOperandWithOffsetWidth( 2037 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset, 2038 bool &OffsetIsScalable, unsigned &Width, 2039 const TargetRegisterInfo *TRI) const { 2040 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 2041 // Handle only loads/stores with base register followed by immediate offset. 2042 if (LdSt.getNumExplicitOperands() == 3) { 2043 // Non-paired instruction (e.g., ldr x1, [x0, #8]). 2044 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) || 2045 !LdSt.getOperand(2).isImm()) 2046 return false; 2047 } else if (LdSt.getNumExplicitOperands() == 4) { 2048 // Paired instruction (e.g., ldp x1, x2, [x0, #8]). 2049 if (!LdSt.getOperand(1).isReg() || 2050 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) || 2051 !LdSt.getOperand(3).isImm()) 2052 return false; 2053 } else 2054 return false; 2055 2056 // Get the scaling factor for the instruction and set the width for the 2057 // instruction. 2058 TypeSize Scale(0U, false); 2059 int64_t Dummy1, Dummy2; 2060 2061 // If this returns false, then it's an instruction we don't want to handle. 2062 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2)) 2063 return false; 2064 2065 // Compute the offset. Offset is calculated as the immediate operand 2066 // multiplied by the scaling factor. Unscaled instructions have scaling factor 2067 // set to 1. 2068 if (LdSt.getNumExplicitOperands() == 3) { 2069 BaseOp = &LdSt.getOperand(1); 2070 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinSize(); 2071 } else { 2072 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands"); 2073 BaseOp = &LdSt.getOperand(2); 2074 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinSize(); 2075 } 2076 OffsetIsScalable = Scale.isScalable(); 2077 2078 if (!BaseOp->isReg() && !BaseOp->isFI()) 2079 return false; 2080 2081 return true; 2082 } 2083 2084 MachineOperand & 2085 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const { 2086 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 2087 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1); 2088 assert(OfsOp.isImm() && "Offset operand wasn't immediate."); 2089 return OfsOp; 2090 } 2091 2092 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale, 2093 unsigned &Width, int64_t &MinOffset, 2094 int64_t &MaxOffset) { 2095 const unsigned SVEMaxBytesPerVector = AArch64::SVEMaxBitsPerVector / 8; 2096 switch (Opcode) { 2097 // Not a memory operation or something we want to handle. 2098 default: 2099 Scale = TypeSize::Fixed(0); 2100 Width = 0; 2101 MinOffset = MaxOffset = 0; 2102 return false; 2103 case AArch64::STRWpost: 2104 case AArch64::LDRWpost: 2105 Width = 32; 2106 Scale = TypeSize::Fixed(4); 2107 MinOffset = -256; 2108 MaxOffset = 255; 2109 break; 2110 case AArch64::LDURQi: 2111 case AArch64::STURQi: 2112 Width = 16; 2113 Scale = TypeSize::Fixed(1); 2114 MinOffset = -256; 2115 MaxOffset = 255; 2116 break; 2117 case AArch64::PRFUMi: 2118 case AArch64::LDURXi: 2119 case AArch64::LDURDi: 2120 case AArch64::STURXi: 2121 case AArch64::STURDi: 2122 Width = 8; 2123 Scale = TypeSize::Fixed(1); 2124 MinOffset = -256; 2125 MaxOffset = 255; 2126 break; 2127 case AArch64::LDURWi: 2128 case AArch64::LDURSi: 2129 case AArch64::LDURSWi: 2130 case AArch64::STURWi: 2131 case AArch64::STURSi: 2132 Width = 4; 2133 Scale = TypeSize::Fixed(1); 2134 MinOffset = -256; 2135 MaxOffset = 255; 2136 break; 2137 case AArch64::LDURHi: 2138 case AArch64::LDURHHi: 2139 case AArch64::LDURSHXi: 2140 case AArch64::LDURSHWi: 2141 case AArch64::STURHi: 2142 case AArch64::STURHHi: 2143 Width = 2; 2144 Scale = TypeSize::Fixed(1); 2145 MinOffset = -256; 2146 MaxOffset = 255; 2147 break; 2148 case AArch64::LDURBi: 2149 case AArch64::LDURBBi: 2150 case AArch64::LDURSBXi: 2151 case AArch64::LDURSBWi: 2152 case AArch64::STURBi: 2153 case AArch64::STURBBi: 2154 Width = 1; 2155 Scale = TypeSize::Fixed(1); 2156 MinOffset = -256; 2157 MaxOffset = 255; 2158 break; 2159 case AArch64::LDPQi: 2160 case AArch64::LDNPQi: 2161 case AArch64::STPQi: 2162 case AArch64::STNPQi: 2163 Scale = TypeSize::Fixed(16); 2164 Width = 32; 2165 MinOffset = -64; 2166 MaxOffset = 63; 2167 break; 2168 case AArch64::LDRQui: 2169 case AArch64::STRQui: 2170 Scale = TypeSize::Fixed(16); 2171 Width = 16; 2172 MinOffset = 0; 2173 MaxOffset = 4095; 2174 break; 2175 case AArch64::LDPXi: 2176 case AArch64::LDPDi: 2177 case AArch64::LDNPXi: 2178 case AArch64::LDNPDi: 2179 case AArch64::STPXi: 2180 case AArch64::STPDi: 2181 case AArch64::STNPXi: 2182 case AArch64::STNPDi: 2183 Scale = TypeSize::Fixed(8); 2184 Width = 16; 2185 MinOffset = -64; 2186 MaxOffset = 63; 2187 break; 2188 case AArch64::PRFMui: 2189 case AArch64::LDRXui: 2190 case AArch64::LDRDui: 2191 case AArch64::STRXui: 2192 case AArch64::STRDui: 2193 Scale = TypeSize::Fixed(8); 2194 Width = 8; 2195 MinOffset = 0; 2196 MaxOffset = 4095; 2197 break; 2198 case AArch64::LDPWi: 2199 case AArch64::LDPSi: 2200 case AArch64::LDNPWi: 2201 case AArch64::LDNPSi: 2202 case AArch64::STPWi: 2203 case AArch64::STPSi: 2204 case AArch64::STNPWi: 2205 case AArch64::STNPSi: 2206 Scale = TypeSize::Fixed(4); 2207 Width = 8; 2208 MinOffset = -64; 2209 MaxOffset = 63; 2210 break; 2211 case AArch64::LDRWui: 2212 case AArch64::LDRSui: 2213 case AArch64::LDRSWui: 2214 case AArch64::STRWui: 2215 case AArch64::STRSui: 2216 Scale = TypeSize::Fixed(4); 2217 Width = 4; 2218 MinOffset = 0; 2219 MaxOffset = 4095; 2220 break; 2221 case AArch64::LDRHui: 2222 case AArch64::LDRHHui: 2223 case AArch64::LDRSHWui: 2224 case AArch64::LDRSHXui: 2225 case AArch64::STRHui: 2226 case AArch64::STRHHui: 2227 Scale = TypeSize::Fixed(2); 2228 Width = 2; 2229 MinOffset = 0; 2230 MaxOffset = 4095; 2231 break; 2232 case AArch64::LDRBui: 2233 case AArch64::LDRBBui: 2234 case AArch64::LDRSBWui: 2235 case AArch64::LDRSBXui: 2236 case AArch64::STRBui: 2237 case AArch64::STRBBui: 2238 Scale = TypeSize::Fixed(1); 2239 Width = 1; 2240 MinOffset = 0; 2241 MaxOffset = 4095; 2242 break; 2243 case AArch64::ADDG: 2244 Scale = TypeSize::Fixed(16); 2245 Width = 0; 2246 MinOffset = 0; 2247 MaxOffset = 63; 2248 break; 2249 case AArch64::TAGPstack: 2250 Scale = TypeSize::Fixed(16); 2251 Width = 0; 2252 // TAGP with a negative offset turns into SUBP, which has a maximum offset 2253 // of 63 (not 64!). 2254 MinOffset = -63; 2255 MaxOffset = 63; 2256 break; 2257 case AArch64::LDG: 2258 case AArch64::STGOffset: 2259 case AArch64::STZGOffset: 2260 Scale = TypeSize::Fixed(16); 2261 Width = 16; 2262 MinOffset = -256; 2263 MaxOffset = 255; 2264 break; 2265 case AArch64::LDR_PXI: 2266 case AArch64::STR_PXI: 2267 Scale = TypeSize::Scalable(2); 2268 Width = SVEMaxBytesPerVector / 8; 2269 MinOffset = -256; 2270 MaxOffset = 255; 2271 break; 2272 case AArch64::LDR_ZXI: 2273 case AArch64::STR_ZXI: 2274 Scale = TypeSize::Scalable(16); 2275 Width = SVEMaxBytesPerVector; 2276 MinOffset = -256; 2277 MaxOffset = 255; 2278 break; 2279 case AArch64::LD1B_IMM: 2280 case AArch64::LD1H_IMM: 2281 case AArch64::LD1W_IMM: 2282 case AArch64::LD1D_IMM: 2283 case AArch64::ST1B_IMM: 2284 case AArch64::ST1H_IMM: 2285 case AArch64::ST1W_IMM: 2286 case AArch64::ST1D_IMM: 2287 // A full vectors worth of data 2288 // Width = mbytes * elements 2289 Scale = TypeSize::Scalable(16); 2290 Width = SVEMaxBytesPerVector; 2291 MinOffset = -8; 2292 MaxOffset = 7; 2293 break; 2294 case AArch64::ST2GOffset: 2295 case AArch64::STZ2GOffset: 2296 Scale = TypeSize::Fixed(16); 2297 Width = 32; 2298 MinOffset = -256; 2299 MaxOffset = 255; 2300 break; 2301 case AArch64::STGPi: 2302 Scale = TypeSize::Fixed(16); 2303 Width = 16; 2304 MinOffset = -64; 2305 MaxOffset = 63; 2306 break; 2307 } 2308 2309 return true; 2310 } 2311 2312 // Scaling factor for unscaled load or store. 2313 int AArch64InstrInfo::getMemScale(unsigned Opc) { 2314 switch (Opc) { 2315 default: 2316 llvm_unreachable("Opcode has unknown scale!"); 2317 case AArch64::LDRBBui: 2318 case AArch64::LDURBBi: 2319 case AArch64::LDRSBWui: 2320 case AArch64::LDURSBWi: 2321 case AArch64::STRBBui: 2322 case AArch64::STURBBi: 2323 return 1; 2324 case AArch64::LDRHHui: 2325 case AArch64::LDURHHi: 2326 case AArch64::LDRSHWui: 2327 case AArch64::LDURSHWi: 2328 case AArch64::STRHHui: 2329 case AArch64::STURHHi: 2330 return 2; 2331 case AArch64::LDRSui: 2332 case AArch64::LDURSi: 2333 case AArch64::LDRSWui: 2334 case AArch64::LDURSWi: 2335 case AArch64::LDRWui: 2336 case AArch64::LDURWi: 2337 case AArch64::STRSui: 2338 case AArch64::STURSi: 2339 case AArch64::STRWui: 2340 case AArch64::STURWi: 2341 case AArch64::LDPSi: 2342 case AArch64::LDPSWi: 2343 case AArch64::LDPWi: 2344 case AArch64::STPSi: 2345 case AArch64::STPWi: 2346 return 4; 2347 case AArch64::LDRDui: 2348 case AArch64::LDURDi: 2349 case AArch64::LDRXui: 2350 case AArch64::LDURXi: 2351 case AArch64::STRDui: 2352 case AArch64::STURDi: 2353 case AArch64::STRXui: 2354 case AArch64::STURXi: 2355 case AArch64::LDPDi: 2356 case AArch64::LDPXi: 2357 case AArch64::STPDi: 2358 case AArch64::STPXi: 2359 return 8; 2360 case AArch64::LDRQui: 2361 case AArch64::LDURQi: 2362 case AArch64::STRQui: 2363 case AArch64::STURQi: 2364 case AArch64::LDPQi: 2365 case AArch64::STPQi: 2366 case AArch64::STGOffset: 2367 case AArch64::STZGOffset: 2368 case AArch64::ST2GOffset: 2369 case AArch64::STZ2GOffset: 2370 case AArch64::STGPi: 2371 return 16; 2372 } 2373 } 2374 2375 // Scale the unscaled offsets. Returns false if the unscaled offset can't be 2376 // scaled. 2377 static bool scaleOffset(unsigned Opc, int64_t &Offset) { 2378 int Scale = AArch64InstrInfo::getMemScale(Opc); 2379 2380 // If the byte-offset isn't a multiple of the stride, we can't scale this 2381 // offset. 2382 if (Offset % Scale != 0) 2383 return false; 2384 2385 // Convert the byte-offset used by unscaled into an "element" offset used 2386 // by the scaled pair load/store instructions. 2387 Offset /= Scale; 2388 return true; 2389 } 2390 2391 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) { 2392 if (FirstOpc == SecondOpc) 2393 return true; 2394 // We can also pair sign-ext and zero-ext instructions. 2395 switch (FirstOpc) { 2396 default: 2397 return false; 2398 case AArch64::LDRWui: 2399 case AArch64::LDURWi: 2400 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi; 2401 case AArch64::LDRSWui: 2402 case AArch64::LDURSWi: 2403 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi; 2404 } 2405 // These instructions can't be paired based on their opcodes. 2406 return false; 2407 } 2408 2409 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, 2410 int64_t Offset1, unsigned Opcode1, int FI2, 2411 int64_t Offset2, unsigned Opcode2) { 2412 // Accesses through fixed stack object frame indices may access a different 2413 // fixed stack slot. Check that the object offsets + offsets match. 2414 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) { 2415 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1); 2416 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2); 2417 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered."); 2418 // Convert to scaled object offsets. 2419 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1); 2420 if (ObjectOffset1 % Scale1 != 0) 2421 return false; 2422 ObjectOffset1 /= Scale1; 2423 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2); 2424 if (ObjectOffset2 % Scale2 != 0) 2425 return false; 2426 ObjectOffset2 /= Scale2; 2427 ObjectOffset1 += Offset1; 2428 ObjectOffset2 += Offset2; 2429 return ObjectOffset1 + 1 == ObjectOffset2; 2430 } 2431 2432 return FI1 == FI2; 2433 } 2434 2435 /// Detect opportunities for ldp/stp formation. 2436 /// 2437 /// Only called for LdSt for which getMemOperandWithOffset returns true. 2438 bool AArch64InstrInfo::shouldClusterMemOps( 2439 ArrayRef<const MachineOperand *> BaseOps1, 2440 ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads) const { 2441 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1); 2442 const MachineOperand &BaseOp1 = *BaseOps1.front(); 2443 const MachineOperand &BaseOp2 = *BaseOps2.front(); 2444 const MachineInstr &FirstLdSt = *BaseOp1.getParent(); 2445 const MachineInstr &SecondLdSt = *BaseOp2.getParent(); 2446 if (BaseOp1.getType() != BaseOp2.getType()) 2447 return false; 2448 2449 assert((BaseOp1.isReg() || BaseOp1.isFI()) && 2450 "Only base registers and frame indices are supported."); 2451 2452 // Check for both base regs and base FI. 2453 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg()) 2454 return false; 2455 2456 // Only cluster up to a single pair. 2457 if (NumLoads > 2) 2458 return false; 2459 2460 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt)) 2461 return false; 2462 2463 // Can we pair these instructions based on their opcodes? 2464 unsigned FirstOpc = FirstLdSt.getOpcode(); 2465 unsigned SecondOpc = SecondLdSt.getOpcode(); 2466 if (!canPairLdStOpc(FirstOpc, SecondOpc)) 2467 return false; 2468 2469 // Can't merge volatiles or load/stores that have a hint to avoid pair 2470 // formation, for example. 2471 if (!isCandidateToMergeOrPair(FirstLdSt) || 2472 !isCandidateToMergeOrPair(SecondLdSt)) 2473 return false; 2474 2475 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate. 2476 int64_t Offset1 = FirstLdSt.getOperand(2).getImm(); 2477 if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1)) 2478 return false; 2479 2480 int64_t Offset2 = SecondLdSt.getOperand(2).getImm(); 2481 if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2)) 2482 return false; 2483 2484 // Pairwise instructions have a 7-bit signed offset field. 2485 if (Offset1 > 63 || Offset1 < -64) 2486 return false; 2487 2488 // The caller should already have ordered First/SecondLdSt by offset. 2489 // Note: except for non-equal frame index bases 2490 if (BaseOp1.isFI()) { 2491 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) && 2492 "Caller should have ordered offsets."); 2493 2494 const MachineFrameInfo &MFI = 2495 FirstLdSt.getParent()->getParent()->getFrameInfo(); 2496 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc, 2497 BaseOp2.getIndex(), Offset2, SecondOpc); 2498 } 2499 2500 assert(Offset1 <= Offset2 && "Caller should have ordered offsets."); 2501 2502 return Offset1 + 1 == Offset2; 2503 } 2504 2505 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB, 2506 unsigned Reg, unsigned SubIdx, 2507 unsigned State, 2508 const TargetRegisterInfo *TRI) { 2509 if (!SubIdx) 2510 return MIB.addReg(Reg, State); 2511 2512 if (Register::isPhysicalRegister(Reg)) 2513 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State); 2514 return MIB.addReg(Reg, State, SubIdx); 2515 } 2516 2517 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, 2518 unsigned NumRegs) { 2519 // We really want the positive remainder mod 32 here, that happens to be 2520 // easily obtainable with a mask. 2521 return ((DestReg - SrcReg) & 0x1f) < NumRegs; 2522 } 2523 2524 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB, 2525 MachineBasicBlock::iterator I, 2526 const DebugLoc &DL, MCRegister DestReg, 2527 MCRegister SrcReg, bool KillSrc, 2528 unsigned Opcode, 2529 ArrayRef<unsigned> Indices) const { 2530 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON"); 2531 const TargetRegisterInfo *TRI = &getRegisterInfo(); 2532 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 2533 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 2534 unsigned NumRegs = Indices.size(); 2535 2536 int SubReg = 0, End = NumRegs, Incr = 1; 2537 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) { 2538 SubReg = NumRegs - 1; 2539 End = -1; 2540 Incr = -1; 2541 } 2542 2543 for (; SubReg != End; SubReg += Incr) { 2544 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 2545 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 2546 AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI); 2547 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 2548 } 2549 } 2550 2551 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB, 2552 MachineBasicBlock::iterator I, 2553 DebugLoc DL, unsigned DestReg, 2554 unsigned SrcReg, bool KillSrc, 2555 unsigned Opcode, unsigned ZeroReg, 2556 llvm::ArrayRef<unsigned> Indices) const { 2557 const TargetRegisterInfo *TRI = &getRegisterInfo(); 2558 unsigned NumRegs = Indices.size(); 2559 2560 #ifndef NDEBUG 2561 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 2562 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 2563 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 && 2564 "GPR reg sequences should not be able to overlap"); 2565 #endif 2566 2567 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) { 2568 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 2569 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 2570 MIB.addReg(ZeroReg); 2571 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 2572 MIB.addImm(0); 2573 } 2574 } 2575 2576 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, 2577 MachineBasicBlock::iterator I, 2578 const DebugLoc &DL, MCRegister DestReg, 2579 MCRegister SrcReg, bool KillSrc) const { 2580 if (AArch64::GPR32spRegClass.contains(DestReg) && 2581 (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) { 2582 const TargetRegisterInfo *TRI = &getRegisterInfo(); 2583 2584 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) { 2585 // If either operand is WSP, expand to ADD #0. 2586 if (Subtarget.hasZeroCycleRegMove()) { 2587 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move. 2588 MCRegister DestRegX = TRI->getMatchingSuperReg( 2589 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 2590 MCRegister SrcRegX = TRI->getMatchingSuperReg( 2591 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 2592 // This instruction is reading and writing X registers. This may upset 2593 // the register scavenger and machine verifier, so we need to indicate 2594 // that we are reading an undefined value from SrcRegX, but a proper 2595 // value from SrcReg. 2596 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX) 2597 .addReg(SrcRegX, RegState::Undef) 2598 .addImm(0) 2599 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) 2600 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 2601 } else { 2602 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg) 2603 .addReg(SrcReg, getKillRegState(KillSrc)) 2604 .addImm(0) 2605 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 2606 } 2607 } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) { 2608 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg) 2609 .addImm(0) 2610 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 2611 } else { 2612 if (Subtarget.hasZeroCycleRegMove()) { 2613 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move. 2614 MCRegister DestRegX = TRI->getMatchingSuperReg( 2615 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 2616 MCRegister SrcRegX = TRI->getMatchingSuperReg( 2617 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 2618 // This instruction is reading and writing X registers. This may upset 2619 // the register scavenger and machine verifier, so we need to indicate 2620 // that we are reading an undefined value from SrcRegX, but a proper 2621 // value from SrcReg. 2622 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX) 2623 .addReg(AArch64::XZR) 2624 .addReg(SrcRegX, RegState::Undef) 2625 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 2626 } else { 2627 // Otherwise, expand to ORR WZR. 2628 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg) 2629 .addReg(AArch64::WZR) 2630 .addReg(SrcReg, getKillRegState(KillSrc)); 2631 } 2632 } 2633 return; 2634 } 2635 2636 // Copy a Predicate register by ORRing with itself. 2637 if (AArch64::PPRRegClass.contains(DestReg) && 2638 AArch64::PPRRegClass.contains(SrcReg)) { 2639 assert(Subtarget.hasSVE() && "Unexpected SVE register."); 2640 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg) 2641 .addReg(SrcReg) // Pg 2642 .addReg(SrcReg) 2643 .addReg(SrcReg, getKillRegState(KillSrc)); 2644 return; 2645 } 2646 2647 // Copy a Z register by ORRing with itself. 2648 if (AArch64::ZPRRegClass.contains(DestReg) && 2649 AArch64::ZPRRegClass.contains(SrcReg)) { 2650 assert(Subtarget.hasSVE() && "Unexpected SVE register."); 2651 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg) 2652 .addReg(SrcReg) 2653 .addReg(SrcReg, getKillRegState(KillSrc)); 2654 return; 2655 } 2656 2657 if (AArch64::GPR64spRegClass.contains(DestReg) && 2658 (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) { 2659 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) { 2660 // If either operand is SP, expand to ADD #0. 2661 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg) 2662 .addReg(SrcReg, getKillRegState(KillSrc)) 2663 .addImm(0) 2664 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 2665 } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) { 2666 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg) 2667 .addImm(0) 2668 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 2669 } else { 2670 // Otherwise, expand to ORR XZR. 2671 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg) 2672 .addReg(AArch64::XZR) 2673 .addReg(SrcReg, getKillRegState(KillSrc)); 2674 } 2675 return; 2676 } 2677 2678 // Copy a DDDD register quad by copying the individual sub-registers. 2679 if (AArch64::DDDDRegClass.contains(DestReg) && 2680 AArch64::DDDDRegClass.contains(SrcReg)) { 2681 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 2682 AArch64::dsub2, AArch64::dsub3}; 2683 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 2684 Indices); 2685 return; 2686 } 2687 2688 // Copy a DDD register triple by copying the individual sub-registers. 2689 if (AArch64::DDDRegClass.contains(DestReg) && 2690 AArch64::DDDRegClass.contains(SrcReg)) { 2691 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 2692 AArch64::dsub2}; 2693 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 2694 Indices); 2695 return; 2696 } 2697 2698 // Copy a DD register pair by copying the individual sub-registers. 2699 if (AArch64::DDRegClass.contains(DestReg) && 2700 AArch64::DDRegClass.contains(SrcReg)) { 2701 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1}; 2702 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 2703 Indices); 2704 return; 2705 } 2706 2707 // Copy a QQQQ register quad by copying the individual sub-registers. 2708 if (AArch64::QQQQRegClass.contains(DestReg) && 2709 AArch64::QQQQRegClass.contains(SrcReg)) { 2710 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 2711 AArch64::qsub2, AArch64::qsub3}; 2712 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 2713 Indices); 2714 return; 2715 } 2716 2717 // Copy a QQQ register triple by copying the individual sub-registers. 2718 if (AArch64::QQQRegClass.contains(DestReg) && 2719 AArch64::QQQRegClass.contains(SrcReg)) { 2720 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 2721 AArch64::qsub2}; 2722 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 2723 Indices); 2724 return; 2725 } 2726 2727 // Copy a QQ register pair by copying the individual sub-registers. 2728 if (AArch64::QQRegClass.contains(DestReg) && 2729 AArch64::QQRegClass.contains(SrcReg)) { 2730 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1}; 2731 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 2732 Indices); 2733 return; 2734 } 2735 2736 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) && 2737 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) { 2738 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64}; 2739 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs, 2740 AArch64::XZR, Indices); 2741 return; 2742 } 2743 2744 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) && 2745 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) { 2746 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32}; 2747 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs, 2748 AArch64::WZR, Indices); 2749 return; 2750 } 2751 2752 if (AArch64::FPR128RegClass.contains(DestReg) && 2753 AArch64::FPR128RegClass.contains(SrcReg)) { 2754 if (Subtarget.hasNEON()) { 2755 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 2756 .addReg(SrcReg) 2757 .addReg(SrcReg, getKillRegState(KillSrc)); 2758 } else { 2759 BuildMI(MBB, I, DL, get(AArch64::STRQpre)) 2760 .addReg(AArch64::SP, RegState::Define) 2761 .addReg(SrcReg, getKillRegState(KillSrc)) 2762 .addReg(AArch64::SP) 2763 .addImm(-16); 2764 BuildMI(MBB, I, DL, get(AArch64::LDRQpre)) 2765 .addReg(AArch64::SP, RegState::Define) 2766 .addReg(DestReg, RegState::Define) 2767 .addReg(AArch64::SP) 2768 .addImm(16); 2769 } 2770 return; 2771 } 2772 2773 if (AArch64::FPR64RegClass.contains(DestReg) && 2774 AArch64::FPR64RegClass.contains(SrcReg)) { 2775 if (Subtarget.hasNEON()) { 2776 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub, 2777 &AArch64::FPR128RegClass); 2778 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub, 2779 &AArch64::FPR128RegClass); 2780 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 2781 .addReg(SrcReg) 2782 .addReg(SrcReg, getKillRegState(KillSrc)); 2783 } else { 2784 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg) 2785 .addReg(SrcReg, getKillRegState(KillSrc)); 2786 } 2787 return; 2788 } 2789 2790 if (AArch64::FPR32RegClass.contains(DestReg) && 2791 AArch64::FPR32RegClass.contains(SrcReg)) { 2792 if (Subtarget.hasNEON()) { 2793 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub, 2794 &AArch64::FPR128RegClass); 2795 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub, 2796 &AArch64::FPR128RegClass); 2797 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 2798 .addReg(SrcReg) 2799 .addReg(SrcReg, getKillRegState(KillSrc)); 2800 } else { 2801 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 2802 .addReg(SrcReg, getKillRegState(KillSrc)); 2803 } 2804 return; 2805 } 2806 2807 if (AArch64::FPR16RegClass.contains(DestReg) && 2808 AArch64::FPR16RegClass.contains(SrcReg)) { 2809 if (Subtarget.hasNEON()) { 2810 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub, 2811 &AArch64::FPR128RegClass); 2812 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub, 2813 &AArch64::FPR128RegClass); 2814 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 2815 .addReg(SrcReg) 2816 .addReg(SrcReg, getKillRegState(KillSrc)); 2817 } else { 2818 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub, 2819 &AArch64::FPR32RegClass); 2820 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub, 2821 &AArch64::FPR32RegClass); 2822 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 2823 .addReg(SrcReg, getKillRegState(KillSrc)); 2824 } 2825 return; 2826 } 2827 2828 if (AArch64::FPR8RegClass.contains(DestReg) && 2829 AArch64::FPR8RegClass.contains(SrcReg)) { 2830 if (Subtarget.hasNEON()) { 2831 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub, 2832 &AArch64::FPR128RegClass); 2833 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub, 2834 &AArch64::FPR128RegClass); 2835 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 2836 .addReg(SrcReg) 2837 .addReg(SrcReg, getKillRegState(KillSrc)); 2838 } else { 2839 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub, 2840 &AArch64::FPR32RegClass); 2841 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub, 2842 &AArch64::FPR32RegClass); 2843 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 2844 .addReg(SrcReg, getKillRegState(KillSrc)); 2845 } 2846 return; 2847 } 2848 2849 // Copies between GPR64 and FPR64. 2850 if (AArch64::FPR64RegClass.contains(DestReg) && 2851 AArch64::GPR64RegClass.contains(SrcReg)) { 2852 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg) 2853 .addReg(SrcReg, getKillRegState(KillSrc)); 2854 return; 2855 } 2856 if (AArch64::GPR64RegClass.contains(DestReg) && 2857 AArch64::FPR64RegClass.contains(SrcReg)) { 2858 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg) 2859 .addReg(SrcReg, getKillRegState(KillSrc)); 2860 return; 2861 } 2862 // Copies between GPR32 and FPR32. 2863 if (AArch64::FPR32RegClass.contains(DestReg) && 2864 AArch64::GPR32RegClass.contains(SrcReg)) { 2865 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg) 2866 .addReg(SrcReg, getKillRegState(KillSrc)); 2867 return; 2868 } 2869 if (AArch64::GPR32RegClass.contains(DestReg) && 2870 AArch64::FPR32RegClass.contains(SrcReg)) { 2871 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg) 2872 .addReg(SrcReg, getKillRegState(KillSrc)); 2873 return; 2874 } 2875 2876 if (DestReg == AArch64::NZCV) { 2877 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy"); 2878 BuildMI(MBB, I, DL, get(AArch64::MSR)) 2879 .addImm(AArch64SysReg::NZCV) 2880 .addReg(SrcReg, getKillRegState(KillSrc)) 2881 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define); 2882 return; 2883 } 2884 2885 if (SrcReg == AArch64::NZCV) { 2886 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy"); 2887 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg) 2888 .addImm(AArch64SysReg::NZCV) 2889 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc)); 2890 return; 2891 } 2892 2893 llvm_unreachable("unimplemented reg-to-reg copy"); 2894 } 2895 2896 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, 2897 MachineBasicBlock &MBB, 2898 MachineBasicBlock::iterator InsertBefore, 2899 const MCInstrDesc &MCID, 2900 Register SrcReg, bool IsKill, 2901 unsigned SubIdx0, unsigned SubIdx1, int FI, 2902 MachineMemOperand *MMO) { 2903 Register SrcReg0 = SrcReg; 2904 Register SrcReg1 = SrcReg; 2905 if (Register::isPhysicalRegister(SrcReg)) { 2906 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0); 2907 SubIdx0 = 0; 2908 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1); 2909 SubIdx1 = 0; 2910 } 2911 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 2912 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0) 2913 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1) 2914 .addFrameIndex(FI) 2915 .addImm(0) 2916 .addMemOperand(MMO); 2917 } 2918 2919 void AArch64InstrInfo::storeRegToStackSlot( 2920 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, 2921 bool isKill, int FI, const TargetRegisterClass *RC, 2922 const TargetRegisterInfo *TRI) const { 2923 MachineFunction &MF = *MBB.getParent(); 2924 MachineFrameInfo &MFI = MF.getFrameInfo(); 2925 2926 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 2927 MachineMemOperand *MMO = 2928 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 2929 MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); 2930 unsigned Opc = 0; 2931 bool Offset = true; 2932 switch (TRI->getSpillSize(*RC)) { 2933 case 1: 2934 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 2935 Opc = AArch64::STRBui; 2936 break; 2937 case 2: 2938 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 2939 Opc = AArch64::STRHui; 2940 break; 2941 case 4: 2942 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 2943 Opc = AArch64::STRWui; 2944 if (Register::isVirtualRegister(SrcReg)) 2945 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass); 2946 else 2947 assert(SrcReg != AArch64::WSP); 2948 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 2949 Opc = AArch64::STRSui; 2950 break; 2951 case 8: 2952 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 2953 Opc = AArch64::STRXui; 2954 if (Register::isVirtualRegister(SrcReg)) 2955 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 2956 else 2957 assert(SrcReg != AArch64::SP); 2958 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 2959 Opc = AArch64::STRDui; 2960 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 2961 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 2962 get(AArch64::STPWi), SrcReg, isKill, 2963 AArch64::sube32, AArch64::subo32, FI, MMO); 2964 return; 2965 } 2966 break; 2967 case 16: 2968 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 2969 Opc = AArch64::STRQui; 2970 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 2971 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 2972 Opc = AArch64::ST1Twov1d; 2973 Offset = false; 2974 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 2975 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 2976 get(AArch64::STPXi), SrcReg, isKill, 2977 AArch64::sube64, AArch64::subo64, FI, MMO); 2978 return; 2979 } 2980 break; 2981 case 24: 2982 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 2983 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 2984 Opc = AArch64::ST1Threev1d; 2985 Offset = false; 2986 } 2987 break; 2988 case 32: 2989 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 2990 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 2991 Opc = AArch64::ST1Fourv1d; 2992 Offset = false; 2993 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 2994 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 2995 Opc = AArch64::ST1Twov2d; 2996 Offset = false; 2997 } 2998 break; 2999 case 48: 3000 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 3001 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3002 Opc = AArch64::ST1Threev2d; 3003 Offset = false; 3004 } 3005 break; 3006 case 64: 3007 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 3008 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3009 Opc = AArch64::ST1Fourv2d; 3010 Offset = false; 3011 } 3012 break; 3013 } 3014 unsigned StackID = TargetStackID::Default; 3015 if (AArch64::PPRRegClass.hasSubClassEq(RC)) { 3016 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3017 Opc = AArch64::STR_PXI; 3018 StackID = TargetStackID::SVEVector; 3019 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { 3020 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3021 Opc = AArch64::STR_ZXI; 3022 StackID = TargetStackID::SVEVector; 3023 } 3024 assert(Opc && "Unknown register class"); 3025 MFI.setStackID(FI, StackID); 3026 3027 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 3028 .addReg(SrcReg, getKillRegState(isKill)) 3029 .addFrameIndex(FI); 3030 3031 if (Offset) 3032 MI.addImm(0); 3033 MI.addMemOperand(MMO); 3034 } 3035 3036 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, 3037 MachineBasicBlock &MBB, 3038 MachineBasicBlock::iterator InsertBefore, 3039 const MCInstrDesc &MCID, 3040 Register DestReg, unsigned SubIdx0, 3041 unsigned SubIdx1, int FI, 3042 MachineMemOperand *MMO) { 3043 Register DestReg0 = DestReg; 3044 Register DestReg1 = DestReg; 3045 bool IsUndef = true; 3046 if (Register::isPhysicalRegister(DestReg)) { 3047 DestReg0 = TRI.getSubReg(DestReg, SubIdx0); 3048 SubIdx0 = 0; 3049 DestReg1 = TRI.getSubReg(DestReg, SubIdx1); 3050 SubIdx1 = 0; 3051 IsUndef = false; 3052 } 3053 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 3054 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0) 3055 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1) 3056 .addFrameIndex(FI) 3057 .addImm(0) 3058 .addMemOperand(MMO); 3059 } 3060 3061 void AArch64InstrInfo::loadRegFromStackSlot( 3062 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, 3063 int FI, const TargetRegisterClass *RC, 3064 const TargetRegisterInfo *TRI) const { 3065 MachineFunction &MF = *MBB.getParent(); 3066 MachineFrameInfo &MFI = MF.getFrameInfo(); 3067 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 3068 MachineMemOperand *MMO = 3069 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, 3070 MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); 3071 3072 unsigned Opc = 0; 3073 bool Offset = true; 3074 switch (TRI->getSpillSize(*RC)) { 3075 case 1: 3076 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 3077 Opc = AArch64::LDRBui; 3078 break; 3079 case 2: 3080 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 3081 Opc = AArch64::LDRHui; 3082 break; 3083 case 4: 3084 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 3085 Opc = AArch64::LDRWui; 3086 if (Register::isVirtualRegister(DestReg)) 3087 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass); 3088 else 3089 assert(DestReg != AArch64::WSP); 3090 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 3091 Opc = AArch64::LDRSui; 3092 break; 3093 case 8: 3094 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 3095 Opc = AArch64::LDRXui; 3096 if (Register::isVirtualRegister(DestReg)) 3097 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass); 3098 else 3099 assert(DestReg != AArch64::SP); 3100 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 3101 Opc = AArch64::LDRDui; 3102 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 3103 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 3104 get(AArch64::LDPWi), DestReg, AArch64::sube32, 3105 AArch64::subo32, FI, MMO); 3106 return; 3107 } 3108 break; 3109 case 16: 3110 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 3111 Opc = AArch64::LDRQui; 3112 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 3113 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3114 Opc = AArch64::LD1Twov1d; 3115 Offset = false; 3116 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 3117 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 3118 get(AArch64::LDPXi), DestReg, AArch64::sube64, 3119 AArch64::subo64, FI, MMO); 3120 return; 3121 } 3122 break; 3123 case 24: 3124 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 3125 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3126 Opc = AArch64::LD1Threev1d; 3127 Offset = false; 3128 } 3129 break; 3130 case 32: 3131 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 3132 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3133 Opc = AArch64::LD1Fourv1d; 3134 Offset = false; 3135 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 3136 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3137 Opc = AArch64::LD1Twov2d; 3138 Offset = false; 3139 } 3140 break; 3141 case 48: 3142 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 3143 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3144 Opc = AArch64::LD1Threev2d; 3145 Offset = false; 3146 } 3147 break; 3148 case 64: 3149 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 3150 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3151 Opc = AArch64::LD1Fourv2d; 3152 Offset = false; 3153 } 3154 break; 3155 } 3156 3157 unsigned StackID = TargetStackID::Default; 3158 if (AArch64::PPRRegClass.hasSubClassEq(RC)) { 3159 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 3160 Opc = AArch64::LDR_PXI; 3161 StackID = TargetStackID::SVEVector; 3162 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { 3163 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 3164 Opc = AArch64::LDR_ZXI; 3165 StackID = TargetStackID::SVEVector; 3166 } 3167 assert(Opc && "Unknown register class"); 3168 MFI.setStackID(FI, StackID); 3169 3170 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 3171 .addReg(DestReg, getDefRegState(true)) 3172 .addFrameIndex(FI); 3173 if (Offset) 3174 MI.addImm(0); 3175 MI.addMemOperand(MMO); 3176 } 3177 3178 // Helper function to emit a frame offset adjustment from a given 3179 // pointer (SrcReg), stored into DestReg. This function is explicit 3180 // in that it requires the opcode. 3181 static void emitFrameOffsetAdj(MachineBasicBlock &MBB, 3182 MachineBasicBlock::iterator MBBI, 3183 const DebugLoc &DL, unsigned DestReg, 3184 unsigned SrcReg, int64_t Offset, unsigned Opc, 3185 const TargetInstrInfo *TII, 3186 MachineInstr::MIFlag Flag, bool NeedsWinCFI, 3187 bool *HasWinCFI) { 3188 int Sign = 1; 3189 unsigned MaxEncoding, ShiftSize; 3190 switch (Opc) { 3191 case AArch64::ADDXri: 3192 case AArch64::ADDSXri: 3193 case AArch64::SUBXri: 3194 case AArch64::SUBSXri: 3195 MaxEncoding = 0xfff; 3196 ShiftSize = 12; 3197 break; 3198 case AArch64::ADDVL_XXI: 3199 case AArch64::ADDPL_XXI: 3200 MaxEncoding = 31; 3201 ShiftSize = 0; 3202 if (Offset < 0) { 3203 MaxEncoding = 32; 3204 Sign = -1; 3205 Offset = -Offset; 3206 } 3207 break; 3208 default: 3209 llvm_unreachable("Unsupported opcode"); 3210 } 3211 3212 // FIXME: If the offset won't fit in 24-bits, compute the offset into a 3213 // scratch register. If DestReg is a virtual register, use it as the 3214 // scratch register; otherwise, create a new virtual register (to be 3215 // replaced by the scavenger at the end of PEI). That case can be optimized 3216 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch 3217 // register can be loaded with offset%8 and the add/sub can use an extending 3218 // instruction with LSL#3. 3219 // Currently the function handles any offsets but generates a poor sequence 3220 // of code. 3221 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate"); 3222 3223 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize; 3224 do { 3225 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue); 3226 unsigned LocalShiftSize = 0; 3227 if (ThisVal > MaxEncoding) { 3228 ThisVal = ThisVal >> ShiftSize; 3229 LocalShiftSize = ShiftSize; 3230 } 3231 assert((ThisVal >> ShiftSize) <= MaxEncoding && 3232 "Encoding cannot handle value that big"); 3233 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg) 3234 .addReg(SrcReg) 3235 .addImm(Sign * (int)ThisVal); 3236 if (ShiftSize) 3237 MBI = MBI.addImm( 3238 AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize)); 3239 MBI = MBI.setMIFlag(Flag); 3240 3241 if (NeedsWinCFI) { 3242 assert(Sign == 1 && "SEH directives should always have a positive sign"); 3243 int Imm = (int)(ThisVal << LocalShiftSize); 3244 if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) || 3245 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) { 3246 if (HasWinCFI) 3247 *HasWinCFI = true; 3248 if (Imm == 0) 3249 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag); 3250 else 3251 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)) 3252 .addImm(Imm) 3253 .setMIFlag(Flag); 3254 assert((Offset - Imm) == 0 && "Expected remaining offset to be zero to " 3255 "emit a single SEH directive"); 3256 } else if (DestReg == AArch64::SP) { 3257 if (HasWinCFI) 3258 *HasWinCFI = true; 3259 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc"); 3260 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) 3261 .addImm(Imm) 3262 .setMIFlag(Flag); 3263 } 3264 if (HasWinCFI) 3265 *HasWinCFI = true; 3266 } 3267 3268 SrcReg = DestReg; 3269 Offset -= ThisVal << LocalShiftSize; 3270 } while (Offset); 3271 } 3272 3273 void llvm::emitFrameOffset(MachineBasicBlock &MBB, 3274 MachineBasicBlock::iterator MBBI, const DebugLoc &DL, 3275 unsigned DestReg, unsigned SrcReg, 3276 StackOffset Offset, const TargetInstrInfo *TII, 3277 MachineInstr::MIFlag Flag, bool SetNZCV, 3278 bool NeedsWinCFI, bool *HasWinCFI) { 3279 int64_t Bytes, NumPredicateVectors, NumDataVectors; 3280 Offset.getForFrameOffset(Bytes, NumPredicateVectors, NumDataVectors); 3281 3282 // First emit non-scalable frame offsets, or a simple 'mov'. 3283 if (Bytes || (!Offset && SrcReg != DestReg)) { 3284 assert((DestReg != AArch64::SP || Bytes % 16 == 0) && 3285 "SP increment/decrement not 16-byte aligned"); 3286 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri; 3287 if (Bytes < 0) { 3288 Bytes = -Bytes; 3289 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri; 3290 } 3291 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag, 3292 NeedsWinCFI, HasWinCFI); 3293 SrcReg = DestReg; 3294 } 3295 3296 assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) && 3297 "SetNZCV not supported with SVE vectors"); 3298 assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) && 3299 "WinCFI not supported with SVE vectors"); 3300 3301 if (NumDataVectors) { 3302 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors, 3303 AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr); 3304 SrcReg = DestReg; 3305 } 3306 3307 if (NumPredicateVectors) { 3308 assert(DestReg != AArch64::SP && "Unaligned access to SP"); 3309 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors, 3310 AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr); 3311 } 3312 } 3313 3314 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( 3315 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, 3316 MachineBasicBlock::iterator InsertPt, int FrameIndex, 3317 LiveIntervals *LIS, VirtRegMap *VRM) const { 3318 // This is a bit of a hack. Consider this instruction: 3319 // 3320 // %0 = COPY %sp; GPR64all:%0 3321 // 3322 // We explicitly chose GPR64all for the virtual register so such a copy might 3323 // be eliminated by RegisterCoalescer. However, that may not be possible, and 3324 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all 3325 // register class, TargetInstrInfo::foldMemoryOperand() is going to try. 3326 // 3327 // To prevent that, we are going to constrain the %0 register class here. 3328 // 3329 // <rdar://problem/11522048> 3330 // 3331 if (MI.isFullCopy()) { 3332 Register DstReg = MI.getOperand(0).getReg(); 3333 Register SrcReg = MI.getOperand(1).getReg(); 3334 if (SrcReg == AArch64::SP && Register::isVirtualRegister(DstReg)) { 3335 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass); 3336 return nullptr; 3337 } 3338 if (DstReg == AArch64::SP && Register::isVirtualRegister(SrcReg)) { 3339 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 3340 return nullptr; 3341 } 3342 } 3343 3344 // Handle the case where a copy is being spilled or filled but the source 3345 // and destination register class don't match. For example: 3346 // 3347 // %0 = COPY %xzr; GPR64common:%0 3348 // 3349 // In this case we can still safely fold away the COPY and generate the 3350 // following spill code: 3351 // 3352 // STRXui %xzr, %stack.0 3353 // 3354 // This also eliminates spilled cross register class COPYs (e.g. between x and 3355 // d regs) of the same size. For example: 3356 // 3357 // %0 = COPY %1; GPR64:%0, FPR64:%1 3358 // 3359 // will be filled as 3360 // 3361 // LDRDui %0, fi<#0> 3362 // 3363 // instead of 3364 // 3365 // LDRXui %Temp, fi<#0> 3366 // %0 = FMOV %Temp 3367 // 3368 if (MI.isCopy() && Ops.size() == 1 && 3369 // Make sure we're only folding the explicit COPY defs/uses. 3370 (Ops[0] == 0 || Ops[0] == 1)) { 3371 bool IsSpill = Ops[0] == 0; 3372 bool IsFill = !IsSpill; 3373 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); 3374 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3375 MachineBasicBlock &MBB = *MI.getParent(); 3376 const MachineOperand &DstMO = MI.getOperand(0); 3377 const MachineOperand &SrcMO = MI.getOperand(1); 3378 Register DstReg = DstMO.getReg(); 3379 Register SrcReg = SrcMO.getReg(); 3380 // This is slightly expensive to compute for physical regs since 3381 // getMinimalPhysRegClass is slow. 3382 auto getRegClass = [&](unsigned Reg) { 3383 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg) 3384 : TRI.getMinimalPhysRegClass(Reg); 3385 }; 3386 3387 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) { 3388 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) == 3389 TRI.getRegSizeInBits(*getRegClass(SrcReg)) && 3390 "Mismatched register size in non subreg COPY"); 3391 if (IsSpill) 3392 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex, 3393 getRegClass(SrcReg), &TRI); 3394 else 3395 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, 3396 getRegClass(DstReg), &TRI); 3397 return &*--InsertPt; 3398 } 3399 3400 // Handle cases like spilling def of: 3401 // 3402 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0 3403 // 3404 // where the physical register source can be widened and stored to the full 3405 // virtual reg destination stack slot, in this case producing: 3406 // 3407 // STRXui %xzr, %stack.0 3408 // 3409 if (IsSpill && DstMO.isUndef() && Register::isPhysicalRegister(SrcReg)) { 3410 assert(SrcMO.getSubReg() == 0 && 3411 "Unexpected subreg on physical register"); 3412 const TargetRegisterClass *SpillRC; 3413 unsigned SpillSubreg; 3414 switch (DstMO.getSubReg()) { 3415 default: 3416 SpillRC = nullptr; 3417 break; 3418 case AArch64::sub_32: 3419 case AArch64::ssub: 3420 if (AArch64::GPR32RegClass.contains(SrcReg)) { 3421 SpillRC = &AArch64::GPR64RegClass; 3422 SpillSubreg = AArch64::sub_32; 3423 } else if (AArch64::FPR32RegClass.contains(SrcReg)) { 3424 SpillRC = &AArch64::FPR64RegClass; 3425 SpillSubreg = AArch64::ssub; 3426 } else 3427 SpillRC = nullptr; 3428 break; 3429 case AArch64::dsub: 3430 if (AArch64::FPR64RegClass.contains(SrcReg)) { 3431 SpillRC = &AArch64::FPR128RegClass; 3432 SpillSubreg = AArch64::dsub; 3433 } else 3434 SpillRC = nullptr; 3435 break; 3436 } 3437 3438 if (SpillRC) 3439 if (unsigned WidenedSrcReg = 3440 TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) { 3441 storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(), 3442 FrameIndex, SpillRC, &TRI); 3443 return &*--InsertPt; 3444 } 3445 } 3446 3447 // Handle cases like filling use of: 3448 // 3449 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1 3450 // 3451 // where we can load the full virtual reg source stack slot, into the subreg 3452 // destination, in this case producing: 3453 // 3454 // LDRWui %0:sub_32<def,read-undef>, %stack.0 3455 // 3456 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) { 3457 const TargetRegisterClass *FillRC; 3458 switch (DstMO.getSubReg()) { 3459 default: 3460 FillRC = nullptr; 3461 break; 3462 case AArch64::sub_32: 3463 FillRC = &AArch64::GPR32RegClass; 3464 break; 3465 case AArch64::ssub: 3466 FillRC = &AArch64::FPR32RegClass; 3467 break; 3468 case AArch64::dsub: 3469 FillRC = &AArch64::FPR64RegClass; 3470 break; 3471 } 3472 3473 if (FillRC) { 3474 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) == 3475 TRI.getRegSizeInBits(*FillRC) && 3476 "Mismatched regclass size on folded subreg COPY"); 3477 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI); 3478 MachineInstr &LoadMI = *--InsertPt; 3479 MachineOperand &LoadDst = LoadMI.getOperand(0); 3480 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load"); 3481 LoadDst.setSubReg(DstMO.getSubReg()); 3482 LoadDst.setIsUndef(); 3483 return &LoadMI; 3484 } 3485 } 3486 } 3487 3488 // Cannot fold. 3489 return nullptr; 3490 } 3491 3492 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, 3493 StackOffset &SOffset, 3494 bool *OutUseUnscaledOp, 3495 unsigned *OutUnscaledOp, 3496 int64_t *EmittableOffset) { 3497 // Set output values in case of early exit. 3498 if (EmittableOffset) 3499 *EmittableOffset = 0; 3500 if (OutUseUnscaledOp) 3501 *OutUseUnscaledOp = false; 3502 if (OutUnscaledOp) 3503 *OutUnscaledOp = 0; 3504 3505 // Exit early for structured vector spills/fills as they can't take an 3506 // immediate offset. 3507 switch (MI.getOpcode()) { 3508 default: 3509 break; 3510 case AArch64::LD1Twov2d: 3511 case AArch64::LD1Threev2d: 3512 case AArch64::LD1Fourv2d: 3513 case AArch64::LD1Twov1d: 3514 case AArch64::LD1Threev1d: 3515 case AArch64::LD1Fourv1d: 3516 case AArch64::ST1Twov2d: 3517 case AArch64::ST1Threev2d: 3518 case AArch64::ST1Fourv2d: 3519 case AArch64::ST1Twov1d: 3520 case AArch64::ST1Threev1d: 3521 case AArch64::ST1Fourv1d: 3522 case AArch64::IRG: 3523 case AArch64::IRGstack: 3524 case AArch64::STGloop: 3525 case AArch64::STZGloop: 3526 return AArch64FrameOffsetCannotUpdate; 3527 } 3528 3529 // Get the min/max offset and the scale. 3530 TypeSize ScaleValue(0U, false); 3531 unsigned Width; 3532 int64_t MinOff, MaxOff; 3533 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff, 3534 MaxOff)) 3535 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 3536 3537 // Construct the complete offset. 3538 bool IsMulVL = ScaleValue.isScalable(); 3539 unsigned Scale = ScaleValue.getKnownMinSize(); 3540 int64_t Offset = IsMulVL ? SOffset.getScalableBytes() : SOffset.getBytes(); 3541 3542 const MachineOperand &ImmOpnd = 3543 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode())); 3544 Offset += ImmOpnd.getImm() * Scale; 3545 3546 // If the offset doesn't match the scale, we rewrite the instruction to 3547 // use the unscaled instruction instead. Likewise, if we have a negative 3548 // offset and there is an unscaled op to use. 3549 Optional<unsigned> UnscaledOp = 3550 AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode()); 3551 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0); 3552 if (useUnscaledOp && 3553 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff, 3554 MaxOff)) 3555 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 3556 3557 Scale = ScaleValue.getKnownMinSize(); 3558 assert(IsMulVL == ScaleValue.isScalable() && 3559 "Unscaled opcode has different value for scalable"); 3560 3561 int64_t Remainder = Offset % Scale; 3562 assert(!(Remainder && useUnscaledOp) && 3563 "Cannot have remainder when using unscaled op"); 3564 3565 assert(MinOff < MaxOff && "Unexpected Min/Max offsets"); 3566 int64_t NewOffset = Offset / Scale; 3567 if (MinOff <= NewOffset && NewOffset <= MaxOff) 3568 Offset = Remainder; 3569 else { 3570 NewOffset = NewOffset < 0 ? MinOff : MaxOff; 3571 Offset = Offset - NewOffset * Scale + Remainder; 3572 } 3573 3574 if (EmittableOffset) 3575 *EmittableOffset = NewOffset; 3576 if (OutUseUnscaledOp) 3577 *OutUseUnscaledOp = useUnscaledOp; 3578 if (OutUnscaledOp && UnscaledOp) 3579 *OutUnscaledOp = *UnscaledOp; 3580 3581 if (IsMulVL) 3582 SOffset = StackOffset(Offset, MVT::nxv1i8) + 3583 StackOffset(SOffset.getBytes(), MVT::i8); 3584 else 3585 SOffset = StackOffset(Offset, MVT::i8) + 3586 StackOffset(SOffset.getScalableBytes(), MVT::nxv1i8); 3587 return AArch64FrameOffsetCanUpdate | 3588 (SOffset ? 0 : AArch64FrameOffsetIsLegal); 3589 } 3590 3591 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, 3592 unsigned FrameReg, StackOffset &Offset, 3593 const AArch64InstrInfo *TII) { 3594 unsigned Opcode = MI.getOpcode(); 3595 unsigned ImmIdx = FrameRegIdx + 1; 3596 3597 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) { 3598 Offset += StackOffset(MI.getOperand(ImmIdx).getImm(), MVT::i8); 3599 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(), 3600 MI.getOperand(0).getReg(), FrameReg, Offset, TII, 3601 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri)); 3602 MI.eraseFromParent(); 3603 Offset = StackOffset(); 3604 return true; 3605 } 3606 3607 int64_t NewOffset; 3608 unsigned UnscaledOp; 3609 bool UseUnscaledOp; 3610 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp, 3611 &UnscaledOp, &NewOffset); 3612 if (Status & AArch64FrameOffsetCanUpdate) { 3613 if (Status & AArch64FrameOffsetIsLegal) 3614 // Replace the FrameIndex with FrameReg. 3615 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); 3616 if (UseUnscaledOp) 3617 MI.setDesc(TII->get(UnscaledOp)); 3618 3619 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset); 3620 return !Offset; 3621 } 3622 3623 return false; 3624 } 3625 3626 void AArch64InstrInfo::getNoop(MCInst &NopInst) const { 3627 NopInst.setOpcode(AArch64::HINT); 3628 NopInst.addOperand(MCOperand::createImm(0)); 3629 } 3630 3631 // AArch64 supports MachineCombiner. 3632 bool AArch64InstrInfo::useMachineCombiner() const { return true; } 3633 3634 // True when Opc sets flag 3635 static bool isCombineInstrSettingFlag(unsigned Opc) { 3636 switch (Opc) { 3637 case AArch64::ADDSWrr: 3638 case AArch64::ADDSWri: 3639 case AArch64::ADDSXrr: 3640 case AArch64::ADDSXri: 3641 case AArch64::SUBSWrr: 3642 case AArch64::SUBSXrr: 3643 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 3644 case AArch64::SUBSWri: 3645 case AArch64::SUBSXri: 3646 return true; 3647 default: 3648 break; 3649 } 3650 return false; 3651 } 3652 3653 // 32b Opcodes that can be combined with a MUL 3654 static bool isCombineInstrCandidate32(unsigned Opc) { 3655 switch (Opc) { 3656 case AArch64::ADDWrr: 3657 case AArch64::ADDWri: 3658 case AArch64::SUBWrr: 3659 case AArch64::ADDSWrr: 3660 case AArch64::ADDSWri: 3661 case AArch64::SUBSWrr: 3662 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 3663 case AArch64::SUBWri: 3664 case AArch64::SUBSWri: 3665 return true; 3666 default: 3667 break; 3668 } 3669 return false; 3670 } 3671 3672 // 64b Opcodes that can be combined with a MUL 3673 static bool isCombineInstrCandidate64(unsigned Opc) { 3674 switch (Opc) { 3675 case AArch64::ADDXrr: 3676 case AArch64::ADDXri: 3677 case AArch64::SUBXrr: 3678 case AArch64::ADDSXrr: 3679 case AArch64::ADDSXri: 3680 case AArch64::SUBSXrr: 3681 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 3682 case AArch64::SUBXri: 3683 case AArch64::SUBSXri: 3684 case AArch64::ADDv8i8: 3685 case AArch64::ADDv16i8: 3686 case AArch64::ADDv4i16: 3687 case AArch64::ADDv8i16: 3688 case AArch64::ADDv2i32: 3689 case AArch64::ADDv4i32: 3690 case AArch64::SUBv8i8: 3691 case AArch64::SUBv16i8: 3692 case AArch64::SUBv4i16: 3693 case AArch64::SUBv8i16: 3694 case AArch64::SUBv2i32: 3695 case AArch64::SUBv4i32: 3696 return true; 3697 default: 3698 break; 3699 } 3700 return false; 3701 } 3702 3703 // FP Opcodes that can be combined with a FMUL 3704 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { 3705 switch (Inst.getOpcode()) { 3706 default: 3707 break; 3708 case AArch64::FADDHrr: 3709 case AArch64::FADDSrr: 3710 case AArch64::FADDDrr: 3711 case AArch64::FADDv4f16: 3712 case AArch64::FADDv8f16: 3713 case AArch64::FADDv2f32: 3714 case AArch64::FADDv2f64: 3715 case AArch64::FADDv4f32: 3716 case AArch64::FSUBHrr: 3717 case AArch64::FSUBSrr: 3718 case AArch64::FSUBDrr: 3719 case AArch64::FSUBv4f16: 3720 case AArch64::FSUBv8f16: 3721 case AArch64::FSUBv2f32: 3722 case AArch64::FSUBv2f64: 3723 case AArch64::FSUBv4f32: 3724 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options; 3725 return (Options.UnsafeFPMath || 3726 Options.AllowFPOpFusion == FPOpFusion::Fast); 3727 } 3728 return false; 3729 } 3730 3731 // Opcodes that can be combined with a MUL 3732 static bool isCombineInstrCandidate(unsigned Opc) { 3733 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc)); 3734 } 3735 3736 // 3737 // Utility routine that checks if \param MO is defined by an 3738 // \param CombineOpc instruction in the basic block \param MBB 3739 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, 3740 unsigned CombineOpc, unsigned ZeroReg = 0, 3741 bool CheckZeroReg = false) { 3742 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3743 MachineInstr *MI = nullptr; 3744 3745 if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) 3746 MI = MRI.getUniqueVRegDef(MO.getReg()); 3747 // And it needs to be in the trace (otherwise, it won't have a depth). 3748 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc) 3749 return false; 3750 // Must only used by the user we combine with. 3751 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) 3752 return false; 3753 3754 if (CheckZeroReg) { 3755 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && 3756 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && 3757 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); 3758 // The third input reg must be zero. 3759 if (MI->getOperand(3).getReg() != ZeroReg) 3760 return false; 3761 } 3762 3763 return true; 3764 } 3765 3766 // 3767 // Is \param MO defined by an integer multiply and can be combined? 3768 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, 3769 unsigned MulOpc, unsigned ZeroReg) { 3770 return canCombine(MBB, MO, MulOpc, ZeroReg, true); 3771 } 3772 3773 // 3774 // Is \param MO defined by a floating-point multiply and can be combined? 3775 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, 3776 unsigned MulOpc) { 3777 return canCombine(MBB, MO, MulOpc); 3778 } 3779 3780 // TODO: There are many more machine instruction opcodes to match: 3781 // 1. Other data types (integer, vectors) 3782 // 2. Other math / logic operations (xor, or) 3783 // 3. Other forms of the same operation (intrinsics and other variants) 3784 bool AArch64InstrInfo::isAssociativeAndCommutative( 3785 const MachineInstr &Inst) const { 3786 switch (Inst.getOpcode()) { 3787 case AArch64::FADDDrr: 3788 case AArch64::FADDSrr: 3789 case AArch64::FADDv2f32: 3790 case AArch64::FADDv2f64: 3791 case AArch64::FADDv4f32: 3792 case AArch64::FMULDrr: 3793 case AArch64::FMULSrr: 3794 case AArch64::FMULX32: 3795 case AArch64::FMULX64: 3796 case AArch64::FMULXv2f32: 3797 case AArch64::FMULXv2f64: 3798 case AArch64::FMULXv4f32: 3799 case AArch64::FMULv2f32: 3800 case AArch64::FMULv2f64: 3801 case AArch64::FMULv4f32: 3802 return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath; 3803 default: 3804 return false; 3805 } 3806 } 3807 3808 /// Find instructions that can be turned into madd. 3809 static bool getMaddPatterns(MachineInstr &Root, 3810 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 3811 unsigned Opc = Root.getOpcode(); 3812 MachineBasicBlock &MBB = *Root.getParent(); 3813 bool Found = false; 3814 3815 if (!isCombineInstrCandidate(Opc)) 3816 return false; 3817 if (isCombineInstrSettingFlag(Opc)) { 3818 int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true); 3819 // When NZCV is live bail out. 3820 if (Cmp_NZCV == -1) 3821 return false; 3822 unsigned NewOpc = convertToNonFlagSettingOpc(Root); 3823 // When opcode can't change bail out. 3824 // CHECKME: do we miss any cases for opcode conversion? 3825 if (NewOpc == Opc) 3826 return false; 3827 Opc = NewOpc; 3828 } 3829 3830 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg, 3831 MachineCombinerPattern Pattern) { 3832 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) { 3833 Patterns.push_back(Pattern); 3834 Found = true; 3835 } 3836 }; 3837 3838 auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) { 3839 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) { 3840 Patterns.push_back(Pattern); 3841 Found = true; 3842 } 3843 }; 3844 3845 typedef MachineCombinerPattern MCP; 3846 3847 switch (Opc) { 3848 default: 3849 break; 3850 case AArch64::ADDWrr: 3851 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 3852 "ADDWrr does not have register operands"); 3853 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1); 3854 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2); 3855 break; 3856 case AArch64::ADDXrr: 3857 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1); 3858 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2); 3859 break; 3860 case AArch64::SUBWrr: 3861 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1); 3862 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2); 3863 break; 3864 case AArch64::SUBXrr: 3865 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1); 3866 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2); 3867 break; 3868 case AArch64::ADDWri: 3869 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1); 3870 break; 3871 case AArch64::ADDXri: 3872 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1); 3873 break; 3874 case AArch64::SUBWri: 3875 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1); 3876 break; 3877 case AArch64::SUBXri: 3878 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1); 3879 break; 3880 case AArch64::ADDv8i8: 3881 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1); 3882 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2); 3883 break; 3884 case AArch64::ADDv16i8: 3885 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1); 3886 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2); 3887 break; 3888 case AArch64::ADDv4i16: 3889 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1); 3890 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2); 3891 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1); 3892 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2); 3893 break; 3894 case AArch64::ADDv8i16: 3895 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1); 3896 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2); 3897 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1); 3898 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2); 3899 break; 3900 case AArch64::ADDv2i32: 3901 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1); 3902 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2); 3903 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1); 3904 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2); 3905 break; 3906 case AArch64::ADDv4i32: 3907 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1); 3908 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2); 3909 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1); 3910 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2); 3911 break; 3912 case AArch64::SUBv8i8: 3913 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1); 3914 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2); 3915 break; 3916 case AArch64::SUBv16i8: 3917 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1); 3918 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2); 3919 break; 3920 case AArch64::SUBv4i16: 3921 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1); 3922 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2); 3923 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1); 3924 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2); 3925 break; 3926 case AArch64::SUBv8i16: 3927 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1); 3928 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2); 3929 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1); 3930 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2); 3931 break; 3932 case AArch64::SUBv2i32: 3933 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1); 3934 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2); 3935 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1); 3936 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2); 3937 break; 3938 case AArch64::SUBv4i32: 3939 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1); 3940 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2); 3941 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1); 3942 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2); 3943 break; 3944 } 3945 return Found; 3946 } 3947 /// Floating-Point Support 3948 3949 /// Find instructions that can be turned into madd. 3950 static bool getFMAPatterns(MachineInstr &Root, 3951 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 3952 3953 if (!isCombineInstrCandidateFP(Root)) 3954 return false; 3955 3956 MachineBasicBlock &MBB = *Root.getParent(); 3957 bool Found = false; 3958 3959 auto Match = [&](int Opcode, int Operand, 3960 MachineCombinerPattern Pattern) -> bool { 3961 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) { 3962 Patterns.push_back(Pattern); 3963 return true; 3964 } 3965 return false; 3966 }; 3967 3968 typedef MachineCombinerPattern MCP; 3969 3970 switch (Root.getOpcode()) { 3971 default: 3972 assert(false && "Unsupported FP instruction in combiner\n"); 3973 break; 3974 case AArch64::FADDHrr: 3975 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 3976 "FADDHrr does not have register operands"); 3977 3978 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1); 3979 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2); 3980 break; 3981 case AArch64::FADDSrr: 3982 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 3983 "FADDSrr does not have register operands"); 3984 3985 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) || 3986 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1); 3987 3988 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) || 3989 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2); 3990 break; 3991 case AArch64::FADDDrr: 3992 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) || 3993 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1); 3994 3995 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) || 3996 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2); 3997 break; 3998 case AArch64::FADDv4f16: 3999 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) || 4000 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1); 4001 4002 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) || 4003 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2); 4004 break; 4005 case AArch64::FADDv8f16: 4006 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) || 4007 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1); 4008 4009 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) || 4010 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2); 4011 break; 4012 case AArch64::FADDv2f32: 4013 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) || 4014 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1); 4015 4016 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) || 4017 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2); 4018 break; 4019 case AArch64::FADDv2f64: 4020 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) || 4021 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1); 4022 4023 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) || 4024 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2); 4025 break; 4026 case AArch64::FADDv4f32: 4027 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) || 4028 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1); 4029 4030 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) || 4031 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2); 4032 break; 4033 case AArch64::FSUBHrr: 4034 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1); 4035 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2); 4036 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1); 4037 break; 4038 case AArch64::FSUBSrr: 4039 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1); 4040 4041 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) || 4042 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2); 4043 4044 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1); 4045 break; 4046 case AArch64::FSUBDrr: 4047 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1); 4048 4049 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) || 4050 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2); 4051 4052 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1); 4053 break; 4054 case AArch64::FSUBv4f16: 4055 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) || 4056 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2); 4057 4058 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) || 4059 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1); 4060 break; 4061 case AArch64::FSUBv8f16: 4062 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) || 4063 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2); 4064 4065 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) || 4066 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1); 4067 break; 4068 case AArch64::FSUBv2f32: 4069 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) || 4070 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2); 4071 4072 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) || 4073 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1); 4074 break; 4075 case AArch64::FSUBv2f64: 4076 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) || 4077 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2); 4078 4079 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) || 4080 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1); 4081 break; 4082 case AArch64::FSUBv4f32: 4083 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) || 4084 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2); 4085 4086 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) || 4087 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1); 4088 break; 4089 } 4090 return Found; 4091 } 4092 4093 /// Return true when a code sequence can improve throughput. It 4094 /// should be called only for instructions in loops. 4095 /// \param Pattern - combiner pattern 4096 bool AArch64InstrInfo::isThroughputPattern( 4097 MachineCombinerPattern Pattern) const { 4098 switch (Pattern) { 4099 default: 4100 break; 4101 case MachineCombinerPattern::FMULADDH_OP1: 4102 case MachineCombinerPattern::FMULADDH_OP2: 4103 case MachineCombinerPattern::FMULSUBH_OP1: 4104 case MachineCombinerPattern::FMULSUBH_OP2: 4105 case MachineCombinerPattern::FMULADDS_OP1: 4106 case MachineCombinerPattern::FMULADDS_OP2: 4107 case MachineCombinerPattern::FMULSUBS_OP1: 4108 case MachineCombinerPattern::FMULSUBS_OP2: 4109 case MachineCombinerPattern::FMULADDD_OP1: 4110 case MachineCombinerPattern::FMULADDD_OP2: 4111 case MachineCombinerPattern::FMULSUBD_OP1: 4112 case MachineCombinerPattern::FMULSUBD_OP2: 4113 case MachineCombinerPattern::FNMULSUBH_OP1: 4114 case MachineCombinerPattern::FNMULSUBS_OP1: 4115 case MachineCombinerPattern::FNMULSUBD_OP1: 4116 case MachineCombinerPattern::FMLAv4i16_indexed_OP1: 4117 case MachineCombinerPattern::FMLAv4i16_indexed_OP2: 4118 case MachineCombinerPattern::FMLAv8i16_indexed_OP1: 4119 case MachineCombinerPattern::FMLAv8i16_indexed_OP2: 4120 case MachineCombinerPattern::FMLAv1i32_indexed_OP1: 4121 case MachineCombinerPattern::FMLAv1i32_indexed_OP2: 4122 case MachineCombinerPattern::FMLAv1i64_indexed_OP1: 4123 case MachineCombinerPattern::FMLAv1i64_indexed_OP2: 4124 case MachineCombinerPattern::FMLAv4f16_OP2: 4125 case MachineCombinerPattern::FMLAv4f16_OP1: 4126 case MachineCombinerPattern::FMLAv8f16_OP1: 4127 case MachineCombinerPattern::FMLAv8f16_OP2: 4128 case MachineCombinerPattern::FMLAv2f32_OP2: 4129 case MachineCombinerPattern::FMLAv2f32_OP1: 4130 case MachineCombinerPattern::FMLAv2f64_OP1: 4131 case MachineCombinerPattern::FMLAv2f64_OP2: 4132 case MachineCombinerPattern::FMLAv2i32_indexed_OP1: 4133 case MachineCombinerPattern::FMLAv2i32_indexed_OP2: 4134 case MachineCombinerPattern::FMLAv2i64_indexed_OP1: 4135 case MachineCombinerPattern::FMLAv2i64_indexed_OP2: 4136 case MachineCombinerPattern::FMLAv4f32_OP1: 4137 case MachineCombinerPattern::FMLAv4f32_OP2: 4138 case MachineCombinerPattern::FMLAv4i32_indexed_OP1: 4139 case MachineCombinerPattern::FMLAv4i32_indexed_OP2: 4140 case MachineCombinerPattern::FMLSv4i16_indexed_OP1: 4141 case MachineCombinerPattern::FMLSv4i16_indexed_OP2: 4142 case MachineCombinerPattern::FMLSv8i16_indexed_OP1: 4143 case MachineCombinerPattern::FMLSv8i16_indexed_OP2: 4144 case MachineCombinerPattern::FMLSv1i32_indexed_OP2: 4145 case MachineCombinerPattern::FMLSv1i64_indexed_OP2: 4146 case MachineCombinerPattern::FMLSv2i32_indexed_OP2: 4147 case MachineCombinerPattern::FMLSv2i64_indexed_OP2: 4148 case MachineCombinerPattern::FMLSv4f16_OP1: 4149 case MachineCombinerPattern::FMLSv4f16_OP2: 4150 case MachineCombinerPattern::FMLSv8f16_OP1: 4151 case MachineCombinerPattern::FMLSv8f16_OP2: 4152 case MachineCombinerPattern::FMLSv2f32_OP2: 4153 case MachineCombinerPattern::FMLSv2f64_OP2: 4154 case MachineCombinerPattern::FMLSv4i32_indexed_OP2: 4155 case MachineCombinerPattern::FMLSv4f32_OP2: 4156 case MachineCombinerPattern::MULADDv8i8_OP1: 4157 case MachineCombinerPattern::MULADDv8i8_OP2: 4158 case MachineCombinerPattern::MULADDv16i8_OP1: 4159 case MachineCombinerPattern::MULADDv16i8_OP2: 4160 case MachineCombinerPattern::MULADDv4i16_OP1: 4161 case MachineCombinerPattern::MULADDv4i16_OP2: 4162 case MachineCombinerPattern::MULADDv8i16_OP1: 4163 case MachineCombinerPattern::MULADDv8i16_OP2: 4164 case MachineCombinerPattern::MULADDv2i32_OP1: 4165 case MachineCombinerPattern::MULADDv2i32_OP2: 4166 case MachineCombinerPattern::MULADDv4i32_OP1: 4167 case MachineCombinerPattern::MULADDv4i32_OP2: 4168 case MachineCombinerPattern::MULSUBv8i8_OP1: 4169 case MachineCombinerPattern::MULSUBv8i8_OP2: 4170 case MachineCombinerPattern::MULSUBv16i8_OP1: 4171 case MachineCombinerPattern::MULSUBv16i8_OP2: 4172 case MachineCombinerPattern::MULSUBv4i16_OP1: 4173 case MachineCombinerPattern::MULSUBv4i16_OP2: 4174 case MachineCombinerPattern::MULSUBv8i16_OP1: 4175 case MachineCombinerPattern::MULSUBv8i16_OP2: 4176 case MachineCombinerPattern::MULSUBv2i32_OP1: 4177 case MachineCombinerPattern::MULSUBv2i32_OP2: 4178 case MachineCombinerPattern::MULSUBv4i32_OP1: 4179 case MachineCombinerPattern::MULSUBv4i32_OP2: 4180 case MachineCombinerPattern::MULADDv4i16_indexed_OP1: 4181 case MachineCombinerPattern::MULADDv4i16_indexed_OP2: 4182 case MachineCombinerPattern::MULADDv8i16_indexed_OP1: 4183 case MachineCombinerPattern::MULADDv8i16_indexed_OP2: 4184 case MachineCombinerPattern::MULADDv2i32_indexed_OP1: 4185 case MachineCombinerPattern::MULADDv2i32_indexed_OP2: 4186 case MachineCombinerPattern::MULADDv4i32_indexed_OP1: 4187 case MachineCombinerPattern::MULADDv4i32_indexed_OP2: 4188 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1: 4189 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2: 4190 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1: 4191 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2: 4192 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1: 4193 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2: 4194 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1: 4195 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2: 4196 return true; 4197 } // end switch (Pattern) 4198 return false; 4199 } 4200 /// Return true when there is potentially a faster code sequence for an 4201 /// instruction chain ending in \p Root. All potential patterns are listed in 4202 /// the \p Pattern vector. Pattern should be sorted in priority order since the 4203 /// pattern evaluator stops checking as soon as it finds a faster sequence. 4204 4205 bool AArch64InstrInfo::getMachineCombinerPatterns( 4206 MachineInstr &Root, 4207 SmallVectorImpl<MachineCombinerPattern> &Patterns) const { 4208 // Integer patterns 4209 if (getMaddPatterns(Root, Patterns)) 4210 return true; 4211 // Floating point patterns 4212 if (getFMAPatterns(Root, Patterns)) 4213 return true; 4214 4215 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); 4216 } 4217 4218 enum class FMAInstKind { Default, Indexed, Accumulator }; 4219 /// genFusedMultiply - Generate fused multiply instructions. 4220 /// This function supports both integer and floating point instructions. 4221 /// A typical example: 4222 /// F|MUL I=A,B,0 4223 /// F|ADD R,I,C 4224 /// ==> F|MADD R,A,B,C 4225 /// \param MF Containing MachineFunction 4226 /// \param MRI Register information 4227 /// \param TII Target information 4228 /// \param Root is the F|ADD instruction 4229 /// \param [out] InsInstrs is a vector of machine instructions and will 4230 /// contain the generated madd instruction 4231 /// \param IdxMulOpd is index of operand in Root that is the result of 4232 /// the F|MUL. In the example above IdxMulOpd is 1. 4233 /// \param MaddOpc the opcode fo the f|madd instruction 4234 /// \param RC Register class of operands 4235 /// \param kind of fma instruction (addressing mode) to be generated 4236 /// \param ReplacedAddend is the result register from the instruction 4237 /// replacing the non-combined operand, if any. 4238 static MachineInstr * 4239 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, 4240 const TargetInstrInfo *TII, MachineInstr &Root, 4241 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd, 4242 unsigned MaddOpc, const TargetRegisterClass *RC, 4243 FMAInstKind kind = FMAInstKind::Default, 4244 const Register *ReplacedAddend = nullptr) { 4245 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 4246 4247 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1; 4248 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 4249 Register ResultReg = Root.getOperand(0).getReg(); 4250 Register SrcReg0 = MUL->getOperand(1).getReg(); 4251 bool Src0IsKill = MUL->getOperand(1).isKill(); 4252 Register SrcReg1 = MUL->getOperand(2).getReg(); 4253 bool Src1IsKill = MUL->getOperand(2).isKill(); 4254 4255 unsigned SrcReg2; 4256 bool Src2IsKill; 4257 if (ReplacedAddend) { 4258 // If we just generated a new addend, we must be it's only use. 4259 SrcReg2 = *ReplacedAddend; 4260 Src2IsKill = true; 4261 } else { 4262 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg(); 4263 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill(); 4264 } 4265 4266 if (Register::isVirtualRegister(ResultReg)) 4267 MRI.constrainRegClass(ResultReg, RC); 4268 if (Register::isVirtualRegister(SrcReg0)) 4269 MRI.constrainRegClass(SrcReg0, RC); 4270 if (Register::isVirtualRegister(SrcReg1)) 4271 MRI.constrainRegClass(SrcReg1, RC); 4272 if (Register::isVirtualRegister(SrcReg2)) 4273 MRI.constrainRegClass(SrcReg2, RC); 4274 4275 MachineInstrBuilder MIB; 4276 if (kind == FMAInstKind::Default) 4277 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 4278 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 4279 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 4280 .addReg(SrcReg2, getKillRegState(Src2IsKill)); 4281 else if (kind == FMAInstKind::Indexed) 4282 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 4283 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 4284 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 4285 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 4286 .addImm(MUL->getOperand(3).getImm()); 4287 else if (kind == FMAInstKind::Accumulator) 4288 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 4289 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 4290 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 4291 .addReg(SrcReg1, getKillRegState(Src1IsKill)); 4292 else 4293 assert(false && "Invalid FMA instruction kind \n"); 4294 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL) 4295 InsInstrs.push_back(MIB); 4296 return MUL; 4297 } 4298 4299 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate 4300 /// instructions. 4301 /// 4302 /// \see genFusedMultiply 4303 static MachineInstr *genFusedMultiplyAcc( 4304 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 4305 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 4306 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { 4307 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 4308 FMAInstKind::Accumulator); 4309 } 4310 4311 /// genNeg - Helper to generate an intermediate negation of the second operand 4312 /// of Root 4313 static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI, 4314 const TargetInstrInfo *TII, MachineInstr &Root, 4315 SmallVectorImpl<MachineInstr *> &InsInstrs, 4316 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 4317 unsigned MnegOpc, const TargetRegisterClass *RC) { 4318 Register NewVR = MRI.createVirtualRegister(RC); 4319 MachineInstrBuilder MIB = 4320 BuildMI(MF, Root.getDebugLoc(), TII->get(MnegOpc), NewVR) 4321 .add(Root.getOperand(2)); 4322 InsInstrs.push_back(MIB); 4323 4324 assert(InstrIdxForVirtReg.empty()); 4325 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 4326 4327 return NewVR; 4328 } 4329 4330 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate 4331 /// instructions with an additional negation of the accumulator 4332 static MachineInstr *genFusedMultiplyAccNeg( 4333 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 4334 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 4335 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, 4336 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { 4337 assert(IdxMulOpd == 1); 4338 4339 Register NewVR = 4340 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); 4341 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 4342 FMAInstKind::Accumulator, &NewVR); 4343 } 4344 4345 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate 4346 /// instructions. 4347 /// 4348 /// \see genFusedMultiply 4349 static MachineInstr *genFusedMultiplyIdx( 4350 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 4351 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 4352 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { 4353 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 4354 FMAInstKind::Indexed); 4355 } 4356 4357 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate 4358 /// instructions with an additional negation of the accumulator 4359 static MachineInstr *genFusedMultiplyIdxNeg( 4360 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 4361 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 4362 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, 4363 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { 4364 assert(IdxMulOpd == 1); 4365 4366 Register NewVR = 4367 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); 4368 4369 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 4370 FMAInstKind::Indexed, &NewVR); 4371 } 4372 4373 /// genMaddR - Generate madd instruction and combine mul and add using 4374 /// an extra virtual register 4375 /// Example - an ADD intermediate needs to be stored in a register: 4376 /// MUL I=A,B,0 4377 /// ADD R,I,Imm 4378 /// ==> ORR V, ZR, Imm 4379 /// ==> MADD R,A,B,V 4380 /// \param MF Containing MachineFunction 4381 /// \param MRI Register information 4382 /// \param TII Target information 4383 /// \param Root is the ADD instruction 4384 /// \param [out] InsInstrs is a vector of machine instructions and will 4385 /// contain the generated madd instruction 4386 /// \param IdxMulOpd is index of operand in Root that is the result of 4387 /// the MUL. In the example above IdxMulOpd is 1. 4388 /// \param MaddOpc the opcode fo the madd instruction 4389 /// \param VR is a virtual register that holds the value of an ADD operand 4390 /// (V in the example above). 4391 /// \param RC Register class of operands 4392 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, 4393 const TargetInstrInfo *TII, MachineInstr &Root, 4394 SmallVectorImpl<MachineInstr *> &InsInstrs, 4395 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR, 4396 const TargetRegisterClass *RC) { 4397 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 4398 4399 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 4400 Register ResultReg = Root.getOperand(0).getReg(); 4401 Register SrcReg0 = MUL->getOperand(1).getReg(); 4402 bool Src0IsKill = MUL->getOperand(1).isKill(); 4403 Register SrcReg1 = MUL->getOperand(2).getReg(); 4404 bool Src1IsKill = MUL->getOperand(2).isKill(); 4405 4406 if (Register::isVirtualRegister(ResultReg)) 4407 MRI.constrainRegClass(ResultReg, RC); 4408 if (Register::isVirtualRegister(SrcReg0)) 4409 MRI.constrainRegClass(SrcReg0, RC); 4410 if (Register::isVirtualRegister(SrcReg1)) 4411 MRI.constrainRegClass(SrcReg1, RC); 4412 if (Register::isVirtualRegister(VR)) 4413 MRI.constrainRegClass(VR, RC); 4414 4415 MachineInstrBuilder MIB = 4416 BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 4417 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 4418 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 4419 .addReg(VR); 4420 // Insert the MADD 4421 InsInstrs.push_back(MIB); 4422 return MUL; 4423 } 4424 4425 /// When getMachineCombinerPatterns() finds potential patterns, 4426 /// this function generates the instructions that could replace the 4427 /// original code sequence 4428 void AArch64InstrInfo::genAlternativeCodeSequence( 4429 MachineInstr &Root, MachineCombinerPattern Pattern, 4430 SmallVectorImpl<MachineInstr *> &InsInstrs, 4431 SmallVectorImpl<MachineInstr *> &DelInstrs, 4432 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const { 4433 MachineBasicBlock &MBB = *Root.getParent(); 4434 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4435 MachineFunction &MF = *MBB.getParent(); 4436 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 4437 4438 MachineInstr *MUL; 4439 const TargetRegisterClass *RC; 4440 unsigned Opc; 4441 switch (Pattern) { 4442 default: 4443 // Reassociate instructions. 4444 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, 4445 DelInstrs, InstrIdxForVirtReg); 4446 return; 4447 case MachineCombinerPattern::MULADDW_OP1: 4448 case MachineCombinerPattern::MULADDX_OP1: 4449 // MUL I=A,B,0 4450 // ADD R,I,C 4451 // ==> MADD R,A,B,C 4452 // --- Create(MADD); 4453 if (Pattern == MachineCombinerPattern::MULADDW_OP1) { 4454 Opc = AArch64::MADDWrrr; 4455 RC = &AArch64::GPR32RegClass; 4456 } else { 4457 Opc = AArch64::MADDXrrr; 4458 RC = &AArch64::GPR64RegClass; 4459 } 4460 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4461 break; 4462 case MachineCombinerPattern::MULADDW_OP2: 4463 case MachineCombinerPattern::MULADDX_OP2: 4464 // MUL I=A,B,0 4465 // ADD R,C,I 4466 // ==> MADD R,A,B,C 4467 // --- Create(MADD); 4468 if (Pattern == MachineCombinerPattern::MULADDW_OP2) { 4469 Opc = AArch64::MADDWrrr; 4470 RC = &AArch64::GPR32RegClass; 4471 } else { 4472 Opc = AArch64::MADDXrrr; 4473 RC = &AArch64::GPR64RegClass; 4474 } 4475 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4476 break; 4477 case MachineCombinerPattern::MULADDWI_OP1: 4478 case MachineCombinerPattern::MULADDXI_OP1: { 4479 // MUL I=A,B,0 4480 // ADD R,I,Imm 4481 // ==> ORR V, ZR, Imm 4482 // ==> MADD R,A,B,V 4483 // --- Create(MADD); 4484 const TargetRegisterClass *OrrRC; 4485 unsigned BitSize, OrrOpc, ZeroReg; 4486 if (Pattern == MachineCombinerPattern::MULADDWI_OP1) { 4487 OrrOpc = AArch64::ORRWri; 4488 OrrRC = &AArch64::GPR32spRegClass; 4489 BitSize = 32; 4490 ZeroReg = AArch64::WZR; 4491 Opc = AArch64::MADDWrrr; 4492 RC = &AArch64::GPR32RegClass; 4493 } else { 4494 OrrOpc = AArch64::ORRXri; 4495 OrrRC = &AArch64::GPR64spRegClass; 4496 BitSize = 64; 4497 ZeroReg = AArch64::XZR; 4498 Opc = AArch64::MADDXrrr; 4499 RC = &AArch64::GPR64RegClass; 4500 } 4501 Register NewVR = MRI.createVirtualRegister(OrrRC); 4502 uint64_t Imm = Root.getOperand(2).getImm(); 4503 4504 if (Root.getOperand(3).isImm()) { 4505 unsigned Val = Root.getOperand(3).getImm(); 4506 Imm = Imm << Val; 4507 } 4508 uint64_t UImm = SignExtend64(Imm, BitSize); 4509 uint64_t Encoding; 4510 if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { 4511 MachineInstrBuilder MIB1 = 4512 BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR) 4513 .addReg(ZeroReg) 4514 .addImm(Encoding); 4515 InsInstrs.push_back(MIB1); 4516 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 4517 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 4518 } 4519 break; 4520 } 4521 case MachineCombinerPattern::MULSUBW_OP1: 4522 case MachineCombinerPattern::MULSUBX_OP1: { 4523 // MUL I=A,B,0 4524 // SUB R,I, C 4525 // ==> SUB V, 0, C 4526 // ==> MADD R,A,B,V // = -C + A*B 4527 // --- Create(MADD); 4528 const TargetRegisterClass *SubRC; 4529 unsigned SubOpc, ZeroReg; 4530 if (Pattern == MachineCombinerPattern::MULSUBW_OP1) { 4531 SubOpc = AArch64::SUBWrr; 4532 SubRC = &AArch64::GPR32spRegClass; 4533 ZeroReg = AArch64::WZR; 4534 Opc = AArch64::MADDWrrr; 4535 RC = &AArch64::GPR32RegClass; 4536 } else { 4537 SubOpc = AArch64::SUBXrr; 4538 SubRC = &AArch64::GPR64spRegClass; 4539 ZeroReg = AArch64::XZR; 4540 Opc = AArch64::MADDXrrr; 4541 RC = &AArch64::GPR64RegClass; 4542 } 4543 Register NewVR = MRI.createVirtualRegister(SubRC); 4544 // SUB NewVR, 0, C 4545 MachineInstrBuilder MIB1 = 4546 BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR) 4547 .addReg(ZeroReg) 4548 .add(Root.getOperand(2)); 4549 InsInstrs.push_back(MIB1); 4550 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 4551 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 4552 break; 4553 } 4554 case MachineCombinerPattern::MULSUBW_OP2: 4555 case MachineCombinerPattern::MULSUBX_OP2: 4556 // MUL I=A,B,0 4557 // SUB R,C,I 4558 // ==> MSUB R,A,B,C (computes C - A*B) 4559 // --- Create(MSUB); 4560 if (Pattern == MachineCombinerPattern::MULSUBW_OP2) { 4561 Opc = AArch64::MSUBWrrr; 4562 RC = &AArch64::GPR32RegClass; 4563 } else { 4564 Opc = AArch64::MSUBXrrr; 4565 RC = &AArch64::GPR64RegClass; 4566 } 4567 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4568 break; 4569 case MachineCombinerPattern::MULSUBWI_OP1: 4570 case MachineCombinerPattern::MULSUBXI_OP1: { 4571 // MUL I=A,B,0 4572 // SUB R,I, Imm 4573 // ==> ORR V, ZR, -Imm 4574 // ==> MADD R,A,B,V // = -Imm + A*B 4575 // --- Create(MADD); 4576 const TargetRegisterClass *OrrRC; 4577 unsigned BitSize, OrrOpc, ZeroReg; 4578 if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) { 4579 OrrOpc = AArch64::ORRWri; 4580 OrrRC = &AArch64::GPR32spRegClass; 4581 BitSize = 32; 4582 ZeroReg = AArch64::WZR; 4583 Opc = AArch64::MADDWrrr; 4584 RC = &AArch64::GPR32RegClass; 4585 } else { 4586 OrrOpc = AArch64::ORRXri; 4587 OrrRC = &AArch64::GPR64spRegClass; 4588 BitSize = 64; 4589 ZeroReg = AArch64::XZR; 4590 Opc = AArch64::MADDXrrr; 4591 RC = &AArch64::GPR64RegClass; 4592 } 4593 Register NewVR = MRI.createVirtualRegister(OrrRC); 4594 uint64_t Imm = Root.getOperand(2).getImm(); 4595 if (Root.getOperand(3).isImm()) { 4596 unsigned Val = Root.getOperand(3).getImm(); 4597 Imm = Imm << Val; 4598 } 4599 uint64_t UImm = SignExtend64(-Imm, BitSize); 4600 uint64_t Encoding; 4601 if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { 4602 MachineInstrBuilder MIB1 = 4603 BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR) 4604 .addReg(ZeroReg) 4605 .addImm(Encoding); 4606 InsInstrs.push_back(MIB1); 4607 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 4608 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 4609 } 4610 break; 4611 } 4612 4613 case MachineCombinerPattern::MULADDv8i8_OP1: 4614 Opc = AArch64::MLAv8i8; 4615 RC = &AArch64::FPR64RegClass; 4616 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4617 break; 4618 case MachineCombinerPattern::MULADDv8i8_OP2: 4619 Opc = AArch64::MLAv8i8; 4620 RC = &AArch64::FPR64RegClass; 4621 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4622 break; 4623 case MachineCombinerPattern::MULADDv16i8_OP1: 4624 Opc = AArch64::MLAv16i8; 4625 RC = &AArch64::FPR128RegClass; 4626 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4627 break; 4628 case MachineCombinerPattern::MULADDv16i8_OP2: 4629 Opc = AArch64::MLAv16i8; 4630 RC = &AArch64::FPR128RegClass; 4631 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4632 break; 4633 case MachineCombinerPattern::MULADDv4i16_OP1: 4634 Opc = AArch64::MLAv4i16; 4635 RC = &AArch64::FPR64RegClass; 4636 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4637 break; 4638 case MachineCombinerPattern::MULADDv4i16_OP2: 4639 Opc = AArch64::MLAv4i16; 4640 RC = &AArch64::FPR64RegClass; 4641 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4642 break; 4643 case MachineCombinerPattern::MULADDv8i16_OP1: 4644 Opc = AArch64::MLAv8i16; 4645 RC = &AArch64::FPR128RegClass; 4646 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4647 break; 4648 case MachineCombinerPattern::MULADDv8i16_OP2: 4649 Opc = AArch64::MLAv8i16; 4650 RC = &AArch64::FPR128RegClass; 4651 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4652 break; 4653 case MachineCombinerPattern::MULADDv2i32_OP1: 4654 Opc = AArch64::MLAv2i32; 4655 RC = &AArch64::FPR64RegClass; 4656 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4657 break; 4658 case MachineCombinerPattern::MULADDv2i32_OP2: 4659 Opc = AArch64::MLAv2i32; 4660 RC = &AArch64::FPR64RegClass; 4661 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4662 break; 4663 case MachineCombinerPattern::MULADDv4i32_OP1: 4664 Opc = AArch64::MLAv4i32; 4665 RC = &AArch64::FPR128RegClass; 4666 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4667 break; 4668 case MachineCombinerPattern::MULADDv4i32_OP2: 4669 Opc = AArch64::MLAv4i32; 4670 RC = &AArch64::FPR128RegClass; 4671 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4672 break; 4673 4674 case MachineCombinerPattern::MULSUBv8i8_OP1: 4675 Opc = AArch64::MLAv8i8; 4676 RC = &AArch64::FPR64RegClass; 4677 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 4678 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8, 4679 RC); 4680 break; 4681 case MachineCombinerPattern::MULSUBv8i8_OP2: 4682 Opc = AArch64::MLSv8i8; 4683 RC = &AArch64::FPR64RegClass; 4684 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4685 break; 4686 case MachineCombinerPattern::MULSUBv16i8_OP1: 4687 Opc = AArch64::MLAv16i8; 4688 RC = &AArch64::FPR128RegClass; 4689 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 4690 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8, 4691 RC); 4692 break; 4693 case MachineCombinerPattern::MULSUBv16i8_OP2: 4694 Opc = AArch64::MLSv16i8; 4695 RC = &AArch64::FPR128RegClass; 4696 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4697 break; 4698 case MachineCombinerPattern::MULSUBv4i16_OP1: 4699 Opc = AArch64::MLAv4i16; 4700 RC = &AArch64::FPR64RegClass; 4701 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 4702 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, 4703 RC); 4704 break; 4705 case MachineCombinerPattern::MULSUBv4i16_OP2: 4706 Opc = AArch64::MLSv4i16; 4707 RC = &AArch64::FPR64RegClass; 4708 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4709 break; 4710 case MachineCombinerPattern::MULSUBv8i16_OP1: 4711 Opc = AArch64::MLAv8i16; 4712 RC = &AArch64::FPR128RegClass; 4713 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 4714 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, 4715 RC); 4716 break; 4717 case MachineCombinerPattern::MULSUBv8i16_OP2: 4718 Opc = AArch64::MLSv8i16; 4719 RC = &AArch64::FPR128RegClass; 4720 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4721 break; 4722 case MachineCombinerPattern::MULSUBv2i32_OP1: 4723 Opc = AArch64::MLAv2i32; 4724 RC = &AArch64::FPR64RegClass; 4725 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 4726 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, 4727 RC); 4728 break; 4729 case MachineCombinerPattern::MULSUBv2i32_OP2: 4730 Opc = AArch64::MLSv2i32; 4731 RC = &AArch64::FPR64RegClass; 4732 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4733 break; 4734 case MachineCombinerPattern::MULSUBv4i32_OP1: 4735 Opc = AArch64::MLAv4i32; 4736 RC = &AArch64::FPR128RegClass; 4737 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 4738 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, 4739 RC); 4740 break; 4741 case MachineCombinerPattern::MULSUBv4i32_OP2: 4742 Opc = AArch64::MLSv4i32; 4743 RC = &AArch64::FPR128RegClass; 4744 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4745 break; 4746 4747 case MachineCombinerPattern::MULADDv4i16_indexed_OP1: 4748 Opc = AArch64::MLAv4i16_indexed; 4749 RC = &AArch64::FPR64RegClass; 4750 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4751 break; 4752 case MachineCombinerPattern::MULADDv4i16_indexed_OP2: 4753 Opc = AArch64::MLAv4i16_indexed; 4754 RC = &AArch64::FPR64RegClass; 4755 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4756 break; 4757 case MachineCombinerPattern::MULADDv8i16_indexed_OP1: 4758 Opc = AArch64::MLAv8i16_indexed; 4759 RC = &AArch64::FPR128RegClass; 4760 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4761 break; 4762 case MachineCombinerPattern::MULADDv8i16_indexed_OP2: 4763 Opc = AArch64::MLAv8i16_indexed; 4764 RC = &AArch64::FPR128RegClass; 4765 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4766 break; 4767 case MachineCombinerPattern::MULADDv2i32_indexed_OP1: 4768 Opc = AArch64::MLAv2i32_indexed; 4769 RC = &AArch64::FPR64RegClass; 4770 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4771 break; 4772 case MachineCombinerPattern::MULADDv2i32_indexed_OP2: 4773 Opc = AArch64::MLAv2i32_indexed; 4774 RC = &AArch64::FPR64RegClass; 4775 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4776 break; 4777 case MachineCombinerPattern::MULADDv4i32_indexed_OP1: 4778 Opc = AArch64::MLAv4i32_indexed; 4779 RC = &AArch64::FPR128RegClass; 4780 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4781 break; 4782 case MachineCombinerPattern::MULADDv4i32_indexed_OP2: 4783 Opc = AArch64::MLAv4i32_indexed; 4784 RC = &AArch64::FPR128RegClass; 4785 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4786 break; 4787 4788 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1: 4789 Opc = AArch64::MLAv4i16_indexed; 4790 RC = &AArch64::FPR64RegClass; 4791 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 4792 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, 4793 RC); 4794 break; 4795 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2: 4796 Opc = AArch64::MLSv4i16_indexed; 4797 RC = &AArch64::FPR64RegClass; 4798 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4799 break; 4800 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1: 4801 Opc = AArch64::MLAv8i16_indexed; 4802 RC = &AArch64::FPR128RegClass; 4803 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 4804 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, 4805 RC); 4806 break; 4807 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2: 4808 Opc = AArch64::MLSv8i16_indexed; 4809 RC = &AArch64::FPR128RegClass; 4810 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4811 break; 4812 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1: 4813 Opc = AArch64::MLAv2i32_indexed; 4814 RC = &AArch64::FPR64RegClass; 4815 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 4816 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, 4817 RC); 4818 break; 4819 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2: 4820 Opc = AArch64::MLSv2i32_indexed; 4821 RC = &AArch64::FPR64RegClass; 4822 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4823 break; 4824 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1: 4825 Opc = AArch64::MLAv4i32_indexed; 4826 RC = &AArch64::FPR128RegClass; 4827 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 4828 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, 4829 RC); 4830 break; 4831 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2: 4832 Opc = AArch64::MLSv4i32_indexed; 4833 RC = &AArch64::FPR128RegClass; 4834 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4835 break; 4836 4837 // Floating Point Support 4838 case MachineCombinerPattern::FMULADDH_OP1: 4839 Opc = AArch64::FMADDHrrr; 4840 RC = &AArch64::FPR16RegClass; 4841 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4842 break; 4843 case MachineCombinerPattern::FMULADDS_OP1: 4844 Opc = AArch64::FMADDSrrr; 4845 RC = &AArch64::FPR32RegClass; 4846 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4847 break; 4848 case MachineCombinerPattern::FMULADDD_OP1: 4849 Opc = AArch64::FMADDDrrr; 4850 RC = &AArch64::FPR64RegClass; 4851 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4852 break; 4853 4854 case MachineCombinerPattern::FMULADDH_OP2: 4855 Opc = AArch64::FMADDHrrr; 4856 RC = &AArch64::FPR16RegClass; 4857 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4858 break; 4859 case MachineCombinerPattern::FMULADDS_OP2: 4860 Opc = AArch64::FMADDSrrr; 4861 RC = &AArch64::FPR32RegClass; 4862 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4863 break; 4864 case MachineCombinerPattern::FMULADDD_OP2: 4865 Opc = AArch64::FMADDDrrr; 4866 RC = &AArch64::FPR64RegClass; 4867 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4868 break; 4869 4870 case MachineCombinerPattern::FMLAv1i32_indexed_OP1: 4871 Opc = AArch64::FMLAv1i32_indexed; 4872 RC = &AArch64::FPR32RegClass; 4873 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4874 FMAInstKind::Indexed); 4875 break; 4876 case MachineCombinerPattern::FMLAv1i32_indexed_OP2: 4877 Opc = AArch64::FMLAv1i32_indexed; 4878 RC = &AArch64::FPR32RegClass; 4879 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4880 FMAInstKind::Indexed); 4881 break; 4882 4883 case MachineCombinerPattern::FMLAv1i64_indexed_OP1: 4884 Opc = AArch64::FMLAv1i64_indexed; 4885 RC = &AArch64::FPR64RegClass; 4886 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4887 FMAInstKind::Indexed); 4888 break; 4889 case MachineCombinerPattern::FMLAv1i64_indexed_OP2: 4890 Opc = AArch64::FMLAv1i64_indexed; 4891 RC = &AArch64::FPR64RegClass; 4892 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4893 FMAInstKind::Indexed); 4894 break; 4895 4896 case MachineCombinerPattern::FMLAv4i16_indexed_OP1: 4897 RC = &AArch64::FPR64RegClass; 4898 Opc = AArch64::FMLAv4i16_indexed; 4899 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4900 FMAInstKind::Indexed); 4901 break; 4902 case MachineCombinerPattern::FMLAv4f16_OP1: 4903 RC = &AArch64::FPR64RegClass; 4904 Opc = AArch64::FMLAv4f16; 4905 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4906 FMAInstKind::Accumulator); 4907 break; 4908 case MachineCombinerPattern::FMLAv4i16_indexed_OP2: 4909 RC = &AArch64::FPR64RegClass; 4910 Opc = AArch64::FMLAv4i16_indexed; 4911 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4912 FMAInstKind::Indexed); 4913 break; 4914 case MachineCombinerPattern::FMLAv4f16_OP2: 4915 RC = &AArch64::FPR64RegClass; 4916 Opc = AArch64::FMLAv4f16; 4917 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4918 FMAInstKind::Accumulator); 4919 break; 4920 4921 case MachineCombinerPattern::FMLAv2i32_indexed_OP1: 4922 case MachineCombinerPattern::FMLAv2f32_OP1: 4923 RC = &AArch64::FPR64RegClass; 4924 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) { 4925 Opc = AArch64::FMLAv2i32_indexed; 4926 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4927 FMAInstKind::Indexed); 4928 } else { 4929 Opc = AArch64::FMLAv2f32; 4930 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4931 FMAInstKind::Accumulator); 4932 } 4933 break; 4934 case MachineCombinerPattern::FMLAv2i32_indexed_OP2: 4935 case MachineCombinerPattern::FMLAv2f32_OP2: 4936 RC = &AArch64::FPR64RegClass; 4937 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) { 4938 Opc = AArch64::FMLAv2i32_indexed; 4939 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4940 FMAInstKind::Indexed); 4941 } else { 4942 Opc = AArch64::FMLAv2f32; 4943 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4944 FMAInstKind::Accumulator); 4945 } 4946 break; 4947 4948 case MachineCombinerPattern::FMLAv8i16_indexed_OP1: 4949 RC = &AArch64::FPR128RegClass; 4950 Opc = AArch64::FMLAv8i16_indexed; 4951 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4952 FMAInstKind::Indexed); 4953 break; 4954 case MachineCombinerPattern::FMLAv8f16_OP1: 4955 RC = &AArch64::FPR128RegClass; 4956 Opc = AArch64::FMLAv8f16; 4957 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4958 FMAInstKind::Accumulator); 4959 break; 4960 case MachineCombinerPattern::FMLAv8i16_indexed_OP2: 4961 RC = &AArch64::FPR128RegClass; 4962 Opc = AArch64::FMLAv8i16_indexed; 4963 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4964 FMAInstKind::Indexed); 4965 break; 4966 case MachineCombinerPattern::FMLAv8f16_OP2: 4967 RC = &AArch64::FPR128RegClass; 4968 Opc = AArch64::FMLAv8f16; 4969 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4970 FMAInstKind::Accumulator); 4971 break; 4972 4973 case MachineCombinerPattern::FMLAv2i64_indexed_OP1: 4974 case MachineCombinerPattern::FMLAv2f64_OP1: 4975 RC = &AArch64::FPR128RegClass; 4976 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) { 4977 Opc = AArch64::FMLAv2i64_indexed; 4978 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4979 FMAInstKind::Indexed); 4980 } else { 4981 Opc = AArch64::FMLAv2f64; 4982 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4983 FMAInstKind::Accumulator); 4984 } 4985 break; 4986 case MachineCombinerPattern::FMLAv2i64_indexed_OP2: 4987 case MachineCombinerPattern::FMLAv2f64_OP2: 4988 RC = &AArch64::FPR128RegClass; 4989 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) { 4990 Opc = AArch64::FMLAv2i64_indexed; 4991 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4992 FMAInstKind::Indexed); 4993 } else { 4994 Opc = AArch64::FMLAv2f64; 4995 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4996 FMAInstKind::Accumulator); 4997 } 4998 break; 4999 5000 case MachineCombinerPattern::FMLAv4i32_indexed_OP1: 5001 case MachineCombinerPattern::FMLAv4f32_OP1: 5002 RC = &AArch64::FPR128RegClass; 5003 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) { 5004 Opc = AArch64::FMLAv4i32_indexed; 5005 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5006 FMAInstKind::Indexed); 5007 } else { 5008 Opc = AArch64::FMLAv4f32; 5009 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5010 FMAInstKind::Accumulator); 5011 } 5012 break; 5013 5014 case MachineCombinerPattern::FMLAv4i32_indexed_OP2: 5015 case MachineCombinerPattern::FMLAv4f32_OP2: 5016 RC = &AArch64::FPR128RegClass; 5017 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) { 5018 Opc = AArch64::FMLAv4i32_indexed; 5019 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5020 FMAInstKind::Indexed); 5021 } else { 5022 Opc = AArch64::FMLAv4f32; 5023 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5024 FMAInstKind::Accumulator); 5025 } 5026 break; 5027 5028 case MachineCombinerPattern::FMULSUBH_OP1: 5029 Opc = AArch64::FNMSUBHrrr; 5030 RC = &AArch64::FPR16RegClass; 5031 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5032 break; 5033 case MachineCombinerPattern::FMULSUBS_OP1: 5034 Opc = AArch64::FNMSUBSrrr; 5035 RC = &AArch64::FPR32RegClass; 5036 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5037 break; 5038 case MachineCombinerPattern::FMULSUBD_OP1: 5039 Opc = AArch64::FNMSUBDrrr; 5040 RC = &AArch64::FPR64RegClass; 5041 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5042 break; 5043 5044 case MachineCombinerPattern::FNMULSUBH_OP1: 5045 Opc = AArch64::FNMADDHrrr; 5046 RC = &AArch64::FPR16RegClass; 5047 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5048 break; 5049 case MachineCombinerPattern::FNMULSUBS_OP1: 5050 Opc = AArch64::FNMADDSrrr; 5051 RC = &AArch64::FPR32RegClass; 5052 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5053 break; 5054 case MachineCombinerPattern::FNMULSUBD_OP1: 5055 Opc = AArch64::FNMADDDrrr; 5056 RC = &AArch64::FPR64RegClass; 5057 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5058 break; 5059 5060 case MachineCombinerPattern::FMULSUBH_OP2: 5061 Opc = AArch64::FMSUBHrrr; 5062 RC = &AArch64::FPR16RegClass; 5063 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5064 break; 5065 case MachineCombinerPattern::FMULSUBS_OP2: 5066 Opc = AArch64::FMSUBSrrr; 5067 RC = &AArch64::FPR32RegClass; 5068 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5069 break; 5070 case MachineCombinerPattern::FMULSUBD_OP2: 5071 Opc = AArch64::FMSUBDrrr; 5072 RC = &AArch64::FPR64RegClass; 5073 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5074 break; 5075 5076 case MachineCombinerPattern::FMLSv1i32_indexed_OP2: 5077 Opc = AArch64::FMLSv1i32_indexed; 5078 RC = &AArch64::FPR32RegClass; 5079 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5080 FMAInstKind::Indexed); 5081 break; 5082 5083 case MachineCombinerPattern::FMLSv1i64_indexed_OP2: 5084 Opc = AArch64::FMLSv1i64_indexed; 5085 RC = &AArch64::FPR64RegClass; 5086 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5087 FMAInstKind::Indexed); 5088 break; 5089 5090 case MachineCombinerPattern::FMLSv4f16_OP1: 5091 case MachineCombinerPattern::FMLSv4i16_indexed_OP1: { 5092 RC = &AArch64::FPR64RegClass; 5093 Register NewVR = MRI.createVirtualRegister(RC); 5094 MachineInstrBuilder MIB1 = 5095 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f16), NewVR) 5096 .add(Root.getOperand(2)); 5097 InsInstrs.push_back(MIB1); 5098 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5099 if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) { 5100 Opc = AArch64::FMLAv4f16; 5101 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5102 FMAInstKind::Accumulator, &NewVR); 5103 } else { 5104 Opc = AArch64::FMLAv4i16_indexed; 5105 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5106 FMAInstKind::Indexed, &NewVR); 5107 } 5108 break; 5109 } 5110 case MachineCombinerPattern::FMLSv4f16_OP2: 5111 RC = &AArch64::FPR64RegClass; 5112 Opc = AArch64::FMLSv4f16; 5113 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5114 FMAInstKind::Accumulator); 5115 break; 5116 case MachineCombinerPattern::FMLSv4i16_indexed_OP2: 5117 RC = &AArch64::FPR64RegClass; 5118 Opc = AArch64::FMLSv4i16_indexed; 5119 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5120 FMAInstKind::Indexed); 5121 break; 5122 5123 case MachineCombinerPattern::FMLSv2f32_OP2: 5124 case MachineCombinerPattern::FMLSv2i32_indexed_OP2: 5125 RC = &AArch64::FPR64RegClass; 5126 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) { 5127 Opc = AArch64::FMLSv2i32_indexed; 5128 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5129 FMAInstKind::Indexed); 5130 } else { 5131 Opc = AArch64::FMLSv2f32; 5132 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5133 FMAInstKind::Accumulator); 5134 } 5135 break; 5136 5137 case MachineCombinerPattern::FMLSv8f16_OP1: 5138 case MachineCombinerPattern::FMLSv8i16_indexed_OP1: { 5139 RC = &AArch64::FPR128RegClass; 5140 Register NewVR = MRI.createVirtualRegister(RC); 5141 MachineInstrBuilder MIB1 = 5142 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv8f16), NewVR) 5143 .add(Root.getOperand(2)); 5144 InsInstrs.push_back(MIB1); 5145 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5146 if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) { 5147 Opc = AArch64::FMLAv8f16; 5148 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5149 FMAInstKind::Accumulator, &NewVR); 5150 } else { 5151 Opc = AArch64::FMLAv8i16_indexed; 5152 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5153 FMAInstKind::Indexed, &NewVR); 5154 } 5155 break; 5156 } 5157 case MachineCombinerPattern::FMLSv8f16_OP2: 5158 RC = &AArch64::FPR128RegClass; 5159 Opc = AArch64::FMLSv8f16; 5160 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5161 FMAInstKind::Accumulator); 5162 break; 5163 case MachineCombinerPattern::FMLSv8i16_indexed_OP2: 5164 RC = &AArch64::FPR128RegClass; 5165 Opc = AArch64::FMLSv8i16_indexed; 5166 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5167 FMAInstKind::Indexed); 5168 break; 5169 5170 case MachineCombinerPattern::FMLSv2f64_OP2: 5171 case MachineCombinerPattern::FMLSv2i64_indexed_OP2: 5172 RC = &AArch64::FPR128RegClass; 5173 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) { 5174 Opc = AArch64::FMLSv2i64_indexed; 5175 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5176 FMAInstKind::Indexed); 5177 } else { 5178 Opc = AArch64::FMLSv2f64; 5179 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5180 FMAInstKind::Accumulator); 5181 } 5182 break; 5183 5184 case MachineCombinerPattern::FMLSv4f32_OP2: 5185 case MachineCombinerPattern::FMLSv4i32_indexed_OP2: 5186 RC = &AArch64::FPR128RegClass; 5187 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) { 5188 Opc = AArch64::FMLSv4i32_indexed; 5189 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5190 FMAInstKind::Indexed); 5191 } else { 5192 Opc = AArch64::FMLSv4f32; 5193 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5194 FMAInstKind::Accumulator); 5195 } 5196 break; 5197 case MachineCombinerPattern::FMLSv2f32_OP1: 5198 case MachineCombinerPattern::FMLSv2i32_indexed_OP1: { 5199 RC = &AArch64::FPR64RegClass; 5200 Register NewVR = MRI.createVirtualRegister(RC); 5201 MachineInstrBuilder MIB1 = 5202 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR) 5203 .add(Root.getOperand(2)); 5204 InsInstrs.push_back(MIB1); 5205 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5206 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) { 5207 Opc = AArch64::FMLAv2i32_indexed; 5208 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5209 FMAInstKind::Indexed, &NewVR); 5210 } else { 5211 Opc = AArch64::FMLAv2f32; 5212 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5213 FMAInstKind::Accumulator, &NewVR); 5214 } 5215 break; 5216 } 5217 case MachineCombinerPattern::FMLSv4f32_OP1: 5218 case MachineCombinerPattern::FMLSv4i32_indexed_OP1: { 5219 RC = &AArch64::FPR128RegClass; 5220 Register NewVR = MRI.createVirtualRegister(RC); 5221 MachineInstrBuilder MIB1 = 5222 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR) 5223 .add(Root.getOperand(2)); 5224 InsInstrs.push_back(MIB1); 5225 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5226 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) { 5227 Opc = AArch64::FMLAv4i32_indexed; 5228 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5229 FMAInstKind::Indexed, &NewVR); 5230 } else { 5231 Opc = AArch64::FMLAv4f32; 5232 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5233 FMAInstKind::Accumulator, &NewVR); 5234 } 5235 break; 5236 } 5237 case MachineCombinerPattern::FMLSv2f64_OP1: 5238 case MachineCombinerPattern::FMLSv2i64_indexed_OP1: { 5239 RC = &AArch64::FPR128RegClass; 5240 Register NewVR = MRI.createVirtualRegister(RC); 5241 MachineInstrBuilder MIB1 = 5242 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR) 5243 .add(Root.getOperand(2)); 5244 InsInstrs.push_back(MIB1); 5245 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5246 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) { 5247 Opc = AArch64::FMLAv2i64_indexed; 5248 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5249 FMAInstKind::Indexed, &NewVR); 5250 } else { 5251 Opc = AArch64::FMLAv2f64; 5252 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5253 FMAInstKind::Accumulator, &NewVR); 5254 } 5255 break; 5256 } 5257 } // end switch (Pattern) 5258 // Record MUL and ADD/SUB for deletion 5259 DelInstrs.push_back(MUL); 5260 DelInstrs.push_back(&Root); 5261 } 5262 5263 /// Replace csincr-branch sequence by simple conditional branch 5264 /// 5265 /// Examples: 5266 /// 1. \code 5267 /// csinc w9, wzr, wzr, <condition code> 5268 /// tbnz w9, #0, 0x44 5269 /// \endcode 5270 /// to 5271 /// \code 5272 /// b.<inverted condition code> 5273 /// \endcode 5274 /// 5275 /// 2. \code 5276 /// csinc w9, wzr, wzr, <condition code> 5277 /// tbz w9, #0, 0x44 5278 /// \endcode 5279 /// to 5280 /// \code 5281 /// b.<condition code> 5282 /// \endcode 5283 /// 5284 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the 5285 /// compare's constant operand is power of 2. 5286 /// 5287 /// Examples: 5288 /// \code 5289 /// and w8, w8, #0x400 5290 /// cbnz w8, L1 5291 /// \endcode 5292 /// to 5293 /// \code 5294 /// tbnz w8, #10, L1 5295 /// \endcode 5296 /// 5297 /// \param MI Conditional Branch 5298 /// \return True when the simple conditional branch is generated 5299 /// 5300 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const { 5301 bool IsNegativeBranch = false; 5302 bool IsTestAndBranch = false; 5303 unsigned TargetBBInMI = 0; 5304 switch (MI.getOpcode()) { 5305 default: 5306 llvm_unreachable("Unknown branch instruction?"); 5307 case AArch64::Bcc: 5308 return false; 5309 case AArch64::CBZW: 5310 case AArch64::CBZX: 5311 TargetBBInMI = 1; 5312 break; 5313 case AArch64::CBNZW: 5314 case AArch64::CBNZX: 5315 TargetBBInMI = 1; 5316 IsNegativeBranch = true; 5317 break; 5318 case AArch64::TBZW: 5319 case AArch64::TBZX: 5320 TargetBBInMI = 2; 5321 IsTestAndBranch = true; 5322 break; 5323 case AArch64::TBNZW: 5324 case AArch64::TBNZX: 5325 TargetBBInMI = 2; 5326 IsNegativeBranch = true; 5327 IsTestAndBranch = true; 5328 break; 5329 } 5330 // So we increment a zero register and test for bits other 5331 // than bit 0? Conservatively bail out in case the verifier 5332 // missed this case. 5333 if (IsTestAndBranch && MI.getOperand(1).getImm()) 5334 return false; 5335 5336 // Find Definition. 5337 assert(MI.getParent() && "Incomplete machine instruciton\n"); 5338 MachineBasicBlock *MBB = MI.getParent(); 5339 MachineFunction *MF = MBB->getParent(); 5340 MachineRegisterInfo *MRI = &MF->getRegInfo(); 5341 Register VReg = MI.getOperand(0).getReg(); 5342 if (!Register::isVirtualRegister(VReg)) 5343 return false; 5344 5345 MachineInstr *DefMI = MRI->getVRegDef(VReg); 5346 5347 // Look through COPY instructions to find definition. 5348 while (DefMI->isCopy()) { 5349 Register CopyVReg = DefMI->getOperand(1).getReg(); 5350 if (!MRI->hasOneNonDBGUse(CopyVReg)) 5351 return false; 5352 if (!MRI->hasOneDef(CopyVReg)) 5353 return false; 5354 DefMI = MRI->getVRegDef(CopyVReg); 5355 } 5356 5357 switch (DefMI->getOpcode()) { 5358 default: 5359 return false; 5360 // Fold AND into a TBZ/TBNZ if constant operand is power of 2. 5361 case AArch64::ANDWri: 5362 case AArch64::ANDXri: { 5363 if (IsTestAndBranch) 5364 return false; 5365 if (DefMI->getParent() != MBB) 5366 return false; 5367 if (!MRI->hasOneNonDBGUse(VReg)) 5368 return false; 5369 5370 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri); 5371 uint64_t Mask = AArch64_AM::decodeLogicalImmediate( 5372 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64); 5373 if (!isPowerOf2_64(Mask)) 5374 return false; 5375 5376 MachineOperand &MO = DefMI->getOperand(1); 5377 Register NewReg = MO.getReg(); 5378 if (!Register::isVirtualRegister(NewReg)) 5379 return false; 5380 5381 assert(!MRI->def_empty(NewReg) && "Register must be defined."); 5382 5383 MachineBasicBlock &RefToMBB = *MBB; 5384 MachineBasicBlock *TBB = MI.getOperand(1).getMBB(); 5385 DebugLoc DL = MI.getDebugLoc(); 5386 unsigned Imm = Log2_64(Mask); 5387 unsigned Opc = (Imm < 32) 5388 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW) 5389 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX); 5390 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc)) 5391 .addReg(NewReg) 5392 .addImm(Imm) 5393 .addMBB(TBB); 5394 // Register lives on to the CBZ now. 5395 MO.setIsKill(false); 5396 5397 // For immediate smaller than 32, we need to use the 32-bit 5398 // variant (W) in all cases. Indeed the 64-bit variant does not 5399 // allow to encode them. 5400 // Therefore, if the input register is 64-bit, we need to take the 5401 // 32-bit sub-part. 5402 if (!Is32Bit && Imm < 32) 5403 NewMI->getOperand(0).setSubReg(AArch64::sub_32); 5404 MI.eraseFromParent(); 5405 return true; 5406 } 5407 // Look for CSINC 5408 case AArch64::CSINCWr: 5409 case AArch64::CSINCXr: { 5410 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR && 5411 DefMI->getOperand(2).getReg() == AArch64::WZR) && 5412 !(DefMI->getOperand(1).getReg() == AArch64::XZR && 5413 DefMI->getOperand(2).getReg() == AArch64::XZR)) 5414 return false; 5415 5416 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) 5417 return false; 5418 5419 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm(); 5420 // Convert only when the condition code is not modified between 5421 // the CSINC and the branch. The CC may be used by other 5422 // instructions in between. 5423 if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write)) 5424 return false; 5425 MachineBasicBlock &RefToMBB = *MBB; 5426 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB(); 5427 DebugLoc DL = MI.getDebugLoc(); 5428 if (IsNegativeBranch) 5429 CC = AArch64CC::getInvertedCondCode(CC); 5430 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB); 5431 MI.eraseFromParent(); 5432 return true; 5433 } 5434 } 5435 } 5436 5437 std::pair<unsigned, unsigned> 5438 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { 5439 const unsigned Mask = AArch64II::MO_FRAGMENT; 5440 return std::make_pair(TF & Mask, TF & ~Mask); 5441 } 5442 5443 ArrayRef<std::pair<unsigned, const char *>> 5444 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { 5445 using namespace AArch64II; 5446 5447 static const std::pair<unsigned, const char *> TargetFlags[] = { 5448 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"}, 5449 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"}, 5450 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"}, 5451 {MO_HI12, "aarch64-hi12"}}; 5452 return makeArrayRef(TargetFlags); 5453 } 5454 5455 ArrayRef<std::pair<unsigned, const char *>> 5456 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { 5457 using namespace AArch64II; 5458 5459 static const std::pair<unsigned, const char *> TargetFlags[] = { 5460 {MO_COFFSTUB, "aarch64-coffstub"}, 5461 {MO_GOT, "aarch64-got"}, 5462 {MO_NC, "aarch64-nc"}, 5463 {MO_S, "aarch64-s"}, 5464 {MO_TLS, "aarch64-tls"}, 5465 {MO_DLLIMPORT, "aarch64-dllimport"}, 5466 {MO_PREL, "aarch64-prel"}, 5467 {MO_TAGGED, "aarch64-tagged"}}; 5468 return makeArrayRef(TargetFlags); 5469 } 5470 5471 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>> 5472 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const { 5473 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] = 5474 {{MOSuppressPair, "aarch64-suppress-pair"}, 5475 {MOStridedAccess, "aarch64-strided-access"}}; 5476 return makeArrayRef(TargetFlags); 5477 } 5478 5479 /// Constants defining how certain sequences should be outlined. 5480 /// This encompasses how an outlined function should be called, and what kind of 5481 /// frame should be emitted for that outlined function. 5482 /// 5483 /// \p MachineOutlinerDefault implies that the function should be called with 5484 /// a save and restore of LR to the stack. 5485 /// 5486 /// That is, 5487 /// 5488 /// I1 Save LR OUTLINED_FUNCTION: 5489 /// I2 --> BL OUTLINED_FUNCTION I1 5490 /// I3 Restore LR I2 5491 /// I3 5492 /// RET 5493 /// 5494 /// * Call construction overhead: 3 (save + BL + restore) 5495 /// * Frame construction overhead: 1 (ret) 5496 /// * Requires stack fixups? Yes 5497 /// 5498 /// \p MachineOutlinerTailCall implies that the function is being created from 5499 /// a sequence of instructions ending in a return. 5500 /// 5501 /// That is, 5502 /// 5503 /// I1 OUTLINED_FUNCTION: 5504 /// I2 --> B OUTLINED_FUNCTION I1 5505 /// RET I2 5506 /// RET 5507 /// 5508 /// * Call construction overhead: 1 (B) 5509 /// * Frame construction overhead: 0 (Return included in sequence) 5510 /// * Requires stack fixups? No 5511 /// 5512 /// \p MachineOutlinerNoLRSave implies that the function should be called using 5513 /// a BL instruction, but doesn't require LR to be saved and restored. This 5514 /// happens when LR is known to be dead. 5515 /// 5516 /// That is, 5517 /// 5518 /// I1 OUTLINED_FUNCTION: 5519 /// I2 --> BL OUTLINED_FUNCTION I1 5520 /// I3 I2 5521 /// I3 5522 /// RET 5523 /// 5524 /// * Call construction overhead: 1 (BL) 5525 /// * Frame construction overhead: 1 (RET) 5526 /// * Requires stack fixups? No 5527 /// 5528 /// \p MachineOutlinerThunk implies that the function is being created from 5529 /// a sequence of instructions ending in a call. The outlined function is 5530 /// called with a BL instruction, and the outlined function tail-calls the 5531 /// original call destination. 5532 /// 5533 /// That is, 5534 /// 5535 /// I1 OUTLINED_FUNCTION: 5536 /// I2 --> BL OUTLINED_FUNCTION I1 5537 /// BL f I2 5538 /// B f 5539 /// * Call construction overhead: 1 (BL) 5540 /// * Frame construction overhead: 0 5541 /// * Requires stack fixups? No 5542 /// 5543 /// \p MachineOutlinerRegSave implies that the function should be called with a 5544 /// save and restore of LR to an available register. This allows us to avoid 5545 /// stack fixups. Note that this outlining variant is compatible with the 5546 /// NoLRSave case. 5547 /// 5548 /// That is, 5549 /// 5550 /// I1 Save LR OUTLINED_FUNCTION: 5551 /// I2 --> BL OUTLINED_FUNCTION I1 5552 /// I3 Restore LR I2 5553 /// I3 5554 /// RET 5555 /// 5556 /// * Call construction overhead: 3 (save + BL + restore) 5557 /// * Frame construction overhead: 1 (ret) 5558 /// * Requires stack fixups? No 5559 enum MachineOutlinerClass { 5560 MachineOutlinerDefault, /// Emit a save, restore, call, and return. 5561 MachineOutlinerTailCall, /// Only emit a branch. 5562 MachineOutlinerNoLRSave, /// Emit a call and return. 5563 MachineOutlinerThunk, /// Emit a call and tail-call. 5564 MachineOutlinerRegSave /// Same as default, but save to a register. 5565 }; 5566 5567 enum MachineOutlinerMBBFlags { 5568 LRUnavailableSomewhere = 0x2, 5569 HasCalls = 0x4, 5570 UnsafeRegsDead = 0x8 5571 }; 5572 5573 unsigned 5574 AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const { 5575 assert(C.LRUWasSet && "LRU wasn't set?"); 5576 MachineFunction *MF = C.getMF(); 5577 const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>( 5578 MF->getSubtarget().getRegisterInfo()); 5579 5580 // Check if there is an available register across the sequence that we can 5581 // use. 5582 for (unsigned Reg : AArch64::GPR64RegClass) { 5583 if (!ARI->isReservedReg(*MF, Reg) && 5584 Reg != AArch64::LR && // LR is not reserved, but don't use it. 5585 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved. 5586 Reg != AArch64::X17 && // Ditto for X17. 5587 C.LRU.available(Reg) && C.UsedInSequence.available(Reg)) 5588 return Reg; 5589 } 5590 5591 // No suitable register. Return 0. 5592 return 0u; 5593 } 5594 5595 static bool 5596 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a, 5597 const outliner::Candidate &b) { 5598 const Function &Fa = a.getMF()->getFunction(); 5599 const Function &Fb = b.getMF()->getFunction(); 5600 5601 // If none of the functions have the "sign-return-address" attribute their 5602 // signing behaviour is equal 5603 if (!Fa.hasFnAttribute("sign-return-address") && 5604 !Fb.hasFnAttribute("sign-return-address")) { 5605 return true; 5606 } 5607 5608 // If both functions have the "sign-return-address" attribute their signing 5609 // behaviour is equal, if the values of the attributes are equal 5610 if (Fa.hasFnAttribute("sign-return-address") && 5611 Fb.hasFnAttribute("sign-return-address")) { 5612 StringRef ScopeA = 5613 Fa.getFnAttribute("sign-return-address").getValueAsString(); 5614 StringRef ScopeB = 5615 Fb.getFnAttribute("sign-return-address").getValueAsString(); 5616 return ScopeA.equals(ScopeB); 5617 } 5618 5619 // If function B doesn't have the "sign-return-address" attribute but A does, 5620 // the functions' signing behaviour is equal if A's value for 5621 // "sign-return-address" is "none" and vice versa. 5622 if (Fa.hasFnAttribute("sign-return-address")) { 5623 StringRef ScopeA = 5624 Fa.getFnAttribute("sign-return-address").getValueAsString(); 5625 return ScopeA.equals("none"); 5626 } 5627 5628 if (Fb.hasFnAttribute("sign-return-address")) { 5629 StringRef ScopeB = 5630 Fb.getFnAttribute("sign-return-address").getValueAsString(); 5631 return ScopeB.equals("none"); 5632 } 5633 5634 llvm_unreachable("Unkown combination of sign-return-address attributes"); 5635 } 5636 5637 static bool 5638 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a, 5639 const outliner::Candidate &b) { 5640 const Function &Fa = a.getMF()->getFunction(); 5641 const Function &Fb = b.getMF()->getFunction(); 5642 5643 // If none of the functions have the "sign-return-address-key" attribute 5644 // their keys are equal 5645 if (!Fa.hasFnAttribute("sign-return-address-key") && 5646 !Fb.hasFnAttribute("sign-return-address-key")) { 5647 return true; 5648 } 5649 5650 // If both functions have the "sign-return-address-key" attribute their 5651 // keys are equal if the values of "sign-return-address-key" are equal 5652 if (Fa.hasFnAttribute("sign-return-address-key") && 5653 Fb.hasFnAttribute("sign-return-address-key")) { 5654 StringRef KeyA = 5655 Fa.getFnAttribute("sign-return-address-key").getValueAsString(); 5656 StringRef KeyB = 5657 Fb.getFnAttribute("sign-return-address-key").getValueAsString(); 5658 return KeyA.equals(KeyB); 5659 } 5660 5661 // If B doesn't have the "sign-return-address-key" attribute, both keys are 5662 // equal, if function a has the default key (a_key) 5663 if (Fa.hasFnAttribute("sign-return-address-key")) { 5664 StringRef KeyA = 5665 Fa.getFnAttribute("sign-return-address-key").getValueAsString(); 5666 return KeyA.equals_lower("a_key"); 5667 } 5668 5669 if (Fb.hasFnAttribute("sign-return-address-key")) { 5670 StringRef KeyB = 5671 Fb.getFnAttribute("sign-return-address-key").getValueAsString(); 5672 return KeyB.equals_lower("a_key"); 5673 } 5674 5675 llvm_unreachable("Unkown combination of sign-return-address-key attributes"); 5676 } 5677 5678 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a, 5679 const outliner::Candidate &b) { 5680 const AArch64Subtarget &SubtargetA = 5681 a.getMF()->getSubtarget<AArch64Subtarget>(); 5682 const AArch64Subtarget &SubtargetB = 5683 b.getMF()->getSubtarget<AArch64Subtarget>(); 5684 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps(); 5685 } 5686 5687 outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( 5688 std::vector<outliner::Candidate> &RepeatedSequenceLocs) const { 5689 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0]; 5690 unsigned SequenceSize = 5691 std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0, 5692 [this](unsigned Sum, const MachineInstr &MI) { 5693 return Sum + getInstSizeInBytes(MI); 5694 }); 5695 unsigned NumBytesToCreateFrame = 0; 5696 5697 // We only allow outlining for functions having exactly matching return 5698 // address signing attributes, i.e., all share the same value for the 5699 // attribute "sign-return-address" and all share the same type of key they 5700 // are signed with. 5701 // Additionally we require all functions to simultaniously either support 5702 // v8.3a features or not. Otherwise an outlined function could get signed 5703 // using dedicated v8.3 instructions and a call from a function that doesn't 5704 // support v8.3 instructions would therefore be invalid. 5705 if (std::adjacent_find( 5706 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), 5707 [](const outliner::Candidate &a, const outliner::Candidate &b) { 5708 // Return true if a and b are non-equal w.r.t. return address 5709 // signing or support of v8.3a features 5710 if (outliningCandidatesSigningScopeConsensus(a, b) && 5711 outliningCandidatesSigningKeyConsensus(a, b) && 5712 outliningCandidatesV8_3OpsConsensus(a, b)) { 5713 return false; 5714 } 5715 return true; 5716 }) != RepeatedSequenceLocs.end()) { 5717 return outliner::OutlinedFunction(); 5718 } 5719 5720 // Since at this point all candidates agree on their return address signing 5721 // picking just one is fine. If the candidate functions potentially sign their 5722 // return addresses, the outlined function should do the same. Note that in 5723 // the case of "sign-return-address"="non-leaf" this is an assumption: It is 5724 // not certainly true that the outlined function will have to sign its return 5725 // address but this decision is made later, when the decision to outline 5726 // has already been made. 5727 // The same holds for the number of additional instructions we need: On 5728 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is 5729 // necessary. However, at this point we don't know if the outlined function 5730 // will have a RET instruction so we assume the worst. 5731 const Function &FCF = FirstCand.getMF()->getFunction(); 5732 const TargetRegisterInfo &TRI = getRegisterInfo(); 5733 if (FCF.hasFnAttribute("sign-return-address")) { 5734 // One PAC and one AUT instructions 5735 NumBytesToCreateFrame += 8; 5736 5737 // We have to check if sp modifying instructions would get outlined. 5738 // If so we only allow outlining if sp is unchanged overall, so matching 5739 // sub and add instructions are okay to outline, all other sp modifications 5740 // are not 5741 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) { 5742 int SPValue = 0; 5743 MachineBasicBlock::iterator MBBI = C.front(); 5744 for (;;) { 5745 if (MBBI->modifiesRegister(AArch64::SP, &TRI)) { 5746 switch (MBBI->getOpcode()) { 5747 case AArch64::ADDXri: 5748 case AArch64::ADDWri: 5749 assert(MBBI->getNumOperands() == 4 && "Wrong number of operands"); 5750 assert(MBBI->getOperand(2).isImm() && 5751 "Expected operand to be immediate"); 5752 assert(MBBI->getOperand(1).isReg() && 5753 "Expected operand to be a register"); 5754 // Check if the add just increments sp. If so, we search for 5755 // matching sub instructions that decrement sp. If not, the 5756 // modification is illegal 5757 if (MBBI->getOperand(1).getReg() == AArch64::SP) 5758 SPValue += MBBI->getOperand(2).getImm(); 5759 else 5760 return true; 5761 break; 5762 case AArch64::SUBXri: 5763 case AArch64::SUBWri: 5764 assert(MBBI->getNumOperands() == 4 && "Wrong number of operands"); 5765 assert(MBBI->getOperand(2).isImm() && 5766 "Expected operand to be immediate"); 5767 assert(MBBI->getOperand(1).isReg() && 5768 "Expected operand to be a register"); 5769 // Check if the sub just decrements sp. If so, we search for 5770 // matching add instructions that increment sp. If not, the 5771 // modification is illegal 5772 if (MBBI->getOperand(1).getReg() == AArch64::SP) 5773 SPValue -= MBBI->getOperand(2).getImm(); 5774 else 5775 return true; 5776 break; 5777 default: 5778 return true; 5779 } 5780 } 5781 if (MBBI == C.back()) 5782 break; 5783 ++MBBI; 5784 } 5785 if (SPValue) 5786 return true; 5787 return false; 5788 }; 5789 // Remove candidates with illegal stack modifying instructions 5790 RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(), 5791 RepeatedSequenceLocs.end(), 5792 hasIllegalSPModification), 5793 RepeatedSequenceLocs.end()); 5794 5795 // If the sequence doesn't have enough candidates left, then we're done. 5796 if (RepeatedSequenceLocs.size() < 2) 5797 return outliner::OutlinedFunction(); 5798 } 5799 5800 // Properties about candidate MBBs that hold for all of them. 5801 unsigned FlagsSetInAll = 0xF; 5802 5803 // Compute liveness information for each candidate, and set FlagsSetInAll. 5804 std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), 5805 [&FlagsSetInAll](outliner::Candidate &C) { 5806 FlagsSetInAll &= C.Flags; 5807 }); 5808 5809 // According to the AArch64 Procedure Call Standard, the following are 5810 // undefined on entry/exit from a function call: 5811 // 5812 // * Registers x16, x17, (and thus w16, w17) 5813 // * Condition codes (and thus the NZCV register) 5814 // 5815 // Because if this, we can't outline any sequence of instructions where 5816 // one 5817 // of these registers is live into/across it. Thus, we need to delete 5818 // those 5819 // candidates. 5820 auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) { 5821 // If the unsafe registers in this block are all dead, then we don't need 5822 // to compute liveness here. 5823 if (C.Flags & UnsafeRegsDead) 5824 return false; 5825 C.initLRU(TRI); 5826 LiveRegUnits LRU = C.LRU; 5827 return (!LRU.available(AArch64::W16) || !LRU.available(AArch64::W17) || 5828 !LRU.available(AArch64::NZCV)); 5829 }; 5830 5831 // Are there any candidates where those registers are live? 5832 if (!(FlagsSetInAll & UnsafeRegsDead)) { 5833 // Erase every candidate that violates the restrictions above. (It could be 5834 // true that we have viable candidates, so it's not worth bailing out in 5835 // the case that, say, 1 out of 20 candidates violate the restructions.) 5836 RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(), 5837 RepeatedSequenceLocs.end(), 5838 CantGuaranteeValueAcrossCall), 5839 RepeatedSequenceLocs.end()); 5840 5841 // If the sequence doesn't have enough candidates left, then we're done. 5842 if (RepeatedSequenceLocs.size() < 2) 5843 return outliner::OutlinedFunction(); 5844 } 5845 5846 // At this point, we have only "safe" candidates to outline. Figure out 5847 // frame + call instruction information. 5848 5849 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode(); 5850 5851 // Helper lambda which sets call information for every candidate. 5852 auto SetCandidateCallInfo = 5853 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) { 5854 for (outliner::Candidate &C : RepeatedSequenceLocs) 5855 C.setCallInfo(CallID, NumBytesForCall); 5856 }; 5857 5858 unsigned FrameID = MachineOutlinerDefault; 5859 NumBytesToCreateFrame += 4; 5860 5861 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) { 5862 return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement"); 5863 }); 5864 5865 // Returns true if an instructions is safe to fix up, false otherwise. 5866 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) { 5867 if (MI.isCall()) 5868 return true; 5869 5870 if (!MI.modifiesRegister(AArch64::SP, &TRI) && 5871 !MI.readsRegister(AArch64::SP, &TRI)) 5872 return true; 5873 5874 // Any modification of SP will break our code to save/restore LR. 5875 // FIXME: We could handle some instructions which add a constant 5876 // offset to SP, with a bit more work. 5877 if (MI.modifiesRegister(AArch64::SP, &TRI)) 5878 return false; 5879 5880 // At this point, we have a stack instruction that we might need to 5881 // fix up. We'll handle it if it's a load or store. 5882 if (MI.mayLoadOrStore()) { 5883 const MachineOperand *Base; // Filled with the base operand of MI. 5884 int64_t Offset; // Filled with the offset of MI. 5885 bool OffsetIsScalable; 5886 5887 // Does it allow us to offset the base operand and is the base the 5888 // register SP? 5889 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) || 5890 !Base->isReg() || Base->getReg() != AArch64::SP) 5891 return false; 5892 5893 // Fixe-up code below assumes bytes. 5894 if (OffsetIsScalable) 5895 return false; 5896 5897 // Find the minimum/maximum offset for this instruction and check 5898 // if fixing it up would be in range. 5899 int64_t MinOffset, 5900 MaxOffset; // Unscaled offsets for the instruction. 5901 TypeSize Scale(0U, false); // The scale to multiply the offsets by. 5902 unsigned DummyWidth; 5903 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset); 5904 5905 Offset += 16; // Update the offset to what it would be if we outlined. 5906 if (Offset < MinOffset * (int64_t)Scale.getFixedSize() || 5907 Offset > MaxOffset * (int64_t)Scale.getFixedSize()) 5908 return false; 5909 5910 // It's in range, so we can outline it. 5911 return true; 5912 } 5913 5914 // FIXME: Add handling for instructions like "add x0, sp, #8". 5915 5916 // We can't fix it up, so don't outline it. 5917 return false; 5918 }; 5919 5920 // True if it's possible to fix up each stack instruction in this sequence. 5921 // Important for frames/call variants that modify the stack. 5922 bool AllStackInstrsSafe = std::all_of( 5923 FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup); 5924 5925 // If the last instruction in any candidate is a terminator, then we should 5926 // tail call all of the candidates. 5927 if (RepeatedSequenceLocs[0].back()->isTerminator()) { 5928 FrameID = MachineOutlinerTailCall; 5929 NumBytesToCreateFrame = 0; 5930 SetCandidateCallInfo(MachineOutlinerTailCall, 4); 5931 } 5932 5933 else if (LastInstrOpcode == AArch64::BL || 5934 (LastInstrOpcode == AArch64::BLR && !HasBTI)) { 5935 // FIXME: Do we need to check if the code after this uses the value of LR? 5936 FrameID = MachineOutlinerThunk; 5937 NumBytesToCreateFrame = 0; 5938 SetCandidateCallInfo(MachineOutlinerThunk, 4); 5939 } 5940 5941 else { 5942 // We need to decide how to emit calls + frames. We can always emit the same 5943 // frame if we don't need to save to the stack. If we have to save to the 5944 // stack, then we need a different frame. 5945 unsigned NumBytesNoStackCalls = 0; 5946 std::vector<outliner::Candidate> CandidatesWithoutStackFixups; 5947 5948 // Check if we have to save LR. 5949 for (outliner::Candidate &C : RepeatedSequenceLocs) { 5950 C.initLRU(TRI); 5951 5952 // If we have a noreturn caller, then we're going to be conservative and 5953 // say that we have to save LR. If we don't have a ret at the end of the 5954 // block, then we can't reason about liveness accurately. 5955 // 5956 // FIXME: We can probably do better than always disabling this in 5957 // noreturn functions by fixing up the liveness info. 5958 bool IsNoReturn = 5959 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn); 5960 5961 // Is LR available? If so, we don't need a save. 5962 if (C.LRU.available(AArch64::LR) && !IsNoReturn) { 5963 NumBytesNoStackCalls += 4; 5964 C.setCallInfo(MachineOutlinerNoLRSave, 4); 5965 CandidatesWithoutStackFixups.push_back(C); 5966 } 5967 5968 // Is an unused register available? If so, we won't modify the stack, so 5969 // we can outline with the same frame type as those that don't save LR. 5970 else if (findRegisterToSaveLRTo(C)) { 5971 NumBytesNoStackCalls += 12; 5972 C.setCallInfo(MachineOutlinerRegSave, 12); 5973 CandidatesWithoutStackFixups.push_back(C); 5974 } 5975 5976 // Is SP used in the sequence at all? If not, we don't have to modify 5977 // the stack, so we are guaranteed to get the same frame. 5978 else if (C.UsedInSequence.available(AArch64::SP)) { 5979 NumBytesNoStackCalls += 12; 5980 C.setCallInfo(MachineOutlinerDefault, 12); 5981 CandidatesWithoutStackFixups.push_back(C); 5982 } 5983 5984 // If we outline this, we need to modify the stack. Pretend we don't 5985 // outline this by saving all of its bytes. 5986 else { 5987 NumBytesNoStackCalls += SequenceSize; 5988 } 5989 } 5990 5991 // If there are no places where we have to save LR, then note that we 5992 // don't have to update the stack. Otherwise, give every candidate the 5993 // default call type, as long as it's safe to do so. 5994 if (!AllStackInstrsSafe || 5995 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) { 5996 RepeatedSequenceLocs = CandidatesWithoutStackFixups; 5997 FrameID = MachineOutlinerNoLRSave; 5998 } else { 5999 SetCandidateCallInfo(MachineOutlinerDefault, 12); 6000 } 6001 6002 // If we dropped all of the candidates, bail out here. 6003 if (RepeatedSequenceLocs.size() < 2) { 6004 RepeatedSequenceLocs.clear(); 6005 return outliner::OutlinedFunction(); 6006 } 6007 } 6008 6009 // Does every candidate's MBB contain a call? If so, then we might have a call 6010 // in the range. 6011 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { 6012 // Check if the range contains a call. These require a save + restore of the 6013 // link register. 6014 bool ModStackToSaveLR = false; 6015 if (std::any_of(FirstCand.front(), FirstCand.back(), 6016 [](const MachineInstr &MI) { return MI.isCall(); })) 6017 ModStackToSaveLR = true; 6018 6019 // Handle the last instruction separately. If this is a tail call, then the 6020 // last instruction is a call. We don't want to save + restore in this case. 6021 // However, it could be possible that the last instruction is a call without 6022 // it being valid to tail call this sequence. We should consider this as 6023 // well. 6024 else if (FrameID != MachineOutlinerThunk && 6025 FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall()) 6026 ModStackToSaveLR = true; 6027 6028 if (ModStackToSaveLR) { 6029 // We can't fix up the stack. Bail out. 6030 if (!AllStackInstrsSafe) { 6031 RepeatedSequenceLocs.clear(); 6032 return outliner::OutlinedFunction(); 6033 } 6034 6035 // Save + restore LR. 6036 NumBytesToCreateFrame += 8; 6037 } 6038 } 6039 6040 return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, 6041 NumBytesToCreateFrame, FrameID); 6042 } 6043 6044 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom( 6045 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const { 6046 const Function &F = MF.getFunction(); 6047 6048 // Can F be deduplicated by the linker? If it can, don't outline from it. 6049 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage()) 6050 return false; 6051 6052 // Don't outline from functions with section markings; the program could 6053 // expect that all the code is in the named section. 6054 // FIXME: Allow outlining from multiple functions with the same section 6055 // marking. 6056 if (F.hasSection()) 6057 return false; 6058 6059 // Outlining from functions with redzones is unsafe since the outliner may 6060 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't 6061 // outline from it. 6062 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 6063 if (!AFI || AFI->hasRedZone().getValueOr(true)) 6064 return false; 6065 6066 // It's safe to outline from MF. 6067 return true; 6068 } 6069 6070 bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB, 6071 unsigned &Flags) const { 6072 // Check if LR is available through all of the MBB. If it's not, then set 6073 // a flag. 6074 assert(MBB.getParent()->getRegInfo().tracksLiveness() && 6075 "Suitable Machine Function for outlining must track liveness"); 6076 LiveRegUnits LRU(getRegisterInfo()); 6077 6078 std::for_each(MBB.rbegin(), MBB.rend(), 6079 [&LRU](MachineInstr &MI) { LRU.accumulate(MI); }); 6080 6081 // Check if each of the unsafe registers are available... 6082 bool W16AvailableInBlock = LRU.available(AArch64::W16); 6083 bool W17AvailableInBlock = LRU.available(AArch64::W17); 6084 bool NZCVAvailableInBlock = LRU.available(AArch64::NZCV); 6085 6086 // If all of these are dead (and not live out), we know we don't have to check 6087 // them later. 6088 if (W16AvailableInBlock && W17AvailableInBlock && NZCVAvailableInBlock) 6089 Flags |= MachineOutlinerMBBFlags::UnsafeRegsDead; 6090 6091 // Now, add the live outs to the set. 6092 LRU.addLiveOuts(MBB); 6093 6094 // If any of these registers is available in the MBB, but also a live out of 6095 // the block, then we know outlining is unsafe. 6096 if (W16AvailableInBlock && !LRU.available(AArch64::W16)) 6097 return false; 6098 if (W17AvailableInBlock && !LRU.available(AArch64::W17)) 6099 return false; 6100 if (NZCVAvailableInBlock && !LRU.available(AArch64::NZCV)) 6101 return false; 6102 6103 // Check if there's a call inside this MachineBasicBlock. If there is, then 6104 // set a flag. 6105 if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); })) 6106 Flags |= MachineOutlinerMBBFlags::HasCalls; 6107 6108 MachineFunction *MF = MBB.getParent(); 6109 6110 // In the event that we outline, we may have to save LR. If there is an 6111 // available register in the MBB, then we'll always save LR there. Check if 6112 // this is true. 6113 bool CanSaveLR = false; 6114 const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>( 6115 MF->getSubtarget().getRegisterInfo()); 6116 6117 // Check if there is an available register across the sequence that we can 6118 // use. 6119 for (unsigned Reg : AArch64::GPR64RegClass) { 6120 if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR && 6121 Reg != AArch64::X16 && Reg != AArch64::X17 && LRU.available(Reg)) { 6122 CanSaveLR = true; 6123 break; 6124 } 6125 } 6126 6127 // Check if we have a register we can save LR to, and if LR was used 6128 // somewhere. If both of those things are true, then we need to evaluate the 6129 // safety of outlining stack instructions later. 6130 if (!CanSaveLR && !LRU.available(AArch64::LR)) 6131 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere; 6132 6133 return true; 6134 } 6135 6136 outliner::InstrType 6137 AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, 6138 unsigned Flags) const { 6139 MachineInstr &MI = *MIT; 6140 MachineBasicBlock *MBB = MI.getParent(); 6141 MachineFunction *MF = MBB->getParent(); 6142 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>(); 6143 6144 // Don't outline anything used for return address signing. The outlined 6145 // function will get signed later if needed 6146 switch (MI.getOpcode()) { 6147 case AArch64::PACIASP: 6148 case AArch64::PACIBSP: 6149 case AArch64::AUTIASP: 6150 case AArch64::AUTIBSP: 6151 case AArch64::RETAA: 6152 case AArch64::RETAB: 6153 case AArch64::EMITBKEY: 6154 return outliner::InstrType::Illegal; 6155 } 6156 6157 // Don't outline LOHs. 6158 if (FuncInfo->getLOHRelated().count(&MI)) 6159 return outliner::InstrType::Illegal; 6160 6161 // We can only outline these if we will tail call the outlined function, or 6162 // fix up the CFI offsets. For the sake of safety, don't outline CFI 6163 // instructions. 6164 // 6165 // FIXME: If the proper fixups are implemented, this should be possible. 6166 if (MI.isCFIInstruction()) 6167 return outliner::InstrType::Illegal; 6168 6169 // Don't allow debug values to impact outlining type. 6170 if (MI.isDebugInstr() || MI.isIndirectDebugValue()) 6171 return outliner::InstrType::Invisible; 6172 6173 // At this point, KILL instructions don't really tell us much so we can go 6174 // ahead and skip over them. 6175 if (MI.isKill()) 6176 return outliner::InstrType::Invisible; 6177 6178 // Is this a terminator for a basic block? 6179 if (MI.isTerminator()) { 6180 6181 // Is this the end of a function? 6182 if (MI.getParent()->succ_empty()) 6183 return outliner::InstrType::Legal; 6184 6185 // It's not, so don't outline it. 6186 return outliner::InstrType::Illegal; 6187 } 6188 6189 // Make sure none of the operands are un-outlinable. 6190 for (const MachineOperand &MOP : MI.operands()) { 6191 if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() || 6192 MOP.isTargetIndex()) 6193 return outliner::InstrType::Illegal; 6194 6195 // If it uses LR or W30 explicitly, then don't touch it. 6196 if (MOP.isReg() && !MOP.isImplicit() && 6197 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30)) 6198 return outliner::InstrType::Illegal; 6199 } 6200 6201 // Special cases for instructions that can always be outlined, but will fail 6202 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always 6203 // be outlined because they don't require a *specific* value to be in LR. 6204 if (MI.getOpcode() == AArch64::ADRP) 6205 return outliner::InstrType::Legal; 6206 6207 // If MI is a call we might be able to outline it. We don't want to outline 6208 // any calls that rely on the position of items on the stack. When we outline 6209 // something containing a call, we have to emit a save and restore of LR in 6210 // the outlined function. Currently, this always happens by saving LR to the 6211 // stack. Thus, if we outline, say, half the parameters for a function call 6212 // plus the call, then we'll break the callee's expectations for the layout 6213 // of the stack. 6214 // 6215 // FIXME: Allow calls to functions which construct a stack frame, as long 6216 // as they don't access arguments on the stack. 6217 // FIXME: Figure out some way to analyze functions defined in other modules. 6218 // We should be able to compute the memory usage based on the IR calling 6219 // convention, even if we can't see the definition. 6220 if (MI.isCall()) { 6221 // Get the function associated with the call. Look at each operand and find 6222 // the one that represents the callee and get its name. 6223 const Function *Callee = nullptr; 6224 for (const MachineOperand &MOP : MI.operands()) { 6225 if (MOP.isGlobal()) { 6226 Callee = dyn_cast<Function>(MOP.getGlobal()); 6227 break; 6228 } 6229 } 6230 6231 // Never outline calls to mcount. There isn't any rule that would require 6232 // this, but the Linux kernel's "ftrace" feature depends on it. 6233 if (Callee && Callee->getName() == "\01_mcount") 6234 return outliner::InstrType::Illegal; 6235 6236 // If we don't know anything about the callee, assume it depends on the 6237 // stack layout of the caller. In that case, it's only legal to outline 6238 // as a tail-call. Whitelist the call instructions we know about so we 6239 // don't get unexpected results with call pseudo-instructions. 6240 auto UnknownCallOutlineType = outliner::InstrType::Illegal; 6241 if (MI.getOpcode() == AArch64::BLR || MI.getOpcode() == AArch64::BL) 6242 UnknownCallOutlineType = outliner::InstrType::LegalTerminator; 6243 6244 if (!Callee) 6245 return UnknownCallOutlineType; 6246 6247 // We have a function we have information about. Check it if it's something 6248 // can safely outline. 6249 MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee); 6250 6251 // We don't know what's going on with the callee at all. Don't touch it. 6252 if (!CalleeMF) 6253 return UnknownCallOutlineType; 6254 6255 // Check if we know anything about the callee saves on the function. If we 6256 // don't, then don't touch it, since that implies that we haven't 6257 // computed anything about its stack frame yet. 6258 MachineFrameInfo &MFI = CalleeMF->getFrameInfo(); 6259 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 || 6260 MFI.getNumObjects() > 0) 6261 return UnknownCallOutlineType; 6262 6263 // At this point, we can say that CalleeMF ought to not pass anything on the 6264 // stack. Therefore, we can outline it. 6265 return outliner::InstrType::Legal; 6266 } 6267 6268 // Don't outline positions. 6269 if (MI.isPosition()) 6270 return outliner::InstrType::Illegal; 6271 6272 // Don't touch the link register or W30. 6273 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) || 6274 MI.modifiesRegister(AArch64::W30, &getRegisterInfo())) 6275 return outliner::InstrType::Illegal; 6276 6277 // Don't outline BTI instructions, because that will prevent the outlining 6278 // site from being indirectly callable. 6279 if (MI.getOpcode() == AArch64::HINT) { 6280 int64_t Imm = MI.getOperand(0).getImm(); 6281 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38) 6282 return outliner::InstrType::Illegal; 6283 } 6284 6285 return outliner::InstrType::Legal; 6286 } 6287 6288 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const { 6289 for (MachineInstr &MI : MBB) { 6290 const MachineOperand *Base; 6291 unsigned Width; 6292 int64_t Offset; 6293 bool OffsetIsScalable; 6294 6295 // Is this a load or store with an immediate offset with SP as the base? 6296 if (!MI.mayLoadOrStore() || 6297 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width, 6298 &RI) || 6299 (Base->isReg() && Base->getReg() != AArch64::SP)) 6300 continue; 6301 6302 // It is, so we have to fix it up. 6303 TypeSize Scale(0U, false); 6304 int64_t Dummy1, Dummy2; 6305 6306 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI); 6307 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!"); 6308 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2); 6309 assert(Scale != 0 && "Unexpected opcode!"); 6310 assert(!OffsetIsScalable && "Expected offset to be a byte offset"); 6311 6312 // We've pushed the return address to the stack, so add 16 to the offset. 6313 // This is safe, since we already checked if it would overflow when we 6314 // checked if this instruction was legal to outline. 6315 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedSize(); 6316 StackOffsetOperand.setImm(NewImm); 6317 } 6318 } 6319 6320 static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, 6321 bool ShouldSignReturnAddr, 6322 bool ShouldSignReturnAddrWithAKey) { 6323 if (ShouldSignReturnAddr) { 6324 MachineBasicBlock::iterator MBBPAC = MBB.begin(); 6325 MachineBasicBlock::iterator MBBAUT = MBB.getFirstTerminator(); 6326 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); 6327 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 6328 DebugLoc DL; 6329 6330 if (MBBAUT != MBB.end()) 6331 DL = MBBAUT->getDebugLoc(); 6332 6333 // At the very beginning of the basic block we insert the following 6334 // depending on the key type 6335 // 6336 // a_key: b_key: 6337 // PACIASP EMITBKEY 6338 // CFI_INSTRUCTION PACIBSP 6339 // CFI_INSTRUCTION 6340 if (ShouldSignReturnAddrWithAKey) { 6341 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIASP)) 6342 .setMIFlag(MachineInstr::FrameSetup); 6343 } else { 6344 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::EMITBKEY)) 6345 .setMIFlag(MachineInstr::FrameSetup); 6346 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIBSP)) 6347 .setMIFlag(MachineInstr::FrameSetup); 6348 } 6349 unsigned CFIIndex = 6350 MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr)); 6351 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::CFI_INSTRUCTION)) 6352 .addCFIIndex(CFIIndex) 6353 .setMIFlags(MachineInstr::FrameSetup); 6354 6355 // If v8.3a features are available we can replace a RET instruction by 6356 // RETAA or RETAB and omit the AUT instructions 6357 if (Subtarget.hasV8_3aOps() && MBBAUT != MBB.end() && 6358 MBBAUT->getOpcode() == AArch64::RET) { 6359 BuildMI(MBB, MBBAUT, DL, 6360 TII->get(ShouldSignReturnAddrWithAKey ? AArch64::RETAA 6361 : AArch64::RETAB)) 6362 .copyImplicitOps(*MBBAUT); 6363 MBB.erase(MBBAUT); 6364 } else { 6365 BuildMI(MBB, MBBAUT, DL, 6366 TII->get(ShouldSignReturnAddrWithAKey ? AArch64::AUTIASP 6367 : AArch64::AUTIBSP)) 6368 .setMIFlag(MachineInstr::FrameDestroy); 6369 } 6370 } 6371 } 6372 6373 void AArch64InstrInfo::buildOutlinedFrame( 6374 MachineBasicBlock &MBB, MachineFunction &MF, 6375 const outliner::OutlinedFunction &OF) const { 6376 // For thunk outlining, rewrite the last instruction from a call to a 6377 // tail-call. 6378 if (OF.FrameConstructionID == MachineOutlinerThunk) { 6379 MachineInstr *Call = &*--MBB.instr_end(); 6380 unsigned TailOpcode; 6381 if (Call->getOpcode() == AArch64::BL) { 6382 TailOpcode = AArch64::TCRETURNdi; 6383 } else { 6384 assert(Call->getOpcode() == AArch64::BLR); 6385 TailOpcode = AArch64::TCRETURNriALL; 6386 } 6387 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode)) 6388 .add(Call->getOperand(0)) 6389 .addImm(0); 6390 MBB.insert(MBB.end(), TC); 6391 Call->eraseFromParent(); 6392 } 6393 6394 bool IsLeafFunction = true; 6395 6396 // Is there a call in the outlined range? 6397 auto IsNonTailCall = [](const MachineInstr &MI) { 6398 return MI.isCall() && !MI.isReturn(); 6399 }; 6400 6401 if (std::any_of(MBB.instr_begin(), MBB.instr_end(), IsNonTailCall)) { 6402 // Fix up the instructions in the range, since we're going to modify the 6403 // stack. 6404 assert(OF.FrameConstructionID != MachineOutlinerDefault && 6405 "Can only fix up stack references once"); 6406 fixupPostOutline(MBB); 6407 6408 IsLeafFunction = false; 6409 6410 // LR has to be a live in so that we can save it. 6411 MBB.addLiveIn(AArch64::LR); 6412 6413 MachineBasicBlock::iterator It = MBB.begin(); 6414 MachineBasicBlock::iterator Et = MBB.end(); 6415 6416 if (OF.FrameConstructionID == MachineOutlinerTailCall || 6417 OF.FrameConstructionID == MachineOutlinerThunk) 6418 Et = std::prev(MBB.end()); 6419 6420 // Insert a save before the outlined region 6421 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 6422 .addReg(AArch64::SP, RegState::Define) 6423 .addReg(AArch64::LR) 6424 .addReg(AArch64::SP) 6425 .addImm(-16); 6426 It = MBB.insert(It, STRXpre); 6427 6428 const TargetSubtargetInfo &STI = MF.getSubtarget(); 6429 const MCRegisterInfo *MRI = STI.getRegisterInfo(); 6430 unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true); 6431 6432 // Add a CFI saying the stack was moved 16 B down. 6433 int64_t StackPosEntry = 6434 MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 16)); 6435 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 6436 .addCFIIndex(StackPosEntry) 6437 .setMIFlags(MachineInstr::FrameSetup); 6438 6439 // Add a CFI saying that the LR that we want to find is now 16 B higher than 6440 // before. 6441 int64_t LRPosEntry = 6442 MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, 16)); 6443 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 6444 .addCFIIndex(LRPosEntry) 6445 .setMIFlags(MachineInstr::FrameSetup); 6446 6447 // Insert a restore before the terminator for the function. 6448 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 6449 .addReg(AArch64::SP, RegState::Define) 6450 .addReg(AArch64::LR, RegState::Define) 6451 .addReg(AArch64::SP) 6452 .addImm(16); 6453 Et = MBB.insert(Et, LDRXpost); 6454 } 6455 6456 // If a bunch of candidates reach this point they must agree on their return 6457 // address signing. It is therefore enough to just consider the signing 6458 // behaviour of one of them 6459 const Function &CF = OF.Candidates.front().getMF()->getFunction(); 6460 bool ShouldSignReturnAddr = false; 6461 if (CF.hasFnAttribute("sign-return-address")) { 6462 StringRef Scope = 6463 CF.getFnAttribute("sign-return-address").getValueAsString(); 6464 if (Scope.equals("all")) 6465 ShouldSignReturnAddr = true; 6466 else if (Scope.equals("non-leaf") && !IsLeafFunction) 6467 ShouldSignReturnAddr = true; 6468 } 6469 6470 // a_key is the default 6471 bool ShouldSignReturnAddrWithAKey = true; 6472 if (CF.hasFnAttribute("sign-return-address-key")) { 6473 const StringRef Key = 6474 CF.getFnAttribute("sign-return-address-key").getValueAsString(); 6475 // Key can either be a_key or b_key 6476 assert((Key.equals_lower("a_key") || Key.equals_lower("b_key")) && 6477 "Return address signing key must be either a_key or b_key"); 6478 ShouldSignReturnAddrWithAKey = Key.equals_lower("a_key"); 6479 } 6480 6481 // If this is a tail call outlined function, then there's already a return. 6482 if (OF.FrameConstructionID == MachineOutlinerTailCall || 6483 OF.FrameConstructionID == MachineOutlinerThunk) { 6484 signOutlinedFunction(MF, MBB, ShouldSignReturnAddr, 6485 ShouldSignReturnAddrWithAKey); 6486 return; 6487 } 6488 6489 // It's not a tail call, so we have to insert the return ourselves. 6490 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET)) 6491 .addReg(AArch64::LR, RegState::Undef); 6492 MBB.insert(MBB.end(), ret); 6493 6494 signOutlinedFunction(MF, MBB, ShouldSignReturnAddr, 6495 ShouldSignReturnAddrWithAKey); 6496 6497 // Did we have to modify the stack by saving the link register? 6498 if (OF.FrameConstructionID != MachineOutlinerDefault) 6499 return; 6500 6501 // We modified the stack. 6502 // Walk over the basic block and fix up all the stack accesses. 6503 fixupPostOutline(MBB); 6504 } 6505 6506 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall( 6507 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, 6508 MachineFunction &MF, const outliner::Candidate &C) const { 6509 6510 // Are we tail calling? 6511 if (C.CallConstructionID == MachineOutlinerTailCall) { 6512 // If yes, then we can just branch to the label. 6513 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi)) 6514 .addGlobalAddress(M.getNamedValue(MF.getName())) 6515 .addImm(0)); 6516 return It; 6517 } 6518 6519 // Are we saving the link register? 6520 if (C.CallConstructionID == MachineOutlinerNoLRSave || 6521 C.CallConstructionID == MachineOutlinerThunk) { 6522 // No, so just insert the call. 6523 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 6524 .addGlobalAddress(M.getNamedValue(MF.getName()))); 6525 return It; 6526 } 6527 6528 // We want to return the spot where we inserted the call. 6529 MachineBasicBlock::iterator CallPt; 6530 6531 // Instructions for saving and restoring LR around the call instruction we're 6532 // going to insert. 6533 MachineInstr *Save; 6534 MachineInstr *Restore; 6535 // Can we save to a register? 6536 if (C.CallConstructionID == MachineOutlinerRegSave) { 6537 // FIXME: This logic should be sunk into a target-specific interface so that 6538 // we don't have to recompute the register. 6539 unsigned Reg = findRegisterToSaveLRTo(C); 6540 assert(Reg != 0 && "No callee-saved register available?"); 6541 6542 // Save and restore LR from that register. 6543 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg) 6544 .addReg(AArch64::XZR) 6545 .addReg(AArch64::LR) 6546 .addImm(0); 6547 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR) 6548 .addReg(AArch64::XZR) 6549 .addReg(Reg) 6550 .addImm(0); 6551 } else { 6552 // We have the default case. Save and restore from SP. 6553 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 6554 .addReg(AArch64::SP, RegState::Define) 6555 .addReg(AArch64::LR) 6556 .addReg(AArch64::SP) 6557 .addImm(-16); 6558 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 6559 .addReg(AArch64::SP, RegState::Define) 6560 .addReg(AArch64::LR, RegState::Define) 6561 .addReg(AArch64::SP) 6562 .addImm(16); 6563 } 6564 6565 It = MBB.insert(It, Save); 6566 It++; 6567 6568 // Insert the call. 6569 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 6570 .addGlobalAddress(M.getNamedValue(MF.getName()))); 6571 CallPt = It; 6572 It++; 6573 6574 It = MBB.insert(It, Restore); 6575 return CallPt; 6576 } 6577 6578 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault( 6579 MachineFunction &MF) const { 6580 return MF.getFunction().hasMinSize(); 6581 } 6582 6583 Optional<DestSourcePair> 6584 AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { 6585 6586 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg 6587 // and zero immediate operands used as an alias for mov instruction. 6588 if (MI.getOpcode() == AArch64::ORRWrs && 6589 MI.getOperand(1).getReg() == AArch64::WZR && 6590 MI.getOperand(3).getImm() == 0x0) { 6591 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 6592 } 6593 6594 if (MI.getOpcode() == AArch64::ORRXrs && 6595 MI.getOperand(1).getReg() == AArch64::XZR && 6596 MI.getOperand(3).getImm() == 0x0) { 6597 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 6598 } 6599 6600 return None; 6601 } 6602 6603 Optional<RegImmPair> AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, 6604 Register Reg) const { 6605 int Sign = 1; 6606 int64_t Offset = 0; 6607 6608 // TODO: Handle cases where Reg is a super- or sub-register of the 6609 // destination register. 6610 const MachineOperand &Op0 = MI.getOperand(0); 6611 if (!Op0.isReg() || Reg != Op0.getReg()) 6612 return None; 6613 6614 switch (MI.getOpcode()) { 6615 default: 6616 return None; 6617 case AArch64::SUBWri: 6618 case AArch64::SUBXri: 6619 case AArch64::SUBSWri: 6620 case AArch64::SUBSXri: 6621 Sign *= -1; 6622 LLVM_FALLTHROUGH; 6623 case AArch64::ADDSWri: 6624 case AArch64::ADDSXri: 6625 case AArch64::ADDWri: 6626 case AArch64::ADDXri: { 6627 // TODO: Third operand can be global address (usually some string). 6628 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() || 6629 !MI.getOperand(2).isImm()) 6630 return None; 6631 Offset = MI.getOperand(2).getImm() * Sign; 6632 int Shift = MI.getOperand(3).getImm(); 6633 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12"); 6634 Offset = Offset << Shift; 6635 } 6636 } 6637 return RegImmPair{MI.getOperand(1).getReg(), Offset}; 6638 } 6639 6640 /// If the given ORR instruction is a copy, and \p DescribedReg overlaps with 6641 /// the destination register then, if possible, describe the value in terms of 6642 /// the source register. 6643 static Optional<ParamLoadedValue> 6644 describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg, 6645 const TargetInstrInfo *TII, 6646 const TargetRegisterInfo *TRI) { 6647 auto DestSrc = TII->isCopyInstr(MI); 6648 if (!DestSrc) 6649 return None; 6650 6651 Register DestReg = DestSrc->Destination->getReg(); 6652 Register SrcReg = DestSrc->Source->getReg(); 6653 6654 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {}); 6655 6656 // If the described register is the destination, just return the source. 6657 if (DestReg == DescribedReg) 6658 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 6659 6660 // ORRWrs zero-extends to 64-bits, so we need to consider such cases. 6661 if (MI.getOpcode() == AArch64::ORRWrs && 6662 TRI->isSuperRegister(DestReg, DescribedReg)) 6663 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 6664 6665 // We may need to describe the lower part of a ORRXrs move. 6666 if (MI.getOpcode() == AArch64::ORRXrs && 6667 TRI->isSubRegister(DestReg, DescribedReg)) { 6668 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32); 6669 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr); 6670 } 6671 6672 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) && 6673 "Unhandled ORR[XW]rs copy case"); 6674 6675 return None; 6676 } 6677 6678 Optional<ParamLoadedValue> 6679 AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI, 6680 Register Reg) const { 6681 const MachineFunction *MF = MI.getMF(); 6682 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 6683 switch (MI.getOpcode()) { 6684 case AArch64::MOVZWi: 6685 case AArch64::MOVZXi: { 6686 // MOVZWi may be used for producing zero-extended 32-bit immediates in 6687 // 64-bit parameters, so we need to consider super-registers. 6688 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg)) 6689 return None; 6690 6691 if (!MI.getOperand(1).isImm()) 6692 return None; 6693 int64_t Immediate = MI.getOperand(1).getImm(); 6694 int Shift = MI.getOperand(2).getImm(); 6695 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift), 6696 nullptr); 6697 } 6698 case AArch64::ORRWrs: 6699 case AArch64::ORRXrs: 6700 return describeORRLoadedValue(MI, Reg, this, TRI); 6701 } 6702 6703 return TargetInstrInfo::describeLoadedValue(MI, Reg); 6704 } 6705 6706 uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const { 6707 return get(Opc).TSFlags & AArch64::ElementSizeMask; 6708 } 6709 6710 #define GET_INSTRINFO_HELPERS 6711 #define GET_INSTRMAP_INFO 6712 #include "AArch64GenInstrInfo.inc" 6713