1 //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI Implementation of TargetInstrInfo. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIInstrInfo.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUSubtarget.h" 17 #include "GCNHazardRecognizer.h" 18 #include "SIDefines.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "SIRegisterInfo.h" 21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/APInt.h" 24 #include "llvm/ADT/ArrayRef.h" 25 #include "llvm/ADT/SmallVector.h" 26 #include "llvm/ADT/StringRef.h" 27 #include "llvm/ADT/iterator_range.h" 28 #include "llvm/Analysis/AliasAnalysis.h" 29 #include "llvm/Analysis/MemoryLocation.h" 30 #include "llvm/Analysis/ValueTracking.h" 31 #include "llvm/CodeGen/MachineBasicBlock.h" 32 #include "llvm/CodeGen/MachineDominators.h" 33 #include "llvm/CodeGen/MachineFrameInfo.h" 34 #include "llvm/CodeGen/MachineFunction.h" 35 #include "llvm/CodeGen/MachineInstr.h" 36 #include "llvm/CodeGen/MachineInstrBuilder.h" 37 #include "llvm/CodeGen/MachineInstrBundle.h" 38 #include "llvm/CodeGen/MachineMemOperand.h" 39 #include "llvm/CodeGen/MachineOperand.h" 40 #include "llvm/CodeGen/MachineRegisterInfo.h" 41 #include "llvm/CodeGen/RegisterScavenging.h" 42 #include "llvm/CodeGen/ScheduleDAG.h" 43 #include "llvm/CodeGen/SelectionDAGNodes.h" 44 #include "llvm/CodeGen/TargetOpcodes.h" 45 #include "llvm/CodeGen/TargetRegisterInfo.h" 46 #include "llvm/IR/DebugLoc.h" 47 #include "llvm/IR/DiagnosticInfo.h" 48 #include "llvm/IR/Function.h" 49 #include "llvm/IR/InlineAsm.h" 50 #include "llvm/IR/LLVMContext.h" 51 #include "llvm/MC/MCInstrDesc.h" 52 #include "llvm/Support/Casting.h" 53 #include "llvm/Support/CommandLine.h" 54 #include "llvm/Support/Compiler.h" 55 #include "llvm/Support/ErrorHandling.h" 56 #include "llvm/Support/MachineValueType.h" 57 #include "llvm/Support/MathExtras.h" 58 #include "llvm/Target/TargetMachine.h" 59 #include <cassert> 60 #include <cstdint> 61 #include <iterator> 62 #include <utility> 63 64 using namespace llvm; 65 66 #define GET_INSTRINFO_CTOR_DTOR 67 #include "AMDGPUGenInstrInfo.inc" 68 69 namespace llvm { 70 namespace AMDGPU { 71 #define GET_D16ImageDimIntrinsics_IMPL 72 #define GET_ImageDimIntrinsicTable_IMPL 73 #define GET_RsrcIntrinsics_IMPL 74 #include "AMDGPUGenSearchableTables.inc" 75 } 76 } 77 78 79 // Must be at least 4 to be able to branch over minimum unconditional branch 80 // code. This is only for making it possible to write reasonably small tests for 81 // long branches. 82 static cl::opt<unsigned> 83 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), 84 cl::desc("Restrict range of branch instructions (DEBUG)")); 85 86 SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) 87 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), 88 RI(ST), ST(ST) { 89 SchedModel.init(&ST); 90 } 91 92 //===----------------------------------------------------------------------===// 93 // TargetInstrInfo callbacks 94 //===----------------------------------------------------------------------===// 95 96 static unsigned getNumOperandsNoGlue(SDNode *Node) { 97 unsigned N = Node->getNumOperands(); 98 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 99 --N; 100 return N; 101 } 102 103 /// Returns true if both nodes have the same value for the given 104 /// operand \p Op, or if both nodes do not have this operand. 105 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 106 unsigned Opc0 = N0->getMachineOpcode(); 107 unsigned Opc1 = N1->getMachineOpcode(); 108 109 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 110 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 111 112 if (Op0Idx == -1 && Op1Idx == -1) 113 return true; 114 115 116 if ((Op0Idx == -1 && Op1Idx != -1) || 117 (Op1Idx == -1 && Op0Idx != -1)) 118 return false; 119 120 // getNamedOperandIdx returns the index for the MachineInstr's operands, 121 // which includes the result as the first operand. We are indexing into the 122 // MachineSDNode's operands, so we need to skip the result operand to get 123 // the real index. 124 --Op0Idx; 125 --Op1Idx; 126 127 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 128 } 129 130 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, 131 AliasAnalysis *AA) const { 132 // TODO: The generic check fails for VALU instructions that should be 133 // rematerializable due to implicit reads of exec. We really want all of the 134 // generic logic for this except for this. 135 switch (MI.getOpcode()) { 136 case AMDGPU::V_MOV_B32_e32: 137 case AMDGPU::V_MOV_B32_e64: 138 case AMDGPU::V_MOV_B64_PSEUDO: 139 // No implicit operands. 140 return MI.getNumOperands() == MI.getDesc().getNumOperands(); 141 default: 142 return false; 143 } 144 } 145 146 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 147 int64_t &Offset0, 148 int64_t &Offset1) const { 149 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 150 return false; 151 152 unsigned Opc0 = Load0->getMachineOpcode(); 153 unsigned Opc1 = Load1->getMachineOpcode(); 154 155 // Make sure both are actually loads. 156 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 157 return false; 158 159 if (isDS(Opc0) && isDS(Opc1)) { 160 161 // FIXME: Handle this case: 162 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 163 return false; 164 165 // Check base reg. 166 if (Load0->getOperand(0) != Load1->getOperand(0)) 167 return false; 168 169 // Skip read2 / write2 variants for simplicity. 170 // TODO: We should report true if the used offsets are adjacent (excluded 171 // st64 versions). 172 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 173 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 174 if (Offset0Idx == -1 || Offset1Idx == -1) 175 return false; 176 177 // XXX - be careful of datalesss loads 178 // getNamedOperandIdx returns the index for MachineInstrs. Since they 179 // include the output in the operand list, but SDNodes don't, we need to 180 // subtract the index by one. 181 Offset0Idx -= get(Opc0).NumDefs; 182 Offset1Idx -= get(Opc1).NumDefs; 183 Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue(); 184 Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue(); 185 return true; 186 } 187 188 if (isSMRD(Opc0) && isSMRD(Opc1)) { 189 // Skip time and cache invalidation instructions. 190 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 || 191 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1) 192 return false; 193 194 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); 195 196 // Check base reg. 197 if (Load0->getOperand(0) != Load1->getOperand(0)) 198 return false; 199 200 const ConstantSDNode *Load0Offset = 201 dyn_cast<ConstantSDNode>(Load0->getOperand(1)); 202 const ConstantSDNode *Load1Offset = 203 dyn_cast<ConstantSDNode>(Load1->getOperand(1)); 204 205 if (!Load0Offset || !Load1Offset) 206 return false; 207 208 Offset0 = Load0Offset->getZExtValue(); 209 Offset1 = Load1Offset->getZExtValue(); 210 return true; 211 } 212 213 // MUBUF and MTBUF can access the same addresses. 214 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 215 216 // MUBUF and MTBUF have vaddr at different indices. 217 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 218 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 219 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 220 return false; 221 222 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 223 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 224 225 if (OffIdx0 == -1 || OffIdx1 == -1) 226 return false; 227 228 // getNamedOperandIdx returns the index for MachineInstrs. Since they 229 // include the output in the operand list, but SDNodes don't, we need to 230 // subtract the index by one. 231 OffIdx0 -= get(Opc0).NumDefs; 232 OffIdx1 -= get(Opc1).NumDefs; 233 234 SDValue Off0 = Load0->getOperand(OffIdx0); 235 SDValue Off1 = Load1->getOperand(OffIdx1); 236 237 // The offset might be a FrameIndexSDNode. 238 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 239 return false; 240 241 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 242 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 243 return true; 244 } 245 246 return false; 247 } 248 249 static bool isStride64(unsigned Opc) { 250 switch (Opc) { 251 case AMDGPU::DS_READ2ST64_B32: 252 case AMDGPU::DS_READ2ST64_B64: 253 case AMDGPU::DS_WRITE2ST64_B32: 254 case AMDGPU::DS_WRITE2ST64_B64: 255 return true; 256 default: 257 return false; 258 } 259 } 260 261 bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt, 262 const MachineOperand *&BaseOp, 263 int64_t &Offset, 264 const TargetRegisterInfo *TRI) const { 265 if (!LdSt.mayLoadOrStore()) 266 return false; 267 268 unsigned Opc = LdSt.getOpcode(); 269 270 if (isDS(LdSt)) { 271 const MachineOperand *OffsetImm = 272 getNamedOperand(LdSt, AMDGPU::OpName::offset); 273 if (OffsetImm) { 274 // Normal, single offset LDS instruction. 275 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); 276 // TODO: ds_consume/ds_append use M0 for the base address. Is it safe to 277 // report that here? 278 if (!BaseOp || !BaseOp->isReg()) 279 return false; 280 281 Offset = OffsetImm->getImm(); 282 283 return true; 284 } 285 286 // The 2 offset instructions use offset0 and offset1 instead. We can treat 287 // these as a load with a single offset if the 2 offsets are consecutive. We 288 // will use this for some partially aligned loads. 289 const MachineOperand *Offset0Imm = 290 getNamedOperand(LdSt, AMDGPU::OpName::offset0); 291 const MachineOperand *Offset1Imm = 292 getNamedOperand(LdSt, AMDGPU::OpName::offset1); 293 294 uint8_t Offset0 = Offset0Imm->getImm(); 295 uint8_t Offset1 = Offset1Imm->getImm(); 296 297 if (Offset1 > Offset0 && Offset1 - Offset0 == 1) { 298 // Each of these offsets is in element sized units, so we need to convert 299 // to bytes of the individual reads. 300 301 unsigned EltSize; 302 if (LdSt.mayLoad()) 303 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16; 304 else { 305 assert(LdSt.mayStore()); 306 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 307 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8; 308 } 309 310 if (isStride64(Opc)) 311 EltSize *= 64; 312 313 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); 314 if (!BaseOp->isReg()) 315 return false; 316 317 Offset = EltSize * Offset0; 318 319 return true; 320 } 321 322 return false; 323 } 324 325 if (isMUBUF(LdSt) || isMTBUF(LdSt)) { 326 const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset); 327 if (SOffset && SOffset->isReg()) { 328 // We can only handle this if it's a stack access, as any other resource 329 // would require reporting multiple base registers. 330 const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 331 if (AddrReg && !AddrReg->isFI()) 332 return false; 333 334 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); 335 const SIMachineFunctionInfo *MFI 336 = LdSt.getParent()->getParent()->getInfo<SIMachineFunctionInfo>(); 337 if (RSrc->getReg() != MFI->getScratchRSrcReg()) 338 return false; 339 340 const MachineOperand *OffsetImm = 341 getNamedOperand(LdSt, AMDGPU::OpName::offset); 342 BaseOp = SOffset; 343 Offset = OffsetImm->getImm(); 344 return true; 345 } 346 347 const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 348 if (!AddrReg) 349 return false; 350 351 const MachineOperand *OffsetImm = 352 getNamedOperand(LdSt, AMDGPU::OpName::offset); 353 BaseOp = AddrReg; 354 Offset = OffsetImm->getImm(); 355 if (SOffset) // soffset can be an inline immediate. 356 Offset += SOffset->getImm(); 357 358 if (!BaseOp->isReg()) 359 return false; 360 361 return true; 362 } 363 364 if (isSMRD(LdSt)) { 365 const MachineOperand *OffsetImm = 366 getNamedOperand(LdSt, AMDGPU::OpName::offset); 367 if (!OffsetImm) 368 return false; 369 370 const MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase); 371 BaseOp = SBaseReg; 372 Offset = OffsetImm->getImm(); 373 if (!BaseOp->isReg()) 374 return false; 375 376 return true; 377 } 378 379 if (isFLAT(LdSt)) { 380 const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 381 if (VAddr) { 382 // Can't analyze 2 offsets. 383 if (getNamedOperand(LdSt, AMDGPU::OpName::saddr)) 384 return false; 385 386 BaseOp = VAddr; 387 } else { 388 // scratch instructions have either vaddr or saddr. 389 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr); 390 } 391 392 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm(); 393 if (!BaseOp->isReg()) 394 return false; 395 return true; 396 } 397 398 return false; 399 } 400 401 static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, 402 const MachineOperand &BaseOp1, 403 const MachineInstr &MI2, 404 const MachineOperand &BaseOp2) { 405 // Support only base operands with base registers. 406 // Note: this could be extended to support FI operands. 407 if (!BaseOp1.isReg() || !BaseOp2.isReg()) 408 return false; 409 410 if (BaseOp1.isIdenticalTo(BaseOp2)) 411 return true; 412 413 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand()) 414 return false; 415 416 auto MO1 = *MI1.memoperands_begin(); 417 auto MO2 = *MI2.memoperands_begin(); 418 if (MO1->getAddrSpace() != MO2->getAddrSpace()) 419 return false; 420 421 auto Base1 = MO1->getValue(); 422 auto Base2 = MO2->getValue(); 423 if (!Base1 || !Base2) 424 return false; 425 const MachineFunction &MF = *MI1.getParent()->getParent(); 426 const DataLayout &DL = MF.getFunction().getParent()->getDataLayout(); 427 Base1 = GetUnderlyingObject(Base1, DL); 428 Base2 = GetUnderlyingObject(Base2, DL); 429 430 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2)) 431 return false; 432 433 return Base1 == Base2; 434 } 435 436 bool SIInstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1, 437 const MachineOperand &BaseOp2, 438 unsigned NumLoads) const { 439 const MachineInstr &FirstLdSt = *BaseOp1.getParent(); 440 const MachineInstr &SecondLdSt = *BaseOp2.getParent(); 441 442 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOp1, SecondLdSt, BaseOp2)) 443 return false; 444 445 const MachineOperand *FirstDst = nullptr; 446 const MachineOperand *SecondDst = nullptr; 447 448 if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || 449 (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) || 450 (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) { 451 const unsigned MaxGlobalLoadCluster = 6; 452 if (NumLoads > MaxGlobalLoadCluster) 453 return false; 454 455 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); 456 if (!FirstDst) 457 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); 458 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); 459 if (!SecondDst) 460 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); 461 } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { 462 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); 463 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); 464 } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) { 465 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); 466 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); 467 } 468 469 if (!FirstDst || !SecondDst) 470 return false; 471 472 // Try to limit clustering based on the total number of bytes loaded 473 // rather than the number of instructions. This is done to help reduce 474 // register pressure. The method used is somewhat inexact, though, 475 // because it assumes that all loads in the cluster will load the 476 // same number of bytes as FirstLdSt. 477 478 // The unit of this value is bytes. 479 // FIXME: This needs finer tuning. 480 unsigned LoadClusterThreshold = 16; 481 482 const MachineRegisterInfo &MRI = 483 FirstLdSt.getParent()->getParent()->getRegInfo(); 484 485 const Register Reg = FirstDst->getReg(); 486 487 const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg) 488 ? MRI.getRegClass(Reg) 489 : RI.getPhysRegClass(Reg); 490 491 return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold; 492 } 493 494 // FIXME: This behaves strangely. If, for example, you have 32 load + stores, 495 // the first 16 loads will be interleaved with the stores, and the next 16 will 496 // be clustered as expected. It should really split into 2 16 store batches. 497 // 498 // Loads are clustered until this returns false, rather than trying to schedule 499 // groups of stores. This also means we have to deal with saying different 500 // address space loads should be clustered, and ones which might cause bank 501 // conflicts. 502 // 503 // This might be deprecated so it might not be worth that much effort to fix. 504 bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, 505 int64_t Offset0, int64_t Offset1, 506 unsigned NumLoads) const { 507 assert(Offset1 > Offset0 && 508 "Second offset should be larger than first offset!"); 509 // If we have less than 16 loads in a row, and the offsets are within 64 510 // bytes, then schedule together. 511 512 // A cacheline is 64 bytes (for global memory). 513 return (NumLoads <= 16 && (Offset1 - Offset0) < 64); 514 } 515 516 static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, 517 MachineBasicBlock::iterator MI, 518 const DebugLoc &DL, MCRegister DestReg, 519 MCRegister SrcReg, bool KillSrc) { 520 MachineFunction *MF = MBB.getParent(); 521 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), 522 "illegal SGPR to VGPR copy", 523 DL, DS_Error); 524 LLVMContext &C = MF->getFunction().getContext(); 525 C.diagnose(IllegalCopy); 526 527 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg) 528 .addReg(SrcReg, getKillRegState(KillSrc)); 529 } 530 531 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 532 MachineBasicBlock::iterator MI, 533 const DebugLoc &DL, MCRegister DestReg, 534 MCRegister SrcReg, bool KillSrc) const { 535 const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg); 536 537 if (RC == &AMDGPU::VGPR_32RegClass) { 538 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 539 AMDGPU::SReg_32RegClass.contains(SrcReg) || 540 AMDGPU::AGPR_32RegClass.contains(SrcReg)); 541 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ? 542 AMDGPU::V_ACCVGPR_READ_B32 : AMDGPU::V_MOV_B32_e32; 543 BuildMI(MBB, MI, DL, get(Opc), DestReg) 544 .addReg(SrcReg, getKillRegState(KillSrc)); 545 return; 546 } 547 548 if (RC == &AMDGPU::SReg_32_XM0RegClass || 549 RC == &AMDGPU::SReg_32RegClass) { 550 if (SrcReg == AMDGPU::SCC) { 551 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) 552 .addImm(1) 553 .addImm(0); 554 return; 555 } 556 557 if (DestReg == AMDGPU::VCC_LO) { 558 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) { 559 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO) 560 .addReg(SrcReg, getKillRegState(KillSrc)); 561 } else { 562 // FIXME: Hack until VReg_1 removed. 563 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 564 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) 565 .addImm(0) 566 .addReg(SrcReg, getKillRegState(KillSrc)); 567 } 568 569 return; 570 } 571 572 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) { 573 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 574 return; 575 } 576 577 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 578 .addReg(SrcReg, getKillRegState(KillSrc)); 579 return; 580 } 581 582 if (RC == &AMDGPU::SReg_64RegClass) { 583 if (DestReg == AMDGPU::VCC) { 584 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 585 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 586 .addReg(SrcReg, getKillRegState(KillSrc)); 587 } else { 588 // FIXME: Hack until VReg_1 removed. 589 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 590 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) 591 .addImm(0) 592 .addReg(SrcReg, getKillRegState(KillSrc)); 593 } 594 595 return; 596 } 597 598 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) { 599 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 600 return; 601 } 602 603 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 604 .addReg(SrcReg, getKillRegState(KillSrc)); 605 return; 606 } 607 608 if (DestReg == AMDGPU::SCC) { 609 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 610 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) 611 .addReg(SrcReg, getKillRegState(KillSrc)) 612 .addImm(0); 613 return; 614 } 615 616 if (RC == &AMDGPU::AGPR_32RegClass) { 617 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 618 AMDGPU::SReg_32RegClass.contains(SrcReg) || 619 AMDGPU::AGPR_32RegClass.contains(SrcReg)); 620 if (!AMDGPU::VGPR_32RegClass.contains(SrcReg)) { 621 // First try to find defining accvgpr_write to avoid temporary registers. 622 for (auto Def = MI, E = MBB.begin(); Def != E; ) { 623 --Def; 624 if (!Def->definesRegister(SrcReg, &RI)) 625 continue; 626 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32) 627 break; 628 629 MachineOperand &DefOp = Def->getOperand(1); 630 assert(DefOp.isReg() || DefOp.isImm()); 631 632 if (DefOp.isReg()) { 633 // Check that register source operand if not clobbered before MI. 634 // Immediate operands are always safe to propagate. 635 bool SafeToPropagate = true; 636 for (auto I = Def; I != MI && SafeToPropagate; ++I) 637 if (I->modifiesRegister(DefOp.getReg(), &RI)) 638 SafeToPropagate = false; 639 640 if (!SafeToPropagate) 641 break; 642 643 DefOp.setIsKill(false); 644 } 645 646 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) 647 .add(DefOp); 648 return; 649 } 650 651 RegScavenger RS; 652 RS.enterBasicBlock(MBB); 653 RS.forward(MI); 654 655 // Ideally we want to have three registers for a long reg_sequence copy 656 // to hide 2 waitstates between v_mov_b32 and accvgpr_write. 657 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 658 *MBB.getParent()); 659 660 // Registers in the sequence are allocated contiguously so we can just 661 // use register number to pick one of three round-robin temps. 662 unsigned RegNo = DestReg % 3; 663 unsigned Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); 664 if (!Tmp) 665 report_fatal_error("Cannot scavenge VGPR to copy to AGPR"); 666 RS.setRegUsed(Tmp); 667 // Only loop through if there are any free registers left, otherwise 668 // scavenger may report a fatal error without emergency spill slot 669 // or spill with the slot. 670 while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { 671 unsigned Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); 672 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) 673 break; 674 Tmp = Tmp2; 675 RS.setRegUsed(Tmp); 676 } 677 copyPhysReg(MBB, MI, DL, Tmp, SrcReg, KillSrc); 678 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) 679 .addReg(Tmp, RegState::Kill); 680 return; 681 } 682 683 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) 684 .addReg(SrcReg, getKillRegState(KillSrc)); 685 return; 686 } 687 688 unsigned EltSize = 4; 689 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 690 if (RI.isSGPRClass(RC)) { 691 // TODO: Copy vec3/vec5 with s_mov_b64s then final s_mov_b32. 692 if (!(RI.getRegSizeInBits(*RC) % 64)) { 693 Opcode = AMDGPU::S_MOV_B64; 694 EltSize = 8; 695 } else { 696 Opcode = AMDGPU::S_MOV_B32; 697 EltSize = 4; 698 } 699 700 if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) { 701 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 702 return; 703 } 704 } else if (RI.hasAGPRs(RC)) { 705 Opcode = RI.hasVGPRs(RI.getPhysRegClass(SrcReg)) ? 706 AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY; 707 } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) { 708 Opcode = AMDGPU::V_ACCVGPR_READ_B32; 709 } 710 711 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); 712 bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); 713 714 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 715 unsigned SubIdx; 716 if (Forward) 717 SubIdx = SubIndices[Idx]; 718 else 719 SubIdx = SubIndices[SubIndices.size() - Idx - 1]; 720 721 if (Opcode == TargetOpcode::COPY) { 722 copyPhysReg(MBB, MI, DL, RI.getSubReg(DestReg, SubIdx), 723 RI.getSubReg(SrcReg, SubIdx), KillSrc); 724 continue; 725 } 726 727 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 728 get(Opcode), RI.getSubReg(DestReg, SubIdx)); 729 730 Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); 731 732 if (Idx == 0) 733 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 734 735 bool UseKill = KillSrc && Idx == SubIndices.size() - 1; 736 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); 737 } 738 } 739 740 int SIInstrInfo::commuteOpcode(unsigned Opcode) const { 741 int NewOpc; 742 743 // Try to map original to commuted opcode 744 NewOpc = AMDGPU::getCommuteRev(Opcode); 745 if (NewOpc != -1) 746 // Check if the commuted (REV) opcode exists on the target. 747 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 748 749 // Try to map commuted to original opcode 750 NewOpc = AMDGPU::getCommuteOrig(Opcode); 751 if (NewOpc != -1) 752 // Check if the original (non-REV) opcode exists on the target. 753 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 754 755 return Opcode; 756 } 757 758 void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, 759 MachineBasicBlock::iterator MI, 760 const DebugLoc &DL, unsigned DestReg, 761 int64_t Value) const { 762 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 763 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg); 764 if (RegClass == &AMDGPU::SReg_32RegClass || 765 RegClass == &AMDGPU::SGPR_32RegClass || 766 RegClass == &AMDGPU::SReg_32_XM0RegClass || 767 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) { 768 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 769 .addImm(Value); 770 return; 771 } 772 773 if (RegClass == &AMDGPU::SReg_64RegClass || 774 RegClass == &AMDGPU::SGPR_64RegClass || 775 RegClass == &AMDGPU::SReg_64_XEXECRegClass) { 776 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 777 .addImm(Value); 778 return; 779 } 780 781 if (RegClass == &AMDGPU::VGPR_32RegClass) { 782 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 783 .addImm(Value); 784 return; 785 } 786 if (RegClass == &AMDGPU::VReg_64RegClass) { 787 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg) 788 .addImm(Value); 789 return; 790 } 791 792 unsigned EltSize = 4; 793 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 794 if (RI.isSGPRClass(RegClass)) { 795 if (RI.getRegSizeInBits(*RegClass) > 32) { 796 Opcode = AMDGPU::S_MOV_B64; 797 EltSize = 8; 798 } else { 799 Opcode = AMDGPU::S_MOV_B32; 800 EltSize = 4; 801 } 802 } 803 804 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize); 805 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 806 int64_t IdxValue = Idx == 0 ? Value : 0; 807 808 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 809 get(Opcode), RI.getSubReg(DestReg, Idx)); 810 Builder.addImm(IdxValue); 811 } 812 } 813 814 const TargetRegisterClass * 815 SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const { 816 return &AMDGPU::VGPR_32RegClass; 817 } 818 819 void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, 820 MachineBasicBlock::iterator I, 821 const DebugLoc &DL, unsigned DstReg, 822 ArrayRef<MachineOperand> Cond, 823 unsigned TrueReg, 824 unsigned FalseReg) const { 825 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 826 MachineFunction *MF = MBB.getParent(); 827 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 828 const TargetRegisterClass *BoolXExecRC = 829 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 830 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && 831 "Not a VGPR32 reg"); 832 833 if (Cond.size() == 1) { 834 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 835 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 836 .add(Cond[0]); 837 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 838 .addImm(0) 839 .addReg(FalseReg) 840 .addImm(0) 841 .addReg(TrueReg) 842 .addReg(SReg); 843 } else if (Cond.size() == 2) { 844 assert(Cond[0].isImm() && "Cond[0] is not an immediate"); 845 switch (Cond[0].getImm()) { 846 case SIInstrInfo::SCC_TRUE: { 847 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 848 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 849 : AMDGPU::S_CSELECT_B64), SReg) 850 .addImm(1) 851 .addImm(0); 852 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 853 .addImm(0) 854 .addReg(FalseReg) 855 .addImm(0) 856 .addReg(TrueReg) 857 .addReg(SReg); 858 break; 859 } 860 case SIInstrInfo::SCC_FALSE: { 861 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 862 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 863 : AMDGPU::S_CSELECT_B64), SReg) 864 .addImm(0) 865 .addImm(1); 866 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 867 .addImm(0) 868 .addReg(FalseReg) 869 .addImm(0) 870 .addReg(TrueReg) 871 .addReg(SReg); 872 break; 873 } 874 case SIInstrInfo::VCCNZ: { 875 MachineOperand RegOp = Cond[1]; 876 RegOp.setImplicit(false); 877 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 878 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 879 .add(RegOp); 880 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 881 .addImm(0) 882 .addReg(FalseReg) 883 .addImm(0) 884 .addReg(TrueReg) 885 .addReg(SReg); 886 break; 887 } 888 case SIInstrInfo::VCCZ: { 889 MachineOperand RegOp = Cond[1]; 890 RegOp.setImplicit(false); 891 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 892 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 893 .add(RegOp); 894 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 895 .addImm(0) 896 .addReg(TrueReg) 897 .addImm(0) 898 .addReg(FalseReg) 899 .addReg(SReg); 900 break; 901 } 902 case SIInstrInfo::EXECNZ: { 903 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 904 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); 905 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 906 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 907 .addImm(0); 908 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 909 : AMDGPU::S_CSELECT_B64), SReg) 910 .addImm(1) 911 .addImm(0); 912 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 913 .addImm(0) 914 .addReg(FalseReg) 915 .addImm(0) 916 .addReg(TrueReg) 917 .addReg(SReg); 918 break; 919 } 920 case SIInstrInfo::EXECZ: { 921 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 922 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); 923 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 924 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 925 .addImm(0); 926 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 927 : AMDGPU::S_CSELECT_B64), SReg) 928 .addImm(0) 929 .addImm(1); 930 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 931 .addImm(0) 932 .addReg(FalseReg) 933 .addImm(0) 934 .addReg(TrueReg) 935 .addReg(SReg); 936 llvm_unreachable("Unhandled branch predicate EXECZ"); 937 break; 938 } 939 default: 940 llvm_unreachable("invalid branch predicate"); 941 } 942 } else { 943 llvm_unreachable("Can only handle Cond size 1 or 2"); 944 } 945 } 946 947 unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB, 948 MachineBasicBlock::iterator I, 949 const DebugLoc &DL, 950 unsigned SrcReg, int Value) const { 951 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 952 Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); 953 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) 954 .addImm(Value) 955 .addReg(SrcReg); 956 957 return Reg; 958 } 959 960 unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB, 961 MachineBasicBlock::iterator I, 962 const DebugLoc &DL, 963 unsigned SrcReg, int Value) const { 964 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 965 Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); 966 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) 967 .addImm(Value) 968 .addReg(SrcReg); 969 970 return Reg; 971 } 972 973 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 974 975 if (RI.hasAGPRs(DstRC)) 976 return AMDGPU::COPY; 977 if (RI.getRegSizeInBits(*DstRC) == 32) { 978 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 979 } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { 980 return AMDGPU::S_MOV_B64; 981 } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) { 982 return AMDGPU::V_MOV_B64_PSEUDO; 983 } 984 return AMDGPU::COPY; 985 } 986 987 static unsigned getSGPRSpillSaveOpcode(unsigned Size) { 988 switch (Size) { 989 case 4: 990 return AMDGPU::SI_SPILL_S32_SAVE; 991 case 8: 992 return AMDGPU::SI_SPILL_S64_SAVE; 993 case 12: 994 return AMDGPU::SI_SPILL_S96_SAVE; 995 case 16: 996 return AMDGPU::SI_SPILL_S128_SAVE; 997 case 20: 998 return AMDGPU::SI_SPILL_S160_SAVE; 999 case 32: 1000 return AMDGPU::SI_SPILL_S256_SAVE; 1001 case 64: 1002 return AMDGPU::SI_SPILL_S512_SAVE; 1003 case 128: 1004 return AMDGPU::SI_SPILL_S1024_SAVE; 1005 default: 1006 llvm_unreachable("unknown register size"); 1007 } 1008 } 1009 1010 static unsigned getVGPRSpillSaveOpcode(unsigned Size) { 1011 switch (Size) { 1012 case 4: 1013 return AMDGPU::SI_SPILL_V32_SAVE; 1014 case 8: 1015 return AMDGPU::SI_SPILL_V64_SAVE; 1016 case 12: 1017 return AMDGPU::SI_SPILL_V96_SAVE; 1018 case 16: 1019 return AMDGPU::SI_SPILL_V128_SAVE; 1020 case 20: 1021 return AMDGPU::SI_SPILL_V160_SAVE; 1022 case 32: 1023 return AMDGPU::SI_SPILL_V256_SAVE; 1024 case 64: 1025 return AMDGPU::SI_SPILL_V512_SAVE; 1026 case 128: 1027 return AMDGPU::SI_SPILL_V1024_SAVE; 1028 default: 1029 llvm_unreachable("unknown register size"); 1030 } 1031 } 1032 1033 static unsigned getAGPRSpillSaveOpcode(unsigned Size) { 1034 switch (Size) { 1035 case 4: 1036 return AMDGPU::SI_SPILL_A32_SAVE; 1037 case 8: 1038 return AMDGPU::SI_SPILL_A64_SAVE; 1039 case 16: 1040 return AMDGPU::SI_SPILL_A128_SAVE; 1041 case 64: 1042 return AMDGPU::SI_SPILL_A512_SAVE; 1043 case 128: 1044 return AMDGPU::SI_SPILL_A1024_SAVE; 1045 default: 1046 llvm_unreachable("unknown register size"); 1047 } 1048 } 1049 1050 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 1051 MachineBasicBlock::iterator MI, 1052 unsigned SrcReg, bool isKill, 1053 int FrameIndex, 1054 const TargetRegisterClass *RC, 1055 const TargetRegisterInfo *TRI) const { 1056 MachineFunction *MF = MBB.getParent(); 1057 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1058 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1059 const DebugLoc &DL = MBB.findDebugLoc(MI); 1060 1061 unsigned Size = FrameInfo.getObjectSize(FrameIndex); 1062 unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); 1063 MachinePointerInfo PtrInfo 1064 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 1065 MachineMemOperand *MMO 1066 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 1067 Size, Align); 1068 unsigned SpillSize = TRI->getSpillSize(*RC); 1069 1070 if (RI.isSGPRClass(RC)) { 1071 MFI->setHasSpilledSGPRs(); 1072 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled"); 1073 1074 // We are only allowed to create one new instruction when spilling 1075 // registers, so we need to use pseudo instruction for spilling SGPRs. 1076 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize)); 1077 1078 // The SGPR spill/restore instructions only work on number sgprs, so we need 1079 // to make sure we are using the correct register class. 1080 if (Register::isVirtualRegister(SrcReg) && SpillSize == 4) { 1081 MachineRegisterInfo &MRI = MF->getRegInfo(); 1082 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); 1083 } 1084 1085 BuildMI(MBB, MI, DL, OpDesc) 1086 .addReg(SrcReg, getKillRegState(isKill)) // data 1087 .addFrameIndex(FrameIndex) // addr 1088 .addMemOperand(MMO) 1089 .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) 1090 .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); 1091 // Add the scratch resource registers as implicit uses because we may end up 1092 // needing them, and need to ensure that the reserved registers are 1093 // correctly handled. 1094 if (RI.spillSGPRToVGPR()) 1095 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); 1096 return; 1097 } 1098 1099 unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillSaveOpcode(SpillSize) 1100 : getVGPRSpillSaveOpcode(SpillSize); 1101 MFI->setHasSpilledVGPRs(); 1102 1103 auto MIB = BuildMI(MBB, MI, DL, get(Opcode)); 1104 if (RI.hasAGPRs(RC)) { 1105 MachineRegisterInfo &MRI = MF->getRegInfo(); 1106 Register Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1107 MIB.addReg(Tmp, RegState::Define); 1108 } 1109 MIB.addReg(SrcReg, getKillRegState(isKill)) // data 1110 .addFrameIndex(FrameIndex) // addr 1111 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 1112 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset 1113 .addImm(0) // offset 1114 .addMemOperand(MMO); 1115 } 1116 1117 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { 1118 switch (Size) { 1119 case 4: 1120 return AMDGPU::SI_SPILL_S32_RESTORE; 1121 case 8: 1122 return AMDGPU::SI_SPILL_S64_RESTORE; 1123 case 12: 1124 return AMDGPU::SI_SPILL_S96_RESTORE; 1125 case 16: 1126 return AMDGPU::SI_SPILL_S128_RESTORE; 1127 case 20: 1128 return AMDGPU::SI_SPILL_S160_RESTORE; 1129 case 32: 1130 return AMDGPU::SI_SPILL_S256_RESTORE; 1131 case 64: 1132 return AMDGPU::SI_SPILL_S512_RESTORE; 1133 case 128: 1134 return AMDGPU::SI_SPILL_S1024_RESTORE; 1135 default: 1136 llvm_unreachable("unknown register size"); 1137 } 1138 } 1139 1140 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { 1141 switch (Size) { 1142 case 4: 1143 return AMDGPU::SI_SPILL_V32_RESTORE; 1144 case 8: 1145 return AMDGPU::SI_SPILL_V64_RESTORE; 1146 case 12: 1147 return AMDGPU::SI_SPILL_V96_RESTORE; 1148 case 16: 1149 return AMDGPU::SI_SPILL_V128_RESTORE; 1150 case 20: 1151 return AMDGPU::SI_SPILL_V160_RESTORE; 1152 case 32: 1153 return AMDGPU::SI_SPILL_V256_RESTORE; 1154 case 64: 1155 return AMDGPU::SI_SPILL_V512_RESTORE; 1156 case 128: 1157 return AMDGPU::SI_SPILL_V1024_RESTORE; 1158 default: 1159 llvm_unreachable("unknown register size"); 1160 } 1161 } 1162 1163 static unsigned getAGPRSpillRestoreOpcode(unsigned Size) { 1164 switch (Size) { 1165 case 4: 1166 return AMDGPU::SI_SPILL_A32_RESTORE; 1167 case 8: 1168 return AMDGPU::SI_SPILL_A64_RESTORE; 1169 case 16: 1170 return AMDGPU::SI_SPILL_A128_RESTORE; 1171 case 64: 1172 return AMDGPU::SI_SPILL_A512_RESTORE; 1173 case 128: 1174 return AMDGPU::SI_SPILL_A1024_RESTORE; 1175 default: 1176 llvm_unreachable("unknown register size"); 1177 } 1178 } 1179 1180 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 1181 MachineBasicBlock::iterator MI, 1182 unsigned DestReg, int FrameIndex, 1183 const TargetRegisterClass *RC, 1184 const TargetRegisterInfo *TRI) const { 1185 MachineFunction *MF = MBB.getParent(); 1186 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1187 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1188 const DebugLoc &DL = MBB.findDebugLoc(MI); 1189 unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); 1190 unsigned Size = FrameInfo.getObjectSize(FrameIndex); 1191 unsigned SpillSize = TRI->getSpillSize(*RC); 1192 1193 MachinePointerInfo PtrInfo 1194 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 1195 1196 MachineMemOperand *MMO = MF->getMachineMemOperand( 1197 PtrInfo, MachineMemOperand::MOLoad, Size, Align); 1198 1199 if (RI.isSGPRClass(RC)) { 1200 MFI->setHasSpilledSGPRs(); 1201 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into"); 1202 1203 // FIXME: Maybe this should not include a memoperand because it will be 1204 // lowered to non-memory instructions. 1205 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); 1206 if (Register::isVirtualRegister(DestReg) && SpillSize == 4) { 1207 MachineRegisterInfo &MRI = MF->getRegInfo(); 1208 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); 1209 } 1210 1211 if (RI.spillSGPRToVGPR()) 1212 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); 1213 BuildMI(MBB, MI, DL, OpDesc, DestReg) 1214 .addFrameIndex(FrameIndex) // addr 1215 .addMemOperand(MMO) 1216 .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) 1217 .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); 1218 return; 1219 } 1220 1221 unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize) 1222 : getVGPRSpillRestoreOpcode(SpillSize); 1223 auto MIB = BuildMI(MBB, MI, DL, get(Opcode), DestReg); 1224 if (RI.hasAGPRs(RC)) { 1225 MachineRegisterInfo &MRI = MF->getRegInfo(); 1226 Register Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1227 MIB.addReg(Tmp, RegState::Define); 1228 } 1229 MIB.addFrameIndex(FrameIndex) // vaddr 1230 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 1231 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset 1232 .addImm(0) // offset 1233 .addMemOperand(MMO); 1234 } 1235 1236 /// \param @Offset Offset in bytes of the FrameIndex being spilled 1237 unsigned SIInstrInfo::calculateLDSSpillAddress( 1238 MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg, 1239 unsigned FrameOffset, unsigned Size) const { 1240 MachineFunction *MF = MBB.getParent(); 1241 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1242 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 1243 const DebugLoc &DL = MBB.findDebugLoc(MI); 1244 unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); 1245 unsigned WavefrontSize = ST.getWavefrontSize(); 1246 1247 unsigned TIDReg = MFI->getTIDReg(); 1248 if (!MFI->hasCalculatedTID()) { 1249 MachineBasicBlock &Entry = MBB.getParent()->front(); 1250 MachineBasicBlock::iterator Insert = Entry.front(); 1251 const DebugLoc &DL = Insert->getDebugLoc(); 1252 1253 TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass, 1254 *MF); 1255 if (TIDReg == AMDGPU::NoRegister) 1256 return TIDReg; 1257 1258 if (!AMDGPU::isShader(MF->getFunction().getCallingConv()) && 1259 WorkGroupSize > WavefrontSize) { 1260 Register TIDIGXReg = 1261 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 1262 Register TIDIGYReg = 1263 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 1264 Register TIDIGZReg = 1265 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 1266 Register InputPtrReg = 1267 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1268 for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { 1269 if (!Entry.isLiveIn(Reg)) 1270 Entry.addLiveIn(Reg); 1271 } 1272 1273 RS->enterBasicBlock(Entry); 1274 // FIXME: Can we scavenge an SReg_64 and access the subregs? 1275 unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 1276 unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 1277 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) 1278 .addReg(InputPtrReg) 1279 .addImm(SI::KernelInputOffsets::NGROUPS_Z); 1280 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) 1281 .addReg(InputPtrReg) 1282 .addImm(SI::KernelInputOffsets::NGROUPS_Y); 1283 1284 // NGROUPS.X * NGROUPS.Y 1285 BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) 1286 .addReg(STmp1) 1287 .addReg(STmp0); 1288 // (NGROUPS.X * NGROUPS.Y) * TIDIG.X 1289 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) 1290 .addReg(STmp1) 1291 .addReg(TIDIGXReg); 1292 // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) 1293 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) 1294 .addReg(STmp0) 1295 .addReg(TIDIGYReg) 1296 .addReg(TIDReg); 1297 // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z 1298 getAddNoCarry(Entry, Insert, DL, TIDReg) 1299 .addReg(TIDReg) 1300 .addReg(TIDIGZReg) 1301 .addImm(0); // clamp bit 1302 } else { 1303 // Get the wave id 1304 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), 1305 TIDReg) 1306 .addImm(-1) 1307 .addImm(0); 1308 1309 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), 1310 TIDReg) 1311 .addImm(-1) 1312 .addReg(TIDReg); 1313 } 1314 1315 BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), 1316 TIDReg) 1317 .addImm(2) 1318 .addReg(TIDReg); 1319 MFI->setTIDReg(TIDReg); 1320 } 1321 1322 // Add FrameIndex to LDS offset 1323 unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize); 1324 getAddNoCarry(MBB, MI, DL, TmpReg) 1325 .addImm(LDSOffset) 1326 .addReg(TIDReg) 1327 .addImm(0); // clamp bit 1328 1329 return TmpReg; 1330 } 1331 1332 void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB, 1333 MachineBasicBlock::iterator MI, 1334 int Count) const { 1335 DebugLoc DL = MBB.findDebugLoc(MI); 1336 while (Count > 0) { 1337 int Arg; 1338 if (Count >= 8) 1339 Arg = 7; 1340 else 1341 Arg = Count - 1; 1342 Count -= 8; 1343 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)) 1344 .addImm(Arg); 1345 } 1346 } 1347 1348 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, 1349 MachineBasicBlock::iterator MI) const { 1350 insertWaitStates(MBB, MI, 1); 1351 } 1352 1353 void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { 1354 auto MF = MBB.getParent(); 1355 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 1356 1357 assert(Info->isEntryFunction()); 1358 1359 if (MBB.succ_empty()) { 1360 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end(); 1361 if (HasNoTerminator) { 1362 if (Info->returnsVoid()) { 1363 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0); 1364 } else { 1365 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG)); 1366 } 1367 } 1368 } 1369 } 1370 1371 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { 1372 switch (MI.getOpcode()) { 1373 default: return 1; // FIXME: Do wait states equal cycles? 1374 1375 case AMDGPU::S_NOP: 1376 return MI.getOperand(0).getImm() + 1; 1377 } 1378 } 1379 1380 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 1381 MachineBasicBlock &MBB = *MI.getParent(); 1382 DebugLoc DL = MBB.findDebugLoc(MI); 1383 switch (MI.getOpcode()) { 1384 default: return TargetInstrInfo::expandPostRAPseudo(MI); 1385 case AMDGPU::S_MOV_B64_term: 1386 // This is only a terminator to get the correct spill code placement during 1387 // register allocation. 1388 MI.setDesc(get(AMDGPU::S_MOV_B64)); 1389 break; 1390 1391 case AMDGPU::S_MOV_B32_term: 1392 // This is only a terminator to get the correct spill code placement during 1393 // register allocation. 1394 MI.setDesc(get(AMDGPU::S_MOV_B32)); 1395 break; 1396 1397 case AMDGPU::S_XOR_B64_term: 1398 // This is only a terminator to get the correct spill code placement during 1399 // register allocation. 1400 MI.setDesc(get(AMDGPU::S_XOR_B64)); 1401 break; 1402 1403 case AMDGPU::S_XOR_B32_term: 1404 // This is only a terminator to get the correct spill code placement during 1405 // register allocation. 1406 MI.setDesc(get(AMDGPU::S_XOR_B32)); 1407 break; 1408 1409 case AMDGPU::S_OR_B32_term: 1410 // This is only a terminator to get the correct spill code placement during 1411 // register allocation. 1412 MI.setDesc(get(AMDGPU::S_OR_B32)); 1413 break; 1414 1415 case AMDGPU::S_ANDN2_B64_term: 1416 // This is only a terminator to get the correct spill code placement during 1417 // register allocation. 1418 MI.setDesc(get(AMDGPU::S_ANDN2_B64)); 1419 break; 1420 1421 case AMDGPU::S_ANDN2_B32_term: 1422 // This is only a terminator to get the correct spill code placement during 1423 // register allocation. 1424 MI.setDesc(get(AMDGPU::S_ANDN2_B32)); 1425 break; 1426 1427 case AMDGPU::V_MOV_B64_PSEUDO: { 1428 Register Dst = MI.getOperand(0).getReg(); 1429 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 1430 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 1431 1432 const MachineOperand &SrcOp = MI.getOperand(1); 1433 // FIXME: Will this work for 64-bit floating point immediates? 1434 assert(!SrcOp.isFPImm()); 1435 if (SrcOp.isImm()) { 1436 APInt Imm(64, SrcOp.getImm()); 1437 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 1438 .addImm(Imm.getLoBits(32).getZExtValue()) 1439 .addReg(Dst, RegState::Implicit | RegState::Define); 1440 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 1441 .addImm(Imm.getHiBits(32).getZExtValue()) 1442 .addReg(Dst, RegState::Implicit | RegState::Define); 1443 } else { 1444 assert(SrcOp.isReg()); 1445 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 1446 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 1447 .addReg(Dst, RegState::Implicit | RegState::Define); 1448 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 1449 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 1450 .addReg(Dst, RegState::Implicit | RegState::Define); 1451 } 1452 MI.eraseFromParent(); 1453 break; 1454 } 1455 case AMDGPU::V_MOV_B64_DPP_PSEUDO: { 1456 expandMovDPP64(MI); 1457 break; 1458 } 1459 case AMDGPU::V_SET_INACTIVE_B32: { 1460 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; 1461 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1462 BuildMI(MBB, MI, DL, get(NotOpc), Exec) 1463 .addReg(Exec); 1464 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) 1465 .add(MI.getOperand(2)); 1466 BuildMI(MBB, MI, DL, get(NotOpc), Exec) 1467 .addReg(Exec); 1468 MI.eraseFromParent(); 1469 break; 1470 } 1471 case AMDGPU::V_SET_INACTIVE_B64: { 1472 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; 1473 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1474 BuildMI(MBB, MI, DL, get(NotOpc), Exec) 1475 .addReg(Exec); 1476 MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), 1477 MI.getOperand(0).getReg()) 1478 .add(MI.getOperand(2)); 1479 expandPostRAPseudo(*Copy); 1480 BuildMI(MBB, MI, DL, get(NotOpc), Exec) 1481 .addReg(Exec); 1482 MI.eraseFromParent(); 1483 break; 1484 } 1485 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V1: 1486 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V2: 1487 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V3: 1488 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V4: 1489 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V5: 1490 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V8: 1491 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V16: 1492 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V32: { 1493 unsigned Opc = ST.useVGPRIndexMode() ? 1494 AMDGPU::V_MOV_B32_indirect : AMDGPU::V_MOVRELD_B32_e32; 1495 const MCInstrDesc &OpDesc = get(Opc); 1496 Register VecReg = MI.getOperand(0).getReg(); 1497 bool IsUndef = MI.getOperand(1).isUndef(); 1498 unsigned SubReg = MI.getOperand(3).getImm(); 1499 assert(VecReg == MI.getOperand(1).getReg()); 1500 1501 MachineInstrBuilder MIB = 1502 BuildMI(MBB, MI, DL, OpDesc) 1503 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) 1504 .add(MI.getOperand(2)) 1505 .addReg(VecReg, RegState::ImplicitDefine) 1506 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); 1507 1508 const int ImpDefIdx = 1509 OpDesc.getNumOperands() + OpDesc.getNumImplicitUses(); 1510 const int ImpUseIdx = ImpDefIdx + 1; 1511 MIB->tieOperands(ImpDefIdx, ImpUseIdx); 1512 MI.eraseFromParent(); 1513 break; 1514 } 1515 case AMDGPU::SI_PC_ADD_REL_OFFSET: { 1516 MachineFunction &MF = *MBB.getParent(); 1517 Register Reg = MI.getOperand(0).getReg(); 1518 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0); 1519 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1); 1520 1521 // Create a bundle so these instructions won't be re-ordered by the 1522 // post-RA scheduler. 1523 MIBundleBuilder Bundler(MBB, MI); 1524 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); 1525 1526 // Add 32-bit offset from this instruction to the start of the 1527 // constant data. 1528 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) 1529 .addReg(RegLo) 1530 .add(MI.getOperand(1))); 1531 1532 MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) 1533 .addReg(RegHi); 1534 MIB.add(MI.getOperand(2)); 1535 1536 Bundler.append(MIB); 1537 finalizeBundle(MBB, Bundler.begin()); 1538 1539 MI.eraseFromParent(); 1540 break; 1541 } 1542 case AMDGPU::ENTER_WWM: { 1543 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when 1544 // WWM is entered. 1545 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 1546 : AMDGPU::S_OR_SAVEEXEC_B64)); 1547 break; 1548 } 1549 case AMDGPU::EXIT_WWM: { 1550 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when 1551 // WWM is exited. 1552 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); 1553 break; 1554 } 1555 case TargetOpcode::BUNDLE: { 1556 if (!MI.mayLoad() || MI.hasUnmodeledSideEffects()) 1557 return false; 1558 1559 // If it is a load it must be a memory clause 1560 for (MachineBasicBlock::instr_iterator I = MI.getIterator(); 1561 I->isBundledWithSucc(); ++I) { 1562 I->unbundleFromSucc(); 1563 for (MachineOperand &MO : I->operands()) 1564 if (MO.isReg()) 1565 MO.setIsInternalRead(false); 1566 } 1567 1568 MI.eraseFromParent(); 1569 break; 1570 } 1571 } 1572 return true; 1573 } 1574 1575 std::pair<MachineInstr*, MachineInstr*> 1576 SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { 1577 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); 1578 1579 MachineBasicBlock &MBB = *MI.getParent(); 1580 DebugLoc DL = MBB.findDebugLoc(MI); 1581 MachineFunction *MF = MBB.getParent(); 1582 MachineRegisterInfo &MRI = MF->getRegInfo(); 1583 Register Dst = MI.getOperand(0).getReg(); 1584 unsigned Part = 0; 1585 MachineInstr *Split[2]; 1586 1587 1588 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) { 1589 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp)); 1590 if (Dst.isPhysical()) { 1591 MovDPP.addDef(RI.getSubReg(Dst, Sub)); 1592 } else { 1593 assert(MRI.isSSA()); 1594 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1595 MovDPP.addDef(Tmp); 1596 } 1597 1598 for (unsigned I = 1; I <= 2; ++I) { // old and src operands. 1599 const MachineOperand &SrcOp = MI.getOperand(I); 1600 assert(!SrcOp.isFPImm()); 1601 if (SrcOp.isImm()) { 1602 APInt Imm(64, SrcOp.getImm()); 1603 Imm.ashrInPlace(Part * 32); 1604 MovDPP.addImm(Imm.getLoBits(32).getZExtValue()); 1605 } else { 1606 assert(SrcOp.isReg()); 1607 Register Src = SrcOp.getReg(); 1608 if (Src.isPhysical()) 1609 MovDPP.addReg(RI.getSubReg(Src, Sub)); 1610 else 1611 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub); 1612 } 1613 } 1614 1615 for (unsigned I = 3; I < MI.getNumExplicitOperands(); ++I) 1616 MovDPP.addImm(MI.getOperand(I).getImm()); 1617 1618 Split[Part] = MovDPP; 1619 ++Part; 1620 } 1621 1622 if (Dst.isVirtual()) 1623 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst) 1624 .addReg(Split[0]->getOperand(0).getReg()) 1625 .addImm(AMDGPU::sub0) 1626 .addReg(Split[1]->getOperand(0).getReg()) 1627 .addImm(AMDGPU::sub1); 1628 1629 MI.eraseFromParent(); 1630 return std::make_pair(Split[0], Split[1]); 1631 } 1632 1633 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, 1634 MachineOperand &Src0, 1635 unsigned Src0OpName, 1636 MachineOperand &Src1, 1637 unsigned Src1OpName) const { 1638 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); 1639 if (!Src0Mods) 1640 return false; 1641 1642 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); 1643 assert(Src1Mods && 1644 "All commutable instructions have both src0 and src1 modifiers"); 1645 1646 int Src0ModsVal = Src0Mods->getImm(); 1647 int Src1ModsVal = Src1Mods->getImm(); 1648 1649 Src1Mods->setImm(Src0ModsVal); 1650 Src0Mods->setImm(Src1ModsVal); 1651 return true; 1652 } 1653 1654 static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, 1655 MachineOperand &RegOp, 1656 MachineOperand &NonRegOp) { 1657 Register Reg = RegOp.getReg(); 1658 unsigned SubReg = RegOp.getSubReg(); 1659 bool IsKill = RegOp.isKill(); 1660 bool IsDead = RegOp.isDead(); 1661 bool IsUndef = RegOp.isUndef(); 1662 bool IsDebug = RegOp.isDebug(); 1663 1664 if (NonRegOp.isImm()) 1665 RegOp.ChangeToImmediate(NonRegOp.getImm()); 1666 else if (NonRegOp.isFI()) 1667 RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); 1668 else 1669 return nullptr; 1670 1671 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); 1672 NonRegOp.setSubReg(SubReg); 1673 1674 return &MI; 1675 } 1676 1677 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, 1678 unsigned Src0Idx, 1679 unsigned Src1Idx) const { 1680 assert(!NewMI && "this should never be used"); 1681 1682 unsigned Opc = MI.getOpcode(); 1683 int CommutedOpcode = commuteOpcode(Opc); 1684 if (CommutedOpcode == -1) 1685 return nullptr; 1686 1687 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == 1688 static_cast<int>(Src0Idx) && 1689 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == 1690 static_cast<int>(Src1Idx) && 1691 "inconsistency with findCommutedOpIndices"); 1692 1693 MachineOperand &Src0 = MI.getOperand(Src0Idx); 1694 MachineOperand &Src1 = MI.getOperand(Src1Idx); 1695 1696 MachineInstr *CommutedMI = nullptr; 1697 if (Src0.isReg() && Src1.isReg()) { 1698 if (isOperandLegal(MI, Src1Idx, &Src0)) { 1699 // Be sure to copy the source modifiers to the right place. 1700 CommutedMI 1701 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); 1702 } 1703 1704 } else if (Src0.isReg() && !Src1.isReg()) { 1705 // src0 should always be able to support any operand type, so no need to 1706 // check operand legality. 1707 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); 1708 } else if (!Src0.isReg() && Src1.isReg()) { 1709 if (isOperandLegal(MI, Src1Idx, &Src0)) 1710 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); 1711 } else { 1712 // FIXME: Found two non registers to commute. This does happen. 1713 return nullptr; 1714 } 1715 1716 if (CommutedMI) { 1717 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, 1718 Src1, AMDGPU::OpName::src1_modifiers); 1719 1720 CommutedMI->setDesc(get(CommutedOpcode)); 1721 } 1722 1723 return CommutedMI; 1724 } 1725 1726 // This needs to be implemented because the source modifiers may be inserted 1727 // between the true commutable operands, and the base 1728 // TargetInstrInfo::commuteInstruction uses it. 1729 bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI, 1730 unsigned &SrcOpIdx0, 1731 unsigned &SrcOpIdx1) const { 1732 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1); 1733 } 1734 1735 bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0, 1736 unsigned &SrcOpIdx1) const { 1737 if (!Desc.isCommutable()) 1738 return false; 1739 1740 unsigned Opc = Desc.getOpcode(); 1741 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1742 if (Src0Idx == -1) 1743 return false; 1744 1745 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1746 if (Src1Idx == -1) 1747 return false; 1748 1749 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); 1750 } 1751 1752 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, 1753 int64_t BrOffset) const { 1754 // BranchRelaxation should never have to check s_setpc_b64 because its dest 1755 // block is unanalyzable. 1756 assert(BranchOp != AMDGPU::S_SETPC_B64); 1757 1758 // Convert to dwords. 1759 BrOffset /= 4; 1760 1761 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is 1762 // from the next instruction. 1763 BrOffset -= 1; 1764 1765 return isIntN(BranchOffsetBits, BrOffset); 1766 } 1767 1768 MachineBasicBlock *SIInstrInfo::getBranchDestBlock( 1769 const MachineInstr &MI) const { 1770 if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { 1771 // This would be a difficult analysis to perform, but can always be legal so 1772 // there's no need to analyze it. 1773 return nullptr; 1774 } 1775 1776 return MI.getOperand(0).getMBB(); 1777 } 1778 1779 unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, 1780 MachineBasicBlock &DestBB, 1781 const DebugLoc &DL, 1782 int64_t BrOffset, 1783 RegScavenger *RS) const { 1784 assert(RS && "RegScavenger required for long branching"); 1785 assert(MBB.empty() && 1786 "new block should be inserted for expanding unconditional branch"); 1787 assert(MBB.pred_size() == 1); 1788 1789 MachineFunction *MF = MBB.getParent(); 1790 MachineRegisterInfo &MRI = MF->getRegInfo(); 1791 1792 // FIXME: Virtual register workaround for RegScavenger not working with empty 1793 // blocks. 1794 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1795 1796 auto I = MBB.end(); 1797 1798 // We need to compute the offset relative to the instruction immediately after 1799 // s_getpc_b64. Insert pc arithmetic code before last terminator. 1800 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); 1801 1802 // TODO: Handle > 32-bit block address. 1803 if (BrOffset >= 0) { 1804 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) 1805 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 1806 .addReg(PCReg, 0, AMDGPU::sub0) 1807 .addMBB(&DestBB, MO_LONG_BRANCH_FORWARD); 1808 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) 1809 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 1810 .addReg(PCReg, 0, AMDGPU::sub1) 1811 .addImm(0); 1812 } else { 1813 // Backwards branch. 1814 BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32)) 1815 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 1816 .addReg(PCReg, 0, AMDGPU::sub0) 1817 .addMBB(&DestBB, MO_LONG_BRANCH_BACKWARD); 1818 BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32)) 1819 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 1820 .addReg(PCReg, 0, AMDGPU::sub1) 1821 .addImm(0); 1822 } 1823 1824 // Insert the indirect branch after the other terminator. 1825 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) 1826 .addReg(PCReg); 1827 1828 // FIXME: If spilling is necessary, this will fail because this scavenger has 1829 // no emergency stack slots. It is non-trivial to spill in this situation, 1830 // because the restore code needs to be specially placed after the 1831 // jump. BranchRelaxation then needs to be made aware of the newly inserted 1832 // block. 1833 // 1834 // If a spill is needed for the pc register pair, we need to insert a spill 1835 // restore block right before the destination block, and insert a short branch 1836 // into the old destination block's fallthrough predecessor. 1837 // e.g.: 1838 // 1839 // s_cbranch_scc0 skip_long_branch: 1840 // 1841 // long_branch_bb: 1842 // spill s[8:9] 1843 // s_getpc_b64 s[8:9] 1844 // s_add_u32 s8, s8, restore_bb 1845 // s_addc_u32 s9, s9, 0 1846 // s_setpc_b64 s[8:9] 1847 // 1848 // skip_long_branch: 1849 // foo; 1850 // 1851 // ..... 1852 // 1853 // dest_bb_fallthrough_predecessor: 1854 // bar; 1855 // s_branch dest_bb 1856 // 1857 // restore_bb: 1858 // restore s[8:9] 1859 // fallthrough dest_bb 1860 /// 1861 // dest_bb: 1862 // buzz; 1863 1864 RS->enterBasicBlockEnd(MBB); 1865 unsigned Scav = RS->scavengeRegisterBackwards( 1866 AMDGPU::SReg_64RegClass, 1867 MachineBasicBlock::iterator(GetPC), false, 0); 1868 MRI.replaceRegWith(PCReg, Scav); 1869 MRI.clearVirtRegs(); 1870 RS->setRegUsed(Scav); 1871 1872 return 4 + 8 + 4 + 4; 1873 } 1874 1875 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { 1876 switch (Cond) { 1877 case SIInstrInfo::SCC_TRUE: 1878 return AMDGPU::S_CBRANCH_SCC1; 1879 case SIInstrInfo::SCC_FALSE: 1880 return AMDGPU::S_CBRANCH_SCC0; 1881 case SIInstrInfo::VCCNZ: 1882 return AMDGPU::S_CBRANCH_VCCNZ; 1883 case SIInstrInfo::VCCZ: 1884 return AMDGPU::S_CBRANCH_VCCZ; 1885 case SIInstrInfo::EXECNZ: 1886 return AMDGPU::S_CBRANCH_EXECNZ; 1887 case SIInstrInfo::EXECZ: 1888 return AMDGPU::S_CBRANCH_EXECZ; 1889 default: 1890 llvm_unreachable("invalid branch predicate"); 1891 } 1892 } 1893 1894 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { 1895 switch (Opcode) { 1896 case AMDGPU::S_CBRANCH_SCC0: 1897 return SCC_FALSE; 1898 case AMDGPU::S_CBRANCH_SCC1: 1899 return SCC_TRUE; 1900 case AMDGPU::S_CBRANCH_VCCNZ: 1901 return VCCNZ; 1902 case AMDGPU::S_CBRANCH_VCCZ: 1903 return VCCZ; 1904 case AMDGPU::S_CBRANCH_EXECNZ: 1905 return EXECNZ; 1906 case AMDGPU::S_CBRANCH_EXECZ: 1907 return EXECZ; 1908 default: 1909 return INVALID_BR; 1910 } 1911 } 1912 1913 bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, 1914 MachineBasicBlock::iterator I, 1915 MachineBasicBlock *&TBB, 1916 MachineBasicBlock *&FBB, 1917 SmallVectorImpl<MachineOperand> &Cond, 1918 bool AllowModify) const { 1919 if (I->getOpcode() == AMDGPU::S_BRANCH) { 1920 // Unconditional Branch 1921 TBB = I->getOperand(0).getMBB(); 1922 return false; 1923 } 1924 1925 MachineBasicBlock *CondBB = nullptr; 1926 1927 if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 1928 CondBB = I->getOperand(1).getMBB(); 1929 Cond.push_back(I->getOperand(0)); 1930 } else { 1931 BranchPredicate Pred = getBranchPredicate(I->getOpcode()); 1932 if (Pred == INVALID_BR) 1933 return true; 1934 1935 CondBB = I->getOperand(0).getMBB(); 1936 Cond.push_back(MachineOperand::CreateImm(Pred)); 1937 Cond.push_back(I->getOperand(1)); // Save the branch register. 1938 } 1939 ++I; 1940 1941 if (I == MBB.end()) { 1942 // Conditional branch followed by fall-through. 1943 TBB = CondBB; 1944 return false; 1945 } 1946 1947 if (I->getOpcode() == AMDGPU::S_BRANCH) { 1948 TBB = CondBB; 1949 FBB = I->getOperand(0).getMBB(); 1950 return false; 1951 } 1952 1953 return true; 1954 } 1955 1956 bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, 1957 MachineBasicBlock *&FBB, 1958 SmallVectorImpl<MachineOperand> &Cond, 1959 bool AllowModify) const { 1960 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 1961 auto E = MBB.end(); 1962 if (I == E) 1963 return false; 1964 1965 // Skip over the instructions that are artificially terminators for special 1966 // exec management. 1967 while (I != E && !I->isBranch() && !I->isReturn() && 1968 I->getOpcode() != AMDGPU::SI_MASK_BRANCH) { 1969 switch (I->getOpcode()) { 1970 case AMDGPU::SI_MASK_BRANCH: 1971 case AMDGPU::S_MOV_B64_term: 1972 case AMDGPU::S_XOR_B64_term: 1973 case AMDGPU::S_ANDN2_B64_term: 1974 case AMDGPU::S_MOV_B32_term: 1975 case AMDGPU::S_XOR_B32_term: 1976 case AMDGPU::S_OR_B32_term: 1977 case AMDGPU::S_ANDN2_B32_term: 1978 break; 1979 case AMDGPU::SI_IF: 1980 case AMDGPU::SI_ELSE: 1981 case AMDGPU::SI_KILL_I1_TERMINATOR: 1982 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: 1983 // FIXME: It's messy that these need to be considered here at all. 1984 return true; 1985 default: 1986 llvm_unreachable("unexpected non-branch terminator inst"); 1987 } 1988 1989 ++I; 1990 } 1991 1992 if (I == E) 1993 return false; 1994 1995 if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH) 1996 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); 1997 1998 ++I; 1999 2000 // TODO: Should be able to treat as fallthrough? 2001 if (I == MBB.end()) 2002 return true; 2003 2004 if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify)) 2005 return true; 2006 2007 MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB(); 2008 2009 // Specifically handle the case where the conditional branch is to the same 2010 // destination as the mask branch. e.g. 2011 // 2012 // si_mask_branch BB8 2013 // s_cbranch_execz BB8 2014 // s_cbranch BB9 2015 // 2016 // This is required to understand divergent loops which may need the branches 2017 // to be relaxed. 2018 if (TBB != MaskBrDest || Cond.empty()) 2019 return true; 2020 2021 auto Pred = Cond[0].getImm(); 2022 return (Pred != EXECZ && Pred != EXECNZ); 2023 } 2024 2025 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, 2026 int *BytesRemoved) const { 2027 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 2028 2029 unsigned Count = 0; 2030 unsigned RemovedSize = 0; 2031 while (I != MBB.end()) { 2032 MachineBasicBlock::iterator Next = std::next(I); 2033 if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) { 2034 I = Next; 2035 continue; 2036 } 2037 2038 RemovedSize += getInstSizeInBytes(*I); 2039 I->eraseFromParent(); 2040 ++Count; 2041 I = Next; 2042 } 2043 2044 if (BytesRemoved) 2045 *BytesRemoved = RemovedSize; 2046 2047 return Count; 2048 } 2049 2050 // Copy the flags onto the implicit condition register operand. 2051 static void preserveCondRegFlags(MachineOperand &CondReg, 2052 const MachineOperand &OrigCond) { 2053 CondReg.setIsUndef(OrigCond.isUndef()); 2054 CondReg.setIsKill(OrigCond.isKill()); 2055 } 2056 2057 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, 2058 MachineBasicBlock *TBB, 2059 MachineBasicBlock *FBB, 2060 ArrayRef<MachineOperand> Cond, 2061 const DebugLoc &DL, 2062 int *BytesAdded) const { 2063 if (!FBB && Cond.empty()) { 2064 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 2065 .addMBB(TBB); 2066 if (BytesAdded) 2067 *BytesAdded = 4; 2068 return 1; 2069 } 2070 2071 if(Cond.size() == 1 && Cond[0].isReg()) { 2072 BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO)) 2073 .add(Cond[0]) 2074 .addMBB(TBB); 2075 return 1; 2076 } 2077 2078 assert(TBB && Cond[0].isImm()); 2079 2080 unsigned Opcode 2081 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); 2082 2083 if (!FBB) { 2084 Cond[1].isUndef(); 2085 MachineInstr *CondBr = 2086 BuildMI(&MBB, DL, get(Opcode)) 2087 .addMBB(TBB); 2088 2089 // Copy the flags onto the implicit condition register operand. 2090 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); 2091 2092 if (BytesAdded) 2093 *BytesAdded = 4; 2094 return 1; 2095 } 2096 2097 assert(TBB && FBB); 2098 2099 MachineInstr *CondBr = 2100 BuildMI(&MBB, DL, get(Opcode)) 2101 .addMBB(TBB); 2102 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 2103 .addMBB(FBB); 2104 2105 MachineOperand &CondReg = CondBr->getOperand(1); 2106 CondReg.setIsUndef(Cond[1].isUndef()); 2107 CondReg.setIsKill(Cond[1].isKill()); 2108 2109 if (BytesAdded) 2110 *BytesAdded = 8; 2111 2112 return 2; 2113 } 2114 2115 bool SIInstrInfo::reverseBranchCondition( 2116 SmallVectorImpl<MachineOperand> &Cond) const { 2117 if (Cond.size() != 2) { 2118 return true; 2119 } 2120 2121 if (Cond[0].isImm()) { 2122 Cond[0].setImm(-Cond[0].getImm()); 2123 return false; 2124 } 2125 2126 return true; 2127 } 2128 2129 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 2130 ArrayRef<MachineOperand> Cond, 2131 unsigned DstReg, unsigned TrueReg, 2132 unsigned FalseReg, int &CondCycles, 2133 int &TrueCycles, int &FalseCycles) const { 2134 switch (Cond[0].getImm()) { 2135 case VCCNZ: 2136 case VCCZ: { 2137 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2138 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 2139 assert(MRI.getRegClass(FalseReg) == RC); 2140 2141 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 2142 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 2143 2144 // Limit to equal cost for branch vs. N v_cndmask_b32s. 2145 return RI.hasVGPRs(RC) && NumInsts <= 6; 2146 } 2147 case SCC_TRUE: 2148 case SCC_FALSE: { 2149 // FIXME: We could insert for VGPRs if we could replace the original compare 2150 // with a vector one. 2151 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2152 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 2153 assert(MRI.getRegClass(FalseReg) == RC); 2154 2155 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 2156 2157 // Multiples of 8 can do s_cselect_b64 2158 if (NumInsts % 2 == 0) 2159 NumInsts /= 2; 2160 2161 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 2162 return RI.isSGPRClass(RC); 2163 } 2164 default: 2165 return false; 2166 } 2167 } 2168 2169 void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, 2170 MachineBasicBlock::iterator I, const DebugLoc &DL, 2171 unsigned DstReg, ArrayRef<MachineOperand> Cond, 2172 unsigned TrueReg, unsigned FalseReg) const { 2173 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); 2174 if (Pred == VCCZ || Pred == SCC_FALSE) { 2175 Pred = static_cast<BranchPredicate>(-Pred); 2176 std::swap(TrueReg, FalseReg); 2177 } 2178 2179 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2180 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); 2181 unsigned DstSize = RI.getRegSizeInBits(*DstRC); 2182 2183 if (DstSize == 32) { 2184 unsigned SelOp = Pred == SCC_TRUE ? 2185 AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32; 2186 2187 // Instruction's operands are backwards from what is expected. 2188 MachineInstr *Select = 2189 BuildMI(MBB, I, DL, get(SelOp), DstReg) 2190 .addReg(FalseReg) 2191 .addReg(TrueReg); 2192 2193 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 2194 return; 2195 } 2196 2197 if (DstSize == 64 && Pred == SCC_TRUE) { 2198 MachineInstr *Select = 2199 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) 2200 .addReg(FalseReg) 2201 .addReg(TrueReg); 2202 2203 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 2204 return; 2205 } 2206 2207 static const int16_t Sub0_15[] = { 2208 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 2209 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 2210 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 2211 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 2212 }; 2213 2214 static const int16_t Sub0_15_64[] = { 2215 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 2216 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 2217 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 2218 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 2219 }; 2220 2221 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; 2222 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; 2223 const int16_t *SubIndices = Sub0_15; 2224 int NElts = DstSize / 32; 2225 2226 // 64-bit select is only available for SALU. 2227 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit. 2228 if (Pred == SCC_TRUE) { 2229 if (NElts % 2) { 2230 SelOp = AMDGPU::S_CSELECT_B32; 2231 EltRC = &AMDGPU::SGPR_32RegClass; 2232 } else { 2233 SelOp = AMDGPU::S_CSELECT_B64; 2234 EltRC = &AMDGPU::SGPR_64RegClass; 2235 SubIndices = Sub0_15_64; 2236 NElts /= 2; 2237 } 2238 } 2239 2240 MachineInstrBuilder MIB = BuildMI( 2241 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); 2242 2243 I = MIB->getIterator(); 2244 2245 SmallVector<unsigned, 8> Regs; 2246 for (int Idx = 0; Idx != NElts; ++Idx) { 2247 Register DstElt = MRI.createVirtualRegister(EltRC); 2248 Regs.push_back(DstElt); 2249 2250 unsigned SubIdx = SubIndices[Idx]; 2251 2252 MachineInstr *Select = 2253 BuildMI(MBB, I, DL, get(SelOp), DstElt) 2254 .addReg(FalseReg, 0, SubIdx) 2255 .addReg(TrueReg, 0, SubIdx); 2256 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 2257 fixImplicitOperands(*Select); 2258 2259 MIB.addReg(DstElt) 2260 .addImm(SubIdx); 2261 } 2262 } 2263 2264 bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { 2265 switch (MI.getOpcode()) { 2266 case AMDGPU::V_MOV_B32_e32: 2267 case AMDGPU::V_MOV_B32_e64: 2268 case AMDGPU::V_MOV_B64_PSEUDO: { 2269 // If there are additional implicit register operands, this may be used for 2270 // register indexing so the source register operand isn't simply copied. 2271 unsigned NumOps = MI.getDesc().getNumOperands() + 2272 MI.getDesc().getNumImplicitUses(); 2273 2274 return MI.getNumOperands() == NumOps; 2275 } 2276 case AMDGPU::S_MOV_B32: 2277 case AMDGPU::S_MOV_B64: 2278 case AMDGPU::COPY: 2279 case AMDGPU::V_ACCVGPR_WRITE_B32: 2280 case AMDGPU::V_ACCVGPR_READ_B32: 2281 return true; 2282 default: 2283 return false; 2284 } 2285 } 2286 2287 unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind( 2288 unsigned Kind) const { 2289 switch(Kind) { 2290 case PseudoSourceValue::Stack: 2291 case PseudoSourceValue::FixedStack: 2292 return AMDGPUAS::PRIVATE_ADDRESS; 2293 case PseudoSourceValue::ConstantPool: 2294 case PseudoSourceValue::GOT: 2295 case PseudoSourceValue::JumpTable: 2296 case PseudoSourceValue::GlobalValueCallEntry: 2297 case PseudoSourceValue::ExternalSymbolCallEntry: 2298 case PseudoSourceValue::TargetCustom: 2299 return AMDGPUAS::CONSTANT_ADDRESS; 2300 } 2301 return AMDGPUAS::FLAT_ADDRESS; 2302 } 2303 2304 static void removeModOperands(MachineInstr &MI) { 2305 unsigned Opc = MI.getOpcode(); 2306 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, 2307 AMDGPU::OpName::src0_modifiers); 2308 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, 2309 AMDGPU::OpName::src1_modifiers); 2310 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, 2311 AMDGPU::OpName::src2_modifiers); 2312 2313 MI.RemoveOperand(Src2ModIdx); 2314 MI.RemoveOperand(Src1ModIdx); 2315 MI.RemoveOperand(Src0ModIdx); 2316 } 2317 2318 bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, 2319 unsigned Reg, MachineRegisterInfo *MRI) const { 2320 if (!MRI->hasOneNonDBGUse(Reg)) 2321 return false; 2322 2323 switch (DefMI.getOpcode()) { 2324 default: 2325 return false; 2326 case AMDGPU::S_MOV_B64: 2327 // TODO: We could fold 64-bit immediates, but this get compilicated 2328 // when there are sub-registers. 2329 return false; 2330 2331 case AMDGPU::V_MOV_B32_e32: 2332 case AMDGPU::S_MOV_B32: 2333 case AMDGPU::V_ACCVGPR_WRITE_B32: 2334 break; 2335 } 2336 2337 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); 2338 assert(ImmOp); 2339 // FIXME: We could handle FrameIndex values here. 2340 if (!ImmOp->isImm()) 2341 return false; 2342 2343 unsigned Opc = UseMI.getOpcode(); 2344 if (Opc == AMDGPU::COPY) { 2345 bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg()); 2346 unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 2347 if (RI.isAGPR(*MRI, UseMI.getOperand(0).getReg())) { 2348 if (!isInlineConstant(*ImmOp, AMDGPU::OPERAND_REG_INLINE_AC_INT32)) 2349 return false; 2350 NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32; 2351 } 2352 UseMI.setDesc(get(NewOpc)); 2353 UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm()); 2354 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); 2355 return true; 2356 } 2357 2358 if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || 2359 Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64 || 2360 Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 || 2361 Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64) { 2362 // Don't fold if we are using source or output modifiers. The new VOP2 2363 // instructions don't have them. 2364 if (hasAnyModifiersSet(UseMI)) 2365 return false; 2366 2367 // If this is a free constant, there's no reason to do this. 2368 // TODO: We could fold this here instead of letting SIFoldOperands do it 2369 // later. 2370 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); 2371 2372 // Any src operand can be used for the legality check. 2373 if (isInlineConstant(UseMI, *Src0, *ImmOp)) 2374 return false; 2375 2376 bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || 2377 Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64; 2378 bool IsFMA = Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 || 2379 Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64; 2380 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); 2381 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); 2382 2383 // Multiplied part is the constant: Use v_madmk_{f16, f32}. 2384 // We should only expect these to be on src0 due to canonicalizations. 2385 if (Src0->isReg() && Src0->getReg() == Reg) { 2386 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 2387 return false; 2388 2389 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) 2390 return false; 2391 2392 unsigned NewOpc = 2393 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : AMDGPU::V_FMAMK_F16) 2394 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16); 2395 if (pseudoToMCOpcode(NewOpc) == -1) 2396 return false; 2397 2398 // We need to swap operands 0 and 1 since madmk constant is at operand 1. 2399 2400 const int64_t Imm = ImmOp->getImm(); 2401 2402 // FIXME: This would be a lot easier if we could return a new instruction 2403 // instead of having to modify in place. 2404 2405 // Remove these first since they are at the end. 2406 UseMI.RemoveOperand( 2407 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 2408 UseMI.RemoveOperand( 2409 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 2410 2411 Register Src1Reg = Src1->getReg(); 2412 unsigned Src1SubReg = Src1->getSubReg(); 2413 Src0->setReg(Src1Reg); 2414 Src0->setSubReg(Src1SubReg); 2415 Src0->setIsKill(Src1->isKill()); 2416 2417 if (Opc == AMDGPU::V_MAC_F32_e64 || 2418 Opc == AMDGPU::V_MAC_F16_e64 || 2419 Opc == AMDGPU::V_FMAC_F32_e64 || 2420 Opc == AMDGPU::V_FMAC_F16_e64) 2421 UseMI.untieRegOperand( 2422 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 2423 2424 Src1->ChangeToImmediate(Imm); 2425 2426 removeModOperands(UseMI); 2427 UseMI.setDesc(get(NewOpc)); 2428 2429 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 2430 if (DeleteDef) 2431 DefMI.eraseFromParent(); 2432 2433 return true; 2434 } 2435 2436 // Added part is the constant: Use v_madak_{f16, f32}. 2437 if (Src2->isReg() && Src2->getReg() == Reg) { 2438 // Not allowed to use constant bus for another operand. 2439 // We can however allow an inline immediate as src0. 2440 bool Src0Inlined = false; 2441 if (Src0->isReg()) { 2442 // Try to inline constant if possible. 2443 // If the Def moves immediate and the use is single 2444 // We are saving VGPR here. 2445 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg()); 2446 if (Def && Def->isMoveImmediate() && 2447 isInlineConstant(Def->getOperand(1)) && 2448 MRI->hasOneUse(Src0->getReg())) { 2449 Src0->ChangeToImmediate(Def->getOperand(1).getImm()); 2450 Src0Inlined = true; 2451 } else if ((Register::isPhysicalRegister(Src0->getReg()) && 2452 (ST.getConstantBusLimit(Opc) <= 1 && 2453 RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) || 2454 (Register::isVirtualRegister(Src0->getReg()) && 2455 (ST.getConstantBusLimit(Opc) <= 1 && 2456 RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))) 2457 return false; 2458 // VGPR is okay as Src0 - fallthrough 2459 } 2460 2461 if (Src1->isReg() && !Src0Inlined ) { 2462 // We have one slot for inlinable constant so far - try to fill it 2463 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg()); 2464 if (Def && Def->isMoveImmediate() && 2465 isInlineConstant(Def->getOperand(1)) && 2466 MRI->hasOneUse(Src1->getReg()) && 2467 commuteInstruction(UseMI)) { 2468 Src0->ChangeToImmediate(Def->getOperand(1).getImm()); 2469 } else if ((Register::isPhysicalRegister(Src1->getReg()) && 2470 RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) || 2471 (Register::isVirtualRegister(Src1->getReg()) && 2472 RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) 2473 return false; 2474 // VGPR is okay as Src1 - fallthrough 2475 } 2476 2477 unsigned NewOpc = 2478 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 : AMDGPU::V_FMAAK_F16) 2479 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16); 2480 if (pseudoToMCOpcode(NewOpc) == -1) 2481 return false; 2482 2483 const int64_t Imm = ImmOp->getImm(); 2484 2485 // FIXME: This would be a lot easier if we could return a new instruction 2486 // instead of having to modify in place. 2487 2488 // Remove these first since they are at the end. 2489 UseMI.RemoveOperand( 2490 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 2491 UseMI.RemoveOperand( 2492 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 2493 2494 if (Opc == AMDGPU::V_MAC_F32_e64 || 2495 Opc == AMDGPU::V_MAC_F16_e64 || 2496 Opc == AMDGPU::V_FMAC_F32_e64 || 2497 Opc == AMDGPU::V_FMAC_F16_e64) 2498 UseMI.untieRegOperand( 2499 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 2500 2501 // ChangingToImmediate adds Src2 back to the instruction. 2502 Src2->ChangeToImmediate(Imm); 2503 2504 // These come before src2. 2505 removeModOperands(UseMI); 2506 UseMI.setDesc(get(NewOpc)); 2507 // It might happen that UseMI was commuted 2508 // and we now have SGPR as SRC1. If so 2 inlined 2509 // constant and SGPR are illegal. 2510 legalizeOperands(UseMI); 2511 2512 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 2513 if (DeleteDef) 2514 DefMI.eraseFromParent(); 2515 2516 return true; 2517 } 2518 } 2519 2520 return false; 2521 } 2522 2523 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 2524 int WidthB, int OffsetB) { 2525 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 2526 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 2527 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 2528 return LowOffset + LowWidth <= HighOffset; 2529 } 2530 2531 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa, 2532 const MachineInstr &MIb) const { 2533 const MachineOperand *BaseOp0, *BaseOp1; 2534 int64_t Offset0, Offset1; 2535 2536 if (getMemOperandWithOffset(MIa, BaseOp0, Offset0, &RI) && 2537 getMemOperandWithOffset(MIb, BaseOp1, Offset1, &RI)) { 2538 if (!BaseOp0->isIdenticalTo(*BaseOp1)) 2539 return false; 2540 2541 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { 2542 // FIXME: Handle ds_read2 / ds_write2. 2543 return false; 2544 } 2545 unsigned Width0 = (*MIa.memoperands_begin())->getSize(); 2546 unsigned Width1 = (*MIb.memoperands_begin())->getSize(); 2547 if (offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { 2548 return true; 2549 } 2550 } 2551 2552 return false; 2553 } 2554 2555 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, 2556 const MachineInstr &MIb) const { 2557 assert(MIa.mayLoadOrStore() && 2558 "MIa must load from or modify a memory location"); 2559 assert(MIb.mayLoadOrStore() && 2560 "MIb must load from or modify a memory location"); 2561 2562 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) 2563 return false; 2564 2565 // XXX - Can we relax this between address spaces? 2566 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 2567 return false; 2568 2569 // TODO: Should we check the address space from the MachineMemOperand? That 2570 // would allow us to distinguish objects we know don't alias based on the 2571 // underlying address space, even if it was lowered to a different one, 2572 // e.g. private accesses lowered to use MUBUF instructions on a scratch 2573 // buffer. 2574 if (isDS(MIa)) { 2575 if (isDS(MIb)) 2576 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2577 2578 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb); 2579 } 2580 2581 if (isMUBUF(MIa) || isMTBUF(MIa)) { 2582 if (isMUBUF(MIb) || isMTBUF(MIb)) 2583 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2584 2585 return !isFLAT(MIb) && !isSMRD(MIb); 2586 } 2587 2588 if (isSMRD(MIa)) { 2589 if (isSMRD(MIb)) 2590 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2591 2592 return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa); 2593 } 2594 2595 if (isFLAT(MIa)) { 2596 if (isFLAT(MIb)) 2597 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2598 2599 return false; 2600 } 2601 2602 return false; 2603 } 2604 2605 static int64_t getFoldableImm(const MachineOperand* MO) { 2606 if (!MO->isReg()) 2607 return false; 2608 const MachineFunction *MF = MO->getParent()->getParent()->getParent(); 2609 const MachineRegisterInfo &MRI = MF->getRegInfo(); 2610 auto Def = MRI.getUniqueVRegDef(MO->getReg()); 2611 if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 && 2612 Def->getOperand(1).isImm()) 2613 return Def->getOperand(1).getImm(); 2614 return AMDGPU::NoRegister; 2615 } 2616 2617 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, 2618 MachineInstr &MI, 2619 LiveVariables *LV) const { 2620 unsigned Opc = MI.getOpcode(); 2621 bool IsF16 = false; 2622 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 || 2623 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64; 2624 2625 switch (Opc) { 2626 default: 2627 return nullptr; 2628 case AMDGPU::V_MAC_F16_e64: 2629 case AMDGPU::V_FMAC_F16_e64: 2630 IsF16 = true; 2631 LLVM_FALLTHROUGH; 2632 case AMDGPU::V_MAC_F32_e64: 2633 case AMDGPU::V_FMAC_F32_e64: 2634 break; 2635 case AMDGPU::V_MAC_F16_e32: 2636 case AMDGPU::V_FMAC_F16_e32: 2637 IsF16 = true; 2638 LLVM_FALLTHROUGH; 2639 case AMDGPU::V_MAC_F32_e32: 2640 case AMDGPU::V_FMAC_F32_e32: { 2641 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 2642 AMDGPU::OpName::src0); 2643 const MachineOperand *Src0 = &MI.getOperand(Src0Idx); 2644 if (!Src0->isReg() && !Src0->isImm()) 2645 return nullptr; 2646 2647 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0)) 2648 return nullptr; 2649 2650 break; 2651 } 2652 } 2653 2654 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 2655 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); 2656 const MachineOperand *Src0Mods = 2657 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 2658 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 2659 const MachineOperand *Src1Mods = 2660 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 2661 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 2662 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); 2663 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); 2664 2665 if (!Src0Mods && !Src1Mods && !Clamp && !Omod && 2666 // If we have an SGPR input, we will violate the constant bus restriction. 2667 (ST.getConstantBusLimit(Opc) > 1 || 2668 !Src0->isReg() || 2669 !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) { 2670 if (auto Imm = getFoldableImm(Src2)) { 2671 unsigned NewOpc = 2672 IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32) 2673 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32); 2674 if (pseudoToMCOpcode(NewOpc) != -1) 2675 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) 2676 .add(*Dst) 2677 .add(*Src0) 2678 .add(*Src1) 2679 .addImm(Imm); 2680 } 2681 unsigned NewOpc = 2682 IsFMA ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32) 2683 : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32); 2684 if (auto Imm = getFoldableImm(Src1)) { 2685 if (pseudoToMCOpcode(NewOpc) != -1) 2686 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) 2687 .add(*Dst) 2688 .add(*Src0) 2689 .addImm(Imm) 2690 .add(*Src2); 2691 } 2692 if (auto Imm = getFoldableImm(Src0)) { 2693 if (pseudoToMCOpcode(NewOpc) != -1 && 2694 isOperandLegal(MI, AMDGPU::getNamedOperandIdx(NewOpc, 2695 AMDGPU::OpName::src0), Src1)) 2696 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) 2697 .add(*Dst) 2698 .add(*Src1) 2699 .addImm(Imm) 2700 .add(*Src2); 2701 } 2702 } 2703 2704 unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16 : AMDGPU::V_FMA_F32) 2705 : (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32); 2706 if (pseudoToMCOpcode(NewOpc) == -1) 2707 return nullptr; 2708 2709 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) 2710 .add(*Dst) 2711 .addImm(Src0Mods ? Src0Mods->getImm() : 0) 2712 .add(*Src0) 2713 .addImm(Src1Mods ? Src1Mods->getImm() : 0) 2714 .add(*Src1) 2715 .addImm(0) // Src mods 2716 .add(*Src2) 2717 .addImm(Clamp ? Clamp->getImm() : 0) 2718 .addImm(Omod ? Omod->getImm() : 0); 2719 } 2720 2721 // It's not generally safe to move VALU instructions across these since it will 2722 // start using the register as a base index rather than directly. 2723 // XXX - Why isn't hasSideEffects sufficient for these? 2724 static bool changesVGPRIndexingMode(const MachineInstr &MI) { 2725 switch (MI.getOpcode()) { 2726 case AMDGPU::S_SET_GPR_IDX_ON: 2727 case AMDGPU::S_SET_GPR_IDX_MODE: 2728 case AMDGPU::S_SET_GPR_IDX_OFF: 2729 return true; 2730 default: 2731 return false; 2732 } 2733 } 2734 2735 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, 2736 const MachineBasicBlock *MBB, 2737 const MachineFunction &MF) const { 2738 // XXX - Do we want the SP check in the base implementation? 2739 2740 // Target-independent instructions do not have an implicit-use of EXEC, even 2741 // when they operate on VGPRs. Treating EXEC modifications as scheduling 2742 // boundaries prevents incorrect movements of such instructions. 2743 return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) || 2744 MI.modifiesRegister(AMDGPU::EXEC, &RI) || 2745 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || 2746 MI.getOpcode() == AMDGPU::S_SETREG_B32 || 2747 MI.getOpcode() == AMDGPU::S_DENORM_MODE || 2748 changesVGPRIndexingMode(MI); 2749 } 2750 2751 bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { 2752 return Opcode == AMDGPU::DS_ORDERED_COUNT || 2753 Opcode == AMDGPU::DS_GWS_INIT || 2754 Opcode == AMDGPU::DS_GWS_SEMA_V || 2755 Opcode == AMDGPU::DS_GWS_SEMA_BR || 2756 Opcode == AMDGPU::DS_GWS_SEMA_P || 2757 Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL || 2758 Opcode == AMDGPU::DS_GWS_BARRIER; 2759 } 2760 2761 bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const { 2762 unsigned Opcode = MI.getOpcode(); 2763 2764 if (MI.mayStore() && isSMRD(MI)) 2765 return true; // scalar store or atomic 2766 2767 // This will terminate the function when other lanes may need to continue. 2768 if (MI.isReturn()) 2769 return true; 2770 2771 // These instructions cause shader I/O that may cause hardware lockups 2772 // when executed with an empty EXEC mask. 2773 // 2774 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when 2775 // EXEC = 0, but checking for that case here seems not worth it 2776 // given the typical code patterns. 2777 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT || 2778 Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE || 2779 Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP || 2780 Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER) 2781 return true; 2782 2783 if (MI.isCall() || MI.isInlineAsm()) 2784 return true; // conservative assumption 2785 2786 // These are like SALU instructions in terms of effects, so it's questionable 2787 // whether we should return true for those. 2788 // 2789 // However, executing them with EXEC = 0 causes them to operate on undefined 2790 // data, which we avoid by returning true here. 2791 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || Opcode == AMDGPU::V_READLANE_B32) 2792 return true; 2793 2794 return false; 2795 } 2796 2797 bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI, 2798 const MachineInstr &MI) const { 2799 if (MI.isMetaInstruction()) 2800 return false; 2801 2802 // This won't read exec if this is an SGPR->SGPR copy. 2803 if (MI.isCopyLike()) { 2804 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg())) 2805 return true; 2806 2807 // Make sure this isn't copying exec as a normal operand 2808 return MI.readsRegister(AMDGPU::EXEC, &RI); 2809 } 2810 2811 // Make a conservative assumption about the callee. 2812 if (MI.isCall()) 2813 return true; 2814 2815 // Be conservative with any unhandled generic opcodes. 2816 if (!isTargetSpecificOpcode(MI.getOpcode())) 2817 return true; 2818 2819 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI); 2820 } 2821 2822 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 2823 switch (Imm.getBitWidth()) { 2824 case 1: // This likely will be a condition code mask. 2825 return true; 2826 2827 case 32: 2828 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(), 2829 ST.hasInv2PiInlineImm()); 2830 case 64: 2831 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), 2832 ST.hasInv2PiInlineImm()); 2833 case 16: 2834 return ST.has16BitInsts() && 2835 AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), 2836 ST.hasInv2PiInlineImm()); 2837 default: 2838 llvm_unreachable("invalid bitwidth"); 2839 } 2840 } 2841 2842 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 2843 uint8_t OperandType) const { 2844 if (!MO.isImm() || 2845 OperandType < AMDGPU::OPERAND_SRC_FIRST || 2846 OperandType > AMDGPU::OPERAND_SRC_LAST) 2847 return false; 2848 2849 // MachineOperand provides no way to tell the true operand size, since it only 2850 // records a 64-bit value. We need to know the size to determine if a 32-bit 2851 // floating point immediate bit pattern is legal for an integer immediate. It 2852 // would be for any 32-bit integer operand, but would not be for a 64-bit one. 2853 2854 int64_t Imm = MO.getImm(); 2855 switch (OperandType) { 2856 case AMDGPU::OPERAND_REG_IMM_INT32: 2857 case AMDGPU::OPERAND_REG_IMM_FP32: 2858 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 2859 case AMDGPU::OPERAND_REG_INLINE_C_FP32: 2860 case AMDGPU::OPERAND_REG_INLINE_AC_INT32: 2861 case AMDGPU::OPERAND_REG_INLINE_AC_FP32: { 2862 int32_t Trunc = static_cast<int32_t>(Imm); 2863 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); 2864 } 2865 case AMDGPU::OPERAND_REG_IMM_INT64: 2866 case AMDGPU::OPERAND_REG_IMM_FP64: 2867 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 2868 case AMDGPU::OPERAND_REG_INLINE_C_FP64: 2869 return AMDGPU::isInlinableLiteral64(MO.getImm(), 2870 ST.hasInv2PiInlineImm()); 2871 case AMDGPU::OPERAND_REG_IMM_INT16: 2872 case AMDGPU::OPERAND_REG_IMM_FP16: 2873 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 2874 case AMDGPU::OPERAND_REG_INLINE_C_FP16: 2875 case AMDGPU::OPERAND_REG_INLINE_AC_INT16: 2876 case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { 2877 if (isInt<16>(Imm) || isUInt<16>(Imm)) { 2878 // A few special case instructions have 16-bit operands on subtargets 2879 // where 16-bit instructions are not legal. 2880 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle 2881 // constants in these cases 2882 int16_t Trunc = static_cast<int16_t>(Imm); 2883 return ST.has16BitInsts() && 2884 AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); 2885 } 2886 2887 return false; 2888 } 2889 case AMDGPU::OPERAND_REG_IMM_V2INT16: 2890 case AMDGPU::OPERAND_REG_IMM_V2FP16: 2891 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: 2892 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: 2893 case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: 2894 case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { 2895 uint32_t Trunc = static_cast<uint32_t>(Imm); 2896 return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); 2897 } 2898 default: 2899 llvm_unreachable("invalid bitwidth"); 2900 } 2901 } 2902 2903 bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO, 2904 const MCOperandInfo &OpInfo) const { 2905 switch (MO.getType()) { 2906 case MachineOperand::MO_Register: 2907 return false; 2908 case MachineOperand::MO_Immediate: 2909 return !isInlineConstant(MO, OpInfo); 2910 case MachineOperand::MO_FrameIndex: 2911 case MachineOperand::MO_MachineBasicBlock: 2912 case MachineOperand::MO_ExternalSymbol: 2913 case MachineOperand::MO_GlobalAddress: 2914 case MachineOperand::MO_MCSymbol: 2915 return true; 2916 default: 2917 llvm_unreachable("unexpected operand type"); 2918 } 2919 } 2920 2921 static bool compareMachineOp(const MachineOperand &Op0, 2922 const MachineOperand &Op1) { 2923 if (Op0.getType() != Op1.getType()) 2924 return false; 2925 2926 switch (Op0.getType()) { 2927 case MachineOperand::MO_Register: 2928 return Op0.getReg() == Op1.getReg(); 2929 case MachineOperand::MO_Immediate: 2930 return Op0.getImm() == Op1.getImm(); 2931 default: 2932 llvm_unreachable("Didn't expect to be comparing these operand types"); 2933 } 2934 } 2935 2936 bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, 2937 const MachineOperand &MO) const { 2938 const MCInstrDesc &InstDesc = MI.getDesc(); 2939 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo]; 2940 2941 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); 2942 2943 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 2944 return true; 2945 2946 if (OpInfo.RegClass < 0) 2947 return false; 2948 2949 const MachineFunction *MF = MI.getParent()->getParent(); 2950 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 2951 2952 if (MO.isImm() && isInlineConstant(MO, OpInfo)) { 2953 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() && 2954 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(), 2955 AMDGPU::OpName::src2)) 2956 return false; 2957 return RI.opCanUseInlineConstant(OpInfo.OperandType); 2958 } 2959 2960 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType)) 2961 return false; 2962 2963 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo)) 2964 return true; 2965 2966 return ST.hasVOP3Literal(); 2967 } 2968 2969 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 2970 int Op32 = AMDGPU::getVOPe32(Opcode); 2971 if (Op32 == -1) 2972 return false; 2973 2974 return pseudoToMCOpcode(Op32) != -1; 2975 } 2976 2977 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 2978 // The src0_modifier operand is present on all instructions 2979 // that have modifiers. 2980 2981 return AMDGPU::getNamedOperandIdx(Opcode, 2982 AMDGPU::OpName::src0_modifiers) != -1; 2983 } 2984 2985 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 2986 unsigned OpName) const { 2987 const MachineOperand *Mods = getNamedOperand(MI, OpName); 2988 return Mods && Mods->getImm(); 2989 } 2990 2991 bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { 2992 return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || 2993 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || 2994 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) || 2995 hasModifiersSet(MI, AMDGPU::OpName::clamp) || 2996 hasModifiersSet(MI, AMDGPU::OpName::omod); 2997 } 2998 2999 bool SIInstrInfo::canShrink(const MachineInstr &MI, 3000 const MachineRegisterInfo &MRI) const { 3001 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 3002 // Can't shrink instruction with three operands. 3003 // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add 3004 // a special case for it. It can only be shrunk if the third operand 3005 // is vcc, and src0_modifiers and src1_modifiers are not set. 3006 // We should handle this the same way we handle vopc, by addding 3007 // a register allocation hint pre-regalloc and then do the shrinking 3008 // post-regalloc. 3009 if (Src2) { 3010 switch (MI.getOpcode()) { 3011 default: return false; 3012 3013 case AMDGPU::V_ADDC_U32_e64: 3014 case AMDGPU::V_SUBB_U32_e64: 3015 case AMDGPU::V_SUBBREV_U32_e64: { 3016 const MachineOperand *Src1 3017 = getNamedOperand(MI, AMDGPU::OpName::src1); 3018 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg())) 3019 return false; 3020 // Additional verification is needed for sdst/src2. 3021 return true; 3022 } 3023 case AMDGPU::V_MAC_F32_e64: 3024 case AMDGPU::V_MAC_F16_e64: 3025 case AMDGPU::V_FMAC_F32_e64: 3026 case AMDGPU::V_FMAC_F16_e64: 3027 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) || 3028 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) 3029 return false; 3030 break; 3031 3032 case AMDGPU::V_CNDMASK_B32_e64: 3033 break; 3034 } 3035 } 3036 3037 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 3038 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) || 3039 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers))) 3040 return false; 3041 3042 // We don't need to check src0, all input types are legal, so just make sure 3043 // src0 isn't using any modifiers. 3044 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) 3045 return false; 3046 3047 // Can it be shrunk to a valid 32 bit opcode? 3048 if (!hasVALU32BitEncoding(MI.getOpcode())) 3049 return false; 3050 3051 // Check output modifiers 3052 return !hasModifiersSet(MI, AMDGPU::OpName::omod) && 3053 !hasModifiersSet(MI, AMDGPU::OpName::clamp); 3054 } 3055 3056 // Set VCC operand with all flags from \p Orig, except for setting it as 3057 // implicit. 3058 static void copyFlagsToImplicitVCC(MachineInstr &MI, 3059 const MachineOperand &Orig) { 3060 3061 for (MachineOperand &Use : MI.implicit_operands()) { 3062 if (Use.isUse() && Use.getReg() == AMDGPU::VCC) { 3063 Use.setIsUndef(Orig.isUndef()); 3064 Use.setIsKill(Orig.isKill()); 3065 return; 3066 } 3067 } 3068 } 3069 3070 MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI, 3071 unsigned Op32) const { 3072 MachineBasicBlock *MBB = MI.getParent();; 3073 MachineInstrBuilder Inst32 = 3074 BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32)); 3075 3076 // Add the dst operand if the 32-bit encoding also has an explicit $vdst. 3077 // For VOPC instructions, this is replaced by an implicit def of vcc. 3078 int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst); 3079 if (Op32DstIdx != -1) { 3080 // dst 3081 Inst32.add(MI.getOperand(0)); 3082 } else { 3083 assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) || 3084 (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && 3085 "Unexpected case"); 3086 } 3087 3088 Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0)); 3089 3090 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 3091 if (Src1) 3092 Inst32.add(*Src1); 3093 3094 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 3095 3096 if (Src2) { 3097 int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); 3098 if (Op32Src2Idx != -1) { 3099 Inst32.add(*Src2); 3100 } else { 3101 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is 3102 // replaced with an implicit read of vcc. This was already added 3103 // during the initial BuildMI, so find it to preserve the flags. 3104 copyFlagsToImplicitVCC(*Inst32, *Src2); 3105 } 3106 } 3107 3108 return Inst32; 3109 } 3110 3111 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 3112 const MachineOperand &MO, 3113 const MCOperandInfo &OpInfo) const { 3114 // Literal constants use the constant bus. 3115 //if (isLiteralConstantLike(MO, OpInfo)) 3116 // return true; 3117 if (MO.isImm()) 3118 return !isInlineConstant(MO, OpInfo); 3119 3120 if (!MO.isReg()) 3121 return true; // Misc other operands like FrameIndex 3122 3123 if (!MO.isUse()) 3124 return false; 3125 3126 if (Register::isVirtualRegister(MO.getReg())) 3127 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 3128 3129 // Null is free 3130 if (MO.getReg() == AMDGPU::SGPR_NULL) 3131 return false; 3132 3133 // SGPRs use the constant bus 3134 if (MO.isImplicit()) { 3135 return MO.getReg() == AMDGPU::M0 || 3136 MO.getReg() == AMDGPU::VCC || 3137 MO.getReg() == AMDGPU::VCC_LO; 3138 } else { 3139 return AMDGPU::SReg_32RegClass.contains(MO.getReg()) || 3140 AMDGPU::SReg_64RegClass.contains(MO.getReg()); 3141 } 3142 } 3143 3144 static unsigned findImplicitSGPRRead(const MachineInstr &MI) { 3145 for (const MachineOperand &MO : MI.implicit_operands()) { 3146 // We only care about reads. 3147 if (MO.isDef()) 3148 continue; 3149 3150 switch (MO.getReg()) { 3151 case AMDGPU::VCC: 3152 case AMDGPU::VCC_LO: 3153 case AMDGPU::VCC_HI: 3154 case AMDGPU::M0: 3155 case AMDGPU::FLAT_SCR: 3156 return MO.getReg(); 3157 3158 default: 3159 break; 3160 } 3161 } 3162 3163 return AMDGPU::NoRegister; 3164 } 3165 3166 static bool shouldReadExec(const MachineInstr &MI) { 3167 if (SIInstrInfo::isVALU(MI)) { 3168 switch (MI.getOpcode()) { 3169 case AMDGPU::V_READLANE_B32: 3170 case AMDGPU::V_READLANE_B32_gfx6_gfx7: 3171 case AMDGPU::V_READLANE_B32_gfx10: 3172 case AMDGPU::V_READLANE_B32_vi: 3173 case AMDGPU::V_WRITELANE_B32: 3174 case AMDGPU::V_WRITELANE_B32_gfx6_gfx7: 3175 case AMDGPU::V_WRITELANE_B32_gfx10: 3176 case AMDGPU::V_WRITELANE_B32_vi: 3177 return false; 3178 } 3179 3180 return true; 3181 } 3182 3183 if (MI.isPreISelOpcode() || 3184 SIInstrInfo::isGenericOpcode(MI.getOpcode()) || 3185 SIInstrInfo::isSALU(MI) || 3186 SIInstrInfo::isSMRD(MI)) 3187 return false; 3188 3189 return true; 3190 } 3191 3192 static bool isSubRegOf(const SIRegisterInfo &TRI, 3193 const MachineOperand &SuperVec, 3194 const MachineOperand &SubReg) { 3195 if (Register::isPhysicalRegister(SubReg.getReg())) 3196 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); 3197 3198 return SubReg.getSubReg() != AMDGPU::NoSubRegister && 3199 SubReg.getReg() == SuperVec.getReg(); 3200 } 3201 3202 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, 3203 StringRef &ErrInfo) const { 3204 uint16_t Opcode = MI.getOpcode(); 3205 if (SIInstrInfo::isGenericOpcode(MI.getOpcode())) 3206 return true; 3207 3208 const MachineFunction *MF = MI.getParent()->getParent(); 3209 const MachineRegisterInfo &MRI = MF->getRegInfo(); 3210 3211 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 3212 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 3213 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 3214 3215 // Make sure the number of operands is correct. 3216 const MCInstrDesc &Desc = get(Opcode); 3217 if (!Desc.isVariadic() && 3218 Desc.getNumOperands() != MI.getNumExplicitOperands()) { 3219 ErrInfo = "Instruction has wrong number of operands."; 3220 return false; 3221 } 3222 3223 if (MI.isInlineAsm()) { 3224 // Verify register classes for inlineasm constraints. 3225 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands(); 3226 I != E; ++I) { 3227 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI); 3228 if (!RC) 3229 continue; 3230 3231 const MachineOperand &Op = MI.getOperand(I); 3232 if (!Op.isReg()) 3233 continue; 3234 3235 Register Reg = Op.getReg(); 3236 if (!Register::isVirtualRegister(Reg) && !RC->contains(Reg)) { 3237 ErrInfo = "inlineasm operand has incorrect register class."; 3238 return false; 3239 } 3240 } 3241 3242 return true; 3243 } 3244 3245 // Make sure the register classes are correct. 3246 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 3247 if (MI.getOperand(i).isFPImm()) { 3248 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 3249 "all fp values to integers."; 3250 return false; 3251 } 3252 3253 int RegClass = Desc.OpInfo[i].RegClass; 3254 3255 switch (Desc.OpInfo[i].OperandType) { 3256 case MCOI::OPERAND_REGISTER: 3257 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) { 3258 ErrInfo = "Illegal immediate value for operand."; 3259 return false; 3260 } 3261 break; 3262 case AMDGPU::OPERAND_REG_IMM_INT32: 3263 case AMDGPU::OPERAND_REG_IMM_FP32: 3264 break; 3265 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 3266 case AMDGPU::OPERAND_REG_INLINE_C_FP32: 3267 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 3268 case AMDGPU::OPERAND_REG_INLINE_C_FP64: 3269 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 3270 case AMDGPU::OPERAND_REG_INLINE_C_FP16: 3271 case AMDGPU::OPERAND_REG_INLINE_AC_INT32: 3272 case AMDGPU::OPERAND_REG_INLINE_AC_FP32: 3273 case AMDGPU::OPERAND_REG_INLINE_AC_INT16: 3274 case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { 3275 const MachineOperand &MO = MI.getOperand(i); 3276 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { 3277 ErrInfo = "Illegal immediate value for operand."; 3278 return false; 3279 } 3280 break; 3281 } 3282 case MCOI::OPERAND_IMMEDIATE: 3283 case AMDGPU::OPERAND_KIMM32: 3284 // Check if this operand is an immediate. 3285 // FrameIndex operands will be replaced by immediates, so they are 3286 // allowed. 3287 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { 3288 ErrInfo = "Expected immediate, but got non-immediate"; 3289 return false; 3290 } 3291 LLVM_FALLTHROUGH; 3292 default: 3293 continue; 3294 } 3295 3296 if (!MI.getOperand(i).isReg()) 3297 continue; 3298 3299 if (RegClass != -1) { 3300 Register Reg = MI.getOperand(i).getReg(); 3301 if (Reg == AMDGPU::NoRegister || Register::isVirtualRegister(Reg)) 3302 continue; 3303 3304 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 3305 if (!RC->contains(Reg)) { 3306 ErrInfo = "Operand has incorrect register class."; 3307 return false; 3308 } 3309 } 3310 } 3311 3312 // Verify SDWA 3313 if (isSDWA(MI)) { 3314 if (!ST.hasSDWA()) { 3315 ErrInfo = "SDWA is not supported on this target"; 3316 return false; 3317 } 3318 3319 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); 3320 3321 const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx }; 3322 3323 for (int OpIdx: OpIndicies) { 3324 if (OpIdx == -1) 3325 continue; 3326 const MachineOperand &MO = MI.getOperand(OpIdx); 3327 3328 if (!ST.hasSDWAScalar()) { 3329 // Only VGPRS on VI 3330 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) { 3331 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI"; 3332 return false; 3333 } 3334 } else { 3335 // No immediates on GFX9 3336 if (!MO.isReg()) { 3337 ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9"; 3338 return false; 3339 } 3340 } 3341 } 3342 3343 if (!ST.hasSDWAOmod()) { 3344 // No omod allowed on VI 3345 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); 3346 if (OMod != nullptr && 3347 (!OMod->isImm() || OMod->getImm() != 0)) { 3348 ErrInfo = "OMod not allowed in SDWA instructions on VI"; 3349 return false; 3350 } 3351 } 3352 3353 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode); 3354 if (isVOPC(BasicOpcode)) { 3355 if (!ST.hasSDWASdst() && DstIdx != -1) { 3356 // Only vcc allowed as dst on VI for VOPC 3357 const MachineOperand &Dst = MI.getOperand(DstIdx); 3358 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) { 3359 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI"; 3360 return false; 3361 } 3362 } else if (!ST.hasSDWAOutModsVOPC()) { 3363 // No clamp allowed on GFX9 for VOPC 3364 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); 3365 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) { 3366 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI"; 3367 return false; 3368 } 3369 3370 // No omod allowed on GFX9 for VOPC 3371 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); 3372 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) { 3373 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI"; 3374 return false; 3375 } 3376 } 3377 } 3378 3379 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused); 3380 if (DstUnused && DstUnused->isImm() && 3381 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) { 3382 const MachineOperand &Dst = MI.getOperand(DstIdx); 3383 if (!Dst.isReg() || !Dst.isTied()) { 3384 ErrInfo = "Dst register should have tied register"; 3385 return false; 3386 } 3387 3388 const MachineOperand &TiedMO = 3389 MI.getOperand(MI.findTiedOperandIdx(DstIdx)); 3390 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) { 3391 ErrInfo = 3392 "Dst register should be tied to implicit use of preserved register"; 3393 return false; 3394 } else if (Register::isPhysicalRegister(TiedMO.getReg()) && 3395 Dst.getReg() != TiedMO.getReg()) { 3396 ErrInfo = "Dst register should use same physical register as preserved"; 3397 return false; 3398 } 3399 } 3400 } 3401 3402 // Verify MIMG 3403 if (isMIMG(MI.getOpcode()) && !MI.mayStore()) { 3404 // Ensure that the return type used is large enough for all the options 3405 // being used TFE/LWE require an extra result register. 3406 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask); 3407 if (DMask) { 3408 uint64_t DMaskImm = DMask->getImm(); 3409 uint32_t RegCount = 3410 isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm); 3411 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe); 3412 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe); 3413 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16); 3414 3415 // Adjust for packed 16 bit values 3416 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem()) 3417 RegCount >>= 1; 3418 3419 // Adjust if using LWE or TFE 3420 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm())) 3421 RegCount += 1; 3422 3423 const uint32_t DstIdx = 3424 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); 3425 const MachineOperand &Dst = MI.getOperand(DstIdx); 3426 if (Dst.isReg()) { 3427 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx); 3428 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32; 3429 if (RegCount > DstSize) { 3430 ErrInfo = "MIMG instruction returns too many registers for dst " 3431 "register class"; 3432 return false; 3433 } 3434 } 3435 } 3436 } 3437 3438 // Verify VOP*. Ignore multiple sgpr operands on writelane. 3439 if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32 3440 && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) { 3441 // Only look at the true operands. Only a real operand can use the constant 3442 // bus, and we don't want to check pseudo-operands like the source modifier 3443 // flags. 3444 const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; 3445 3446 unsigned ConstantBusCount = 0; 3447 unsigned LiteralCount = 0; 3448 3449 if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) 3450 ++ConstantBusCount; 3451 3452 SmallVector<unsigned, 2> SGPRsUsed; 3453 unsigned SGPRUsed = findImplicitSGPRRead(MI); 3454 if (SGPRUsed != AMDGPU::NoRegister) { 3455 ++ConstantBusCount; 3456 SGPRsUsed.push_back(SGPRUsed); 3457 } 3458 3459 for (int OpIdx : OpIndices) { 3460 if (OpIdx == -1) 3461 break; 3462 const MachineOperand &MO = MI.getOperand(OpIdx); 3463 if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { 3464 if (MO.isReg()) { 3465 SGPRUsed = MO.getReg(); 3466 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) { 3467 return !RI.regsOverlap(SGPRUsed, SGPR); 3468 })) { 3469 ++ConstantBusCount; 3470 SGPRsUsed.push_back(SGPRUsed); 3471 } 3472 } else { 3473 ++ConstantBusCount; 3474 ++LiteralCount; 3475 } 3476 } 3477 } 3478 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 3479 // v_writelane_b32 is an exception from constant bus restriction: 3480 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const 3481 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) && 3482 Opcode != AMDGPU::V_WRITELANE_B32) { 3483 ErrInfo = "VOP* instruction violates constant bus restriction"; 3484 return false; 3485 } 3486 3487 if (isVOP3(MI) && LiteralCount) { 3488 if (LiteralCount && !ST.hasVOP3Literal()) { 3489 ErrInfo = "VOP3 instruction uses literal"; 3490 return false; 3491 } 3492 if (LiteralCount > 1) { 3493 ErrInfo = "VOP3 instruction uses more than one literal"; 3494 return false; 3495 } 3496 } 3497 } 3498 3499 // Special case for writelane - this can break the multiple constant bus rule, 3500 // but still can't use more than one SGPR register 3501 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) { 3502 unsigned SGPRCount = 0; 3503 Register SGPRUsed = AMDGPU::NoRegister; 3504 3505 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) { 3506 if (OpIdx == -1) 3507 break; 3508 3509 const MachineOperand &MO = MI.getOperand(OpIdx); 3510 3511 if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { 3512 if (MO.isReg() && MO.getReg() != AMDGPU::M0) { 3513 if (MO.getReg() != SGPRUsed) 3514 ++SGPRCount; 3515 SGPRUsed = MO.getReg(); 3516 } 3517 } 3518 if (SGPRCount > ST.getConstantBusLimit(Opcode)) { 3519 ErrInfo = "WRITELANE instruction violates constant bus restriction"; 3520 return false; 3521 } 3522 } 3523 } 3524 3525 // Verify misc. restrictions on specific instructions. 3526 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || 3527 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { 3528 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 3529 const MachineOperand &Src1 = MI.getOperand(Src1Idx); 3530 const MachineOperand &Src2 = MI.getOperand(Src2Idx); 3531 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 3532 if (!compareMachineOp(Src0, Src1) && 3533 !compareMachineOp(Src0, Src2)) { 3534 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 3535 return false; 3536 } 3537 } 3538 } 3539 3540 if (isSOP2(MI) || isSOPC(MI)) { 3541 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 3542 const MachineOperand &Src1 = MI.getOperand(Src1Idx); 3543 unsigned Immediates = 0; 3544 3545 if (!Src0.isReg() && 3546 !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType)) 3547 Immediates++; 3548 if (!Src1.isReg() && 3549 !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType)) 3550 Immediates++; 3551 3552 if (Immediates > 1) { 3553 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants"; 3554 return false; 3555 } 3556 } 3557 3558 if (isSOPK(MI)) { 3559 auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16); 3560 if (Desc.isBranch()) { 3561 if (!Op->isMBB()) { 3562 ErrInfo = "invalid branch target for SOPK instruction"; 3563 return false; 3564 } 3565 } else { 3566 uint64_t Imm = Op->getImm(); 3567 if (sopkIsZext(MI)) { 3568 if (!isUInt<16>(Imm)) { 3569 ErrInfo = "invalid immediate for SOPK instruction"; 3570 return false; 3571 } 3572 } else { 3573 if (!isInt<16>(Imm)) { 3574 ErrInfo = "invalid immediate for SOPK instruction"; 3575 return false; 3576 } 3577 } 3578 } 3579 } 3580 3581 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || 3582 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || 3583 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 3584 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { 3585 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 3586 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; 3587 3588 const unsigned StaticNumOps = Desc.getNumOperands() + 3589 Desc.getNumImplicitUses(); 3590 const unsigned NumImplicitOps = IsDst ? 2 : 1; 3591 3592 // Allow additional implicit operands. This allows a fixup done by the post 3593 // RA scheduler where the main implicit operand is killed and implicit-defs 3594 // are added for sub-registers that remain live after this instruction. 3595 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { 3596 ErrInfo = "missing implicit register operands"; 3597 return false; 3598 } 3599 3600 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 3601 if (IsDst) { 3602 if (!Dst->isUse()) { 3603 ErrInfo = "v_movreld_b32 vdst should be a use operand"; 3604 return false; 3605 } 3606 3607 unsigned UseOpIdx; 3608 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || 3609 UseOpIdx != StaticNumOps + 1) { 3610 ErrInfo = "movrel implicit operands should be tied"; 3611 return false; 3612 } 3613 } 3614 3615 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 3616 const MachineOperand &ImpUse 3617 = MI.getOperand(StaticNumOps + NumImplicitOps - 1); 3618 if (!ImpUse.isReg() || !ImpUse.isUse() || 3619 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { 3620 ErrInfo = "src0 should be subreg of implicit vector use"; 3621 return false; 3622 } 3623 } 3624 3625 // Make sure we aren't losing exec uses in the td files. This mostly requires 3626 // being careful when using let Uses to try to add other use registers. 3627 if (shouldReadExec(MI)) { 3628 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 3629 ErrInfo = "VALU instruction does not implicitly read exec mask"; 3630 return false; 3631 } 3632 } 3633 3634 if (isSMRD(MI)) { 3635 if (MI.mayStore()) { 3636 // The register offset form of scalar stores may only use m0 as the 3637 // soffset register. 3638 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff); 3639 if (Soff && Soff->getReg() != AMDGPU::M0) { 3640 ErrInfo = "scalar stores must use m0 as offset register"; 3641 return false; 3642 } 3643 } 3644 } 3645 3646 if (isFLAT(MI) && !MF->getSubtarget<GCNSubtarget>().hasFlatInstOffsets()) { 3647 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 3648 if (Offset->getImm() != 0) { 3649 ErrInfo = "subtarget does not support offsets in flat instructions"; 3650 return false; 3651 } 3652 } 3653 3654 if (isMIMG(MI)) { 3655 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim); 3656 if (DimOp) { 3657 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode, 3658 AMDGPU::OpName::vaddr0); 3659 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); 3660 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode); 3661 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3662 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); 3663 const AMDGPU::MIMGDimInfo *Dim = 3664 AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm()); 3665 3666 if (!Dim) { 3667 ErrInfo = "dim is out of range"; 3668 return false; 3669 } 3670 3671 bool IsNSA = SRsrcIdx - VAddr0Idx > 1; 3672 unsigned AddrWords = BaseOpcode->NumExtraArgs + 3673 (BaseOpcode->Gradients ? Dim->NumGradients : 0) + 3674 (BaseOpcode->Coordinates ? Dim->NumCoords : 0) + 3675 (BaseOpcode->LodOrClampOrMip ? 1 : 0); 3676 3677 unsigned VAddrWords; 3678 if (IsNSA) { 3679 VAddrWords = SRsrcIdx - VAddr0Idx; 3680 } else { 3681 const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx); 3682 VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32; 3683 if (AddrWords > 8) 3684 AddrWords = 16; 3685 else if (AddrWords > 4) 3686 AddrWords = 8; 3687 else if (AddrWords == 3 && VAddrWords == 4) { 3688 // CodeGen uses the V4 variant of instructions for three addresses, 3689 // because the selection DAG does not support non-power-of-two types. 3690 AddrWords = 4; 3691 } 3692 } 3693 3694 if (VAddrWords != AddrWords) { 3695 ErrInfo = "bad vaddr size"; 3696 return false; 3697 } 3698 } 3699 } 3700 3701 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl); 3702 if (DppCt) { 3703 using namespace AMDGPU::DPP; 3704 3705 unsigned DC = DppCt->getImm(); 3706 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 || 3707 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST || 3708 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) || 3709 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) || 3710 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) || 3711 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) || 3712 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) { 3713 ErrInfo = "Invalid dpp_ctrl value"; 3714 return false; 3715 } 3716 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 && 3717 ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 3718 ErrInfo = "Invalid dpp_ctrl value: " 3719 "wavefront shifts are not supported on GFX10+"; 3720 return false; 3721 } 3722 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 && 3723 ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 3724 ErrInfo = "Invalid dpp_ctrl value: " 3725 "broadcasts are not supported on GFX10+"; 3726 return false; 3727 } 3728 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST && 3729 ST.getGeneration() < AMDGPUSubtarget::GFX10) { 3730 ErrInfo = "Invalid dpp_ctrl value: " 3731 "row_share and row_xmask are not supported before GFX10"; 3732 return false; 3733 } 3734 } 3735 3736 return true; 3737 } 3738 3739 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { 3740 switch (MI.getOpcode()) { 3741 default: return AMDGPU::INSTRUCTION_LIST_END; 3742 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 3743 case AMDGPU::COPY: return AMDGPU::COPY; 3744 case AMDGPU::PHI: return AMDGPU::PHI; 3745 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 3746 case AMDGPU::WQM: return AMDGPU::WQM; 3747 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM; 3748 case AMDGPU::WWM: return AMDGPU::WWM; 3749 case AMDGPU::S_MOV_B32: { 3750 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 3751 return MI.getOperand(1).isReg() || 3752 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ? 3753 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 3754 } 3755 case AMDGPU::S_ADD_I32: 3756 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32; 3757 case AMDGPU::S_ADDC_U32: 3758 return AMDGPU::V_ADDC_U32_e32; 3759 case AMDGPU::S_SUB_I32: 3760 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32; 3761 // FIXME: These are not consistently handled, and selected when the carry is 3762 // used. 3763 case AMDGPU::S_ADD_U32: 3764 return AMDGPU::V_ADD_I32_e32; 3765 case AMDGPU::S_SUB_U32: 3766 return AMDGPU::V_SUB_I32_e32; 3767 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 3768 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32; 3769 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32; 3770 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32; 3771 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; 3772 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; 3773 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; 3774 case AMDGPU::S_XNOR_B32: 3775 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; 3776 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; 3777 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; 3778 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; 3779 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; 3780 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 3781 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; 3782 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 3783 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; 3784 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 3785 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; 3786 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; 3787 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; 3788 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; 3789 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; 3790 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 3791 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 3792 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 3793 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 3794 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; 3795 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; 3796 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; 3797 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; 3798 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; 3799 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; 3800 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; 3801 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; 3802 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; 3803 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; 3804 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; 3805 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; 3806 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32; 3807 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32; 3808 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 3809 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 3810 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 3811 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 3812 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; 3813 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; 3814 } 3815 llvm_unreachable( 3816 "Unexpected scalar opcode without corresponding vector one!"); 3817 } 3818 3819 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 3820 unsigned OpNo) const { 3821 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 3822 const MCInstrDesc &Desc = get(MI.getOpcode()); 3823 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 3824 Desc.OpInfo[OpNo].RegClass == -1) { 3825 Register Reg = MI.getOperand(OpNo).getReg(); 3826 3827 if (Register::isVirtualRegister(Reg)) 3828 return MRI.getRegClass(Reg); 3829 return RI.getPhysRegClass(Reg); 3830 } 3831 3832 unsigned RCID = Desc.OpInfo[OpNo].RegClass; 3833 return RI.getRegClass(RCID); 3834 } 3835 3836 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { 3837 MachineBasicBlock::iterator I = MI; 3838 MachineBasicBlock *MBB = MI.getParent(); 3839 MachineOperand &MO = MI.getOperand(OpIdx); 3840 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 3841 const SIRegisterInfo *TRI = 3842 static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 3843 unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; 3844 const TargetRegisterClass *RC = RI.getRegClass(RCID); 3845 unsigned Size = TRI->getRegSizeInBits(*RC); 3846 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32; 3847 if (MO.isReg()) 3848 Opcode = AMDGPU::COPY; 3849 else if (RI.isSGPRClass(RC)) 3850 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 3851 3852 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 3853 if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) 3854 VRC = &AMDGPU::VReg_64RegClass; 3855 else 3856 VRC = &AMDGPU::VGPR_32RegClass; 3857 3858 Register Reg = MRI.createVirtualRegister(VRC); 3859 DebugLoc DL = MBB->findDebugLoc(I); 3860 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); 3861 MO.ChangeToRegister(Reg, false); 3862 } 3863 3864 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 3865 MachineRegisterInfo &MRI, 3866 MachineOperand &SuperReg, 3867 const TargetRegisterClass *SuperRC, 3868 unsigned SubIdx, 3869 const TargetRegisterClass *SubRC) 3870 const { 3871 MachineBasicBlock *MBB = MI->getParent(); 3872 DebugLoc DL = MI->getDebugLoc(); 3873 Register SubReg = MRI.createVirtualRegister(SubRC); 3874 3875 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { 3876 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 3877 .addReg(SuperReg.getReg(), 0, SubIdx); 3878 return SubReg; 3879 } 3880 3881 // Just in case the super register is itself a sub-register, copy it to a new 3882 // value so we don't need to worry about merging its subreg index with the 3883 // SubIdx passed to this function. The register coalescer should be able to 3884 // eliminate this extra copy. 3885 Register NewSuperReg = MRI.createVirtualRegister(SuperRC); 3886 3887 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 3888 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 3889 3890 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 3891 .addReg(NewSuperReg, 0, SubIdx); 3892 3893 return SubReg; 3894 } 3895 3896 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 3897 MachineBasicBlock::iterator MII, 3898 MachineRegisterInfo &MRI, 3899 MachineOperand &Op, 3900 const TargetRegisterClass *SuperRC, 3901 unsigned SubIdx, 3902 const TargetRegisterClass *SubRC) const { 3903 if (Op.isImm()) { 3904 if (SubIdx == AMDGPU::sub0) 3905 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); 3906 if (SubIdx == AMDGPU::sub1) 3907 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32)); 3908 3909 llvm_unreachable("Unhandled register index for immediate"); 3910 } 3911 3912 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 3913 SubIdx, SubRC); 3914 return MachineOperand::CreateReg(SubReg, false); 3915 } 3916 3917 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 3918 void SIInstrInfo::swapOperands(MachineInstr &Inst) const { 3919 assert(Inst.getNumExplicitOperands() == 3); 3920 MachineOperand Op1 = Inst.getOperand(1); 3921 Inst.RemoveOperand(1); 3922 Inst.addOperand(Op1); 3923 } 3924 3925 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, 3926 const MCOperandInfo &OpInfo, 3927 const MachineOperand &MO) const { 3928 if (!MO.isReg()) 3929 return false; 3930 3931 Register Reg = MO.getReg(); 3932 const TargetRegisterClass *RC = Register::isVirtualRegister(Reg) 3933 ? MRI.getRegClass(Reg) 3934 : RI.getPhysRegClass(Reg); 3935 3936 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass); 3937 if (MO.getSubReg()) { 3938 const MachineFunction *MF = MO.getParent()->getParent()->getParent(); 3939 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF); 3940 if (!SuperRC) 3941 return false; 3942 3943 DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()); 3944 if (!DRC) 3945 return false; 3946 } 3947 return RC->hasSuperClassEq(DRC); 3948 } 3949 3950 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, 3951 const MCOperandInfo &OpInfo, 3952 const MachineOperand &MO) const { 3953 if (MO.isReg()) 3954 return isLegalRegOperand(MRI, OpInfo, MO); 3955 3956 // Handle non-register types that are treated like immediates. 3957 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); 3958 return true; 3959 } 3960 3961 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, 3962 const MachineOperand *MO) const { 3963 const MachineFunction &MF = *MI.getParent()->getParent(); 3964 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3965 const MCInstrDesc &InstDesc = MI.getDesc(); 3966 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; 3967 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 3968 const TargetRegisterClass *DefinedRC = 3969 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 3970 if (!MO) 3971 MO = &MI.getOperand(OpIdx); 3972 3973 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); 3974 int VOP3LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; 3975 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { 3976 if (isVOP3(MI) && isLiteralConstantLike(*MO, OpInfo) && !VOP3LiteralLimit--) 3977 return false; 3978 3979 SmallDenseSet<RegSubRegPair> SGPRsUsed; 3980 if (MO->isReg()) 3981 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg())); 3982 3983 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3984 if (i == OpIdx) 3985 continue; 3986 const MachineOperand &Op = MI.getOperand(i); 3987 if (Op.isReg()) { 3988 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg()); 3989 if (!SGPRsUsed.count(SGPR) && 3990 usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) { 3991 if (--ConstantBusLimit <= 0) 3992 return false; 3993 SGPRsUsed.insert(SGPR); 3994 } 3995 } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { 3996 if (--ConstantBusLimit <= 0) 3997 return false; 3998 } else if (isVOP3(MI) && AMDGPU::isSISrcOperand(InstDesc, i) && 3999 isLiteralConstantLike(Op, InstDesc.OpInfo[i])) { 4000 if (!VOP3LiteralLimit--) 4001 return false; 4002 if (--ConstantBusLimit <= 0) 4003 return false; 4004 } 4005 } 4006 } 4007 4008 if (MO->isReg()) { 4009 assert(DefinedRC); 4010 return isLegalRegOperand(MRI, OpInfo, *MO); 4011 } 4012 4013 // Handle non-register types that are treated like immediates. 4014 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal()); 4015 4016 if (!DefinedRC) { 4017 // This operand expects an immediate. 4018 return true; 4019 } 4020 4021 return isImmOperandLegal(MI, OpIdx, *MO); 4022 } 4023 4024 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, 4025 MachineInstr &MI) const { 4026 unsigned Opc = MI.getOpcode(); 4027 const MCInstrDesc &InstrDesc = get(Opc); 4028 4029 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 4030 MachineOperand &Src0 = MI.getOperand(Src0Idx); 4031 4032 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 4033 MachineOperand &Src1 = MI.getOperand(Src1Idx); 4034 4035 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 4036 // we need to only have one constant bus use before GFX10. 4037 bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; 4038 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && 4039 Src0.isReg() && (RI.isSGPRReg(MRI, Src0.getReg()) || 4040 isLiteralConstantLike(Src0, InstrDesc.OpInfo[Src0Idx]))) 4041 legalizeOpWithMove(MI, Src0Idx); 4042 4043 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for 4044 // both the value to write (src0) and lane select (src1). Fix up non-SGPR 4045 // src0/src1 with V_READFIRSTLANE. 4046 if (Opc == AMDGPU::V_WRITELANE_B32) { 4047 const DebugLoc &DL = MI.getDebugLoc(); 4048 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) { 4049 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4050 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4051 .add(Src0); 4052 Src0.ChangeToRegister(Reg, false); 4053 } 4054 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) { 4055 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4056 const DebugLoc &DL = MI.getDebugLoc(); 4057 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4058 .add(Src1); 4059 Src1.ChangeToRegister(Reg, false); 4060 } 4061 return; 4062 } 4063 4064 // No VOP2 instructions support AGPRs. 4065 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg())) 4066 legalizeOpWithMove(MI, Src0Idx); 4067 4068 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg())) 4069 legalizeOpWithMove(MI, Src1Idx); 4070 4071 // VOP2 src0 instructions support all operand types, so we don't need to check 4072 // their legality. If src1 is already legal, we don't need to do anything. 4073 if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) 4074 return; 4075 4076 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for 4077 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane 4078 // select is uniform. 4079 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() && 4080 RI.isVGPR(MRI, Src1.getReg())) { 4081 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4082 const DebugLoc &DL = MI.getDebugLoc(); 4083 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4084 .add(Src1); 4085 Src1.ChangeToRegister(Reg, false); 4086 return; 4087 } 4088 4089 // We do not use commuteInstruction here because it is too aggressive and will 4090 // commute if it is possible. We only want to commute here if it improves 4091 // legality. This can be called a fairly large number of times so don't waste 4092 // compile time pointlessly swapping and checking legality again. 4093 if (HasImplicitSGPR || !MI.isCommutable()) { 4094 legalizeOpWithMove(MI, Src1Idx); 4095 return; 4096 } 4097 4098 // If src0 can be used as src1, commuting will make the operands legal. 4099 // Otherwise we have to give up and insert a move. 4100 // 4101 // TODO: Other immediate-like operand kinds could be commuted if there was a 4102 // MachineOperand::ChangeTo* for them. 4103 if ((!Src1.isImm() && !Src1.isReg()) || 4104 !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { 4105 legalizeOpWithMove(MI, Src1Idx); 4106 return; 4107 } 4108 4109 int CommutedOpc = commuteOpcode(MI); 4110 if (CommutedOpc == -1) { 4111 legalizeOpWithMove(MI, Src1Idx); 4112 return; 4113 } 4114 4115 MI.setDesc(get(CommutedOpc)); 4116 4117 Register Src0Reg = Src0.getReg(); 4118 unsigned Src0SubReg = Src0.getSubReg(); 4119 bool Src0Kill = Src0.isKill(); 4120 4121 if (Src1.isImm()) 4122 Src0.ChangeToImmediate(Src1.getImm()); 4123 else if (Src1.isReg()) { 4124 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); 4125 Src0.setSubReg(Src1.getSubReg()); 4126 } else 4127 llvm_unreachable("Should only have register or immediate operands"); 4128 4129 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); 4130 Src1.setSubReg(Src0SubReg); 4131 fixImplicitOperands(MI); 4132 } 4133 4134 // Legalize VOP3 operands. All operand types are supported for any operand 4135 // but only one literal constant and only starting from GFX10. 4136 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, 4137 MachineInstr &MI) const { 4138 unsigned Opc = MI.getOpcode(); 4139 4140 int VOP3Idx[3] = { 4141 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 4142 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), 4143 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) 4144 }; 4145 4146 if (Opc == AMDGPU::V_PERMLANE16_B32 || 4147 Opc == AMDGPU::V_PERMLANEX16_B32) { 4148 // src1 and src2 must be scalar 4149 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]); 4150 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]); 4151 const DebugLoc &DL = MI.getDebugLoc(); 4152 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) { 4153 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4154 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4155 .add(Src1); 4156 Src1.ChangeToRegister(Reg, false); 4157 } 4158 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) { 4159 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4160 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4161 .add(Src2); 4162 Src2.ChangeToRegister(Reg, false); 4163 } 4164 } 4165 4166 // Find the one SGPR operand we are allowed to use. 4167 int ConstantBusLimit = ST.getConstantBusLimit(Opc); 4168 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; 4169 SmallDenseSet<unsigned> SGPRsUsed; 4170 unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); 4171 if (SGPRReg != AMDGPU::NoRegister) { 4172 SGPRsUsed.insert(SGPRReg); 4173 --ConstantBusLimit; 4174 } 4175 4176 for (unsigned i = 0; i < 3; ++i) { 4177 int Idx = VOP3Idx[i]; 4178 if (Idx == -1) 4179 break; 4180 MachineOperand &MO = MI.getOperand(Idx); 4181 4182 if (!MO.isReg()) { 4183 if (!isLiteralConstantLike(MO, get(Opc).OpInfo[Idx])) 4184 continue; 4185 4186 if (LiteralLimit > 0 && ConstantBusLimit > 0) { 4187 --LiteralLimit; 4188 --ConstantBusLimit; 4189 continue; 4190 } 4191 4192 --LiteralLimit; 4193 --ConstantBusLimit; 4194 legalizeOpWithMove(MI, Idx); 4195 continue; 4196 } 4197 4198 if (RI.hasAGPRs(MRI.getRegClass(MO.getReg())) && 4199 !isOperandLegal(MI, Idx, &MO)) { 4200 legalizeOpWithMove(MI, Idx); 4201 continue; 4202 } 4203 4204 if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 4205 continue; // VGPRs are legal 4206 4207 // We can use one SGPR in each VOP3 instruction prior to GFX10 4208 // and two starting from GFX10. 4209 if (SGPRsUsed.count(MO.getReg())) 4210 continue; 4211 if (ConstantBusLimit > 0) { 4212 SGPRsUsed.insert(MO.getReg()); 4213 --ConstantBusLimit; 4214 continue; 4215 } 4216 4217 // If we make it this far, then the operand is not legal and we must 4218 // legalize it. 4219 legalizeOpWithMove(MI, Idx); 4220 } 4221 } 4222 4223 unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, 4224 MachineRegisterInfo &MRI) const { 4225 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); 4226 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); 4227 Register DstReg = MRI.createVirtualRegister(SRC); 4228 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; 4229 4230 if (RI.hasAGPRs(VRC)) { 4231 VRC = RI.getEquivalentVGPRClass(VRC); 4232 Register NewSrcReg = MRI.createVirtualRegister(VRC); 4233 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 4234 get(TargetOpcode::COPY), NewSrcReg) 4235 .addReg(SrcReg); 4236 SrcReg = NewSrcReg; 4237 } 4238 4239 if (SubRegs == 1) { 4240 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 4241 get(AMDGPU::V_READFIRSTLANE_B32), DstReg) 4242 .addReg(SrcReg); 4243 return DstReg; 4244 } 4245 4246 SmallVector<unsigned, 8> SRegs; 4247 for (unsigned i = 0; i < SubRegs; ++i) { 4248 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4249 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 4250 get(AMDGPU::V_READFIRSTLANE_B32), SGPR) 4251 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); 4252 SRegs.push_back(SGPR); 4253 } 4254 4255 MachineInstrBuilder MIB = 4256 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 4257 get(AMDGPU::REG_SEQUENCE), DstReg); 4258 for (unsigned i = 0; i < SubRegs; ++i) { 4259 MIB.addReg(SRegs[i]); 4260 MIB.addImm(RI.getSubRegFromChannel(i)); 4261 } 4262 return DstReg; 4263 } 4264 4265 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, 4266 MachineInstr &MI) const { 4267 4268 // If the pointer is store in VGPRs, then we need to move them to 4269 // SGPRs using v_readfirstlane. This is safe because we only select 4270 // loads with uniform pointers to SMRD instruction so we know the 4271 // pointer value is uniform. 4272 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); 4273 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { 4274 unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); 4275 SBase->setReg(SGPR); 4276 } 4277 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff); 4278 if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) { 4279 unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI); 4280 SOff->setReg(SGPR); 4281 } 4282 } 4283 4284 void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, 4285 MachineBasicBlock::iterator I, 4286 const TargetRegisterClass *DstRC, 4287 MachineOperand &Op, 4288 MachineRegisterInfo &MRI, 4289 const DebugLoc &DL) const { 4290 Register OpReg = Op.getReg(); 4291 unsigned OpSubReg = Op.getSubReg(); 4292 4293 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( 4294 RI.getRegClassForReg(MRI, OpReg), OpSubReg); 4295 4296 // Check if operand is already the correct register class. 4297 if (DstRC == OpRC) 4298 return; 4299 4300 Register DstReg = MRI.createVirtualRegister(DstRC); 4301 MachineInstr *Copy = 4302 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); 4303 4304 Op.setReg(DstReg); 4305 Op.setSubReg(0); 4306 4307 MachineInstr *Def = MRI.getVRegDef(OpReg); 4308 if (!Def) 4309 return; 4310 4311 // Try to eliminate the copy if it is copying an immediate value. 4312 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass) 4313 FoldImmediate(*Copy, *Def, OpReg, &MRI); 4314 4315 bool ImpDef = Def->isImplicitDef(); 4316 while (!ImpDef && Def && Def->isCopy()) { 4317 if (Def->getOperand(1).getReg().isPhysical()) 4318 break; 4319 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg()); 4320 ImpDef = Def && Def->isImplicitDef(); 4321 } 4322 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) && 4323 !ImpDef) 4324 Copy->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 4325 } 4326 4327 // Emit the actual waterfall loop, executing the wrapped instruction for each 4328 // unique value of \p Rsrc across all lanes. In the best case we execute 1 4329 // iteration, in the worst case we execute 64 (once per lane). 4330 static void 4331 emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, 4332 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, 4333 const DebugLoc &DL, MachineOperand &Rsrc) { 4334 MachineFunction &MF = *OrigBB.getParent(); 4335 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 4336 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 4337 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 4338 unsigned SaveExecOpc = 4339 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 4340 unsigned XorTermOpc = 4341 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; 4342 unsigned AndOpc = 4343 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 4344 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 4345 4346 MachineBasicBlock::iterator I = LoopBB.begin(); 4347 4348 Register VRsrc = Rsrc.getReg(); 4349 unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef()); 4350 4351 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); 4352 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC); 4353 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC); 4354 Register AndCond = MRI.createVirtualRegister(BoolXExecRC); 4355 Register SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4356 Register SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4357 Register SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4358 Register SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4359 Register SRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 4360 4361 // Beginning of the loop, read the next Rsrc variant. 4362 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0) 4363 .addReg(VRsrc, VRsrcUndef, AMDGPU::sub0); 4364 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub1) 4365 .addReg(VRsrc, VRsrcUndef, AMDGPU::sub1); 4366 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub2) 4367 .addReg(VRsrc, VRsrcUndef, AMDGPU::sub2); 4368 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub3) 4369 .addReg(VRsrc, VRsrcUndef, AMDGPU::sub3); 4370 4371 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc) 4372 .addReg(SRsrcSub0) 4373 .addImm(AMDGPU::sub0) 4374 .addReg(SRsrcSub1) 4375 .addImm(AMDGPU::sub1) 4376 .addReg(SRsrcSub2) 4377 .addImm(AMDGPU::sub2) 4378 .addReg(SRsrcSub3) 4379 .addImm(AMDGPU::sub3); 4380 4381 // Update Rsrc operand to use the SGPR Rsrc. 4382 Rsrc.setReg(SRsrc); 4383 Rsrc.setIsKill(true); 4384 4385 // Identify all lanes with identical Rsrc operands in their VGPRs. 4386 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg0) 4387 .addReg(SRsrc, 0, AMDGPU::sub0_sub1) 4388 .addReg(VRsrc, 0, AMDGPU::sub0_sub1); 4389 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1) 4390 .addReg(SRsrc, 0, AMDGPU::sub2_sub3) 4391 .addReg(VRsrc, 0, AMDGPU::sub2_sub3); 4392 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndCond) 4393 .addReg(CondReg0) 4394 .addReg(CondReg1); 4395 4396 MRI.setSimpleHint(SaveExec, AndCond); 4397 4398 // Update EXEC to matching lanes, saving original to SaveExec. 4399 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec) 4400 .addReg(AndCond, RegState::Kill); 4401 4402 // The original instruction is here; we insert the terminators after it. 4403 I = LoopBB.end(); 4404 4405 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 4406 BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec) 4407 .addReg(Exec) 4408 .addReg(SaveExec); 4409 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB); 4410 } 4411 4412 // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register 4413 // with SGPRs by iterating over all unique values across all lanes. 4414 static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, 4415 MachineOperand &Rsrc, MachineDominatorTree *MDT) { 4416 MachineBasicBlock &MBB = *MI.getParent(); 4417 MachineFunction &MF = *MBB.getParent(); 4418 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 4419 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 4420 MachineRegisterInfo &MRI = MF.getRegInfo(); 4421 MachineBasicBlock::iterator I(&MI); 4422 const DebugLoc &DL = MI.getDebugLoc(); 4423 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 4424 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 4425 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 4426 4427 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); 4428 4429 // Save the EXEC mask 4430 BuildMI(MBB, I, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec); 4431 4432 // Killed uses in the instruction we are waterfalling around will be 4433 // incorrect due to the added control-flow. 4434 for (auto &MO : MI.uses()) { 4435 if (MO.isReg() && MO.isUse()) { 4436 MRI.clearKillFlags(MO.getReg()); 4437 } 4438 } 4439 4440 // To insert the loop we need to split the block. Move everything after this 4441 // point to a new block, and insert a new empty block between the two. 4442 MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock(); 4443 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock(); 4444 MachineFunction::iterator MBBI(MBB); 4445 ++MBBI; 4446 4447 MF.insert(MBBI, LoopBB); 4448 MF.insert(MBBI, RemainderBB); 4449 4450 LoopBB->addSuccessor(LoopBB); 4451 LoopBB->addSuccessor(RemainderBB); 4452 4453 // Move MI to the LoopBB, and the remainder of the block to RemainderBB. 4454 MachineBasicBlock::iterator J = I++; 4455 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 4456 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); 4457 LoopBB->splice(LoopBB->begin(), &MBB, J); 4458 4459 MBB.addSuccessor(LoopBB); 4460 4461 // Update dominators. We know that MBB immediately dominates LoopBB, that 4462 // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately 4463 // dominates all of the successors transferred to it from MBB that MBB used 4464 // to properly dominate. 4465 if (MDT) { 4466 MDT->addNewBlock(LoopBB, &MBB); 4467 MDT->addNewBlock(RemainderBB, LoopBB); 4468 for (auto &Succ : RemainderBB->successors()) { 4469 if (MDT->properlyDominates(&MBB, Succ)) { 4470 MDT->changeImmediateDominator(Succ, RemainderBB); 4471 } 4472 } 4473 } 4474 4475 emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc); 4476 4477 // Restore the EXEC mask 4478 MachineBasicBlock::iterator First = RemainderBB->begin(); 4479 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec); 4480 } 4481 4482 // Extract pointer from Rsrc and return a zero-value Rsrc replacement. 4483 static std::tuple<unsigned, unsigned> 4484 extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) { 4485 MachineBasicBlock &MBB = *MI.getParent(); 4486 MachineFunction &MF = *MBB.getParent(); 4487 MachineRegisterInfo &MRI = MF.getRegInfo(); 4488 4489 // Extract the ptr from the resource descriptor. 4490 unsigned RsrcPtr = 4491 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass, 4492 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); 4493 4494 // Create an empty resource descriptor 4495 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4496 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4497 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4498 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 4499 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat(); 4500 4501 // Zero64 = 0 4502 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64) 4503 .addImm(0); 4504 4505 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 4506 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo) 4507 .addImm(RsrcDataFormat & 0xFFFFFFFF); 4508 4509 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 4510 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi) 4511 .addImm(RsrcDataFormat >> 32); 4512 4513 // NewSRsrc = {Zero64, SRsrcFormat} 4514 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc) 4515 .addReg(Zero64) 4516 .addImm(AMDGPU::sub0_sub1) 4517 .addReg(SRsrcFormatLo) 4518 .addImm(AMDGPU::sub2) 4519 .addReg(SRsrcFormatHi) 4520 .addImm(AMDGPU::sub3); 4521 4522 return std::make_tuple(RsrcPtr, NewSRsrc); 4523 } 4524 4525 void SIInstrInfo::legalizeOperands(MachineInstr &MI, 4526 MachineDominatorTree *MDT) const { 4527 MachineFunction &MF = *MI.getParent()->getParent(); 4528 MachineRegisterInfo &MRI = MF.getRegInfo(); 4529 4530 // Legalize VOP2 4531 if (isVOP2(MI) || isVOPC(MI)) { 4532 legalizeOperandsVOP2(MRI, MI); 4533 return; 4534 } 4535 4536 // Legalize VOP3 4537 if (isVOP3(MI)) { 4538 legalizeOperandsVOP3(MRI, MI); 4539 return; 4540 } 4541 4542 // Legalize SMRD 4543 if (isSMRD(MI)) { 4544 legalizeOperandsSMRD(MRI, MI); 4545 return; 4546 } 4547 4548 // Legalize REG_SEQUENCE and PHI 4549 // The register class of the operands much be the same type as the register 4550 // class of the output. 4551 if (MI.getOpcode() == AMDGPU::PHI) { 4552 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 4553 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { 4554 if (!MI.getOperand(i).isReg() || 4555 !Register::isVirtualRegister(MI.getOperand(i).getReg())) 4556 continue; 4557 const TargetRegisterClass *OpRC = 4558 MRI.getRegClass(MI.getOperand(i).getReg()); 4559 if (RI.hasVectorRegisters(OpRC)) { 4560 VRC = OpRC; 4561 } else { 4562 SRC = OpRC; 4563 } 4564 } 4565 4566 // If any of the operands are VGPR registers, then they all most be 4567 // otherwise we will create illegal VGPR->SGPR copies when legalizing 4568 // them. 4569 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { 4570 if (!VRC) { 4571 assert(SRC); 4572 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) { 4573 VRC = &AMDGPU::VReg_1RegClass; 4574 } else 4575 VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) 4576 ? RI.getEquivalentAGPRClass(SRC) 4577 : RI.getEquivalentVGPRClass(SRC); 4578 } else { 4579 VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) 4580 ? RI.getEquivalentAGPRClass(VRC) 4581 : RI.getEquivalentVGPRClass(VRC); 4582 } 4583 RC = VRC; 4584 } else { 4585 RC = SRC; 4586 } 4587 4588 // Update all the operands so they have the same type. 4589 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 4590 MachineOperand &Op = MI.getOperand(I); 4591 if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg())) 4592 continue; 4593 4594 // MI is a PHI instruction. 4595 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); 4596 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); 4597 4598 // Avoid creating no-op copies with the same src and dst reg class. These 4599 // confuse some of the machine passes. 4600 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); 4601 } 4602 } 4603 4604 // REG_SEQUENCE doesn't really require operand legalization, but if one has a 4605 // VGPR dest type and SGPR sources, insert copies so all operands are 4606 // VGPRs. This seems to help operand folding / the register coalescer. 4607 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { 4608 MachineBasicBlock *MBB = MI.getParent(); 4609 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); 4610 if (RI.hasVGPRs(DstRC)) { 4611 // Update all the operands so they are VGPR register classes. These may 4612 // not be the same register class because REG_SEQUENCE supports mixing 4613 // subregister index types e.g. sub0_sub1 + sub2 + sub3 4614 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 4615 MachineOperand &Op = MI.getOperand(I); 4616 if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg())) 4617 continue; 4618 4619 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); 4620 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); 4621 if (VRC == OpRC) 4622 continue; 4623 4624 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc()); 4625 Op.setIsKill(); 4626 } 4627 } 4628 4629 return; 4630 } 4631 4632 // Legalize INSERT_SUBREG 4633 // src0 must have the same register class as dst 4634 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { 4635 Register Dst = MI.getOperand(0).getReg(); 4636 Register Src0 = MI.getOperand(1).getReg(); 4637 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 4638 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 4639 if (DstRC != Src0RC) { 4640 MachineBasicBlock *MBB = MI.getParent(); 4641 MachineOperand &Op = MI.getOperand(1); 4642 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); 4643 } 4644 return; 4645 } 4646 4647 // Legalize SI_INIT_M0 4648 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) { 4649 MachineOperand &Src = MI.getOperand(0); 4650 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) 4651 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); 4652 return; 4653 } 4654 4655 // Legalize MIMG and MUBUF/MTBUF for shaders. 4656 // 4657 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via 4658 // scratch memory access. In both cases, the legalization never involves 4659 // conversion to the addr64 form. 4660 if (isMIMG(MI) || 4661 (AMDGPU::isShader(MF.getFunction().getCallingConv()) && 4662 (isMUBUF(MI) || isMTBUF(MI)))) { 4663 MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); 4664 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { 4665 unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); 4666 SRsrc->setReg(SGPR); 4667 } 4668 4669 MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); 4670 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { 4671 unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); 4672 SSamp->setReg(SGPR); 4673 } 4674 return; 4675 } 4676 4677 // Legalize MUBUF* instructions. 4678 int RsrcIdx = 4679 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); 4680 if (RsrcIdx != -1) { 4681 // We have an MUBUF instruction 4682 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx); 4683 unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass; 4684 if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()), 4685 RI.getRegClass(RsrcRC))) { 4686 // The operands are legal. 4687 // FIXME: We may need to legalize operands besided srsrc. 4688 return; 4689 } 4690 4691 // Legalize a VGPR Rsrc. 4692 // 4693 // If the instruction is _ADDR64, we can avoid a waterfall by extracting 4694 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using 4695 // a zero-value SRsrc. 4696 // 4697 // If the instruction is _OFFSET (both idxen and offen disabled), and we 4698 // support ADDR64 instructions, we can convert to ADDR64 and do the same as 4699 // above. 4700 // 4701 // Otherwise we are on non-ADDR64 hardware, and/or we have 4702 // idxen/offen/bothen and we fall back to a waterfall loop. 4703 4704 MachineBasicBlock &MBB = *MI.getParent(); 4705 4706 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 4707 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) { 4708 // This is already an ADDR64 instruction so we need to add the pointer 4709 // extracted from the resource descriptor to the current value of VAddr. 4710 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4711 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4712 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 4713 4714 const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 4715 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC); 4716 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC); 4717 4718 unsigned RsrcPtr, NewSRsrc; 4719 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); 4720 4721 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0 4722 const DebugLoc &DL = MI.getDebugLoc(); 4723 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e64), NewVAddrLo) 4724 .addDef(CondReg0) 4725 .addReg(RsrcPtr, 0, AMDGPU::sub0) 4726 .addReg(VAddr->getReg(), 0, AMDGPU::sub0) 4727 .addImm(0); 4728 4729 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1 4730 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi) 4731 .addDef(CondReg1, RegState::Dead) 4732 .addReg(RsrcPtr, 0, AMDGPU::sub1) 4733 .addReg(VAddr->getReg(), 0, AMDGPU::sub1) 4734 .addReg(CondReg0, RegState::Kill) 4735 .addImm(0); 4736 4737 // NewVaddr = {NewVaddrHi, NewVaddrLo} 4738 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 4739 .addReg(NewVAddrLo) 4740 .addImm(AMDGPU::sub0) 4741 .addReg(NewVAddrHi) 4742 .addImm(AMDGPU::sub1); 4743 4744 VAddr->setReg(NewVAddr); 4745 Rsrc->setReg(NewSRsrc); 4746 } else if (!VAddr && ST.hasAddr64()) { 4747 // This instructions is the _OFFSET variant, so we need to convert it to 4748 // ADDR64. 4749 assert(MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration() 4750 < AMDGPUSubtarget::VOLCANIC_ISLANDS && 4751 "FIXME: Need to emit flat atomics here"); 4752 4753 unsigned RsrcPtr, NewSRsrc; 4754 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); 4755 4756 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 4757 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); 4758 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 4759 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); 4760 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); 4761 4762 // Atomics rith return have have an additional tied operand and are 4763 // missing some of the special bits. 4764 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); 4765 MachineInstr *Addr64; 4766 4767 if (!VDataIn) { 4768 // Regular buffer load / store. 4769 MachineInstrBuilder MIB = 4770 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 4771 .add(*VData) 4772 .addReg(NewVAddr) 4773 .addReg(NewSRsrc) 4774 .add(*SOffset) 4775 .add(*Offset); 4776 4777 // Atomics do not have this operand. 4778 if (const MachineOperand *GLC = 4779 getNamedOperand(MI, AMDGPU::OpName::glc)) { 4780 MIB.addImm(GLC->getImm()); 4781 } 4782 if (const MachineOperand *DLC = 4783 getNamedOperand(MI, AMDGPU::OpName::dlc)) { 4784 MIB.addImm(DLC->getImm()); 4785 } 4786 4787 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)); 4788 4789 if (const MachineOperand *TFE = 4790 getNamedOperand(MI, AMDGPU::OpName::tfe)) { 4791 MIB.addImm(TFE->getImm()); 4792 } 4793 4794 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz)); 4795 4796 MIB.cloneMemRefs(MI); 4797 Addr64 = MIB; 4798 } else { 4799 // Atomics with return. 4800 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 4801 .add(*VData) 4802 .add(*VDataIn) 4803 .addReg(NewVAddr) 4804 .addReg(NewSRsrc) 4805 .add(*SOffset) 4806 .add(*Offset) 4807 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) 4808 .cloneMemRefs(MI); 4809 } 4810 4811 MI.removeFromParent(); 4812 4813 // NewVaddr = {NewVaddrHi, NewVaddrLo} 4814 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), 4815 NewVAddr) 4816 .addReg(RsrcPtr, 0, AMDGPU::sub0) 4817 .addImm(AMDGPU::sub0) 4818 .addReg(RsrcPtr, 0, AMDGPU::sub1) 4819 .addImm(AMDGPU::sub1); 4820 } else { 4821 // This is another variant; legalize Rsrc with waterfall loop from VGPRs 4822 // to SGPRs. 4823 loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT); 4824 } 4825 } 4826 } 4827 4828 void SIInstrInfo::moveToVALU(MachineInstr &TopInst, 4829 MachineDominatorTree *MDT) const { 4830 SetVectorType Worklist; 4831 Worklist.insert(&TopInst); 4832 4833 while (!Worklist.empty()) { 4834 MachineInstr &Inst = *Worklist.pop_back_val(); 4835 MachineBasicBlock *MBB = Inst.getParent(); 4836 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 4837 4838 unsigned Opcode = Inst.getOpcode(); 4839 unsigned NewOpcode = getVALUOp(Inst); 4840 4841 // Handle some special cases 4842 switch (Opcode) { 4843 default: 4844 break; 4845 case AMDGPU::S_ADD_U64_PSEUDO: 4846 case AMDGPU::S_SUB_U64_PSEUDO: 4847 splitScalar64BitAddSub(Worklist, Inst, MDT); 4848 Inst.eraseFromParent(); 4849 continue; 4850 case AMDGPU::S_ADD_I32: 4851 case AMDGPU::S_SUB_I32: 4852 // FIXME: The u32 versions currently selected use the carry. 4853 if (moveScalarAddSub(Worklist, Inst, MDT)) 4854 continue; 4855 4856 // Default handling 4857 break; 4858 case AMDGPU::S_AND_B64: 4859 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); 4860 Inst.eraseFromParent(); 4861 continue; 4862 4863 case AMDGPU::S_OR_B64: 4864 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT); 4865 Inst.eraseFromParent(); 4866 continue; 4867 4868 case AMDGPU::S_XOR_B64: 4869 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT); 4870 Inst.eraseFromParent(); 4871 continue; 4872 4873 case AMDGPU::S_NAND_B64: 4874 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT); 4875 Inst.eraseFromParent(); 4876 continue; 4877 4878 case AMDGPU::S_NOR_B64: 4879 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT); 4880 Inst.eraseFromParent(); 4881 continue; 4882 4883 case AMDGPU::S_XNOR_B64: 4884 if (ST.hasDLInsts()) 4885 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); 4886 else 4887 splitScalar64BitXnor(Worklist, Inst, MDT); 4888 Inst.eraseFromParent(); 4889 continue; 4890 4891 case AMDGPU::S_ANDN2_B64: 4892 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT); 4893 Inst.eraseFromParent(); 4894 continue; 4895 4896 case AMDGPU::S_ORN2_B64: 4897 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT); 4898 Inst.eraseFromParent(); 4899 continue; 4900 4901 case AMDGPU::S_NOT_B64: 4902 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); 4903 Inst.eraseFromParent(); 4904 continue; 4905 4906 case AMDGPU::S_BCNT1_I32_B64: 4907 splitScalar64BitBCNT(Worklist, Inst); 4908 Inst.eraseFromParent(); 4909 continue; 4910 4911 case AMDGPU::S_BFE_I64: 4912 splitScalar64BitBFE(Worklist, Inst); 4913 Inst.eraseFromParent(); 4914 continue; 4915 4916 case AMDGPU::S_LSHL_B32: 4917 if (ST.hasOnlyRevVALUShifts()) { 4918 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 4919 swapOperands(Inst); 4920 } 4921 break; 4922 case AMDGPU::S_ASHR_I32: 4923 if (ST.hasOnlyRevVALUShifts()) { 4924 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 4925 swapOperands(Inst); 4926 } 4927 break; 4928 case AMDGPU::S_LSHR_B32: 4929 if (ST.hasOnlyRevVALUShifts()) { 4930 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 4931 swapOperands(Inst); 4932 } 4933 break; 4934 case AMDGPU::S_LSHL_B64: 4935 if (ST.hasOnlyRevVALUShifts()) { 4936 NewOpcode = AMDGPU::V_LSHLREV_B64; 4937 swapOperands(Inst); 4938 } 4939 break; 4940 case AMDGPU::S_ASHR_I64: 4941 if (ST.hasOnlyRevVALUShifts()) { 4942 NewOpcode = AMDGPU::V_ASHRREV_I64; 4943 swapOperands(Inst); 4944 } 4945 break; 4946 case AMDGPU::S_LSHR_B64: 4947 if (ST.hasOnlyRevVALUShifts()) { 4948 NewOpcode = AMDGPU::V_LSHRREV_B64; 4949 swapOperands(Inst); 4950 } 4951 break; 4952 4953 case AMDGPU::S_ABS_I32: 4954 lowerScalarAbs(Worklist, Inst); 4955 Inst.eraseFromParent(); 4956 continue; 4957 4958 case AMDGPU::S_CBRANCH_SCC0: 4959 case AMDGPU::S_CBRANCH_SCC1: 4960 // Clear unused bits of vcc 4961 if (ST.isWave32()) 4962 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B32), 4963 AMDGPU::VCC_LO) 4964 .addReg(AMDGPU::EXEC_LO) 4965 .addReg(AMDGPU::VCC_LO); 4966 else 4967 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), 4968 AMDGPU::VCC) 4969 .addReg(AMDGPU::EXEC) 4970 .addReg(AMDGPU::VCC); 4971 break; 4972 4973 case AMDGPU::S_BFE_U64: 4974 case AMDGPU::S_BFM_B64: 4975 llvm_unreachable("Moving this op to VALU not implemented"); 4976 4977 case AMDGPU::S_PACK_LL_B32_B16: 4978 case AMDGPU::S_PACK_LH_B32_B16: 4979 case AMDGPU::S_PACK_HH_B32_B16: 4980 movePackToVALU(Worklist, MRI, Inst); 4981 Inst.eraseFromParent(); 4982 continue; 4983 4984 case AMDGPU::S_XNOR_B32: 4985 lowerScalarXnor(Worklist, Inst); 4986 Inst.eraseFromParent(); 4987 continue; 4988 4989 case AMDGPU::S_NAND_B32: 4990 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32); 4991 Inst.eraseFromParent(); 4992 continue; 4993 4994 case AMDGPU::S_NOR_B32: 4995 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32); 4996 Inst.eraseFromParent(); 4997 continue; 4998 4999 case AMDGPU::S_ANDN2_B32: 5000 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32); 5001 Inst.eraseFromParent(); 5002 continue; 5003 5004 case AMDGPU::S_ORN2_B32: 5005 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); 5006 Inst.eraseFromParent(); 5007 continue; 5008 } 5009 5010 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 5011 // We cannot move this instruction to the VALU, so we should try to 5012 // legalize its operands instead. 5013 legalizeOperands(Inst, MDT); 5014 continue; 5015 } 5016 5017 // Use the new VALU Opcode. 5018 const MCInstrDesc &NewDesc = get(NewOpcode); 5019 Inst.setDesc(NewDesc); 5020 5021 // Remove any references to SCC. Vector instructions can't read from it, and 5022 // We're just about to add the implicit use / defs of VCC, and we don't want 5023 // both. 5024 for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { 5025 MachineOperand &Op = Inst.getOperand(i); 5026 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { 5027 // Only propagate through live-def of SCC. 5028 if (Op.isDef() && !Op.isDead()) 5029 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist); 5030 Inst.RemoveOperand(i); 5031 } 5032 } 5033 5034 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 5035 // We are converting these to a BFE, so we need to add the missing 5036 // operands for the size and offset. 5037 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 5038 Inst.addOperand(MachineOperand::CreateImm(0)); 5039 Inst.addOperand(MachineOperand::CreateImm(Size)); 5040 5041 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 5042 // The VALU version adds the second operand to the result, so insert an 5043 // extra 0 operand. 5044 Inst.addOperand(MachineOperand::CreateImm(0)); 5045 } 5046 5047 Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); 5048 fixImplicitOperands(Inst); 5049 5050 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 5051 const MachineOperand &OffsetWidthOp = Inst.getOperand(2); 5052 // If we need to move this to VGPRs, we need to unpack the second operand 5053 // back into the 2 separate ones for bit offset and width. 5054 assert(OffsetWidthOp.isImm() && 5055 "Scalar BFE is only implemented for constant width and offset"); 5056 uint32_t Imm = OffsetWidthOp.getImm(); 5057 5058 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 5059 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 5060 Inst.RemoveOperand(2); // Remove old immediate. 5061 Inst.addOperand(MachineOperand::CreateImm(Offset)); 5062 Inst.addOperand(MachineOperand::CreateImm(BitWidth)); 5063 } 5064 5065 bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); 5066 unsigned NewDstReg = AMDGPU::NoRegister; 5067 if (HasDst) { 5068 Register DstReg = Inst.getOperand(0).getReg(); 5069 if (Register::isPhysicalRegister(DstReg)) 5070 continue; 5071 5072 // Update the destination register class. 5073 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); 5074 if (!NewDstRC) 5075 continue; 5076 5077 if (Inst.isCopy() && 5078 Register::isVirtualRegister(Inst.getOperand(1).getReg()) && 5079 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { 5080 // Instead of creating a copy where src and dst are the same register 5081 // class, we just replace all uses of dst with src. These kinds of 5082 // copies interfere with the heuristics MachineSink uses to decide 5083 // whether or not to split a critical edge. Since the pass assumes 5084 // that copies will end up as machine instructions and not be 5085 // eliminated. 5086 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); 5087 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); 5088 MRI.clearKillFlags(Inst.getOperand(1).getReg()); 5089 Inst.getOperand(0).setReg(DstReg); 5090 5091 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally 5092 // these are deleted later, but at -O0 it would leave a suspicious 5093 // looking illegal copy of an undef register. 5094 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I) 5095 Inst.RemoveOperand(I); 5096 Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); 5097 continue; 5098 } 5099 5100 NewDstReg = MRI.createVirtualRegister(NewDstRC); 5101 MRI.replaceRegWith(DstReg, NewDstReg); 5102 } 5103 5104 // Legalize the operands 5105 legalizeOperands(Inst, MDT); 5106 5107 if (HasDst) 5108 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 5109 } 5110 } 5111 5112 // Add/sub require special handling to deal with carry outs. 5113 bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, 5114 MachineDominatorTree *MDT) const { 5115 if (ST.hasAddNoCarry()) { 5116 // Assume there is no user of scc since we don't select this in that case. 5117 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant 5118 // is used. 5119 5120 MachineBasicBlock &MBB = *Inst.getParent(); 5121 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5122 5123 Register OldDstReg = Inst.getOperand(0).getReg(); 5124 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5125 5126 unsigned Opc = Inst.getOpcode(); 5127 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32); 5128 5129 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ? 5130 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64; 5131 5132 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC); 5133 Inst.RemoveOperand(3); 5134 5135 Inst.setDesc(get(NewOpc)); 5136 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit 5137 Inst.addImplicitDefUseOperands(*MBB.getParent()); 5138 MRI.replaceRegWith(OldDstReg, ResultReg); 5139 legalizeOperands(Inst, MDT); 5140 5141 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 5142 return true; 5143 } 5144 5145 return false; 5146 } 5147 5148 void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, 5149 MachineInstr &Inst) const { 5150 MachineBasicBlock &MBB = *Inst.getParent(); 5151 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5152 MachineBasicBlock::iterator MII = Inst; 5153 DebugLoc DL = Inst.getDebugLoc(); 5154 5155 MachineOperand &Dest = Inst.getOperand(0); 5156 MachineOperand &Src = Inst.getOperand(1); 5157 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5158 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5159 5160 unsigned SubOp = ST.hasAddNoCarry() ? 5161 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32; 5162 5163 BuildMI(MBB, MII, DL, get(SubOp), TmpReg) 5164 .addImm(0) 5165 .addReg(Src.getReg()); 5166 5167 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) 5168 .addReg(Src.getReg()) 5169 .addReg(TmpReg); 5170 5171 MRI.replaceRegWith(Dest.getReg(), ResultReg); 5172 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 5173 } 5174 5175 void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, 5176 MachineInstr &Inst) const { 5177 MachineBasicBlock &MBB = *Inst.getParent(); 5178 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5179 MachineBasicBlock::iterator MII = Inst; 5180 const DebugLoc &DL = Inst.getDebugLoc(); 5181 5182 MachineOperand &Dest = Inst.getOperand(0); 5183 MachineOperand &Src0 = Inst.getOperand(1); 5184 MachineOperand &Src1 = Inst.getOperand(2); 5185 5186 if (ST.hasDLInsts()) { 5187 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5188 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); 5189 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); 5190 5191 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest) 5192 .add(Src0) 5193 .add(Src1); 5194 5195 MRI.replaceRegWith(Dest.getReg(), NewDest); 5196 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 5197 } else { 5198 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can 5199 // invert either source and then perform the XOR. If either source is a 5200 // scalar register, then we can leave the inversion on the scalar unit to 5201 // acheive a better distrubution of scalar and vector instructions. 5202 bool Src0IsSGPR = Src0.isReg() && 5203 RI.isSGPRClass(MRI.getRegClass(Src0.getReg())); 5204 bool Src1IsSGPR = Src1.isReg() && 5205 RI.isSGPRClass(MRI.getRegClass(Src1.getReg())); 5206 MachineInstr *Xor; 5207 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5208 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5209 5210 // Build a pair of scalar instructions and add them to the work list. 5211 // The next iteration over the work list will lower these to the vector 5212 // unit as necessary. 5213 if (Src0IsSGPR) { 5214 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0); 5215 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) 5216 .addReg(Temp) 5217 .add(Src1); 5218 } else if (Src1IsSGPR) { 5219 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1); 5220 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) 5221 .add(Src0) 5222 .addReg(Temp); 5223 } else { 5224 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp) 5225 .add(Src0) 5226 .add(Src1); 5227 MachineInstr *Not = 5228 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp); 5229 Worklist.insert(Not); 5230 } 5231 5232 MRI.replaceRegWith(Dest.getReg(), NewDest); 5233 5234 Worklist.insert(Xor); 5235 5236 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 5237 } 5238 } 5239 5240 void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist, 5241 MachineInstr &Inst, 5242 unsigned Opcode) const { 5243 MachineBasicBlock &MBB = *Inst.getParent(); 5244 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5245 MachineBasicBlock::iterator MII = Inst; 5246 const DebugLoc &DL = Inst.getDebugLoc(); 5247 5248 MachineOperand &Dest = Inst.getOperand(0); 5249 MachineOperand &Src0 = Inst.getOperand(1); 5250 MachineOperand &Src1 = Inst.getOperand(2); 5251 5252 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5253 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5254 5255 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm) 5256 .add(Src0) 5257 .add(Src1); 5258 5259 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest) 5260 .addReg(Interm); 5261 5262 Worklist.insert(&Op); 5263 Worklist.insert(&Not); 5264 5265 MRI.replaceRegWith(Dest.getReg(), NewDest); 5266 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 5267 } 5268 5269 void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist, 5270 MachineInstr &Inst, 5271 unsigned Opcode) const { 5272 MachineBasicBlock &MBB = *Inst.getParent(); 5273 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5274 MachineBasicBlock::iterator MII = Inst; 5275 const DebugLoc &DL = Inst.getDebugLoc(); 5276 5277 MachineOperand &Dest = Inst.getOperand(0); 5278 MachineOperand &Src0 = Inst.getOperand(1); 5279 MachineOperand &Src1 = Inst.getOperand(2); 5280 5281 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 5282 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 5283 5284 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm) 5285 .add(Src1); 5286 5287 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest) 5288 .add(Src0) 5289 .addReg(Interm); 5290 5291 Worklist.insert(&Not); 5292 Worklist.insert(&Op); 5293 5294 MRI.replaceRegWith(Dest.getReg(), NewDest); 5295 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 5296 } 5297 5298 void SIInstrInfo::splitScalar64BitUnaryOp( 5299 SetVectorType &Worklist, MachineInstr &Inst, 5300 unsigned Opcode) const { 5301 MachineBasicBlock &MBB = *Inst.getParent(); 5302 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5303 5304 MachineOperand &Dest = Inst.getOperand(0); 5305 MachineOperand &Src0 = Inst.getOperand(1); 5306 DebugLoc DL = Inst.getDebugLoc(); 5307 5308 MachineBasicBlock::iterator MII = Inst; 5309 5310 const MCInstrDesc &InstDesc = get(Opcode); 5311 const TargetRegisterClass *Src0RC = Src0.isReg() ? 5312 MRI.getRegClass(Src0.getReg()) : 5313 &AMDGPU::SGPR_32RegClass; 5314 5315 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 5316 5317 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5318 AMDGPU::sub0, Src0SubRC); 5319 5320 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 5321 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 5322 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 5323 5324 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 5325 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); 5326 5327 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5328 AMDGPU::sub1, Src0SubRC); 5329 5330 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 5331 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); 5332 5333 Register FullDestReg = MRI.createVirtualRegister(NewDestRC); 5334 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 5335 .addReg(DestSub0) 5336 .addImm(AMDGPU::sub0) 5337 .addReg(DestSub1) 5338 .addImm(AMDGPU::sub1); 5339 5340 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 5341 5342 Worklist.insert(&LoHalf); 5343 Worklist.insert(&HiHalf); 5344 5345 // We don't need to legalizeOperands here because for a single operand, src0 5346 // will support any kind of input. 5347 5348 // Move all users of this moved value. 5349 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 5350 } 5351 5352 void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, 5353 MachineInstr &Inst, 5354 MachineDominatorTree *MDT) const { 5355 bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); 5356 5357 MachineBasicBlock &MBB = *Inst.getParent(); 5358 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5359 const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 5360 5361 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 5362 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5363 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5364 5365 Register CarryReg = MRI.createVirtualRegister(CarryRC); 5366 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC); 5367 5368 MachineOperand &Dest = Inst.getOperand(0); 5369 MachineOperand &Src0 = Inst.getOperand(1); 5370 MachineOperand &Src1 = Inst.getOperand(2); 5371 const DebugLoc &DL = Inst.getDebugLoc(); 5372 MachineBasicBlock::iterator MII = Inst; 5373 5374 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); 5375 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); 5376 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 5377 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 5378 5379 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5380 AMDGPU::sub0, Src0SubRC); 5381 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 5382 AMDGPU::sub0, Src1SubRC); 5383 5384 5385 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5386 AMDGPU::sub1, Src0SubRC); 5387 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 5388 AMDGPU::sub1, Src1SubRC); 5389 5390 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; 5391 MachineInstr *LoHalf = 5392 BuildMI(MBB, MII, DL, get(LoOpc), DestSub0) 5393 .addReg(CarryReg, RegState::Define) 5394 .add(SrcReg0Sub0) 5395 .add(SrcReg1Sub0) 5396 .addImm(0); // clamp bit 5397 5398 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 5399 MachineInstr *HiHalf = 5400 BuildMI(MBB, MII, DL, get(HiOpc), DestSub1) 5401 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 5402 .add(SrcReg0Sub1) 5403 .add(SrcReg1Sub1) 5404 .addReg(CarryReg, RegState::Kill) 5405 .addImm(0); // clamp bit 5406 5407 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 5408 .addReg(DestSub0) 5409 .addImm(AMDGPU::sub0) 5410 .addReg(DestSub1) 5411 .addImm(AMDGPU::sub1); 5412 5413 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 5414 5415 // Try to legalize the operands in case we need to swap the order to keep it 5416 // valid. 5417 legalizeOperands(*LoHalf, MDT); 5418 legalizeOperands(*HiHalf, MDT); 5419 5420 // Move all users of this moved vlaue. 5421 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 5422 } 5423 5424 void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, 5425 MachineInstr &Inst, unsigned Opcode, 5426 MachineDominatorTree *MDT) const { 5427 MachineBasicBlock &MBB = *Inst.getParent(); 5428 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5429 5430 MachineOperand &Dest = Inst.getOperand(0); 5431 MachineOperand &Src0 = Inst.getOperand(1); 5432 MachineOperand &Src1 = Inst.getOperand(2); 5433 DebugLoc DL = Inst.getDebugLoc(); 5434 5435 MachineBasicBlock::iterator MII = Inst; 5436 5437 const MCInstrDesc &InstDesc = get(Opcode); 5438 const TargetRegisterClass *Src0RC = Src0.isReg() ? 5439 MRI.getRegClass(Src0.getReg()) : 5440 &AMDGPU::SGPR_32RegClass; 5441 5442 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 5443 const TargetRegisterClass *Src1RC = Src1.isReg() ? 5444 MRI.getRegClass(Src1.getReg()) : 5445 &AMDGPU::SGPR_32RegClass; 5446 5447 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 5448 5449 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5450 AMDGPU::sub0, Src0SubRC); 5451 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 5452 AMDGPU::sub0, Src1SubRC); 5453 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5454 AMDGPU::sub1, Src0SubRC); 5455 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 5456 AMDGPU::sub1, Src1SubRC); 5457 5458 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 5459 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 5460 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 5461 5462 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 5463 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) 5464 .add(SrcReg0Sub0) 5465 .add(SrcReg1Sub0); 5466 5467 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 5468 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) 5469 .add(SrcReg0Sub1) 5470 .add(SrcReg1Sub1); 5471 5472 Register FullDestReg = MRI.createVirtualRegister(NewDestRC); 5473 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 5474 .addReg(DestSub0) 5475 .addImm(AMDGPU::sub0) 5476 .addReg(DestSub1) 5477 .addImm(AMDGPU::sub1); 5478 5479 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 5480 5481 Worklist.insert(&LoHalf); 5482 Worklist.insert(&HiHalf); 5483 5484 // Move all users of this moved vlaue. 5485 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 5486 } 5487 5488 void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist, 5489 MachineInstr &Inst, 5490 MachineDominatorTree *MDT) const { 5491 MachineBasicBlock &MBB = *Inst.getParent(); 5492 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5493 5494 MachineOperand &Dest = Inst.getOperand(0); 5495 MachineOperand &Src0 = Inst.getOperand(1); 5496 MachineOperand &Src1 = Inst.getOperand(2); 5497 const DebugLoc &DL = Inst.getDebugLoc(); 5498 5499 MachineBasicBlock::iterator MII = Inst; 5500 5501 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 5502 5503 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 5504 5505 MachineOperand* Op0; 5506 MachineOperand* Op1; 5507 5508 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) { 5509 Op0 = &Src0; 5510 Op1 = &Src1; 5511 } else { 5512 Op0 = &Src1; 5513 Op1 = &Src0; 5514 } 5515 5516 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm) 5517 .add(*Op0); 5518 5519 Register NewDest = MRI.createVirtualRegister(DestRC); 5520 5521 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest) 5522 .addReg(Interm) 5523 .add(*Op1); 5524 5525 MRI.replaceRegWith(Dest.getReg(), NewDest); 5526 5527 Worklist.insert(&Xor); 5528 } 5529 5530 void SIInstrInfo::splitScalar64BitBCNT( 5531 SetVectorType &Worklist, MachineInstr &Inst) const { 5532 MachineBasicBlock &MBB = *Inst.getParent(); 5533 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5534 5535 MachineBasicBlock::iterator MII = Inst; 5536 const DebugLoc &DL = Inst.getDebugLoc(); 5537 5538 MachineOperand &Dest = Inst.getOperand(0); 5539 MachineOperand &Src = Inst.getOperand(1); 5540 5541 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 5542 const TargetRegisterClass *SrcRC = Src.isReg() ? 5543 MRI.getRegClass(Src.getReg()) : 5544 &AMDGPU::SGPR_32RegClass; 5545 5546 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5547 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5548 5549 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); 5550 5551 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 5552 AMDGPU::sub0, SrcSubRC); 5553 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 5554 AMDGPU::sub1, SrcSubRC); 5555 5556 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0); 5557 5558 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg); 5559 5560 MRI.replaceRegWith(Dest.getReg(), ResultReg); 5561 5562 // We don't need to legalize operands here. src0 for etiher instruction can be 5563 // an SGPR, and the second input is unused or determined here. 5564 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 5565 } 5566 5567 void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, 5568 MachineInstr &Inst) const { 5569 MachineBasicBlock &MBB = *Inst.getParent(); 5570 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5571 MachineBasicBlock::iterator MII = Inst; 5572 const DebugLoc &DL = Inst.getDebugLoc(); 5573 5574 MachineOperand &Dest = Inst.getOperand(0); 5575 uint32_t Imm = Inst.getOperand(2).getImm(); 5576 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 5577 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 5578 5579 (void) Offset; 5580 5581 // Only sext_inreg cases handled. 5582 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && 5583 Offset == 0 && "Not implemented"); 5584 5585 if (BitWidth < 32) { 5586 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5587 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5588 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 5589 5590 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) 5591 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) 5592 .addImm(0) 5593 .addImm(BitWidth); 5594 5595 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 5596 .addImm(31) 5597 .addReg(MidRegLo); 5598 5599 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 5600 .addReg(MidRegLo) 5601 .addImm(AMDGPU::sub0) 5602 .addReg(MidRegHi) 5603 .addImm(AMDGPU::sub1); 5604 5605 MRI.replaceRegWith(Dest.getReg(), ResultReg); 5606 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 5607 return; 5608 } 5609 5610 MachineOperand &Src = Inst.getOperand(1); 5611 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5612 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 5613 5614 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 5615 .addImm(31) 5616 .addReg(Src.getReg(), 0, AMDGPU::sub0); 5617 5618 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 5619 .addReg(Src.getReg(), 0, AMDGPU::sub0) 5620 .addImm(AMDGPU::sub0) 5621 .addReg(TmpReg) 5622 .addImm(AMDGPU::sub1); 5623 5624 MRI.replaceRegWith(Dest.getReg(), ResultReg); 5625 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 5626 } 5627 5628 void SIInstrInfo::addUsersToMoveToVALUWorklist( 5629 unsigned DstReg, 5630 MachineRegisterInfo &MRI, 5631 SetVectorType &Worklist) const { 5632 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), 5633 E = MRI.use_end(); I != E;) { 5634 MachineInstr &UseMI = *I->getParent(); 5635 5636 unsigned OpNo = 0; 5637 5638 switch (UseMI.getOpcode()) { 5639 case AMDGPU::COPY: 5640 case AMDGPU::WQM: 5641 case AMDGPU::SOFT_WQM: 5642 case AMDGPU::WWM: 5643 case AMDGPU::REG_SEQUENCE: 5644 case AMDGPU::PHI: 5645 case AMDGPU::INSERT_SUBREG: 5646 break; 5647 default: 5648 OpNo = I.getOperandNo(); 5649 break; 5650 } 5651 5652 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) { 5653 Worklist.insert(&UseMI); 5654 5655 do { 5656 ++I; 5657 } while (I != E && I->getParent() == &UseMI); 5658 } else { 5659 ++I; 5660 } 5661 } 5662 } 5663 5664 void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, 5665 MachineRegisterInfo &MRI, 5666 MachineInstr &Inst) const { 5667 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5668 MachineBasicBlock *MBB = Inst.getParent(); 5669 MachineOperand &Src0 = Inst.getOperand(1); 5670 MachineOperand &Src1 = Inst.getOperand(2); 5671 const DebugLoc &DL = Inst.getDebugLoc(); 5672 5673 switch (Inst.getOpcode()) { 5674 case AMDGPU::S_PACK_LL_B32_B16: { 5675 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5676 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5677 5678 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are 5679 // 0. 5680 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 5681 .addImm(0xffff); 5682 5683 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg) 5684 .addReg(ImmReg, RegState::Kill) 5685 .add(Src0); 5686 5687 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg) 5688 .add(Src1) 5689 .addImm(16) 5690 .addReg(TmpReg, RegState::Kill); 5691 break; 5692 } 5693 case AMDGPU::S_PACK_LH_B32_B16: { 5694 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5695 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 5696 .addImm(0xffff); 5697 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg) 5698 .addReg(ImmReg, RegState::Kill) 5699 .add(Src0) 5700 .add(Src1); 5701 break; 5702 } 5703 case AMDGPU::S_PACK_HH_B32_B16: { 5704 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5705 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5706 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) 5707 .addImm(16) 5708 .add(Src0); 5709 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 5710 .addImm(0xffff0000); 5711 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg) 5712 .add(Src1) 5713 .addReg(ImmReg, RegState::Kill) 5714 .addReg(TmpReg, RegState::Kill); 5715 break; 5716 } 5717 default: 5718 llvm_unreachable("unhandled s_pack_* instruction"); 5719 } 5720 5721 MachineOperand &Dest = Inst.getOperand(0); 5722 MRI.replaceRegWith(Dest.getReg(), ResultReg); 5723 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 5724 } 5725 5726 void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, 5727 MachineInstr &SCCDefInst, 5728 SetVectorType &Worklist) const { 5729 // Ensure that def inst defines SCC, which is still live. 5730 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && 5731 !Op.isDead() && Op.getParent() == &SCCDefInst); 5732 // This assumes that all the users of SCC are in the same block 5733 // as the SCC def. 5734 for (MachineInstr &MI : // Skip the def inst itself. 5735 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)), 5736 SCCDefInst.getParent()->end())) { 5737 // Check if SCC is used first. 5738 if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) 5739 Worklist.insert(&MI); 5740 // Exit if we find another SCC def. 5741 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1) 5742 return; 5743 } 5744 } 5745 5746 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( 5747 const MachineInstr &Inst) const { 5748 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); 5749 5750 switch (Inst.getOpcode()) { 5751 // For target instructions, getOpRegClass just returns the virtual register 5752 // class associated with the operand, so we need to find an equivalent VGPR 5753 // register class in order to move the instruction to the VALU. 5754 case AMDGPU::COPY: 5755 case AMDGPU::PHI: 5756 case AMDGPU::REG_SEQUENCE: 5757 case AMDGPU::INSERT_SUBREG: 5758 case AMDGPU::WQM: 5759 case AMDGPU::SOFT_WQM: 5760 case AMDGPU::WWM: { 5761 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1); 5762 if (RI.hasAGPRs(SrcRC)) { 5763 if (RI.hasAGPRs(NewDstRC)) 5764 return nullptr; 5765 5766 switch (Inst.getOpcode()) { 5767 case AMDGPU::PHI: 5768 case AMDGPU::REG_SEQUENCE: 5769 case AMDGPU::INSERT_SUBREG: 5770 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC); 5771 break; 5772 default: 5773 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 5774 } 5775 5776 if (!NewDstRC) 5777 return nullptr; 5778 } else { 5779 if (RI.hasVGPRs(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass) 5780 return nullptr; 5781 5782 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 5783 if (!NewDstRC) 5784 return nullptr; 5785 } 5786 5787 return NewDstRC; 5788 } 5789 default: 5790 return NewDstRC; 5791 } 5792 } 5793 5794 // Find the one SGPR operand we are allowed to use. 5795 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI, 5796 int OpIndices[3]) const { 5797 const MCInstrDesc &Desc = MI.getDesc(); 5798 5799 // Find the one SGPR operand we are allowed to use. 5800 // 5801 // First we need to consider the instruction's operand requirements before 5802 // legalizing. Some operands are required to be SGPRs, such as implicit uses 5803 // of VCC, but we are still bound by the constant bus requirement to only use 5804 // one. 5805 // 5806 // If the operand's class is an SGPR, we can never move it. 5807 5808 unsigned SGPRReg = findImplicitSGPRRead(MI); 5809 if (SGPRReg != AMDGPU::NoRegister) 5810 return SGPRReg; 5811 5812 unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; 5813 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 5814 5815 for (unsigned i = 0; i < 3; ++i) { 5816 int Idx = OpIndices[i]; 5817 if (Idx == -1) 5818 break; 5819 5820 const MachineOperand &MO = MI.getOperand(Idx); 5821 if (!MO.isReg()) 5822 continue; 5823 5824 // Is this operand statically required to be an SGPR based on the operand 5825 // constraints? 5826 const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); 5827 bool IsRequiredSGPR = RI.isSGPRClass(OpRC); 5828 if (IsRequiredSGPR) 5829 return MO.getReg(); 5830 5831 // If this could be a VGPR or an SGPR, Check the dynamic register class. 5832 Register Reg = MO.getReg(); 5833 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); 5834 if (RI.isSGPRClass(RegRC)) 5835 UsedSGPRs[i] = Reg; 5836 } 5837 5838 // We don't have a required SGPR operand, so we have a bit more freedom in 5839 // selecting operands to move. 5840 5841 // Try to select the most used SGPR. If an SGPR is equal to one of the 5842 // others, we choose that. 5843 // 5844 // e.g. 5845 // V_FMA_F32 v0, s0, s0, s0 -> No moves 5846 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 5847 5848 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should 5849 // prefer those. 5850 5851 if (UsedSGPRs[0] != AMDGPU::NoRegister) { 5852 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 5853 SGPRReg = UsedSGPRs[0]; 5854 } 5855 5856 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { 5857 if (UsedSGPRs[1] == UsedSGPRs[2]) 5858 SGPRReg = UsedSGPRs[1]; 5859 } 5860 5861 return SGPRReg; 5862 } 5863 5864 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 5865 unsigned OperandName) const { 5866 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 5867 if (Idx == -1) 5868 return nullptr; 5869 5870 return &MI.getOperand(Idx); 5871 } 5872 5873 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 5874 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 5875 return (22ULL << 44) | // IMG_FORMAT_32_FLOAT 5876 (1ULL << 56) | // RESOURCE_LEVEL = 1 5877 (3ULL << 60); // OOB_SELECT = 3 5878 } 5879 5880 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 5881 if (ST.isAmdHsaOS()) { 5882 // Set ATC = 1. GFX9 doesn't have this bit. 5883 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) 5884 RsrcDataFormat |= (1ULL << 56); 5885 5886 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this. 5887 // BTW, it disables TC L2 and therefore decreases performance. 5888 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) 5889 RsrcDataFormat |= (2ULL << 59); 5890 } 5891 5892 return RsrcDataFormat; 5893 } 5894 5895 uint64_t SIInstrInfo::getScratchRsrcWords23() const { 5896 uint64_t Rsrc23 = getDefaultRsrcDataFormat() | 5897 AMDGPU::RSRC_TID_ENABLE | 5898 0xffffffff; // Size; 5899 5900 // GFX9 doesn't have ELEMENT_SIZE. 5901 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 5902 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; 5903 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; 5904 } 5905 5906 // IndexStride = 64 / 32. 5907 uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2; 5908 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; 5909 5910 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. 5911 // Clear them unless we want a huge stride. 5912 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 5913 ST.getGeneration() <= AMDGPUSubtarget::GFX9) 5914 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; 5915 5916 return Rsrc23; 5917 } 5918 5919 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { 5920 unsigned Opc = MI.getOpcode(); 5921 5922 return isSMRD(Opc); 5923 } 5924 5925 bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const { 5926 unsigned Opc = MI.getOpcode(); 5927 5928 return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); 5929 } 5930 5931 unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, 5932 int &FrameIndex) const { 5933 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 5934 if (!Addr || !Addr->isFI()) 5935 return AMDGPU::NoRegister; 5936 5937 assert(!MI.memoperands_empty() && 5938 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS); 5939 5940 FrameIndex = Addr->getIndex(); 5941 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); 5942 } 5943 5944 unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, 5945 int &FrameIndex) const { 5946 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr); 5947 assert(Addr && Addr->isFI()); 5948 FrameIndex = Addr->getIndex(); 5949 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg(); 5950 } 5951 5952 unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 5953 int &FrameIndex) const { 5954 if (!MI.mayLoad()) 5955 return AMDGPU::NoRegister; 5956 5957 if (isMUBUF(MI) || isVGPRSpill(MI)) 5958 return isStackAccess(MI, FrameIndex); 5959 5960 if (isSGPRSpill(MI)) 5961 return isSGPRStackAccess(MI, FrameIndex); 5962 5963 return AMDGPU::NoRegister; 5964 } 5965 5966 unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI, 5967 int &FrameIndex) const { 5968 if (!MI.mayStore()) 5969 return AMDGPU::NoRegister; 5970 5971 if (isMUBUF(MI) || isVGPRSpill(MI)) 5972 return isStackAccess(MI, FrameIndex); 5973 5974 if (isSGPRSpill(MI)) 5975 return isSGPRStackAccess(MI, FrameIndex); 5976 5977 return AMDGPU::NoRegister; 5978 } 5979 5980 unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const { 5981 unsigned Size = 0; 5982 MachineBasicBlock::const_instr_iterator I = MI.getIterator(); 5983 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); 5984 while (++I != E && I->isInsideBundle()) { 5985 assert(!I->isBundle() && "No nested bundle!"); 5986 Size += getInstSizeInBytes(*I); 5987 } 5988 5989 return Size; 5990 } 5991 5992 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 5993 unsigned Opc = MI.getOpcode(); 5994 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); 5995 unsigned DescSize = Desc.getSize(); 5996 5997 // If we have a definitive size, we can use it. Otherwise we need to inspect 5998 // the operands to know the size. 5999 if (isFixedSize(MI)) 6000 return DescSize; 6001 6002 // 4-byte instructions may have a 32-bit literal encoded after them. Check 6003 // operands that coud ever be literals. 6004 if (isVALU(MI) || isSALU(MI)) { 6005 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 6006 if (Src0Idx == -1) 6007 return DescSize; // No operands. 6008 6009 if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx])) 6010 return isVOP3(MI) ? 12 : (DescSize + 4); 6011 6012 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 6013 if (Src1Idx == -1) 6014 return DescSize; 6015 6016 if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx])) 6017 return isVOP3(MI) ? 12 : (DescSize + 4); 6018 6019 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 6020 if (Src2Idx == -1) 6021 return DescSize; 6022 6023 if (isLiteralConstantLike(MI.getOperand(Src2Idx), Desc.OpInfo[Src2Idx])) 6024 return isVOP3(MI) ? 12 : (DescSize + 4); 6025 6026 return DescSize; 6027 } 6028 6029 // Check whether we have extra NSA words. 6030 if (isMIMG(MI)) { 6031 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 6032 if (VAddr0Idx < 0) 6033 return 8; 6034 6035 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 6036 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4); 6037 } 6038 6039 switch (Opc) { 6040 case TargetOpcode::IMPLICIT_DEF: 6041 case TargetOpcode::KILL: 6042 case TargetOpcode::DBG_VALUE: 6043 case TargetOpcode::EH_LABEL: 6044 return 0; 6045 case TargetOpcode::BUNDLE: 6046 return getInstBundleSize(MI); 6047 case TargetOpcode::INLINEASM: 6048 case TargetOpcode::INLINEASM_BR: { 6049 const MachineFunction *MF = MI.getParent()->getParent(); 6050 const char *AsmStr = MI.getOperand(0).getSymbolName(); 6051 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), 6052 &MF->getSubtarget()); 6053 } 6054 default: 6055 return DescSize; 6056 } 6057 } 6058 6059 bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { 6060 if (!isFLAT(MI)) 6061 return false; 6062 6063 if (MI.memoperands_empty()) 6064 return true; 6065 6066 for (const MachineMemOperand *MMO : MI.memoperands()) { 6067 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) 6068 return true; 6069 } 6070 return false; 6071 } 6072 6073 bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const { 6074 return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO; 6075 } 6076 6077 void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, 6078 MachineBasicBlock *IfEnd) const { 6079 MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator(); 6080 assert(TI != IfEntry->end()); 6081 6082 MachineInstr *Branch = &(*TI); 6083 MachineFunction *MF = IfEntry->getParent(); 6084 MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo(); 6085 6086 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 6087 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); 6088 MachineInstr *SIIF = 6089 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg) 6090 .add(Branch->getOperand(0)) 6091 .add(Branch->getOperand(1)); 6092 MachineInstr *SIEND = 6093 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF)) 6094 .addReg(DstReg); 6095 6096 IfEntry->erase(TI); 6097 IfEntry->insert(IfEntry->end(), SIIF); 6098 IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND); 6099 } 6100 } 6101 6102 void SIInstrInfo::convertNonUniformLoopRegion( 6103 MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const { 6104 MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator(); 6105 // We expect 2 terminators, one conditional and one unconditional. 6106 assert(TI != LoopEnd->end()); 6107 6108 MachineInstr *Branch = &(*TI); 6109 MachineFunction *MF = LoopEnd->getParent(); 6110 MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo(); 6111 6112 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 6113 6114 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); 6115 Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC()); 6116 MachineInstrBuilder HeaderPHIBuilder = 6117 BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg); 6118 for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(), 6119 E = LoopEntry->pred_end(); 6120 PI != E; ++PI) { 6121 if (*PI == LoopEnd) { 6122 HeaderPHIBuilder.addReg(BackEdgeReg); 6123 } else { 6124 MachineBasicBlock *PMBB = *PI; 6125 Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC()); 6126 materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(), 6127 ZeroReg, 0); 6128 HeaderPHIBuilder.addReg(ZeroReg); 6129 } 6130 HeaderPHIBuilder.addMBB(*PI); 6131 } 6132 MachineInstr *HeaderPhi = HeaderPHIBuilder; 6133 MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(), 6134 get(AMDGPU::SI_IF_BREAK), BackEdgeReg) 6135 .addReg(DstReg) 6136 .add(Branch->getOperand(0)); 6137 MachineInstr *SILOOP = 6138 BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP)) 6139 .addReg(BackEdgeReg) 6140 .addMBB(LoopEntry); 6141 6142 LoopEntry->insert(LoopEntry->begin(), HeaderPhi); 6143 LoopEnd->erase(TI); 6144 LoopEnd->insert(LoopEnd->end(), SIIFBREAK); 6145 LoopEnd->insert(LoopEnd->end(), SILOOP); 6146 } 6147 } 6148 6149 ArrayRef<std::pair<int, const char *>> 6150 SIInstrInfo::getSerializableTargetIndices() const { 6151 static const std::pair<int, const char *> TargetIndices[] = { 6152 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, 6153 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, 6154 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, 6155 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, 6156 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; 6157 return makeArrayRef(TargetIndices); 6158 } 6159 6160 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The 6161 /// post-RA version of misched uses CreateTargetMIHazardRecognizer. 6162 ScheduleHazardRecognizer * 6163 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, 6164 const ScheduleDAG *DAG) const { 6165 return new GCNHazardRecognizer(DAG->MF); 6166 } 6167 6168 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer 6169 /// pass. 6170 ScheduleHazardRecognizer * 6171 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { 6172 return new GCNHazardRecognizer(MF); 6173 } 6174 6175 std::pair<unsigned, unsigned> 6176 SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { 6177 return std::make_pair(TF & MO_MASK, TF & ~MO_MASK); 6178 } 6179 6180 ArrayRef<std::pair<unsigned, const char *>> 6181 SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { 6182 static const std::pair<unsigned, const char *> TargetFlags[] = { 6183 { MO_GOTPCREL, "amdgpu-gotprel" }, 6184 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" }, 6185 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" }, 6186 { MO_REL32_LO, "amdgpu-rel32-lo" }, 6187 { MO_REL32_HI, "amdgpu-rel32-hi" }, 6188 { MO_ABS32_LO, "amdgpu-abs32-lo" }, 6189 { MO_ABS32_HI, "amdgpu-abs32-hi" }, 6190 }; 6191 6192 return makeArrayRef(TargetFlags); 6193 } 6194 6195 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { 6196 return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && 6197 MI.modifiesRegister(AMDGPU::EXEC, &RI); 6198 } 6199 6200 MachineInstrBuilder 6201 SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, 6202 MachineBasicBlock::iterator I, 6203 const DebugLoc &DL, 6204 unsigned DestReg) const { 6205 if (ST.hasAddNoCarry()) 6206 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg); 6207 6208 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6209 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC()); 6210 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC()); 6211 6212 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) 6213 .addReg(UnusedCarry, RegState::Define | RegState::Dead); 6214 } 6215 6216 MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, 6217 MachineBasicBlock::iterator I, 6218 const DebugLoc &DL, 6219 Register DestReg, 6220 RegScavenger &RS) const { 6221 if (ST.hasAddNoCarry()) 6222 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg); 6223 6224 // If available, prefer to use vcc. 6225 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC) 6226 ? Register(RI.getVCC()) 6227 : RS.scavengeRegister(RI.getBoolRC(), I, 0, false); 6228 6229 // TODO: Users need to deal with this. 6230 if (!UnusedCarry.isValid()) 6231 return MachineInstrBuilder(); 6232 6233 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) 6234 .addReg(UnusedCarry, RegState::Define | RegState::Dead); 6235 } 6236 6237 bool SIInstrInfo::isKillTerminator(unsigned Opcode) { 6238 switch (Opcode) { 6239 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: 6240 case AMDGPU::SI_KILL_I1_TERMINATOR: 6241 return true; 6242 default: 6243 return false; 6244 } 6245 } 6246 6247 const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const { 6248 switch (Opcode) { 6249 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: 6250 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR); 6251 case AMDGPU::SI_KILL_I1_PSEUDO: 6252 return get(AMDGPU::SI_KILL_I1_TERMINATOR); 6253 default: 6254 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO"); 6255 } 6256 } 6257 6258 void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const { 6259 MachineBasicBlock *MBB = MI.getParent(); 6260 MachineFunction *MF = MBB->getParent(); 6261 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 6262 6263 if (!ST.isWave32()) 6264 return; 6265 6266 for (auto &Op : MI.implicit_operands()) { 6267 if (Op.isReg() && Op.getReg() == AMDGPU::VCC) 6268 Op.setReg(AMDGPU::VCC_LO); 6269 } 6270 } 6271 6272 bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { 6273 if (!isSMRD(MI)) 6274 return false; 6275 6276 // Check that it is using a buffer resource. 6277 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase); 6278 if (Idx == -1) // e.g. s_memtime 6279 return false; 6280 6281 const auto RCID = MI.getDesc().OpInfo[Idx].RegClass; 6282 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass); 6283 } 6284 6285 unsigned SIInstrInfo::getNumFlatOffsetBits(unsigned AddrSpace, 6286 bool Signed) const { 6287 if (!ST.hasFlatInstOffsets()) 6288 return 0; 6289 6290 if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS) 6291 return 0; 6292 6293 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) 6294 return Signed ? 12 : 11; 6295 6296 return Signed ? 13 : 12; 6297 } 6298 6299 bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, 6300 bool Signed) const { 6301 // TODO: Should 0 be special cased? 6302 if (!ST.hasFlatInstOffsets()) 6303 return false; 6304 6305 if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS) 6306 return false; 6307 6308 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 6309 return (Signed && isInt<12>(Offset)) || 6310 (!Signed && isUInt<11>(Offset)); 6311 } 6312 6313 return (Signed && isInt<13>(Offset)) || 6314 (!Signed && isUInt<12>(Offset)); 6315 } 6316 6317 6318 // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td 6319 enum SIEncodingFamily { 6320 SI = 0, 6321 VI = 1, 6322 SDWA = 2, 6323 SDWA9 = 3, 6324 GFX80 = 4, 6325 GFX9 = 5, 6326 GFX10 = 6, 6327 SDWA10 = 7 6328 }; 6329 6330 static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) { 6331 switch (ST.getGeneration()) { 6332 default: 6333 break; 6334 case AMDGPUSubtarget::SOUTHERN_ISLANDS: 6335 case AMDGPUSubtarget::SEA_ISLANDS: 6336 return SIEncodingFamily::SI; 6337 case AMDGPUSubtarget::VOLCANIC_ISLANDS: 6338 case AMDGPUSubtarget::GFX9: 6339 return SIEncodingFamily::VI; 6340 case AMDGPUSubtarget::GFX10: 6341 return SIEncodingFamily::GFX10; 6342 } 6343 llvm_unreachable("Unknown subtarget generation!"); 6344 } 6345 6346 bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const { 6347 switch(MCOp) { 6348 // These opcodes use indirect register addressing so 6349 // they need special handling by codegen (currently missing). 6350 // Therefore it is too risky to allow these opcodes 6351 // to be selected by dpp combiner or sdwa peepholer. 6352 case AMDGPU::V_MOVRELS_B32_dpp_gfx10: 6353 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10: 6354 case AMDGPU::V_MOVRELD_B32_dpp_gfx10: 6355 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10: 6356 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10: 6357 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10: 6358 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10: 6359 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10: 6360 return true; 6361 default: 6362 return false; 6363 } 6364 } 6365 6366 int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { 6367 SIEncodingFamily Gen = subtargetEncodingFamily(ST); 6368 6369 if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 && 6370 ST.getGeneration() == AMDGPUSubtarget::GFX9) 6371 Gen = SIEncodingFamily::GFX9; 6372 6373 // Adjust the encoding family to GFX80 for D16 buffer instructions when the 6374 // subtarget has UnpackedD16VMem feature. 6375 // TODO: remove this when we discard GFX80 encoding. 6376 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf)) 6377 Gen = SIEncodingFamily::GFX80; 6378 6379 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) { 6380 switch (ST.getGeneration()) { 6381 default: 6382 Gen = SIEncodingFamily::SDWA; 6383 break; 6384 case AMDGPUSubtarget::GFX9: 6385 Gen = SIEncodingFamily::SDWA9; 6386 break; 6387 case AMDGPUSubtarget::GFX10: 6388 Gen = SIEncodingFamily::SDWA10; 6389 break; 6390 } 6391 } 6392 6393 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); 6394 6395 // -1 means that Opcode is already a native instruction. 6396 if (MCOp == -1) 6397 return Opcode; 6398 6399 // (uint16_t)-1 means that Opcode is a pseudo instruction that has 6400 // no encoding in the given subtarget generation. 6401 if (MCOp == (uint16_t)-1) 6402 return -1; 6403 6404 if (isAsmOnlyOpcode(MCOp)) 6405 return -1; 6406 6407 return MCOp; 6408 } 6409 6410 static 6411 TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) { 6412 assert(RegOpnd.isReg()); 6413 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() : 6414 getRegSubRegPair(RegOpnd); 6415 } 6416 6417 TargetInstrInfo::RegSubRegPair 6418 llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) { 6419 assert(MI.isRegSequence()); 6420 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I) 6421 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) { 6422 auto &RegOp = MI.getOperand(1 + 2 * I); 6423 return getRegOrUndef(RegOp); 6424 } 6425 return TargetInstrInfo::RegSubRegPair(); 6426 } 6427 6428 // Try to find the definition of reg:subreg in subreg-manipulation pseudos 6429 // Following a subreg of reg:subreg isn't supported 6430 static bool followSubRegDef(MachineInstr &MI, 6431 TargetInstrInfo::RegSubRegPair &RSR) { 6432 if (!RSR.SubReg) 6433 return false; 6434 switch (MI.getOpcode()) { 6435 default: break; 6436 case AMDGPU::REG_SEQUENCE: 6437 RSR = getRegSequenceSubReg(MI, RSR.SubReg); 6438 return true; 6439 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg 6440 case AMDGPU::INSERT_SUBREG: 6441 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm()) 6442 // inserted the subreg we're looking for 6443 RSR = getRegOrUndef(MI.getOperand(2)); 6444 else { // the subreg in the rest of the reg 6445 auto R1 = getRegOrUndef(MI.getOperand(1)); 6446 if (R1.SubReg) // subreg of subreg isn't supported 6447 return false; 6448 RSR.Reg = R1.Reg; 6449 } 6450 return true; 6451 } 6452 return false; 6453 } 6454 6455 MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, 6456 MachineRegisterInfo &MRI) { 6457 assert(MRI.isSSA()); 6458 if (!Register::isVirtualRegister(P.Reg)) 6459 return nullptr; 6460 6461 auto RSR = P; 6462 auto *DefInst = MRI.getVRegDef(RSR.Reg); 6463 while (auto *MI = DefInst) { 6464 DefInst = nullptr; 6465 switch (MI->getOpcode()) { 6466 case AMDGPU::COPY: 6467 case AMDGPU::V_MOV_B32_e32: { 6468 auto &Op1 = MI->getOperand(1); 6469 if (Op1.isReg() && Register::isVirtualRegister(Op1.getReg())) { 6470 if (Op1.isUndef()) 6471 return nullptr; 6472 RSR = getRegSubRegPair(Op1); 6473 DefInst = MRI.getVRegDef(RSR.Reg); 6474 } 6475 break; 6476 } 6477 default: 6478 if (followSubRegDef(*MI, RSR)) { 6479 if (!RSR.Reg) 6480 return nullptr; 6481 DefInst = MRI.getVRegDef(RSR.Reg); 6482 } 6483 } 6484 if (!DefInst) 6485 return MI; 6486 } 6487 return nullptr; 6488 } 6489 6490 bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, 6491 Register VReg, 6492 const MachineInstr &DefMI, 6493 const MachineInstr &UseMI) { 6494 assert(MRI.isSSA() && "Must be run on SSA"); 6495 6496 auto *TRI = MRI.getTargetRegisterInfo(); 6497 auto *DefBB = DefMI.getParent(); 6498 6499 // Don't bother searching between blocks, although it is possible this block 6500 // doesn't modify exec. 6501 if (UseMI.getParent() != DefBB) 6502 return true; 6503 6504 const int MaxInstScan = 20; 6505 int NumInst = 0; 6506 6507 // Stop scan at the use. 6508 auto E = UseMI.getIterator(); 6509 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) { 6510 if (I->isDebugInstr()) 6511 continue; 6512 6513 if (++NumInst > MaxInstScan) 6514 return true; 6515 6516 if (I->modifiesRegister(AMDGPU::EXEC, TRI)) 6517 return true; 6518 } 6519 6520 return false; 6521 } 6522 6523 bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, 6524 Register VReg, 6525 const MachineInstr &DefMI) { 6526 assert(MRI.isSSA() && "Must be run on SSA"); 6527 6528 auto *TRI = MRI.getTargetRegisterInfo(); 6529 auto *DefBB = DefMI.getParent(); 6530 6531 const int MaxUseInstScan = 10; 6532 int NumUseInst = 0; 6533 6534 for (auto &UseInst : MRI.use_nodbg_instructions(VReg)) { 6535 // Don't bother searching between blocks, although it is possible this block 6536 // doesn't modify exec. 6537 if (UseInst.getParent() != DefBB) 6538 return true; 6539 6540 if (++NumUseInst > MaxUseInstScan) 6541 return true; 6542 } 6543 6544 const int MaxInstScan = 20; 6545 int NumInst = 0; 6546 6547 // Stop scan when we have seen all the uses. 6548 for (auto I = std::next(DefMI.getIterator()); ; ++I) { 6549 if (I->isDebugInstr()) 6550 continue; 6551 6552 if (++NumInst > MaxInstScan) 6553 return true; 6554 6555 if (I->readsRegister(VReg)) 6556 if (--NumUseInst == 0) 6557 return false; 6558 6559 if (I->modifiesRegister(AMDGPU::EXEC, TRI)) 6560 return true; 6561 } 6562 } 6563 6564 MachineInstr *SIInstrInfo::createPHIDestinationCopy( 6565 MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt, 6566 const DebugLoc &DL, Register Src, Register Dst) const { 6567 auto Cur = MBB.begin(); 6568 if (Cur != MBB.end()) 6569 do { 6570 if (!Cur->isPHI() && Cur->readsRegister(Dst)) 6571 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src); 6572 ++Cur; 6573 } while (Cur != MBB.end() && Cur != LastPHIIt); 6574 6575 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src, 6576 Dst); 6577 } 6578 6579 MachineInstr *SIInstrInfo::createPHISourceCopy( 6580 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, 6581 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const { 6582 if (InsPt != MBB.end() && 6583 (InsPt->getOpcode() == AMDGPU::SI_IF || 6584 InsPt->getOpcode() == AMDGPU::SI_ELSE || 6585 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) && 6586 InsPt->definesRegister(Src)) { 6587 InsPt++; 6588 return BuildMI(MBB, InsPt, DL, 6589 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term 6590 : AMDGPU::S_MOV_B64_term), 6591 Dst) 6592 .addReg(Src, 0, SrcSubReg) 6593 .addReg(AMDGPU::EXEC, RegState::Implicit); 6594 } 6595 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg, 6596 Dst); 6597 } 6598 6599 bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); } 6600 6601 MachineInstr *SIInstrInfo::foldMemoryOperandImpl( 6602 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, 6603 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS, 6604 VirtRegMap *VRM) const { 6605 // This is a bit of a hack (copied from AArch64). Consider this instruction: 6606 // 6607 // %0:sreg_32 = COPY $m0 6608 // 6609 // We explicitly chose SReg_32 for the virtual register so such a copy might 6610 // be eliminated by RegisterCoalescer. However, that may not be possible, and 6611 // %0 may even spill. We can't spill $m0 normally (it would require copying to 6612 // a numbered SGPR anyway), and since it is in the SReg_32 register class, 6613 // TargetInstrInfo::foldMemoryOperand() is going to try. 6614 // 6615 // To prevent that, constrain the %0 register class here. 6616 if (MI.isFullCopy()) { 6617 Register DstReg = MI.getOperand(0).getReg(); 6618 Register SrcReg = MI.getOperand(1).getReg(); 6619 6620 if (DstReg == AMDGPU::M0 && SrcReg.isVirtual()) { 6621 MF.getRegInfo().constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); 6622 return nullptr; 6623 } 6624 6625 if (SrcReg == AMDGPU::M0 && DstReg.isVirtual()) { 6626 MF.getRegInfo().constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass); 6627 return nullptr; 6628 } 6629 } 6630 6631 return nullptr; 6632 } 6633 6634 unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, 6635 const MachineInstr &MI, 6636 unsigned *PredCost) const { 6637 if (MI.isBundle()) { 6638 MachineBasicBlock::const_instr_iterator I(MI.getIterator()); 6639 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end()); 6640 unsigned Lat = 0, Count = 0; 6641 for (++I; I != E && I->isBundledWithPred(); ++I) { 6642 ++Count; 6643 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I)); 6644 } 6645 return Lat + Count - 1; 6646 } 6647 6648 return SchedModel.computeInstrLatency(&MI); 6649 } 6650