1 //===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief SI Implementation of TargetInstrInfo. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "SIInstrInfo.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "GCNHazardRecognizer.h" 18 #include "SIDefines.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/CodeGen/MachineFrameInfo.h" 21 #include "llvm/CodeGen/MachineInstrBuilder.h" 22 #include "llvm/CodeGen/MachineRegisterInfo.h" 23 #include "llvm/CodeGen/RegisterScavenging.h" 24 #include "llvm/CodeGen/ScheduleDAG.h" 25 #include "llvm/IR/DiagnosticInfo.h" 26 #include "llvm/IR/Function.h" 27 #include "llvm/MC/MCInstrDesc.h" 28 #include "llvm/Support/Debug.h" 29 30 using namespace llvm; 31 32 // Must be at least 4 to be able to branch over minimum unconditional branch 33 // code. This is only for making it possible to write reasonably small tests for 34 // long branches. 35 static cl::opt<unsigned> 36 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), 37 cl::desc("Restrict range of branch instructions (DEBUG)")); 38 39 SIInstrInfo::SIInstrInfo(const SISubtarget &ST) 40 : AMDGPUInstrInfo(ST), RI(ST), ST(ST) {} 41 42 //===----------------------------------------------------------------------===// 43 // TargetInstrInfo callbacks 44 //===----------------------------------------------------------------------===// 45 46 static unsigned getNumOperandsNoGlue(SDNode *Node) { 47 unsigned N = Node->getNumOperands(); 48 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 49 --N; 50 return N; 51 } 52 53 static SDValue findChainOperand(SDNode *Load) { 54 SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); 55 assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); 56 return LastOp; 57 } 58 59 /// \brief Returns true if both nodes have the same value for the given 60 /// operand \p Op, or if both nodes do not have this operand. 61 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 62 unsigned Opc0 = N0->getMachineOpcode(); 63 unsigned Opc1 = N1->getMachineOpcode(); 64 65 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 66 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 67 68 if (Op0Idx == -1 && Op1Idx == -1) 69 return true; 70 71 72 if ((Op0Idx == -1 && Op1Idx != -1) || 73 (Op1Idx == -1 && Op0Idx != -1)) 74 return false; 75 76 // getNamedOperandIdx returns the index for the MachineInstr's operands, 77 // which includes the result as the first operand. We are indexing into the 78 // MachineSDNode's operands, so we need to skip the result operand to get 79 // the real index. 80 --Op0Idx; 81 --Op1Idx; 82 83 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 84 } 85 86 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, 87 AliasAnalysis *AA) const { 88 // TODO: The generic check fails for VALU instructions that should be 89 // rematerializable due to implicit reads of exec. We really want all of the 90 // generic logic for this except for this. 91 switch (MI.getOpcode()) { 92 case AMDGPU::V_MOV_B32_e32: 93 case AMDGPU::V_MOV_B32_e64: 94 case AMDGPU::V_MOV_B64_PSEUDO: 95 return true; 96 default: 97 return false; 98 } 99 } 100 101 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 102 int64_t &Offset0, 103 int64_t &Offset1) const { 104 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 105 return false; 106 107 unsigned Opc0 = Load0->getMachineOpcode(); 108 unsigned Opc1 = Load1->getMachineOpcode(); 109 110 // Make sure both are actually loads. 111 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 112 return false; 113 114 if (isDS(Opc0) && isDS(Opc1)) { 115 116 // FIXME: Handle this case: 117 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 118 return false; 119 120 // Check base reg. 121 if (Load0->getOperand(1) != Load1->getOperand(1)) 122 return false; 123 124 // Check chain. 125 if (findChainOperand(Load0) != findChainOperand(Load1)) 126 return false; 127 128 // Skip read2 / write2 variants for simplicity. 129 // TODO: We should report true if the used offsets are adjacent (excluded 130 // st64 versions). 131 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || 132 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) 133 return false; 134 135 Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue(); 136 Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue(); 137 return true; 138 } 139 140 if (isSMRD(Opc0) && isSMRD(Opc1)) { 141 // Skip time and cache invalidation instructions. 142 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 || 143 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1) 144 return false; 145 146 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); 147 148 // Check base reg. 149 if (Load0->getOperand(0) != Load1->getOperand(0)) 150 return false; 151 152 const ConstantSDNode *Load0Offset = 153 dyn_cast<ConstantSDNode>(Load0->getOperand(1)); 154 const ConstantSDNode *Load1Offset = 155 dyn_cast<ConstantSDNode>(Load1->getOperand(1)); 156 157 if (!Load0Offset || !Load1Offset) 158 return false; 159 160 // Check chain. 161 if (findChainOperand(Load0) != findChainOperand(Load1)) 162 return false; 163 164 Offset0 = Load0Offset->getZExtValue(); 165 Offset1 = Load1Offset->getZExtValue(); 166 return true; 167 } 168 169 // MUBUF and MTBUF can access the same addresses. 170 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 171 172 // MUBUF and MTBUF have vaddr at different indices. 173 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 174 findChainOperand(Load0) != findChainOperand(Load1) || 175 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 176 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 177 return false; 178 179 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 180 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 181 182 if (OffIdx0 == -1 || OffIdx1 == -1) 183 return false; 184 185 // getNamedOperandIdx returns the index for MachineInstrs. Since they 186 // inlcude the output in the operand list, but SDNodes don't, we need to 187 // subtract the index by one. 188 --OffIdx0; 189 --OffIdx1; 190 191 SDValue Off0 = Load0->getOperand(OffIdx0); 192 SDValue Off1 = Load1->getOperand(OffIdx1); 193 194 // The offset might be a FrameIndexSDNode. 195 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 196 return false; 197 198 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 199 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 200 return true; 201 } 202 203 return false; 204 } 205 206 static bool isStride64(unsigned Opc) { 207 switch (Opc) { 208 case AMDGPU::DS_READ2ST64_B32: 209 case AMDGPU::DS_READ2ST64_B64: 210 case AMDGPU::DS_WRITE2ST64_B32: 211 case AMDGPU::DS_WRITE2ST64_B64: 212 return true; 213 default: 214 return false; 215 } 216 } 217 218 bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, 219 int64_t &Offset, 220 const TargetRegisterInfo *TRI) const { 221 unsigned Opc = LdSt.getOpcode(); 222 223 if (isDS(LdSt)) { 224 const MachineOperand *OffsetImm = 225 getNamedOperand(LdSt, AMDGPU::OpName::offset); 226 if (OffsetImm) { 227 // Normal, single offset LDS instruction. 228 const MachineOperand *AddrReg = 229 getNamedOperand(LdSt, AMDGPU::OpName::addr); 230 231 BaseReg = AddrReg->getReg(); 232 Offset = OffsetImm->getImm(); 233 return true; 234 } 235 236 // The 2 offset instructions use offset0 and offset1 instead. We can treat 237 // these as a load with a single offset if the 2 offsets are consecutive. We 238 // will use this for some partially aligned loads. 239 const MachineOperand *Offset0Imm = 240 getNamedOperand(LdSt, AMDGPU::OpName::offset0); 241 const MachineOperand *Offset1Imm = 242 getNamedOperand(LdSt, AMDGPU::OpName::offset1); 243 244 uint8_t Offset0 = Offset0Imm->getImm(); 245 uint8_t Offset1 = Offset1Imm->getImm(); 246 247 if (Offset1 > Offset0 && Offset1 - Offset0 == 1) { 248 // Each of these offsets is in element sized units, so we need to convert 249 // to bytes of the individual reads. 250 251 unsigned EltSize; 252 if (LdSt.mayLoad()) 253 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16; 254 else { 255 assert(LdSt.mayStore()); 256 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 257 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8; 258 } 259 260 if (isStride64(Opc)) 261 EltSize *= 64; 262 263 const MachineOperand *AddrReg = 264 getNamedOperand(LdSt, AMDGPU::OpName::addr); 265 BaseReg = AddrReg->getReg(); 266 Offset = EltSize * Offset0; 267 return true; 268 } 269 270 return false; 271 } 272 273 if (isMUBUF(LdSt) || isMTBUF(LdSt)) { 274 const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset); 275 if (SOffset && SOffset->isReg()) 276 return false; 277 278 const MachineOperand *AddrReg = 279 getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 280 if (!AddrReg) 281 return false; 282 283 const MachineOperand *OffsetImm = 284 getNamedOperand(LdSt, AMDGPU::OpName::offset); 285 BaseReg = AddrReg->getReg(); 286 Offset = OffsetImm->getImm(); 287 288 if (SOffset) // soffset can be an inline immediate. 289 Offset += SOffset->getImm(); 290 291 return true; 292 } 293 294 if (isSMRD(LdSt)) { 295 const MachineOperand *OffsetImm = 296 getNamedOperand(LdSt, AMDGPU::OpName::offset); 297 if (!OffsetImm) 298 return false; 299 300 const MachineOperand *SBaseReg = 301 getNamedOperand(LdSt, AMDGPU::OpName::sbase); 302 BaseReg = SBaseReg->getReg(); 303 Offset = OffsetImm->getImm(); 304 return true; 305 } 306 307 if (isFLAT(LdSt)) { 308 const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 309 BaseReg = AddrReg->getReg(); 310 Offset = 0; 311 return true; 312 } 313 314 return false; 315 } 316 317 bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, 318 MachineInstr &SecondLdSt, 319 unsigned NumLoads) const { 320 const MachineOperand *FirstDst = nullptr; 321 const MachineOperand *SecondDst = nullptr; 322 323 if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || 324 (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) || 325 (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) { 326 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); 327 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); 328 } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { 329 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); 330 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); 331 } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) { 332 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); 333 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); 334 } 335 336 if (!FirstDst || !SecondDst) 337 return false; 338 339 // Try to limit clustering based on the total number of bytes loaded 340 // rather than the number of instructions. This is done to help reduce 341 // register pressure. The method used is somewhat inexact, though, 342 // because it assumes that all loads in the cluster will load the 343 // same number of bytes as FirstLdSt. 344 345 // The unit of this value is bytes. 346 // FIXME: This needs finer tuning. 347 unsigned LoadClusterThreshold = 16; 348 349 const MachineRegisterInfo &MRI = 350 FirstLdSt.getParent()->getParent()->getRegInfo(); 351 const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg()); 352 353 return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold; 354 } 355 356 static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, 357 MachineBasicBlock::iterator MI, 358 const DebugLoc &DL, unsigned DestReg, 359 unsigned SrcReg, bool KillSrc) { 360 MachineFunction *MF = MBB.getParent(); 361 DiagnosticInfoUnsupported IllegalCopy(*MF->getFunction(), 362 "illegal SGPR to VGPR copy", 363 DL, DS_Error); 364 LLVMContext &C = MF->getFunction()->getContext(); 365 C.diagnose(IllegalCopy); 366 367 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg) 368 .addReg(SrcReg, getKillRegState(KillSrc)); 369 } 370 371 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 372 MachineBasicBlock::iterator MI, 373 const DebugLoc &DL, unsigned DestReg, 374 unsigned SrcReg, bool KillSrc) const { 375 const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg); 376 377 if (RC == &AMDGPU::VGPR_32RegClass) { 378 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 379 AMDGPU::SReg_32RegClass.contains(SrcReg)); 380 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 381 .addReg(SrcReg, getKillRegState(KillSrc)); 382 return; 383 } 384 385 if (RC == &AMDGPU::SReg_32_XM0RegClass || 386 RC == &AMDGPU::SReg_32RegClass) { 387 if (SrcReg == AMDGPU::SCC) { 388 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) 389 .addImm(-1) 390 .addImm(0); 391 return; 392 } 393 394 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) { 395 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 396 return; 397 } 398 399 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 400 .addReg(SrcReg, getKillRegState(KillSrc)); 401 return; 402 } 403 404 if (RC == &AMDGPU::SReg_64RegClass) { 405 if (DestReg == AMDGPU::VCC) { 406 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 407 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 408 .addReg(SrcReg, getKillRegState(KillSrc)); 409 } else { 410 // FIXME: Hack until VReg_1 removed. 411 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 412 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) 413 .addImm(0) 414 .addReg(SrcReg, getKillRegState(KillSrc)); 415 } 416 417 return; 418 } 419 420 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) { 421 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 422 return; 423 } 424 425 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 426 .addReg(SrcReg, getKillRegState(KillSrc)); 427 return; 428 } 429 430 if (DestReg == AMDGPU::SCC) { 431 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 432 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) 433 .addReg(SrcReg, getKillRegState(KillSrc)) 434 .addImm(0); 435 return; 436 } 437 438 unsigned EltSize = 4; 439 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 440 if (RI.isSGPRClass(RC)) { 441 if (RI.getRegSizeInBits(*RC) > 32) { 442 Opcode = AMDGPU::S_MOV_B64; 443 EltSize = 8; 444 } else { 445 Opcode = AMDGPU::S_MOV_B32; 446 EltSize = 4; 447 } 448 449 if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) { 450 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 451 return; 452 } 453 } 454 455 456 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); 457 bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); 458 459 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 460 unsigned SubIdx; 461 if (Forward) 462 SubIdx = SubIndices[Idx]; 463 else 464 SubIdx = SubIndices[SubIndices.size() - Idx - 1]; 465 466 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 467 get(Opcode), RI.getSubReg(DestReg, SubIdx)); 468 469 Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); 470 471 if (Idx == SubIndices.size() - 1) 472 Builder.addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit); 473 474 if (Idx == 0) 475 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 476 477 Builder.addReg(SrcReg, RegState::Implicit); 478 } 479 } 480 481 int SIInstrInfo::commuteOpcode(unsigned Opcode) const { 482 int NewOpc; 483 484 // Try to map original to commuted opcode 485 NewOpc = AMDGPU::getCommuteRev(Opcode); 486 if (NewOpc != -1) 487 // Check if the commuted (REV) opcode exists on the target. 488 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 489 490 // Try to map commuted to original opcode 491 NewOpc = AMDGPU::getCommuteOrig(Opcode); 492 if (NewOpc != -1) 493 // Check if the original (non-REV) opcode exists on the target. 494 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 495 496 return Opcode; 497 } 498 499 void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, 500 MachineBasicBlock::iterator MI, 501 const DebugLoc &DL, unsigned DestReg, 502 int64_t Value) const { 503 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 504 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg); 505 if (RegClass == &AMDGPU::SReg_32RegClass || 506 RegClass == &AMDGPU::SGPR_32RegClass || 507 RegClass == &AMDGPU::SReg_32_XM0RegClass || 508 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) { 509 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 510 .addImm(Value); 511 return; 512 } 513 514 if (RegClass == &AMDGPU::SReg_64RegClass || 515 RegClass == &AMDGPU::SGPR_64RegClass || 516 RegClass == &AMDGPU::SReg_64_XEXECRegClass) { 517 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 518 .addImm(Value); 519 return; 520 } 521 522 if (RegClass == &AMDGPU::VGPR_32RegClass) { 523 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 524 .addImm(Value); 525 return; 526 } 527 if (RegClass == &AMDGPU::VReg_64RegClass) { 528 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg) 529 .addImm(Value); 530 return; 531 } 532 533 unsigned EltSize = 4; 534 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 535 if (RI.isSGPRClass(RegClass)) { 536 if (RI.getRegSizeInBits(*RegClass) > 32) { 537 Opcode = AMDGPU::S_MOV_B64; 538 EltSize = 8; 539 } else { 540 Opcode = AMDGPU::S_MOV_B32; 541 EltSize = 4; 542 } 543 } 544 545 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize); 546 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 547 int64_t IdxValue = Idx == 0 ? Value : 0; 548 549 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 550 get(Opcode), RI.getSubReg(DestReg, Idx)); 551 Builder.addImm(IdxValue); 552 } 553 } 554 555 const TargetRegisterClass * 556 SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const { 557 return &AMDGPU::VGPR_32RegClass; 558 } 559 560 void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, 561 MachineBasicBlock::iterator I, 562 const DebugLoc &DL, unsigned DstReg, 563 ArrayRef<MachineOperand> Cond, 564 unsigned TrueReg, 565 unsigned FalseReg) const { 566 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 567 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && 568 "Not a VGPR32 reg"); 569 570 if (Cond.size() == 1) { 571 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 572 .addReg(FalseReg) 573 .addReg(TrueReg) 574 .add(Cond[0]); 575 } else if (Cond.size() == 2) { 576 assert(Cond[0].isImm() && "Cond[0] is not an immediate"); 577 switch (Cond[0].getImm()) { 578 case SIInstrInfo::SCC_TRUE: { 579 unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 580 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) 581 .addImm(-1) 582 .addImm(0); 583 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 584 .addReg(FalseReg) 585 .addReg(TrueReg) 586 .addReg(SReg); 587 break; 588 } 589 case SIInstrInfo::SCC_FALSE: { 590 unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 591 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) 592 .addImm(0) 593 .addImm(-1); 594 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 595 .addReg(FalseReg) 596 .addReg(TrueReg) 597 .addReg(SReg); 598 break; 599 } 600 case SIInstrInfo::VCCNZ: { 601 MachineOperand RegOp = Cond[1]; 602 RegOp.setImplicit(false); 603 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 604 .addReg(FalseReg) 605 .addReg(TrueReg) 606 .add(RegOp); 607 break; 608 } 609 case SIInstrInfo::VCCZ: { 610 MachineOperand RegOp = Cond[1]; 611 RegOp.setImplicit(false); 612 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 613 .addReg(TrueReg) 614 .addReg(FalseReg) 615 .add(RegOp); 616 break; 617 } 618 case SIInstrInfo::EXECNZ: { 619 unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 620 unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 621 BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 622 .addImm(0); 623 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) 624 .addImm(-1) 625 .addImm(0); 626 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 627 .addReg(FalseReg) 628 .addReg(TrueReg) 629 .addReg(SReg); 630 break; 631 } 632 case SIInstrInfo::EXECZ: { 633 unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 634 unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 635 BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 636 .addImm(0); 637 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) 638 .addImm(0) 639 .addImm(-1); 640 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 641 .addReg(FalseReg) 642 .addReg(TrueReg) 643 .addReg(SReg); 644 llvm_unreachable("Unhandled branch predicate EXECZ"); 645 break; 646 } 647 default: 648 llvm_unreachable("invalid branch predicate"); 649 } 650 } else { 651 llvm_unreachable("Can only handle Cond size 1 or 2"); 652 } 653 } 654 655 unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB, 656 MachineBasicBlock::iterator I, 657 const DebugLoc &DL, 658 unsigned SrcReg, int Value) const { 659 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 660 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 661 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) 662 .addImm(Value) 663 .addReg(SrcReg); 664 665 return Reg; 666 } 667 668 unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB, 669 MachineBasicBlock::iterator I, 670 const DebugLoc &DL, 671 unsigned SrcReg, int Value) const { 672 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 673 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 674 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) 675 .addImm(Value) 676 .addReg(SrcReg); 677 678 return Reg; 679 } 680 681 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 682 683 if (RI.getRegSizeInBits(*DstRC) == 32) { 684 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 685 } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { 686 return AMDGPU::S_MOV_B64; 687 } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) { 688 return AMDGPU::V_MOV_B64_PSEUDO; 689 } 690 return AMDGPU::COPY; 691 } 692 693 static unsigned getSGPRSpillSaveOpcode(unsigned Size) { 694 switch (Size) { 695 case 4: 696 return AMDGPU::SI_SPILL_S32_SAVE; 697 case 8: 698 return AMDGPU::SI_SPILL_S64_SAVE; 699 case 16: 700 return AMDGPU::SI_SPILL_S128_SAVE; 701 case 32: 702 return AMDGPU::SI_SPILL_S256_SAVE; 703 case 64: 704 return AMDGPU::SI_SPILL_S512_SAVE; 705 default: 706 llvm_unreachable("unknown register size"); 707 } 708 } 709 710 static unsigned getVGPRSpillSaveOpcode(unsigned Size) { 711 switch (Size) { 712 case 4: 713 return AMDGPU::SI_SPILL_V32_SAVE; 714 case 8: 715 return AMDGPU::SI_SPILL_V64_SAVE; 716 case 12: 717 return AMDGPU::SI_SPILL_V96_SAVE; 718 case 16: 719 return AMDGPU::SI_SPILL_V128_SAVE; 720 case 32: 721 return AMDGPU::SI_SPILL_V256_SAVE; 722 case 64: 723 return AMDGPU::SI_SPILL_V512_SAVE; 724 default: 725 llvm_unreachable("unknown register size"); 726 } 727 } 728 729 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 730 MachineBasicBlock::iterator MI, 731 unsigned SrcReg, bool isKill, 732 int FrameIndex, 733 const TargetRegisterClass *RC, 734 const TargetRegisterInfo *TRI) const { 735 MachineFunction *MF = MBB.getParent(); 736 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 737 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 738 DebugLoc DL = MBB.findDebugLoc(MI); 739 740 unsigned Size = FrameInfo.getObjectSize(FrameIndex); 741 unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); 742 MachinePointerInfo PtrInfo 743 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 744 MachineMemOperand *MMO 745 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 746 Size, Align); 747 unsigned SpillSize = TRI->getSpillSize(*RC); 748 749 if (RI.isSGPRClass(RC)) { 750 MFI->setHasSpilledSGPRs(); 751 752 // We are only allowed to create one new instruction when spilling 753 // registers, so we need to use pseudo instruction for spilling SGPRs. 754 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize)); 755 756 // The SGPR spill/restore instructions only work on number sgprs, so we need 757 // to make sure we are using the correct register class. 758 if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) { 759 MachineRegisterInfo &MRI = MF->getRegInfo(); 760 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); 761 } 762 763 MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc) 764 .addReg(SrcReg, getKillRegState(isKill)) // data 765 .addFrameIndex(FrameIndex) // addr 766 .addMemOperand(MMO) 767 .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) 768 .addReg(MFI->getFrameOffsetReg(), RegState::Implicit); 769 // Add the scratch resource registers as implicit uses because we may end up 770 // needing them, and need to ensure that the reserved registers are 771 // correctly handled. 772 773 if (ST.hasScalarStores()) { 774 // m0 is used for offset to scalar stores if used to spill. 775 Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine); 776 } 777 778 return; 779 } 780 781 if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { 782 LLVMContext &Ctx = MF->getFunction()->getContext(); 783 Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" 784 " spill register"); 785 BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) 786 .addReg(SrcReg); 787 788 return; 789 } 790 791 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 792 793 unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize); 794 MFI->setHasSpilledVGPRs(); 795 BuildMI(MBB, MI, DL, get(Opcode)) 796 .addReg(SrcReg, getKillRegState(isKill)) // data 797 .addFrameIndex(FrameIndex) // addr 798 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 799 .addReg(MFI->getFrameOffsetReg()) // scratch_offset 800 .addImm(0) // offset 801 .addMemOperand(MMO); 802 } 803 804 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { 805 switch (Size) { 806 case 4: 807 return AMDGPU::SI_SPILL_S32_RESTORE; 808 case 8: 809 return AMDGPU::SI_SPILL_S64_RESTORE; 810 case 16: 811 return AMDGPU::SI_SPILL_S128_RESTORE; 812 case 32: 813 return AMDGPU::SI_SPILL_S256_RESTORE; 814 case 64: 815 return AMDGPU::SI_SPILL_S512_RESTORE; 816 default: 817 llvm_unreachable("unknown register size"); 818 } 819 } 820 821 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { 822 switch (Size) { 823 case 4: 824 return AMDGPU::SI_SPILL_V32_RESTORE; 825 case 8: 826 return AMDGPU::SI_SPILL_V64_RESTORE; 827 case 12: 828 return AMDGPU::SI_SPILL_V96_RESTORE; 829 case 16: 830 return AMDGPU::SI_SPILL_V128_RESTORE; 831 case 32: 832 return AMDGPU::SI_SPILL_V256_RESTORE; 833 case 64: 834 return AMDGPU::SI_SPILL_V512_RESTORE; 835 default: 836 llvm_unreachable("unknown register size"); 837 } 838 } 839 840 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 841 MachineBasicBlock::iterator MI, 842 unsigned DestReg, int FrameIndex, 843 const TargetRegisterClass *RC, 844 const TargetRegisterInfo *TRI) const { 845 MachineFunction *MF = MBB.getParent(); 846 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 847 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 848 DebugLoc DL = MBB.findDebugLoc(MI); 849 unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); 850 unsigned Size = FrameInfo.getObjectSize(FrameIndex); 851 unsigned SpillSize = TRI->getSpillSize(*RC); 852 853 MachinePointerInfo PtrInfo 854 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 855 856 MachineMemOperand *MMO = MF->getMachineMemOperand( 857 PtrInfo, MachineMemOperand::MOLoad, Size, Align); 858 859 if (RI.isSGPRClass(RC)) { 860 // FIXME: Maybe this should not include a memoperand because it will be 861 // lowered to non-memory instructions. 862 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); 863 if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) { 864 MachineRegisterInfo &MRI = MF->getRegInfo(); 865 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); 866 } 867 868 MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg) 869 .addFrameIndex(FrameIndex) // addr 870 .addMemOperand(MMO) 871 .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) 872 .addReg(MFI->getFrameOffsetReg(), RegState::Implicit); 873 874 if (ST.hasScalarStores()) { 875 // m0 is used for offset to scalar stores if used to spill. 876 Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine); 877 } 878 879 return; 880 } 881 882 if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { 883 LLVMContext &Ctx = MF->getFunction()->getContext(); 884 Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" 885 " restore register"); 886 BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); 887 888 return; 889 } 890 891 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 892 893 unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize); 894 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 895 .addFrameIndex(FrameIndex) // vaddr 896 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 897 .addReg(MFI->getFrameOffsetReg()) // scratch_offset 898 .addImm(0) // offset 899 .addMemOperand(MMO); 900 } 901 902 /// \param @Offset Offset in bytes of the FrameIndex being spilled 903 unsigned SIInstrInfo::calculateLDSSpillAddress( 904 MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg, 905 unsigned FrameOffset, unsigned Size) const { 906 MachineFunction *MF = MBB.getParent(); 907 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 908 const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); 909 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 910 DebugLoc DL = MBB.findDebugLoc(MI); 911 unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); 912 unsigned WavefrontSize = ST.getWavefrontSize(); 913 914 unsigned TIDReg = MFI->getTIDReg(); 915 if (!MFI->hasCalculatedTID()) { 916 MachineBasicBlock &Entry = MBB.getParent()->front(); 917 MachineBasicBlock::iterator Insert = Entry.front(); 918 DebugLoc DL = Insert->getDebugLoc(); 919 920 TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass, 921 *MF); 922 if (TIDReg == AMDGPU::NoRegister) 923 return TIDReg; 924 925 if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) && 926 WorkGroupSize > WavefrontSize) { 927 928 unsigned TIDIGXReg 929 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X); 930 unsigned TIDIGYReg 931 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y); 932 unsigned TIDIGZReg 933 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z); 934 unsigned InputPtrReg = 935 TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); 936 for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { 937 if (!Entry.isLiveIn(Reg)) 938 Entry.addLiveIn(Reg); 939 } 940 941 RS->enterBasicBlock(Entry); 942 // FIXME: Can we scavenge an SReg_64 and access the subregs? 943 unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 944 unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 945 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) 946 .addReg(InputPtrReg) 947 .addImm(SI::KernelInputOffsets::NGROUPS_Z); 948 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) 949 .addReg(InputPtrReg) 950 .addImm(SI::KernelInputOffsets::NGROUPS_Y); 951 952 // NGROUPS.X * NGROUPS.Y 953 BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) 954 .addReg(STmp1) 955 .addReg(STmp0); 956 // (NGROUPS.X * NGROUPS.Y) * TIDIG.X 957 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) 958 .addReg(STmp1) 959 .addReg(TIDIGXReg); 960 // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) 961 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) 962 .addReg(STmp0) 963 .addReg(TIDIGYReg) 964 .addReg(TIDReg); 965 // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z 966 BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) 967 .addReg(TIDReg) 968 .addReg(TIDIGZReg); 969 } else { 970 // Get the wave id 971 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), 972 TIDReg) 973 .addImm(-1) 974 .addImm(0); 975 976 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), 977 TIDReg) 978 .addImm(-1) 979 .addReg(TIDReg); 980 } 981 982 BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), 983 TIDReg) 984 .addImm(2) 985 .addReg(TIDReg); 986 MFI->setTIDReg(TIDReg); 987 } 988 989 // Add FrameIndex to LDS offset 990 unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize); 991 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) 992 .addImm(LDSOffset) 993 .addReg(TIDReg); 994 995 return TmpReg; 996 } 997 998 void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB, 999 MachineBasicBlock::iterator MI, 1000 int Count) const { 1001 DebugLoc DL = MBB.findDebugLoc(MI); 1002 while (Count > 0) { 1003 int Arg; 1004 if (Count >= 8) 1005 Arg = 7; 1006 else 1007 Arg = Count - 1; 1008 Count -= 8; 1009 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)) 1010 .addImm(Arg); 1011 } 1012 } 1013 1014 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, 1015 MachineBasicBlock::iterator MI) const { 1016 insertWaitStates(MBB, MI, 1); 1017 } 1018 1019 void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { 1020 auto MF = MBB.getParent(); 1021 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 1022 1023 assert(Info->isEntryFunction()); 1024 1025 if (MBB.succ_empty()) { 1026 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end(); 1027 if (HasNoTerminator) 1028 BuildMI(MBB, MBB.end(), DebugLoc(), 1029 get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG)); 1030 } 1031 } 1032 1033 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const { 1034 switch (MI.getOpcode()) { 1035 default: return 1; // FIXME: Do wait states equal cycles? 1036 1037 case AMDGPU::S_NOP: 1038 return MI.getOperand(0).getImm() + 1; 1039 } 1040 } 1041 1042 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 1043 MachineBasicBlock &MBB = *MI.getParent(); 1044 DebugLoc DL = MBB.findDebugLoc(MI); 1045 switch (MI.getOpcode()) { 1046 default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); 1047 case AMDGPU::S_MOV_B64_term: { 1048 // This is only a terminator to get the correct spill code placement during 1049 // register allocation. 1050 MI.setDesc(get(AMDGPU::S_MOV_B64)); 1051 break; 1052 } 1053 case AMDGPU::S_XOR_B64_term: { 1054 // This is only a terminator to get the correct spill code placement during 1055 // register allocation. 1056 MI.setDesc(get(AMDGPU::S_XOR_B64)); 1057 break; 1058 } 1059 case AMDGPU::S_ANDN2_B64_term: { 1060 // This is only a terminator to get the correct spill code placement during 1061 // register allocation. 1062 MI.setDesc(get(AMDGPU::S_ANDN2_B64)); 1063 break; 1064 } 1065 case AMDGPU::V_MOV_B64_PSEUDO: { 1066 unsigned Dst = MI.getOperand(0).getReg(); 1067 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 1068 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 1069 1070 const MachineOperand &SrcOp = MI.getOperand(1); 1071 // FIXME: Will this work for 64-bit floating point immediates? 1072 assert(!SrcOp.isFPImm()); 1073 if (SrcOp.isImm()) { 1074 APInt Imm(64, SrcOp.getImm()); 1075 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 1076 .addImm(Imm.getLoBits(32).getZExtValue()) 1077 .addReg(Dst, RegState::Implicit | RegState::Define); 1078 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 1079 .addImm(Imm.getHiBits(32).getZExtValue()) 1080 .addReg(Dst, RegState::Implicit | RegState::Define); 1081 } else { 1082 assert(SrcOp.isReg()); 1083 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 1084 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 1085 .addReg(Dst, RegState::Implicit | RegState::Define); 1086 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 1087 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 1088 .addReg(Dst, RegState::Implicit | RegState::Define); 1089 } 1090 MI.eraseFromParent(); 1091 break; 1092 } 1093 case AMDGPU::V_MOVRELD_B32_V1: 1094 case AMDGPU::V_MOVRELD_B32_V2: 1095 case AMDGPU::V_MOVRELD_B32_V4: 1096 case AMDGPU::V_MOVRELD_B32_V8: 1097 case AMDGPU::V_MOVRELD_B32_V16: { 1098 const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32); 1099 unsigned VecReg = MI.getOperand(0).getReg(); 1100 bool IsUndef = MI.getOperand(1).isUndef(); 1101 unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm(); 1102 assert(VecReg == MI.getOperand(1).getReg()); 1103 1104 MachineInstr *MovRel = 1105 BuildMI(MBB, MI, DL, MovRelDesc) 1106 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) 1107 .add(MI.getOperand(2)) 1108 .addReg(VecReg, RegState::ImplicitDefine) 1109 .addReg(VecReg, 1110 RegState::Implicit | (IsUndef ? RegState::Undef : 0)); 1111 1112 const int ImpDefIdx = 1113 MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses(); 1114 const int ImpUseIdx = ImpDefIdx + 1; 1115 MovRel->tieOperands(ImpDefIdx, ImpUseIdx); 1116 1117 MI.eraseFromParent(); 1118 break; 1119 } 1120 case AMDGPU::SI_PC_ADD_REL_OFFSET: { 1121 MachineFunction &MF = *MBB.getParent(); 1122 unsigned Reg = MI.getOperand(0).getReg(); 1123 unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); 1124 unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); 1125 1126 // Create a bundle so these instructions won't be re-ordered by the 1127 // post-RA scheduler. 1128 MIBundleBuilder Bundler(MBB, MI); 1129 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); 1130 1131 // Add 32-bit offset from this instruction to the start of the 1132 // constant data. 1133 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) 1134 .addReg(RegLo) 1135 .add(MI.getOperand(1))); 1136 1137 MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) 1138 .addReg(RegHi); 1139 if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE) 1140 MIB.addImm(0); 1141 else 1142 MIB.add(MI.getOperand(2)); 1143 1144 Bundler.append(MIB); 1145 llvm::finalizeBundle(MBB, Bundler.begin()); 1146 1147 MI.eraseFromParent(); 1148 break; 1149 } 1150 } 1151 return true; 1152 } 1153 1154 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, 1155 MachineOperand &Src0, 1156 unsigned Src0OpName, 1157 MachineOperand &Src1, 1158 unsigned Src1OpName) const { 1159 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); 1160 if (!Src0Mods) 1161 return false; 1162 1163 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); 1164 assert(Src1Mods && 1165 "All commutable instructions have both src0 and src1 modifiers"); 1166 1167 int Src0ModsVal = Src0Mods->getImm(); 1168 int Src1ModsVal = Src1Mods->getImm(); 1169 1170 Src1Mods->setImm(Src0ModsVal); 1171 Src0Mods->setImm(Src1ModsVal); 1172 return true; 1173 } 1174 1175 static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, 1176 MachineOperand &RegOp, 1177 MachineOperand &NonRegOp) { 1178 unsigned Reg = RegOp.getReg(); 1179 unsigned SubReg = RegOp.getSubReg(); 1180 bool IsKill = RegOp.isKill(); 1181 bool IsDead = RegOp.isDead(); 1182 bool IsUndef = RegOp.isUndef(); 1183 bool IsDebug = RegOp.isDebug(); 1184 1185 if (NonRegOp.isImm()) 1186 RegOp.ChangeToImmediate(NonRegOp.getImm()); 1187 else if (NonRegOp.isFI()) 1188 RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); 1189 else 1190 return nullptr; 1191 1192 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); 1193 NonRegOp.setSubReg(SubReg); 1194 1195 return &MI; 1196 } 1197 1198 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, 1199 unsigned Src0Idx, 1200 unsigned Src1Idx) const { 1201 assert(!NewMI && "this should never be used"); 1202 1203 unsigned Opc = MI.getOpcode(); 1204 int CommutedOpcode = commuteOpcode(Opc); 1205 if (CommutedOpcode == -1) 1206 return nullptr; 1207 1208 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == 1209 static_cast<int>(Src0Idx) && 1210 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == 1211 static_cast<int>(Src1Idx) && 1212 "inconsistency with findCommutedOpIndices"); 1213 1214 MachineOperand &Src0 = MI.getOperand(Src0Idx); 1215 MachineOperand &Src1 = MI.getOperand(Src1Idx); 1216 1217 MachineInstr *CommutedMI = nullptr; 1218 if (Src0.isReg() && Src1.isReg()) { 1219 if (isOperandLegal(MI, Src1Idx, &Src0)) { 1220 // Be sure to copy the source modifiers to the right place. 1221 CommutedMI 1222 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); 1223 } 1224 1225 } else if (Src0.isReg() && !Src1.isReg()) { 1226 // src0 should always be able to support any operand type, so no need to 1227 // check operand legality. 1228 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); 1229 } else if (!Src0.isReg() && Src1.isReg()) { 1230 if (isOperandLegal(MI, Src1Idx, &Src0)) 1231 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); 1232 } else { 1233 // FIXME: Found two non registers to commute. This does happen. 1234 return nullptr; 1235 } 1236 1237 1238 if (CommutedMI) { 1239 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, 1240 Src1, AMDGPU::OpName::src1_modifiers); 1241 1242 CommutedMI->setDesc(get(CommutedOpcode)); 1243 } 1244 1245 return CommutedMI; 1246 } 1247 1248 // This needs to be implemented because the source modifiers may be inserted 1249 // between the true commutable operands, and the base 1250 // TargetInstrInfo::commuteInstruction uses it. 1251 bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0, 1252 unsigned &SrcOpIdx1) const { 1253 if (!MI.isCommutable()) 1254 return false; 1255 1256 unsigned Opc = MI.getOpcode(); 1257 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1258 if (Src0Idx == -1) 1259 return false; 1260 1261 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1262 if (Src1Idx == -1) 1263 return false; 1264 1265 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); 1266 } 1267 1268 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, 1269 int64_t BrOffset) const { 1270 // BranchRelaxation should never have to check s_setpc_b64 because its dest 1271 // block is unanalyzable. 1272 assert(BranchOp != AMDGPU::S_SETPC_B64); 1273 1274 // Convert to dwords. 1275 BrOffset /= 4; 1276 1277 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is 1278 // from the next instruction. 1279 BrOffset -= 1; 1280 1281 return isIntN(BranchOffsetBits, BrOffset); 1282 } 1283 1284 MachineBasicBlock *SIInstrInfo::getBranchDestBlock( 1285 const MachineInstr &MI) const { 1286 if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { 1287 // This would be a difficult analysis to perform, but can always be legal so 1288 // there's no need to analyze it. 1289 return nullptr; 1290 } 1291 1292 return MI.getOperand(0).getMBB(); 1293 } 1294 1295 unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, 1296 MachineBasicBlock &DestBB, 1297 const DebugLoc &DL, 1298 int64_t BrOffset, 1299 RegScavenger *RS) const { 1300 assert(RS && "RegScavenger required for long branching"); 1301 assert(MBB.empty() && 1302 "new block should be inserted for expanding unconditional branch"); 1303 assert(MBB.pred_size() == 1); 1304 1305 MachineFunction *MF = MBB.getParent(); 1306 MachineRegisterInfo &MRI = MF->getRegInfo(); 1307 1308 // FIXME: Virtual register workaround for RegScavenger not working with empty 1309 // blocks. 1310 unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1311 1312 auto I = MBB.end(); 1313 1314 // We need to compute the offset relative to the instruction immediately after 1315 // s_getpc_b64. Insert pc arithmetic code before last terminator. 1316 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); 1317 1318 // TODO: Handle > 32-bit block address. 1319 if (BrOffset >= 0) { 1320 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) 1321 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 1322 .addReg(PCReg, 0, AMDGPU::sub0) 1323 .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD); 1324 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) 1325 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 1326 .addReg(PCReg, 0, AMDGPU::sub1) 1327 .addImm(0); 1328 } else { 1329 // Backwards branch. 1330 BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32)) 1331 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 1332 .addReg(PCReg, 0, AMDGPU::sub0) 1333 .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD); 1334 BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32)) 1335 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 1336 .addReg(PCReg, 0, AMDGPU::sub1) 1337 .addImm(0); 1338 } 1339 1340 // Insert the indirect branch after the other terminator. 1341 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) 1342 .addReg(PCReg); 1343 1344 // FIXME: If spilling is necessary, this will fail because this scavenger has 1345 // no emergency stack slots. It is non-trivial to spill in this situation, 1346 // because the restore code needs to be specially placed after the 1347 // jump. BranchRelaxation then needs to be made aware of the newly inserted 1348 // block. 1349 // 1350 // If a spill is needed for the pc register pair, we need to insert a spill 1351 // restore block right before the destination block, and insert a short branch 1352 // into the old destination block's fallthrough predecessor. 1353 // e.g.: 1354 // 1355 // s_cbranch_scc0 skip_long_branch: 1356 // 1357 // long_branch_bb: 1358 // spill s[8:9] 1359 // s_getpc_b64 s[8:9] 1360 // s_add_u32 s8, s8, restore_bb 1361 // s_addc_u32 s9, s9, 0 1362 // s_setpc_b64 s[8:9] 1363 // 1364 // skip_long_branch: 1365 // foo; 1366 // 1367 // ..... 1368 // 1369 // dest_bb_fallthrough_predecessor: 1370 // bar; 1371 // s_branch dest_bb 1372 // 1373 // restore_bb: 1374 // restore s[8:9] 1375 // fallthrough dest_bb 1376 /// 1377 // dest_bb: 1378 // buzz; 1379 1380 RS->enterBasicBlockEnd(MBB); 1381 unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass, 1382 MachineBasicBlock::iterator(GetPC), 0); 1383 MRI.replaceRegWith(PCReg, Scav); 1384 MRI.clearVirtRegs(); 1385 RS->setRegUsed(Scav); 1386 1387 return 4 + 8 + 4 + 4; 1388 } 1389 1390 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { 1391 switch (Cond) { 1392 case SIInstrInfo::SCC_TRUE: 1393 return AMDGPU::S_CBRANCH_SCC1; 1394 case SIInstrInfo::SCC_FALSE: 1395 return AMDGPU::S_CBRANCH_SCC0; 1396 case SIInstrInfo::VCCNZ: 1397 return AMDGPU::S_CBRANCH_VCCNZ; 1398 case SIInstrInfo::VCCZ: 1399 return AMDGPU::S_CBRANCH_VCCZ; 1400 case SIInstrInfo::EXECNZ: 1401 return AMDGPU::S_CBRANCH_EXECNZ; 1402 case SIInstrInfo::EXECZ: 1403 return AMDGPU::S_CBRANCH_EXECZ; 1404 default: 1405 llvm_unreachable("invalid branch predicate"); 1406 } 1407 } 1408 1409 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { 1410 switch (Opcode) { 1411 case AMDGPU::S_CBRANCH_SCC0: 1412 return SCC_FALSE; 1413 case AMDGPU::S_CBRANCH_SCC1: 1414 return SCC_TRUE; 1415 case AMDGPU::S_CBRANCH_VCCNZ: 1416 return VCCNZ; 1417 case AMDGPU::S_CBRANCH_VCCZ: 1418 return VCCZ; 1419 case AMDGPU::S_CBRANCH_EXECNZ: 1420 return EXECNZ; 1421 case AMDGPU::S_CBRANCH_EXECZ: 1422 return EXECZ; 1423 default: 1424 return INVALID_BR; 1425 } 1426 } 1427 1428 bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, 1429 MachineBasicBlock::iterator I, 1430 MachineBasicBlock *&TBB, 1431 MachineBasicBlock *&FBB, 1432 SmallVectorImpl<MachineOperand> &Cond, 1433 bool AllowModify) const { 1434 if (I->getOpcode() == AMDGPU::S_BRANCH) { 1435 // Unconditional Branch 1436 TBB = I->getOperand(0).getMBB(); 1437 return false; 1438 } 1439 1440 MachineBasicBlock *CondBB = nullptr; 1441 1442 if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 1443 CondBB = I->getOperand(1).getMBB(); 1444 Cond.push_back(I->getOperand(0)); 1445 } else { 1446 BranchPredicate Pred = getBranchPredicate(I->getOpcode()); 1447 if (Pred == INVALID_BR) 1448 return true; 1449 1450 CondBB = I->getOperand(0).getMBB(); 1451 Cond.push_back(MachineOperand::CreateImm(Pred)); 1452 Cond.push_back(I->getOperand(1)); // Save the branch register. 1453 } 1454 ++I; 1455 1456 if (I == MBB.end()) { 1457 // Conditional branch followed by fall-through. 1458 TBB = CondBB; 1459 return false; 1460 } 1461 1462 if (I->getOpcode() == AMDGPU::S_BRANCH) { 1463 TBB = CondBB; 1464 FBB = I->getOperand(0).getMBB(); 1465 return false; 1466 } 1467 1468 return true; 1469 } 1470 1471 bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, 1472 MachineBasicBlock *&FBB, 1473 SmallVectorImpl<MachineOperand> &Cond, 1474 bool AllowModify) const { 1475 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 1476 if (I == MBB.end()) 1477 return false; 1478 1479 if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH) 1480 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); 1481 1482 ++I; 1483 1484 // TODO: Should be able to treat as fallthrough? 1485 if (I == MBB.end()) 1486 return true; 1487 1488 if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify)) 1489 return true; 1490 1491 MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB(); 1492 1493 // Specifically handle the case where the conditional branch is to the same 1494 // destination as the mask branch. e.g. 1495 // 1496 // si_mask_branch BB8 1497 // s_cbranch_execz BB8 1498 // s_cbranch BB9 1499 // 1500 // This is required to understand divergent loops which may need the branches 1501 // to be relaxed. 1502 if (TBB != MaskBrDest || Cond.empty()) 1503 return true; 1504 1505 auto Pred = Cond[0].getImm(); 1506 return (Pred != EXECZ && Pred != EXECNZ); 1507 } 1508 1509 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, 1510 int *BytesRemoved) const { 1511 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 1512 1513 unsigned Count = 0; 1514 unsigned RemovedSize = 0; 1515 while (I != MBB.end()) { 1516 MachineBasicBlock::iterator Next = std::next(I); 1517 if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) { 1518 I = Next; 1519 continue; 1520 } 1521 1522 RemovedSize += getInstSizeInBytes(*I); 1523 I->eraseFromParent(); 1524 ++Count; 1525 I = Next; 1526 } 1527 1528 if (BytesRemoved) 1529 *BytesRemoved = RemovedSize; 1530 1531 return Count; 1532 } 1533 1534 // Copy the flags onto the implicit condition register operand. 1535 static void preserveCondRegFlags(MachineOperand &CondReg, 1536 const MachineOperand &OrigCond) { 1537 CondReg.setIsUndef(OrigCond.isUndef()); 1538 CondReg.setIsKill(OrigCond.isKill()); 1539 } 1540 1541 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, 1542 MachineBasicBlock *TBB, 1543 MachineBasicBlock *FBB, 1544 ArrayRef<MachineOperand> Cond, 1545 const DebugLoc &DL, 1546 int *BytesAdded) const { 1547 1548 if (!FBB && Cond.empty()) { 1549 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 1550 .addMBB(TBB); 1551 if (BytesAdded) 1552 *BytesAdded = 4; 1553 return 1; 1554 } 1555 1556 if(Cond.size() == 1 && Cond[0].isReg()) { 1557 BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO)) 1558 .add(Cond[0]) 1559 .addMBB(TBB); 1560 return 1; 1561 } 1562 1563 assert(TBB && Cond[0].isImm()); 1564 1565 unsigned Opcode 1566 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); 1567 1568 if (!FBB) { 1569 Cond[1].isUndef(); 1570 MachineInstr *CondBr = 1571 BuildMI(&MBB, DL, get(Opcode)) 1572 .addMBB(TBB); 1573 1574 // Copy the flags onto the implicit condition register operand. 1575 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); 1576 1577 if (BytesAdded) 1578 *BytesAdded = 4; 1579 return 1; 1580 } 1581 1582 assert(TBB && FBB); 1583 1584 MachineInstr *CondBr = 1585 BuildMI(&MBB, DL, get(Opcode)) 1586 .addMBB(TBB); 1587 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 1588 .addMBB(FBB); 1589 1590 MachineOperand &CondReg = CondBr->getOperand(1); 1591 CondReg.setIsUndef(Cond[1].isUndef()); 1592 CondReg.setIsKill(Cond[1].isKill()); 1593 1594 if (BytesAdded) 1595 *BytesAdded = 8; 1596 1597 return 2; 1598 } 1599 1600 bool SIInstrInfo::reverseBranchCondition( 1601 SmallVectorImpl<MachineOperand> &Cond) const { 1602 if (Cond.size() != 2) { 1603 return true; 1604 } 1605 1606 if (Cond[0].isImm()) { 1607 Cond[0].setImm(-Cond[0].getImm()); 1608 return false; 1609 } 1610 1611 return true; 1612 } 1613 1614 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 1615 ArrayRef<MachineOperand> Cond, 1616 unsigned TrueReg, unsigned FalseReg, 1617 int &CondCycles, 1618 int &TrueCycles, int &FalseCycles) const { 1619 switch (Cond[0].getImm()) { 1620 case VCCNZ: 1621 case VCCZ: { 1622 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 1623 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 1624 assert(MRI.getRegClass(FalseReg) == RC); 1625 1626 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 1627 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 1628 1629 // Limit to equal cost for branch vs. N v_cndmask_b32s. 1630 return !RI.isSGPRClass(RC) && NumInsts <= 6; 1631 } 1632 case SCC_TRUE: 1633 case SCC_FALSE: { 1634 // FIXME: We could insert for VGPRs if we could replace the original compare 1635 // with a vector one. 1636 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 1637 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 1638 assert(MRI.getRegClass(FalseReg) == RC); 1639 1640 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 1641 1642 // Multiples of 8 can do s_cselect_b64 1643 if (NumInsts % 2 == 0) 1644 NumInsts /= 2; 1645 1646 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 1647 return RI.isSGPRClass(RC); 1648 } 1649 default: 1650 return false; 1651 } 1652 } 1653 1654 void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, 1655 MachineBasicBlock::iterator I, const DebugLoc &DL, 1656 unsigned DstReg, ArrayRef<MachineOperand> Cond, 1657 unsigned TrueReg, unsigned FalseReg) const { 1658 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); 1659 if (Pred == VCCZ || Pred == SCC_FALSE) { 1660 Pred = static_cast<BranchPredicate>(-Pred); 1661 std::swap(TrueReg, FalseReg); 1662 } 1663 1664 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 1665 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); 1666 unsigned DstSize = RI.getRegSizeInBits(*DstRC); 1667 1668 if (DstSize == 32) { 1669 unsigned SelOp = Pred == SCC_TRUE ? 1670 AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32; 1671 1672 // Instruction's operands are backwards from what is expected. 1673 MachineInstr *Select = 1674 BuildMI(MBB, I, DL, get(SelOp), DstReg) 1675 .addReg(FalseReg) 1676 .addReg(TrueReg); 1677 1678 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 1679 return; 1680 } 1681 1682 if (DstSize == 64 && Pred == SCC_TRUE) { 1683 MachineInstr *Select = 1684 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) 1685 .addReg(FalseReg) 1686 .addReg(TrueReg); 1687 1688 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 1689 return; 1690 } 1691 1692 static const int16_t Sub0_15[] = { 1693 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1694 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 1695 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 1696 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 1697 }; 1698 1699 static const int16_t Sub0_15_64[] = { 1700 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 1701 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 1702 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 1703 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 1704 }; 1705 1706 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; 1707 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; 1708 const int16_t *SubIndices = Sub0_15; 1709 int NElts = DstSize / 32; 1710 1711 // 64-bit select is only avaialble for SALU. 1712 if (Pred == SCC_TRUE) { 1713 SelOp = AMDGPU::S_CSELECT_B64; 1714 EltRC = &AMDGPU::SGPR_64RegClass; 1715 SubIndices = Sub0_15_64; 1716 1717 assert(NElts % 2 == 0); 1718 NElts /= 2; 1719 } 1720 1721 MachineInstrBuilder MIB = BuildMI( 1722 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); 1723 1724 I = MIB->getIterator(); 1725 1726 SmallVector<unsigned, 8> Regs; 1727 for (int Idx = 0; Idx != NElts; ++Idx) { 1728 unsigned DstElt = MRI.createVirtualRegister(EltRC); 1729 Regs.push_back(DstElt); 1730 1731 unsigned SubIdx = SubIndices[Idx]; 1732 1733 MachineInstr *Select = 1734 BuildMI(MBB, I, DL, get(SelOp), DstElt) 1735 .addReg(FalseReg, 0, SubIdx) 1736 .addReg(TrueReg, 0, SubIdx); 1737 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 1738 1739 MIB.addReg(DstElt) 1740 .addImm(SubIdx); 1741 } 1742 } 1743 1744 bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { 1745 switch (MI.getOpcode()) { 1746 case AMDGPU::V_MOV_B32_e32: 1747 case AMDGPU::V_MOV_B32_e64: 1748 case AMDGPU::V_MOV_B64_PSEUDO: { 1749 // If there are additional implicit register operands, this may be used for 1750 // register indexing so the source register operand isn't simply copied. 1751 unsigned NumOps = MI.getDesc().getNumOperands() + 1752 MI.getDesc().getNumImplicitUses(); 1753 1754 return MI.getNumOperands() == NumOps; 1755 } 1756 case AMDGPU::S_MOV_B32: 1757 case AMDGPU::S_MOV_B64: 1758 case AMDGPU::COPY: 1759 return true; 1760 default: 1761 return false; 1762 } 1763 } 1764 1765 static void removeModOperands(MachineInstr &MI) { 1766 unsigned Opc = MI.getOpcode(); 1767 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1768 AMDGPU::OpName::src0_modifiers); 1769 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1770 AMDGPU::OpName::src1_modifiers); 1771 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1772 AMDGPU::OpName::src2_modifiers); 1773 1774 MI.RemoveOperand(Src2ModIdx); 1775 MI.RemoveOperand(Src1ModIdx); 1776 MI.RemoveOperand(Src0ModIdx); 1777 } 1778 1779 bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, 1780 unsigned Reg, MachineRegisterInfo *MRI) const { 1781 if (!MRI->hasOneNonDBGUse(Reg)) 1782 return false; 1783 1784 unsigned Opc = UseMI.getOpcode(); 1785 if (Opc == AMDGPU::COPY) { 1786 bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg()); 1787 switch (DefMI.getOpcode()) { 1788 default: 1789 return false; 1790 case AMDGPU::S_MOV_B64: 1791 // TODO: We could fold 64-bit immediates, but this get compilicated 1792 // when there are sub-registers. 1793 return false; 1794 1795 case AMDGPU::V_MOV_B32_e32: 1796 case AMDGPU::S_MOV_B32: 1797 break; 1798 } 1799 unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1800 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); 1801 assert(ImmOp); 1802 // FIXME: We could handle FrameIndex values here. 1803 if (!ImmOp->isImm()) { 1804 return false; 1805 } 1806 UseMI.setDesc(get(NewOpc)); 1807 UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm()); 1808 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); 1809 return true; 1810 } 1811 1812 if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || 1813 Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) { 1814 // Don't fold if we are using source or output modifiers. The new VOP2 1815 // instructions don't have them. 1816 if (hasAnyModifiersSet(UseMI)) 1817 return false; 1818 1819 const MachineOperand &ImmOp = DefMI.getOperand(1); 1820 1821 // If this is a free constant, there's no reason to do this. 1822 // TODO: We could fold this here instead of letting SIFoldOperands do it 1823 // later. 1824 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); 1825 1826 // Any src operand can be used for the legality check. 1827 if (isInlineConstant(UseMI, *Src0, ImmOp)) 1828 return false; 1829 1830 bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64; 1831 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); 1832 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); 1833 1834 // Multiplied part is the constant: Use v_madmk_{f16, f32}. 1835 // We should only expect these to be on src0 due to canonicalizations. 1836 if (Src0->isReg() && Src0->getReg() == Reg) { 1837 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1838 return false; 1839 1840 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) 1841 return false; 1842 1843 // We need to swap operands 0 and 1 since madmk constant is at operand 1. 1844 1845 const int64_t Imm = DefMI.getOperand(1).getImm(); 1846 1847 // FIXME: This would be a lot easier if we could return a new instruction 1848 // instead of having to modify in place. 1849 1850 // Remove these first since they are at the end. 1851 UseMI.RemoveOperand( 1852 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 1853 UseMI.RemoveOperand( 1854 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 1855 1856 unsigned Src1Reg = Src1->getReg(); 1857 unsigned Src1SubReg = Src1->getSubReg(); 1858 Src0->setReg(Src1Reg); 1859 Src0->setSubReg(Src1SubReg); 1860 Src0->setIsKill(Src1->isKill()); 1861 1862 if (Opc == AMDGPU::V_MAC_F32_e64 || 1863 Opc == AMDGPU::V_MAC_F16_e64) 1864 UseMI.untieRegOperand( 1865 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1866 1867 Src1->ChangeToImmediate(Imm); 1868 1869 removeModOperands(UseMI); 1870 UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16)); 1871 1872 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1873 if (DeleteDef) 1874 DefMI.eraseFromParent(); 1875 1876 return true; 1877 } 1878 1879 // Added part is the constant: Use v_madak_{f16, f32}. 1880 if (Src2->isReg() && Src2->getReg() == Reg) { 1881 // Not allowed to use constant bus for another operand. 1882 // We can however allow an inline immediate as src0. 1883 if (!Src0->isImm() && 1884 (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) 1885 return false; 1886 1887 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1888 return false; 1889 1890 const int64_t Imm = DefMI.getOperand(1).getImm(); 1891 1892 // FIXME: This would be a lot easier if we could return a new instruction 1893 // instead of having to modify in place. 1894 1895 // Remove these first since they are at the end. 1896 UseMI.RemoveOperand( 1897 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 1898 UseMI.RemoveOperand( 1899 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 1900 1901 if (Opc == AMDGPU::V_MAC_F32_e64 || 1902 Opc == AMDGPU::V_MAC_F16_e64) 1903 UseMI.untieRegOperand( 1904 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1905 1906 // ChangingToImmediate adds Src2 back to the instruction. 1907 Src2->ChangeToImmediate(Imm); 1908 1909 // These come before src2. 1910 removeModOperands(UseMI); 1911 UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16)); 1912 1913 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1914 if (DeleteDef) 1915 DefMI.eraseFromParent(); 1916 1917 return true; 1918 } 1919 } 1920 1921 return false; 1922 } 1923 1924 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 1925 int WidthB, int OffsetB) { 1926 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1927 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1928 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1929 return LowOffset + LowWidth <= HighOffset; 1930 } 1931 1932 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa, 1933 MachineInstr &MIb) const { 1934 unsigned BaseReg0, BaseReg1; 1935 int64_t Offset0, Offset1; 1936 1937 if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && 1938 getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { 1939 1940 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { 1941 // FIXME: Handle ds_read2 / ds_write2. 1942 return false; 1943 } 1944 unsigned Width0 = (*MIa.memoperands_begin())->getSize(); 1945 unsigned Width1 = (*MIb.memoperands_begin())->getSize(); 1946 if (BaseReg0 == BaseReg1 && 1947 offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { 1948 return true; 1949 } 1950 } 1951 1952 return false; 1953 } 1954 1955 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa, 1956 MachineInstr &MIb, 1957 AliasAnalysis *AA) const { 1958 assert((MIa.mayLoad() || MIa.mayStore()) && 1959 "MIa must load from or modify a memory location"); 1960 assert((MIb.mayLoad() || MIb.mayStore()) && 1961 "MIb must load from or modify a memory location"); 1962 1963 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) 1964 return false; 1965 1966 // XXX - Can we relax this between address spaces? 1967 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 1968 return false; 1969 1970 if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) { 1971 const MachineMemOperand *MMOa = *MIa.memoperands_begin(); 1972 const MachineMemOperand *MMOb = *MIb.memoperands_begin(); 1973 if (MMOa->getValue() && MMOb->getValue()) { 1974 MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo()); 1975 MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo()); 1976 if (!AA->alias(LocA, LocB)) 1977 return true; 1978 } 1979 } 1980 1981 // TODO: Should we check the address space from the MachineMemOperand? That 1982 // would allow us to distinguish objects we know don't alias based on the 1983 // underlying address space, even if it was lowered to a different one, 1984 // e.g. private accesses lowered to use MUBUF instructions on a scratch 1985 // buffer. 1986 if (isDS(MIa)) { 1987 if (isDS(MIb)) 1988 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1989 1990 return !isFLAT(MIb); 1991 } 1992 1993 if (isMUBUF(MIa) || isMTBUF(MIa)) { 1994 if (isMUBUF(MIb) || isMTBUF(MIb)) 1995 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1996 1997 return !isFLAT(MIb) && !isSMRD(MIb); 1998 } 1999 2000 if (isSMRD(MIa)) { 2001 if (isSMRD(MIb)) 2002 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2003 2004 return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa); 2005 } 2006 2007 if (isFLAT(MIa)) { 2008 if (isFLAT(MIb)) 2009 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2010 2011 return false; 2012 } 2013 2014 return false; 2015 } 2016 2017 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, 2018 MachineInstr &MI, 2019 LiveVariables *LV) const { 2020 bool IsF16 = false; 2021 2022 switch (MI.getOpcode()) { 2023 default: 2024 return nullptr; 2025 case AMDGPU::V_MAC_F16_e64: 2026 IsF16 = true; 2027 case AMDGPU::V_MAC_F32_e64: 2028 break; 2029 case AMDGPU::V_MAC_F16_e32: 2030 IsF16 = true; 2031 case AMDGPU::V_MAC_F32_e32: { 2032 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 2033 AMDGPU::OpName::src0); 2034 const MachineOperand *Src0 = &MI.getOperand(Src0Idx); 2035 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0)) 2036 return nullptr; 2037 break; 2038 } 2039 } 2040 2041 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 2042 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); 2043 const MachineOperand *Src0Mods = 2044 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 2045 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 2046 const MachineOperand *Src1Mods = 2047 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 2048 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 2049 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); 2050 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); 2051 2052 return BuildMI(*MBB, MI, MI.getDebugLoc(), 2053 get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32)) 2054 .add(*Dst) 2055 .addImm(Src0Mods ? Src0Mods->getImm() : 0) 2056 .add(*Src0) 2057 .addImm(Src1Mods ? Src1Mods->getImm() : 0) 2058 .add(*Src1) 2059 .addImm(0) // Src mods 2060 .add(*Src2) 2061 .addImm(Clamp ? Clamp->getImm() : 0) 2062 .addImm(Omod ? Omod->getImm() : 0); 2063 } 2064 2065 // It's not generally safe to move VALU instructions across these since it will 2066 // start using the register as a base index rather than directly. 2067 // XXX - Why isn't hasSideEffects sufficient for these? 2068 static bool changesVGPRIndexingMode(const MachineInstr &MI) { 2069 switch (MI.getOpcode()) { 2070 case AMDGPU::S_SET_GPR_IDX_ON: 2071 case AMDGPU::S_SET_GPR_IDX_MODE: 2072 case AMDGPU::S_SET_GPR_IDX_OFF: 2073 return true; 2074 default: 2075 return false; 2076 } 2077 } 2078 2079 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, 2080 const MachineBasicBlock *MBB, 2081 const MachineFunction &MF) const { 2082 // XXX - Do we want the SP check in the base implementation? 2083 2084 // Target-independent instructions do not have an implicit-use of EXEC, even 2085 // when they operate on VGPRs. Treating EXEC modifications as scheduling 2086 // boundaries prevents incorrect movements of such instructions. 2087 return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) || 2088 MI.modifiesRegister(AMDGPU::EXEC, &RI) || 2089 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || 2090 MI.getOpcode() == AMDGPU::S_SETREG_B32 || 2091 changesVGPRIndexingMode(MI); 2092 } 2093 2094 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 2095 switch (Imm.getBitWidth()) { 2096 case 32: 2097 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(), 2098 ST.hasInv2PiInlineImm()); 2099 case 64: 2100 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), 2101 ST.hasInv2PiInlineImm()); 2102 case 16: 2103 return ST.has16BitInsts() && 2104 AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), 2105 ST.hasInv2PiInlineImm()); 2106 default: 2107 llvm_unreachable("invalid bitwidth"); 2108 } 2109 } 2110 2111 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 2112 uint8_t OperandType) const { 2113 if (!MO.isImm() || OperandType < MCOI::OPERAND_FIRST_TARGET) 2114 return false; 2115 2116 // MachineOperand provides no way to tell the true operand size, since it only 2117 // records a 64-bit value. We need to know the size to determine if a 32-bit 2118 // floating point immediate bit pattern is legal for an integer immediate. It 2119 // would be for any 32-bit integer operand, but would not be for a 64-bit one. 2120 2121 int64_t Imm = MO.getImm(); 2122 switch (OperandType) { 2123 case AMDGPU::OPERAND_REG_IMM_INT32: 2124 case AMDGPU::OPERAND_REG_IMM_FP32: 2125 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 2126 case AMDGPU::OPERAND_REG_INLINE_C_FP32: { 2127 int32_t Trunc = static_cast<int32_t>(Imm); 2128 return Trunc == Imm && 2129 AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); 2130 } 2131 case AMDGPU::OPERAND_REG_IMM_INT64: 2132 case AMDGPU::OPERAND_REG_IMM_FP64: 2133 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 2134 case AMDGPU::OPERAND_REG_INLINE_C_FP64: { 2135 return AMDGPU::isInlinableLiteral64(MO.getImm(), 2136 ST.hasInv2PiInlineImm()); 2137 } 2138 case AMDGPU::OPERAND_REG_IMM_INT16: 2139 case AMDGPU::OPERAND_REG_IMM_FP16: 2140 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 2141 case AMDGPU::OPERAND_REG_INLINE_C_FP16: { 2142 if (isInt<16>(Imm) || isUInt<16>(Imm)) { 2143 // A few special case instructions have 16-bit operands on subtargets 2144 // where 16-bit instructions are not legal. 2145 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle 2146 // constants in these cases 2147 int16_t Trunc = static_cast<int16_t>(Imm); 2148 return ST.has16BitInsts() && 2149 AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); 2150 } 2151 2152 return false; 2153 } 2154 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: 2155 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { 2156 uint32_t Trunc = static_cast<uint32_t>(Imm); 2157 return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); 2158 } 2159 default: 2160 llvm_unreachable("invalid bitwidth"); 2161 } 2162 } 2163 2164 bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO, 2165 const MCOperandInfo &OpInfo) const { 2166 switch (MO.getType()) { 2167 case MachineOperand::MO_Register: 2168 return false; 2169 case MachineOperand::MO_Immediate: 2170 return !isInlineConstant(MO, OpInfo); 2171 case MachineOperand::MO_FrameIndex: 2172 case MachineOperand::MO_MachineBasicBlock: 2173 case MachineOperand::MO_ExternalSymbol: 2174 case MachineOperand::MO_GlobalAddress: 2175 case MachineOperand::MO_MCSymbol: 2176 return true; 2177 default: 2178 llvm_unreachable("unexpected operand type"); 2179 } 2180 } 2181 2182 static bool compareMachineOp(const MachineOperand &Op0, 2183 const MachineOperand &Op1) { 2184 if (Op0.getType() != Op1.getType()) 2185 return false; 2186 2187 switch (Op0.getType()) { 2188 case MachineOperand::MO_Register: 2189 return Op0.getReg() == Op1.getReg(); 2190 case MachineOperand::MO_Immediate: 2191 return Op0.getImm() == Op1.getImm(); 2192 default: 2193 llvm_unreachable("Didn't expect to be comparing these operand types"); 2194 } 2195 } 2196 2197 bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, 2198 const MachineOperand &MO) const { 2199 const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo]; 2200 2201 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 2202 2203 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 2204 return true; 2205 2206 if (OpInfo.RegClass < 0) 2207 return false; 2208 2209 if (MO.isImm() && isInlineConstant(MO, OpInfo)) 2210 return RI.opCanUseInlineConstant(OpInfo.OperandType); 2211 2212 return RI.opCanUseLiteralConstant(OpInfo.OperandType); 2213 } 2214 2215 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 2216 int Op32 = AMDGPU::getVOPe32(Opcode); 2217 if (Op32 == -1) 2218 return false; 2219 2220 return pseudoToMCOpcode(Op32) != -1; 2221 } 2222 2223 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 2224 // The src0_modifier operand is present on all instructions 2225 // that have modifiers. 2226 2227 return AMDGPU::getNamedOperandIdx(Opcode, 2228 AMDGPU::OpName::src0_modifiers) != -1; 2229 } 2230 2231 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 2232 unsigned OpName) const { 2233 const MachineOperand *Mods = getNamedOperand(MI, OpName); 2234 return Mods && Mods->getImm(); 2235 } 2236 2237 bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { 2238 return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || 2239 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || 2240 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) || 2241 hasModifiersSet(MI, AMDGPU::OpName::clamp) || 2242 hasModifiersSet(MI, AMDGPU::OpName::omod); 2243 } 2244 2245 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 2246 const MachineOperand &MO, 2247 const MCOperandInfo &OpInfo) const { 2248 // Literal constants use the constant bus. 2249 //if (isLiteralConstantLike(MO, OpInfo)) 2250 // return true; 2251 if (MO.isImm()) 2252 return !isInlineConstant(MO, OpInfo); 2253 2254 if (!MO.isReg()) 2255 return true; // Misc other operands like FrameIndex 2256 2257 if (!MO.isUse()) 2258 return false; 2259 2260 if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) 2261 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 2262 2263 // FLAT_SCR is just an SGPR pair. 2264 if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) 2265 return true; 2266 2267 // EXEC register uses the constant bus. 2268 if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) 2269 return true; 2270 2271 // SGPRs use the constant bus 2272 return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 || 2273 (!MO.isImplicit() && 2274 (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || 2275 AMDGPU::SGPR_64RegClass.contains(MO.getReg())))); 2276 } 2277 2278 static unsigned findImplicitSGPRRead(const MachineInstr &MI) { 2279 for (const MachineOperand &MO : MI.implicit_operands()) { 2280 // We only care about reads. 2281 if (MO.isDef()) 2282 continue; 2283 2284 switch (MO.getReg()) { 2285 case AMDGPU::VCC: 2286 case AMDGPU::M0: 2287 case AMDGPU::FLAT_SCR: 2288 return MO.getReg(); 2289 2290 default: 2291 break; 2292 } 2293 } 2294 2295 return AMDGPU::NoRegister; 2296 } 2297 2298 static bool shouldReadExec(const MachineInstr &MI) { 2299 if (SIInstrInfo::isVALU(MI)) { 2300 switch (MI.getOpcode()) { 2301 case AMDGPU::V_READLANE_B32: 2302 case AMDGPU::V_READLANE_B32_si: 2303 case AMDGPU::V_READLANE_B32_vi: 2304 case AMDGPU::V_WRITELANE_B32: 2305 case AMDGPU::V_WRITELANE_B32_si: 2306 case AMDGPU::V_WRITELANE_B32_vi: 2307 return false; 2308 } 2309 2310 return true; 2311 } 2312 2313 if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) || 2314 SIInstrInfo::isSALU(MI) || 2315 SIInstrInfo::isSMRD(MI)) 2316 return false; 2317 2318 return true; 2319 } 2320 2321 static bool isSubRegOf(const SIRegisterInfo &TRI, 2322 const MachineOperand &SuperVec, 2323 const MachineOperand &SubReg) { 2324 if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg())) 2325 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); 2326 2327 return SubReg.getSubReg() != AMDGPU::NoSubRegister && 2328 SubReg.getReg() == SuperVec.getReg(); 2329 } 2330 2331 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, 2332 StringRef &ErrInfo) const { 2333 uint16_t Opcode = MI.getOpcode(); 2334 2335 if (SIInstrInfo::isGenericOpcode(MI.getOpcode())) 2336 return true; 2337 2338 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 2339 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 2340 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 2341 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 2342 2343 // Make sure the number of operands is correct. 2344 const MCInstrDesc &Desc = get(Opcode); 2345 if (!Desc.isVariadic() && 2346 Desc.getNumOperands() != MI.getNumExplicitOperands()) { 2347 ErrInfo = "Instruction has wrong number of operands."; 2348 return false; 2349 } 2350 2351 if (MI.isInlineAsm()) { 2352 // Verify register classes for inlineasm constraints. 2353 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands(); 2354 I != E; ++I) { 2355 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI); 2356 if (!RC) 2357 continue; 2358 2359 const MachineOperand &Op = MI.getOperand(I); 2360 if (!Op.isReg()) 2361 continue; 2362 2363 unsigned Reg = Op.getReg(); 2364 if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) { 2365 ErrInfo = "inlineasm operand has incorrect register class."; 2366 return false; 2367 } 2368 } 2369 2370 return true; 2371 } 2372 2373 // Make sure the register classes are correct. 2374 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 2375 if (MI.getOperand(i).isFPImm()) { 2376 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 2377 "all fp values to integers."; 2378 return false; 2379 } 2380 2381 int RegClass = Desc.OpInfo[i].RegClass; 2382 2383 switch (Desc.OpInfo[i].OperandType) { 2384 case MCOI::OPERAND_REGISTER: 2385 if (MI.getOperand(i).isImm()) { 2386 ErrInfo = "Illegal immediate value for operand."; 2387 return false; 2388 } 2389 break; 2390 case AMDGPU::OPERAND_REG_IMM_INT32: 2391 case AMDGPU::OPERAND_REG_IMM_FP32: 2392 break; 2393 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 2394 case AMDGPU::OPERAND_REG_INLINE_C_FP32: 2395 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 2396 case AMDGPU::OPERAND_REG_INLINE_C_FP64: 2397 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 2398 case AMDGPU::OPERAND_REG_INLINE_C_FP16: { 2399 const MachineOperand &MO = MI.getOperand(i); 2400 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { 2401 ErrInfo = "Illegal immediate value for operand."; 2402 return false; 2403 } 2404 break; 2405 } 2406 case MCOI::OPERAND_IMMEDIATE: 2407 case AMDGPU::OPERAND_KIMM32: 2408 // Check if this operand is an immediate. 2409 // FrameIndex operands will be replaced by immediates, so they are 2410 // allowed. 2411 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { 2412 ErrInfo = "Expected immediate, but got non-immediate"; 2413 return false; 2414 } 2415 LLVM_FALLTHROUGH; 2416 default: 2417 continue; 2418 } 2419 2420 if (!MI.getOperand(i).isReg()) 2421 continue; 2422 2423 if (RegClass != -1) { 2424 unsigned Reg = MI.getOperand(i).getReg(); 2425 if (Reg == AMDGPU::NoRegister || 2426 TargetRegisterInfo::isVirtualRegister(Reg)) 2427 continue; 2428 2429 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 2430 if (!RC->contains(Reg)) { 2431 ErrInfo = "Operand has incorrect register class."; 2432 return false; 2433 } 2434 } 2435 } 2436 2437 // Verify VOP* 2438 if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI)) { 2439 // Only look at the true operands. Only a real operand can use the constant 2440 // bus, and we don't want to check pseudo-operands like the source modifier 2441 // flags. 2442 const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; 2443 2444 unsigned ConstantBusCount = 0; 2445 2446 if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) 2447 ++ConstantBusCount; 2448 2449 unsigned SGPRUsed = findImplicitSGPRRead(MI); 2450 if (SGPRUsed != AMDGPU::NoRegister) 2451 ++ConstantBusCount; 2452 2453 for (int OpIdx : OpIndices) { 2454 if (OpIdx == -1) 2455 break; 2456 const MachineOperand &MO = MI.getOperand(OpIdx); 2457 if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { 2458 if (MO.isReg()) { 2459 if (MO.getReg() != SGPRUsed) 2460 ++ConstantBusCount; 2461 SGPRUsed = MO.getReg(); 2462 } else { 2463 ++ConstantBusCount; 2464 } 2465 } 2466 } 2467 if (ConstantBusCount > 1) { 2468 ErrInfo = "VOP* instruction uses the constant bus more than once"; 2469 return false; 2470 } 2471 } 2472 2473 // Verify misc. restrictions on specific instructions. 2474 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || 2475 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { 2476 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 2477 const MachineOperand &Src1 = MI.getOperand(Src1Idx); 2478 const MachineOperand &Src2 = MI.getOperand(Src2Idx); 2479 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 2480 if (!compareMachineOp(Src0, Src1) && 2481 !compareMachineOp(Src0, Src2)) { 2482 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 2483 return false; 2484 } 2485 } 2486 } 2487 2488 if (isSOPK(MI)) { 2489 int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm(); 2490 if (sopkIsZext(MI)) { 2491 if (!isUInt<16>(Imm)) { 2492 ErrInfo = "invalid immediate for SOPK instruction"; 2493 return false; 2494 } 2495 } else { 2496 if (!isInt<16>(Imm)) { 2497 ErrInfo = "invalid immediate for SOPK instruction"; 2498 return false; 2499 } 2500 } 2501 } 2502 2503 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || 2504 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || 2505 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 2506 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { 2507 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 2508 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; 2509 2510 const unsigned StaticNumOps = Desc.getNumOperands() + 2511 Desc.getNumImplicitUses(); 2512 const unsigned NumImplicitOps = IsDst ? 2 : 1; 2513 2514 // Allow additional implicit operands. This allows a fixup done by the post 2515 // RA scheduler where the main implicit operand is killed and implicit-defs 2516 // are added for sub-registers that remain live after this instruction. 2517 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { 2518 ErrInfo = "missing implicit register operands"; 2519 return false; 2520 } 2521 2522 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 2523 if (IsDst) { 2524 if (!Dst->isUse()) { 2525 ErrInfo = "v_movreld_b32 vdst should be a use operand"; 2526 return false; 2527 } 2528 2529 unsigned UseOpIdx; 2530 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || 2531 UseOpIdx != StaticNumOps + 1) { 2532 ErrInfo = "movrel implicit operands should be tied"; 2533 return false; 2534 } 2535 } 2536 2537 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 2538 const MachineOperand &ImpUse 2539 = MI.getOperand(StaticNumOps + NumImplicitOps - 1); 2540 if (!ImpUse.isReg() || !ImpUse.isUse() || 2541 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { 2542 ErrInfo = "src0 should be subreg of implicit vector use"; 2543 return false; 2544 } 2545 } 2546 2547 // Make sure we aren't losing exec uses in the td files. This mostly requires 2548 // being careful when using let Uses to try to add other use registers. 2549 if (shouldReadExec(MI)) { 2550 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 2551 ErrInfo = "VALU instruction does not implicitly read exec mask"; 2552 return false; 2553 } 2554 } 2555 2556 if (isSMRD(MI)) { 2557 if (MI.mayStore()) { 2558 // The register offset form of scalar stores may only use m0 as the 2559 // soffset register. 2560 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff); 2561 if (Soff && Soff->getReg() != AMDGPU::M0) { 2562 ErrInfo = "scalar stores must use m0 as offset register"; 2563 return false; 2564 } 2565 } 2566 } 2567 2568 return true; 2569 } 2570 2571 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { 2572 switch (MI.getOpcode()) { 2573 default: return AMDGPU::INSTRUCTION_LIST_END; 2574 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 2575 case AMDGPU::COPY: return AMDGPU::COPY; 2576 case AMDGPU::PHI: return AMDGPU::PHI; 2577 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 2578 case AMDGPU::S_MOV_B32: 2579 return MI.getOperand(1).isReg() ? 2580 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 2581 case AMDGPU::S_ADD_I32: 2582 case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; 2583 case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; 2584 case AMDGPU::S_SUB_I32: 2585 case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; 2586 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 2587 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; 2588 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; 2589 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; 2590 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; 2591 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; 2592 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; 2593 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; 2594 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; 2595 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 2596 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; 2597 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 2598 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; 2599 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 2600 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; 2601 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; 2602 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; 2603 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; 2604 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; 2605 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 2606 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 2607 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 2608 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 2609 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; 2610 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; 2611 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; 2612 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; 2613 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; 2614 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; 2615 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; 2616 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; 2617 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; 2618 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; 2619 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; 2620 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; 2621 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32; 2622 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32; 2623 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 2624 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 2625 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 2626 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 2627 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; 2628 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; 2629 } 2630 } 2631 2632 bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const { 2633 return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END; 2634 } 2635 2636 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 2637 unsigned OpNo) const { 2638 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 2639 const MCInstrDesc &Desc = get(MI.getOpcode()); 2640 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 2641 Desc.OpInfo[OpNo].RegClass == -1) { 2642 unsigned Reg = MI.getOperand(OpNo).getReg(); 2643 2644 if (TargetRegisterInfo::isVirtualRegister(Reg)) 2645 return MRI.getRegClass(Reg); 2646 return RI.getPhysRegClass(Reg); 2647 } 2648 2649 unsigned RCID = Desc.OpInfo[OpNo].RegClass; 2650 return RI.getRegClass(RCID); 2651 } 2652 2653 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { 2654 switch (MI.getOpcode()) { 2655 case AMDGPU::COPY: 2656 case AMDGPU::REG_SEQUENCE: 2657 case AMDGPU::PHI: 2658 case AMDGPU::INSERT_SUBREG: 2659 return RI.hasVGPRs(getOpRegClass(MI, 0)); 2660 default: 2661 return RI.hasVGPRs(getOpRegClass(MI, OpNo)); 2662 } 2663 } 2664 2665 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { 2666 MachineBasicBlock::iterator I = MI; 2667 MachineBasicBlock *MBB = MI.getParent(); 2668 MachineOperand &MO = MI.getOperand(OpIdx); 2669 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 2670 unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; 2671 const TargetRegisterClass *RC = RI.getRegClass(RCID); 2672 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 2673 if (MO.isReg()) 2674 Opcode = AMDGPU::COPY; 2675 else if (RI.isSGPRClass(RC)) 2676 Opcode = AMDGPU::S_MOV_B32; 2677 2678 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 2679 if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) 2680 VRC = &AMDGPU::VReg_64RegClass; 2681 else 2682 VRC = &AMDGPU::VGPR_32RegClass; 2683 2684 unsigned Reg = MRI.createVirtualRegister(VRC); 2685 DebugLoc DL = MBB->findDebugLoc(I); 2686 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); 2687 MO.ChangeToRegister(Reg, false); 2688 } 2689 2690 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 2691 MachineRegisterInfo &MRI, 2692 MachineOperand &SuperReg, 2693 const TargetRegisterClass *SuperRC, 2694 unsigned SubIdx, 2695 const TargetRegisterClass *SubRC) 2696 const { 2697 MachineBasicBlock *MBB = MI->getParent(); 2698 DebugLoc DL = MI->getDebugLoc(); 2699 unsigned SubReg = MRI.createVirtualRegister(SubRC); 2700 2701 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { 2702 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 2703 .addReg(SuperReg.getReg(), 0, SubIdx); 2704 return SubReg; 2705 } 2706 2707 // Just in case the super register is itself a sub-register, copy it to a new 2708 // value so we don't need to worry about merging its subreg index with the 2709 // SubIdx passed to this function. The register coalescer should be able to 2710 // eliminate this extra copy. 2711 unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); 2712 2713 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 2714 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 2715 2716 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 2717 .addReg(NewSuperReg, 0, SubIdx); 2718 2719 return SubReg; 2720 } 2721 2722 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 2723 MachineBasicBlock::iterator MII, 2724 MachineRegisterInfo &MRI, 2725 MachineOperand &Op, 2726 const TargetRegisterClass *SuperRC, 2727 unsigned SubIdx, 2728 const TargetRegisterClass *SubRC) const { 2729 if (Op.isImm()) { 2730 if (SubIdx == AMDGPU::sub0) 2731 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); 2732 if (SubIdx == AMDGPU::sub1) 2733 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32)); 2734 2735 llvm_unreachable("Unhandled register index for immediate"); 2736 } 2737 2738 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 2739 SubIdx, SubRC); 2740 return MachineOperand::CreateReg(SubReg, false); 2741 } 2742 2743 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 2744 void SIInstrInfo::swapOperands(MachineInstr &Inst) const { 2745 assert(Inst.getNumExplicitOperands() == 3); 2746 MachineOperand Op1 = Inst.getOperand(1); 2747 Inst.RemoveOperand(1); 2748 Inst.addOperand(Op1); 2749 } 2750 2751 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, 2752 const MCOperandInfo &OpInfo, 2753 const MachineOperand &MO) const { 2754 if (!MO.isReg()) 2755 return false; 2756 2757 unsigned Reg = MO.getReg(); 2758 const TargetRegisterClass *RC = 2759 TargetRegisterInfo::isVirtualRegister(Reg) ? 2760 MRI.getRegClass(Reg) : 2761 RI.getPhysRegClass(Reg); 2762 2763 const SIRegisterInfo *TRI = 2764 static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 2765 RC = TRI->getSubRegClass(RC, MO.getSubReg()); 2766 2767 // In order to be legal, the common sub-class must be equal to the 2768 // class of the current operand. For example: 2769 // 2770 // v_mov_b32 s0 ; Operand defined as vsrc_b32 2771 // ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL 2772 // 2773 // s_sendmsg 0, s0 ; Operand defined as m0reg 2774 // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL 2775 2776 return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; 2777 } 2778 2779 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, 2780 const MCOperandInfo &OpInfo, 2781 const MachineOperand &MO) const { 2782 if (MO.isReg()) 2783 return isLegalRegOperand(MRI, OpInfo, MO); 2784 2785 // Handle non-register types that are treated like immediates. 2786 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 2787 return true; 2788 } 2789 2790 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, 2791 const MachineOperand *MO) const { 2792 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 2793 const MCInstrDesc &InstDesc = MI.getDesc(); 2794 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; 2795 const TargetRegisterClass *DefinedRC = 2796 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 2797 if (!MO) 2798 MO = &MI.getOperand(OpIdx); 2799 2800 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { 2801 2802 RegSubRegPair SGPRUsed; 2803 if (MO->isReg()) 2804 SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg()); 2805 2806 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 2807 if (i == OpIdx) 2808 continue; 2809 const MachineOperand &Op = MI.getOperand(i); 2810 if (Op.isReg()) { 2811 if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) && 2812 usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) { 2813 return false; 2814 } 2815 } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { 2816 return false; 2817 } 2818 } 2819 } 2820 2821 if (MO->isReg()) { 2822 assert(DefinedRC); 2823 return isLegalRegOperand(MRI, OpInfo, *MO); 2824 } 2825 2826 // Handle non-register types that are treated like immediates. 2827 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); 2828 2829 if (!DefinedRC) { 2830 // This operand expects an immediate. 2831 return true; 2832 } 2833 2834 return isImmOperandLegal(MI, OpIdx, *MO); 2835 } 2836 2837 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, 2838 MachineInstr &MI) const { 2839 unsigned Opc = MI.getOpcode(); 2840 const MCInstrDesc &InstrDesc = get(Opc); 2841 2842 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 2843 MachineOperand &Src1 = MI.getOperand(Src1Idx); 2844 2845 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 2846 // we need to only have one constant bus use. 2847 // 2848 // Note we do not need to worry about literal constants here. They are 2849 // disabled for the operand type for instructions because they will always 2850 // violate the one constant bus use rule. 2851 bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; 2852 if (HasImplicitSGPR) { 2853 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 2854 MachineOperand &Src0 = MI.getOperand(Src0Idx); 2855 2856 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) 2857 legalizeOpWithMove(MI, Src0Idx); 2858 } 2859 2860 // VOP2 src0 instructions support all operand types, so we don't need to check 2861 // their legality. If src1 is already legal, we don't need to do anything. 2862 if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) 2863 return; 2864 2865 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for 2866 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane 2867 // select is uniform. 2868 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() && 2869 RI.isVGPR(MRI, Src1.getReg())) { 2870 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 2871 const DebugLoc &DL = MI.getDebugLoc(); 2872 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 2873 .add(Src1); 2874 Src1.ChangeToRegister(Reg, false); 2875 return; 2876 } 2877 2878 // We do not use commuteInstruction here because it is too aggressive and will 2879 // commute if it is possible. We only want to commute here if it improves 2880 // legality. This can be called a fairly large number of times so don't waste 2881 // compile time pointlessly swapping and checking legality again. 2882 if (HasImplicitSGPR || !MI.isCommutable()) { 2883 legalizeOpWithMove(MI, Src1Idx); 2884 return; 2885 } 2886 2887 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 2888 MachineOperand &Src0 = MI.getOperand(Src0Idx); 2889 2890 // If src0 can be used as src1, commuting will make the operands legal. 2891 // Otherwise we have to give up and insert a move. 2892 // 2893 // TODO: Other immediate-like operand kinds could be commuted if there was a 2894 // MachineOperand::ChangeTo* for them. 2895 if ((!Src1.isImm() && !Src1.isReg()) || 2896 !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { 2897 legalizeOpWithMove(MI, Src1Idx); 2898 return; 2899 } 2900 2901 int CommutedOpc = commuteOpcode(MI); 2902 if (CommutedOpc == -1) { 2903 legalizeOpWithMove(MI, Src1Idx); 2904 return; 2905 } 2906 2907 MI.setDesc(get(CommutedOpc)); 2908 2909 unsigned Src0Reg = Src0.getReg(); 2910 unsigned Src0SubReg = Src0.getSubReg(); 2911 bool Src0Kill = Src0.isKill(); 2912 2913 if (Src1.isImm()) 2914 Src0.ChangeToImmediate(Src1.getImm()); 2915 else if (Src1.isReg()) { 2916 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); 2917 Src0.setSubReg(Src1.getSubReg()); 2918 } else 2919 llvm_unreachable("Should only have register or immediate operands"); 2920 2921 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); 2922 Src1.setSubReg(Src0SubReg); 2923 } 2924 2925 // Legalize VOP3 operands. Because all operand types are supported for any 2926 // operand, and since literal constants are not allowed and should never be 2927 // seen, we only need to worry about inserting copies if we use multiple SGPR 2928 // operands. 2929 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, 2930 MachineInstr &MI) const { 2931 unsigned Opc = MI.getOpcode(); 2932 2933 int VOP3Idx[3] = { 2934 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 2935 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), 2936 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) 2937 }; 2938 2939 // Find the one SGPR operand we are allowed to use. 2940 unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); 2941 2942 for (unsigned i = 0; i < 3; ++i) { 2943 int Idx = VOP3Idx[i]; 2944 if (Idx == -1) 2945 break; 2946 MachineOperand &MO = MI.getOperand(Idx); 2947 2948 // We should never see a VOP3 instruction with an illegal immediate operand. 2949 if (!MO.isReg()) 2950 continue; 2951 2952 if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 2953 continue; // VGPRs are legal 2954 2955 if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { 2956 SGPRReg = MO.getReg(); 2957 // We can use one SGPR in each VOP3 instruction. 2958 continue; 2959 } 2960 2961 // If we make it this far, then the operand is not legal and we must 2962 // legalize it. 2963 legalizeOpWithMove(MI, Idx); 2964 } 2965 } 2966 2967 unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, 2968 MachineRegisterInfo &MRI) const { 2969 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); 2970 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); 2971 unsigned DstReg = MRI.createVirtualRegister(SRC); 2972 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; 2973 2974 SmallVector<unsigned, 8> SRegs; 2975 for (unsigned i = 0; i < SubRegs; ++i) { 2976 unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2977 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 2978 get(AMDGPU::V_READFIRSTLANE_B32), SGPR) 2979 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); 2980 SRegs.push_back(SGPR); 2981 } 2982 2983 MachineInstrBuilder MIB = 2984 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 2985 get(AMDGPU::REG_SEQUENCE), DstReg); 2986 for (unsigned i = 0; i < SubRegs; ++i) { 2987 MIB.addReg(SRegs[i]); 2988 MIB.addImm(RI.getSubRegFromChannel(i)); 2989 } 2990 return DstReg; 2991 } 2992 2993 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, 2994 MachineInstr &MI) const { 2995 2996 // If the pointer is store in VGPRs, then we need to move them to 2997 // SGPRs using v_readfirstlane. This is safe because we only select 2998 // loads with uniform pointers to SMRD instruction so we know the 2999 // pointer value is uniform. 3000 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); 3001 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { 3002 unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); 3003 SBase->setReg(SGPR); 3004 } 3005 } 3006 3007 void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, 3008 MachineBasicBlock::iterator I, 3009 const TargetRegisterClass *DstRC, 3010 MachineOperand &Op, 3011 MachineRegisterInfo &MRI, 3012 const DebugLoc &DL) const { 3013 3014 unsigned OpReg = Op.getReg(); 3015 unsigned OpSubReg = Op.getSubReg(); 3016 3017 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( 3018 RI.getRegClassForReg(MRI, OpReg), OpSubReg); 3019 3020 // Check if operand is already the correct register class. 3021 if (DstRC == OpRC) 3022 return; 3023 3024 unsigned DstReg = MRI.createVirtualRegister(DstRC); 3025 MachineInstr *Copy = 3026 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); 3027 3028 Op.setReg(DstReg); 3029 Op.setSubReg(0); 3030 3031 MachineInstr *Def = MRI.getVRegDef(OpReg); 3032 if (!Def) 3033 return; 3034 3035 // Try to eliminate the copy if it is copying an immediate value. 3036 if (Def->isMoveImmediate()) 3037 FoldImmediate(*Copy, *Def, OpReg, &MRI); 3038 } 3039 3040 void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { 3041 MachineFunction &MF = *MI.getParent()->getParent(); 3042 MachineRegisterInfo &MRI = MF.getRegInfo(); 3043 3044 // Legalize VOP2 3045 if (isVOP2(MI) || isVOPC(MI)) { 3046 legalizeOperandsVOP2(MRI, MI); 3047 return; 3048 } 3049 3050 // Legalize VOP3 3051 if (isVOP3(MI)) { 3052 legalizeOperandsVOP3(MRI, MI); 3053 return; 3054 } 3055 3056 // Legalize SMRD 3057 if (isSMRD(MI)) { 3058 legalizeOperandsSMRD(MRI, MI); 3059 return; 3060 } 3061 3062 // Legalize REG_SEQUENCE and PHI 3063 // The register class of the operands much be the same type as the register 3064 // class of the output. 3065 if (MI.getOpcode() == AMDGPU::PHI) { 3066 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 3067 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { 3068 if (!MI.getOperand(i).isReg() || 3069 !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) 3070 continue; 3071 const TargetRegisterClass *OpRC = 3072 MRI.getRegClass(MI.getOperand(i).getReg()); 3073 if (RI.hasVGPRs(OpRC)) { 3074 VRC = OpRC; 3075 } else { 3076 SRC = OpRC; 3077 } 3078 } 3079 3080 // If any of the operands are VGPR registers, then they all most be 3081 // otherwise we will create illegal VGPR->SGPR copies when legalizing 3082 // them. 3083 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { 3084 if (!VRC) { 3085 assert(SRC); 3086 VRC = RI.getEquivalentVGPRClass(SRC); 3087 } 3088 RC = VRC; 3089 } else { 3090 RC = SRC; 3091 } 3092 3093 // Update all the operands so they have the same type. 3094 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3095 MachineOperand &Op = MI.getOperand(I); 3096 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 3097 continue; 3098 3099 // MI is a PHI instruction. 3100 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); 3101 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); 3102 3103 // Avoid creating no-op copies with the same src and dst reg class. These 3104 // confuse some of the machine passes. 3105 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); 3106 } 3107 } 3108 3109 // REG_SEQUENCE doesn't really require operand legalization, but if one has a 3110 // VGPR dest type and SGPR sources, insert copies so all operands are 3111 // VGPRs. This seems to help operand folding / the register coalescer. 3112 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { 3113 MachineBasicBlock *MBB = MI.getParent(); 3114 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); 3115 if (RI.hasVGPRs(DstRC)) { 3116 // Update all the operands so they are VGPR register classes. These may 3117 // not be the same register class because REG_SEQUENCE supports mixing 3118 // subregister index types e.g. sub0_sub1 + sub2 + sub3 3119 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3120 MachineOperand &Op = MI.getOperand(I); 3121 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 3122 continue; 3123 3124 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); 3125 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); 3126 if (VRC == OpRC) 3127 continue; 3128 3129 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc()); 3130 Op.setIsKill(); 3131 } 3132 } 3133 3134 return; 3135 } 3136 3137 // Legalize INSERT_SUBREG 3138 // src0 must have the same register class as dst 3139 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { 3140 unsigned Dst = MI.getOperand(0).getReg(); 3141 unsigned Src0 = MI.getOperand(1).getReg(); 3142 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 3143 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 3144 if (DstRC != Src0RC) { 3145 MachineBasicBlock *MBB = MI.getParent(); 3146 MachineOperand &Op = MI.getOperand(1); 3147 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); 3148 } 3149 return; 3150 } 3151 3152 // Legalize MIMG and MUBUF/MTBUF for shaders. 3153 // 3154 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via 3155 // scratch memory access. In both cases, the legalization never involves 3156 // conversion to the addr64 form. 3157 if (isMIMG(MI) || 3158 (AMDGPU::isShader(MF.getFunction()->getCallingConv()) && 3159 (isMUBUF(MI) || isMTBUF(MI)))) { 3160 MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); 3161 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { 3162 unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); 3163 SRsrc->setReg(SGPR); 3164 } 3165 3166 MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); 3167 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { 3168 unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); 3169 SSamp->setReg(SGPR); 3170 } 3171 return; 3172 } 3173 3174 // Legalize MUBUF* instructions by converting to addr64 form. 3175 // FIXME: If we start using the non-addr64 instructions for compute, we 3176 // may need to legalize them as above. This especially applies to the 3177 // buffer_load_format_* variants and variants with idxen (or bothen). 3178 int SRsrcIdx = 3179 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); 3180 if (SRsrcIdx != -1) { 3181 // We have an MUBUF instruction 3182 MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx); 3183 unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass; 3184 if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), 3185 RI.getRegClass(SRsrcRC))) { 3186 // The operands are legal. 3187 // FIXME: We may need to legalize operands besided srsrc. 3188 return; 3189 } 3190 3191 MachineBasicBlock &MBB = *MI.getParent(); 3192 3193 // Extract the ptr from the resource descriptor. 3194 unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, 3195 &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); 3196 3197 // Create an empty resource descriptor 3198 unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3199 unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 3200 unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 3201 unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); 3202 uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); 3203 3204 // Zero64 = 0 3205 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64) 3206 .addImm(0); 3207 3208 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 3209 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo) 3210 .addImm(RsrcDataFormat & 0xFFFFFFFF); 3211 3212 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 3213 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi) 3214 .addImm(RsrcDataFormat >> 32); 3215 3216 // NewSRsrc = {Zero64, SRsrcFormat} 3217 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) 3218 .addReg(Zero64) 3219 .addImm(AMDGPU::sub0_sub1) 3220 .addReg(SRsrcFormatLo) 3221 .addImm(AMDGPU::sub2) 3222 .addReg(SRsrcFormatHi) 3223 .addImm(AMDGPU::sub3); 3224 3225 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 3226 unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 3227 if (VAddr) { 3228 // This is already an ADDR64 instruction so we need to add the pointer 3229 // extracted from the resource descriptor to the current value of VAddr. 3230 unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3231 unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3232 3233 // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 3234 DebugLoc DL = MI.getDebugLoc(); 3235 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) 3236 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 3237 .addReg(VAddr->getReg(), 0, AMDGPU::sub0); 3238 3239 // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1 3240 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) 3241 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 3242 .addReg(VAddr->getReg(), 0, AMDGPU::sub1); 3243 3244 // NewVaddr = {NewVaddrHi, NewVaddrLo} 3245 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 3246 .addReg(NewVAddrLo) 3247 .addImm(AMDGPU::sub0) 3248 .addReg(NewVAddrHi) 3249 .addImm(AMDGPU::sub1); 3250 } else { 3251 // This instructions is the _OFFSET variant, so we need to convert it to 3252 // ADDR64. 3253 assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration() 3254 < SISubtarget::VOLCANIC_ISLANDS && 3255 "FIXME: Need to emit flat atomics here"); 3256 3257 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); 3258 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 3259 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); 3260 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); 3261 3262 // Atomics rith return have have an additional tied operand and are 3263 // missing some of the special bits. 3264 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); 3265 MachineInstr *Addr64; 3266 3267 if (!VDataIn) { 3268 // Regular buffer load / store. 3269 MachineInstrBuilder MIB = 3270 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 3271 .add(*VData) 3272 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 3273 // This will be replaced later 3274 // with the new value of vaddr. 3275 .add(*SRsrc) 3276 .add(*SOffset) 3277 .add(*Offset); 3278 3279 // Atomics do not have this operand. 3280 if (const MachineOperand *GLC = 3281 getNamedOperand(MI, AMDGPU::OpName::glc)) { 3282 MIB.addImm(GLC->getImm()); 3283 } 3284 3285 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)); 3286 3287 if (const MachineOperand *TFE = 3288 getNamedOperand(MI, AMDGPU::OpName::tfe)) { 3289 MIB.addImm(TFE->getImm()); 3290 } 3291 3292 MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); 3293 Addr64 = MIB; 3294 } else { 3295 // Atomics with return. 3296 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 3297 .add(*VData) 3298 .add(*VDataIn) 3299 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 3300 // This will be replaced later 3301 // with the new value of vaddr. 3302 .add(*SRsrc) 3303 .add(*SOffset) 3304 .add(*Offset) 3305 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) 3306 .setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); 3307 } 3308 3309 MI.removeFromParent(); 3310 3311 // NewVaddr = {NewVaddrHi, NewVaddrLo} 3312 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), 3313 NewVAddr) 3314 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 3315 .addImm(AMDGPU::sub0) 3316 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 3317 .addImm(AMDGPU::sub1); 3318 3319 VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr); 3320 SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc); 3321 } 3322 3323 // Update the instruction to use NewVaddr 3324 VAddr->setReg(NewVAddr); 3325 // Update the instruction to use NewSRsrc 3326 SRsrc->setReg(NewSRsrc); 3327 } 3328 } 3329 3330 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { 3331 SmallVector<MachineInstr *, 128> Worklist; 3332 Worklist.push_back(&TopInst); 3333 3334 while (!Worklist.empty()) { 3335 MachineInstr &Inst = *Worklist.pop_back_val(); 3336 MachineBasicBlock *MBB = Inst.getParent(); 3337 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 3338 3339 unsigned Opcode = Inst.getOpcode(); 3340 unsigned NewOpcode = getVALUOp(Inst); 3341 3342 // Handle some special cases 3343 switch (Opcode) { 3344 default: 3345 break; 3346 case AMDGPU::S_AND_B64: 3347 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); 3348 Inst.eraseFromParent(); 3349 continue; 3350 3351 case AMDGPU::S_OR_B64: 3352 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64); 3353 Inst.eraseFromParent(); 3354 continue; 3355 3356 case AMDGPU::S_XOR_B64: 3357 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64); 3358 Inst.eraseFromParent(); 3359 continue; 3360 3361 case AMDGPU::S_NOT_B64: 3362 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); 3363 Inst.eraseFromParent(); 3364 continue; 3365 3366 case AMDGPU::S_BCNT1_I32_B64: 3367 splitScalar64BitBCNT(Worklist, Inst); 3368 Inst.eraseFromParent(); 3369 continue; 3370 3371 case AMDGPU::S_BFE_I64: { 3372 splitScalar64BitBFE(Worklist, Inst); 3373 Inst.eraseFromParent(); 3374 continue; 3375 } 3376 3377 case AMDGPU::S_LSHL_B32: 3378 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3379 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 3380 swapOperands(Inst); 3381 } 3382 break; 3383 case AMDGPU::S_ASHR_I32: 3384 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3385 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 3386 swapOperands(Inst); 3387 } 3388 break; 3389 case AMDGPU::S_LSHR_B32: 3390 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3391 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 3392 swapOperands(Inst); 3393 } 3394 break; 3395 case AMDGPU::S_LSHL_B64: 3396 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3397 NewOpcode = AMDGPU::V_LSHLREV_B64; 3398 swapOperands(Inst); 3399 } 3400 break; 3401 case AMDGPU::S_ASHR_I64: 3402 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3403 NewOpcode = AMDGPU::V_ASHRREV_I64; 3404 swapOperands(Inst); 3405 } 3406 break; 3407 case AMDGPU::S_LSHR_B64: 3408 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3409 NewOpcode = AMDGPU::V_LSHRREV_B64; 3410 swapOperands(Inst); 3411 } 3412 break; 3413 3414 case AMDGPU::S_ABS_I32: 3415 lowerScalarAbs(Worklist, Inst); 3416 Inst.eraseFromParent(); 3417 continue; 3418 3419 case AMDGPU::S_CBRANCH_SCC0: 3420 case AMDGPU::S_CBRANCH_SCC1: 3421 // Clear unused bits of vcc 3422 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), 3423 AMDGPU::VCC) 3424 .addReg(AMDGPU::EXEC) 3425 .addReg(AMDGPU::VCC); 3426 break; 3427 3428 case AMDGPU::S_BFE_U64: 3429 case AMDGPU::S_BFM_B64: 3430 llvm_unreachable("Moving this op to VALU not implemented"); 3431 3432 case AMDGPU::S_PACK_LL_B32_B16: 3433 case AMDGPU::S_PACK_LH_B32_B16: 3434 case AMDGPU::S_PACK_HH_B32_B16: { 3435 movePackToVALU(Worklist, MRI, Inst); 3436 Inst.eraseFromParent(); 3437 continue; 3438 } 3439 } 3440 3441 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 3442 // We cannot move this instruction to the VALU, so we should try to 3443 // legalize its operands instead. 3444 legalizeOperands(Inst); 3445 continue; 3446 } 3447 3448 // Use the new VALU Opcode. 3449 const MCInstrDesc &NewDesc = get(NewOpcode); 3450 Inst.setDesc(NewDesc); 3451 3452 // Remove any references to SCC. Vector instructions can't read from it, and 3453 // We're just about to add the implicit use / defs of VCC, and we don't want 3454 // both. 3455 for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { 3456 MachineOperand &Op = Inst.getOperand(i); 3457 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { 3458 Inst.RemoveOperand(i); 3459 addSCCDefUsersToVALUWorklist(Inst, Worklist); 3460 } 3461 } 3462 3463 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 3464 // We are converting these to a BFE, so we need to add the missing 3465 // operands for the size and offset. 3466 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 3467 Inst.addOperand(MachineOperand::CreateImm(0)); 3468 Inst.addOperand(MachineOperand::CreateImm(Size)); 3469 3470 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 3471 // The VALU version adds the second operand to the result, so insert an 3472 // extra 0 operand. 3473 Inst.addOperand(MachineOperand::CreateImm(0)); 3474 } 3475 3476 Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); 3477 3478 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 3479 const MachineOperand &OffsetWidthOp = Inst.getOperand(2); 3480 // If we need to move this to VGPRs, we need to unpack the second operand 3481 // back into the 2 separate ones for bit offset and width. 3482 assert(OffsetWidthOp.isImm() && 3483 "Scalar BFE is only implemented for constant width and offset"); 3484 uint32_t Imm = OffsetWidthOp.getImm(); 3485 3486 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 3487 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 3488 Inst.RemoveOperand(2); // Remove old immediate. 3489 Inst.addOperand(MachineOperand::CreateImm(Offset)); 3490 Inst.addOperand(MachineOperand::CreateImm(BitWidth)); 3491 } 3492 3493 bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); 3494 unsigned NewDstReg = AMDGPU::NoRegister; 3495 if (HasDst) { 3496 unsigned DstReg = Inst.getOperand(0).getReg(); 3497 if (TargetRegisterInfo::isPhysicalRegister(DstReg)) 3498 continue; 3499 3500 // Update the destination register class. 3501 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); 3502 if (!NewDstRC) 3503 continue; 3504 3505 if (Inst.isCopy() && 3506 TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) && 3507 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { 3508 // Instead of creating a copy where src and dst are the same register 3509 // class, we just replace all uses of dst with src. These kinds of 3510 // copies interfere with the heuristics MachineSink uses to decide 3511 // whether or not to split a critical edge. Since the pass assumes 3512 // that copies will end up as machine instructions and not be 3513 // eliminated. 3514 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); 3515 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); 3516 MRI.clearKillFlags(Inst.getOperand(1).getReg()); 3517 Inst.getOperand(0).setReg(DstReg); 3518 continue; 3519 } 3520 3521 NewDstReg = MRI.createVirtualRegister(NewDstRC); 3522 MRI.replaceRegWith(DstReg, NewDstReg); 3523 } 3524 3525 // Legalize the operands 3526 legalizeOperands(Inst); 3527 3528 if (HasDst) 3529 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 3530 } 3531 } 3532 3533 void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, 3534 MachineInstr &Inst) const { 3535 MachineBasicBlock &MBB = *Inst.getParent(); 3536 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3537 MachineBasicBlock::iterator MII = Inst; 3538 DebugLoc DL = Inst.getDebugLoc(); 3539 3540 MachineOperand &Dest = Inst.getOperand(0); 3541 MachineOperand &Src = Inst.getOperand(1); 3542 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3543 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3544 3545 BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg) 3546 .addImm(0) 3547 .addReg(Src.getReg()); 3548 3549 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) 3550 .addReg(Src.getReg()) 3551 .addReg(TmpReg); 3552 3553 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3554 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3555 } 3556 3557 void SIInstrInfo::splitScalar64BitUnaryOp( 3558 SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst, 3559 unsigned Opcode) const { 3560 MachineBasicBlock &MBB = *Inst.getParent(); 3561 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3562 3563 MachineOperand &Dest = Inst.getOperand(0); 3564 MachineOperand &Src0 = Inst.getOperand(1); 3565 DebugLoc DL = Inst.getDebugLoc(); 3566 3567 MachineBasicBlock::iterator MII = Inst; 3568 3569 const MCInstrDesc &InstDesc = get(Opcode); 3570 const TargetRegisterClass *Src0RC = Src0.isReg() ? 3571 MRI.getRegClass(Src0.getReg()) : 3572 &AMDGPU::SGPR_32RegClass; 3573 3574 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 3575 3576 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3577 AMDGPU::sub0, Src0SubRC); 3578 3579 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 3580 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 3581 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 3582 3583 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 3584 BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); 3585 3586 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3587 AMDGPU::sub1, Src0SubRC); 3588 3589 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 3590 BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); 3591 3592 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 3593 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 3594 .addReg(DestSub0) 3595 .addImm(AMDGPU::sub0) 3596 .addReg(DestSub1) 3597 .addImm(AMDGPU::sub1); 3598 3599 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 3600 3601 // We don't need to legalizeOperands here because for a single operand, src0 3602 // will support any kind of input. 3603 3604 // Move all users of this moved value. 3605 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 3606 } 3607 3608 void SIInstrInfo::splitScalar64BitBinaryOp( 3609 SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst, 3610 unsigned Opcode) const { 3611 MachineBasicBlock &MBB = *Inst.getParent(); 3612 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3613 3614 MachineOperand &Dest = Inst.getOperand(0); 3615 MachineOperand &Src0 = Inst.getOperand(1); 3616 MachineOperand &Src1 = Inst.getOperand(2); 3617 DebugLoc DL = Inst.getDebugLoc(); 3618 3619 MachineBasicBlock::iterator MII = Inst; 3620 3621 const MCInstrDesc &InstDesc = get(Opcode); 3622 const TargetRegisterClass *Src0RC = Src0.isReg() ? 3623 MRI.getRegClass(Src0.getReg()) : 3624 &AMDGPU::SGPR_32RegClass; 3625 3626 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 3627 const TargetRegisterClass *Src1RC = Src1.isReg() ? 3628 MRI.getRegClass(Src1.getReg()) : 3629 &AMDGPU::SGPR_32RegClass; 3630 3631 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 3632 3633 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3634 AMDGPU::sub0, Src0SubRC); 3635 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 3636 AMDGPU::sub0, Src1SubRC); 3637 3638 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 3639 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 3640 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 3641 3642 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 3643 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) 3644 .add(SrcReg0Sub0) 3645 .add(SrcReg1Sub0); 3646 3647 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3648 AMDGPU::sub1, Src0SubRC); 3649 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 3650 AMDGPU::sub1, Src1SubRC); 3651 3652 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 3653 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) 3654 .add(SrcReg0Sub1) 3655 .add(SrcReg1Sub1); 3656 3657 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 3658 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 3659 .addReg(DestSub0) 3660 .addImm(AMDGPU::sub0) 3661 .addReg(DestSub1) 3662 .addImm(AMDGPU::sub1); 3663 3664 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 3665 3666 // Try to legalize the operands in case we need to swap the order to keep it 3667 // valid. 3668 legalizeOperands(LoHalf); 3669 legalizeOperands(HiHalf); 3670 3671 // Move all users of this moved vlaue. 3672 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 3673 } 3674 3675 void SIInstrInfo::splitScalar64BitBCNT( 3676 SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst) const { 3677 MachineBasicBlock &MBB = *Inst.getParent(); 3678 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3679 3680 MachineBasicBlock::iterator MII = Inst; 3681 DebugLoc DL = Inst.getDebugLoc(); 3682 3683 MachineOperand &Dest = Inst.getOperand(0); 3684 MachineOperand &Src = Inst.getOperand(1); 3685 3686 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 3687 const TargetRegisterClass *SrcRC = Src.isReg() ? 3688 MRI.getRegClass(Src.getReg()) : 3689 &AMDGPU::SGPR_32RegClass; 3690 3691 unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3692 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3693 3694 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); 3695 3696 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 3697 AMDGPU::sub0, SrcSubRC); 3698 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 3699 AMDGPU::sub1, SrcSubRC); 3700 3701 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0); 3702 3703 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg); 3704 3705 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3706 3707 // We don't need to legalize operands here. src0 for etiher instruction can be 3708 // an SGPR, and the second input is unused or determined here. 3709 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3710 } 3711 3712 void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, 3713 MachineInstr &Inst) const { 3714 MachineBasicBlock &MBB = *Inst.getParent(); 3715 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3716 MachineBasicBlock::iterator MII = Inst; 3717 DebugLoc DL = Inst.getDebugLoc(); 3718 3719 MachineOperand &Dest = Inst.getOperand(0); 3720 uint32_t Imm = Inst.getOperand(2).getImm(); 3721 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 3722 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 3723 3724 (void) Offset; 3725 3726 // Only sext_inreg cases handled. 3727 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && 3728 Offset == 0 && "Not implemented"); 3729 3730 if (BitWidth < 32) { 3731 unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3732 unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3733 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 3734 3735 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) 3736 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) 3737 .addImm(0) 3738 .addImm(BitWidth); 3739 3740 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 3741 .addImm(31) 3742 .addReg(MidRegLo); 3743 3744 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 3745 .addReg(MidRegLo) 3746 .addImm(AMDGPU::sub0) 3747 .addReg(MidRegHi) 3748 .addImm(AMDGPU::sub1); 3749 3750 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3751 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3752 return; 3753 } 3754 3755 MachineOperand &Src = Inst.getOperand(1); 3756 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3757 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 3758 3759 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 3760 .addImm(31) 3761 .addReg(Src.getReg(), 0, AMDGPU::sub0); 3762 3763 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 3764 .addReg(Src.getReg(), 0, AMDGPU::sub0) 3765 .addImm(AMDGPU::sub0) 3766 .addReg(TmpReg) 3767 .addImm(AMDGPU::sub1); 3768 3769 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3770 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3771 } 3772 3773 void SIInstrInfo::addUsersToMoveToVALUWorklist( 3774 unsigned DstReg, 3775 MachineRegisterInfo &MRI, 3776 SmallVectorImpl<MachineInstr *> &Worklist) const { 3777 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), 3778 E = MRI.use_end(); I != E;) { 3779 MachineInstr &UseMI = *I->getParent(); 3780 if (!canReadVGPR(UseMI, I.getOperandNo())) { 3781 Worklist.push_back(&UseMI); 3782 3783 do { 3784 ++I; 3785 } while (I != E && I->getParent() == &UseMI); 3786 } else { 3787 ++I; 3788 } 3789 } 3790 } 3791 3792 void SIInstrInfo::movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist, 3793 MachineRegisterInfo &MRI, 3794 MachineInstr &Inst) const { 3795 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3796 MachineBasicBlock *MBB = Inst.getParent(); 3797 MachineOperand &Src0 = Inst.getOperand(1); 3798 MachineOperand &Src1 = Inst.getOperand(2); 3799 const DebugLoc &DL = Inst.getDebugLoc(); 3800 3801 switch (Inst.getOpcode()) { 3802 case AMDGPU::S_PACK_LL_B32_B16: { 3803 unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3804 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3805 3806 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are 3807 // 0. 3808 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 3809 .addImm(0xffff); 3810 3811 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg) 3812 .addReg(ImmReg, RegState::Kill) 3813 .add(Src0); 3814 3815 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg) 3816 .add(Src1) 3817 .addImm(16) 3818 .addReg(TmpReg, RegState::Kill); 3819 break; 3820 } 3821 case AMDGPU::S_PACK_LH_B32_B16: { 3822 unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3823 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 3824 .addImm(0xffff); 3825 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg) 3826 .addReg(ImmReg, RegState::Kill) 3827 .add(Src0) 3828 .add(Src1); 3829 break; 3830 } 3831 case AMDGPU::S_PACK_HH_B32_B16: { 3832 unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3833 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3834 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) 3835 .addImm(16) 3836 .add(Src0); 3837 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 3838 .addImm(0xffff0000); 3839 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg) 3840 .add(Src1) 3841 .addReg(ImmReg, RegState::Kill) 3842 .addReg(TmpReg, RegState::Kill); 3843 break; 3844 } 3845 default: 3846 llvm_unreachable("unhandled s_pack_* instruction"); 3847 } 3848 3849 MachineOperand &Dest = Inst.getOperand(0); 3850 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3851 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3852 } 3853 3854 void SIInstrInfo::addSCCDefUsersToVALUWorklist( 3855 MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const { 3856 // This assumes that all the users of SCC are in the same block 3857 // as the SCC def. 3858 for (MachineInstr &MI : 3859 llvm::make_range(MachineBasicBlock::iterator(SCCDefInst), 3860 SCCDefInst.getParent()->end())) { 3861 // Exit if we find another SCC def. 3862 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1) 3863 return; 3864 3865 if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1) 3866 Worklist.push_back(&MI); 3867 } 3868 } 3869 3870 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( 3871 const MachineInstr &Inst) const { 3872 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); 3873 3874 switch (Inst.getOpcode()) { 3875 // For target instructions, getOpRegClass just returns the virtual register 3876 // class associated with the operand, so we need to find an equivalent VGPR 3877 // register class in order to move the instruction to the VALU. 3878 case AMDGPU::COPY: 3879 case AMDGPU::PHI: 3880 case AMDGPU::REG_SEQUENCE: 3881 case AMDGPU::INSERT_SUBREG: 3882 if (RI.hasVGPRs(NewDstRC)) 3883 return nullptr; 3884 3885 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 3886 if (!NewDstRC) 3887 return nullptr; 3888 return NewDstRC; 3889 default: 3890 return NewDstRC; 3891 } 3892 } 3893 3894 // Find the one SGPR operand we are allowed to use. 3895 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI, 3896 int OpIndices[3]) const { 3897 const MCInstrDesc &Desc = MI.getDesc(); 3898 3899 // Find the one SGPR operand we are allowed to use. 3900 // 3901 // First we need to consider the instruction's operand requirements before 3902 // legalizing. Some operands are required to be SGPRs, such as implicit uses 3903 // of VCC, but we are still bound by the constant bus requirement to only use 3904 // one. 3905 // 3906 // If the operand's class is an SGPR, we can never move it. 3907 3908 unsigned SGPRReg = findImplicitSGPRRead(MI); 3909 if (SGPRReg != AMDGPU::NoRegister) 3910 return SGPRReg; 3911 3912 unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; 3913 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 3914 3915 for (unsigned i = 0; i < 3; ++i) { 3916 int Idx = OpIndices[i]; 3917 if (Idx == -1) 3918 break; 3919 3920 const MachineOperand &MO = MI.getOperand(Idx); 3921 if (!MO.isReg()) 3922 continue; 3923 3924 // Is this operand statically required to be an SGPR based on the operand 3925 // constraints? 3926 const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); 3927 bool IsRequiredSGPR = RI.isSGPRClass(OpRC); 3928 if (IsRequiredSGPR) 3929 return MO.getReg(); 3930 3931 // If this could be a VGPR or an SGPR, Check the dynamic register class. 3932 unsigned Reg = MO.getReg(); 3933 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); 3934 if (RI.isSGPRClass(RegRC)) 3935 UsedSGPRs[i] = Reg; 3936 } 3937 3938 // We don't have a required SGPR operand, so we have a bit more freedom in 3939 // selecting operands to move. 3940 3941 // Try to select the most used SGPR. If an SGPR is equal to one of the 3942 // others, we choose that. 3943 // 3944 // e.g. 3945 // V_FMA_F32 v0, s0, s0, s0 -> No moves 3946 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 3947 3948 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should 3949 // prefer those. 3950 3951 if (UsedSGPRs[0] != AMDGPU::NoRegister) { 3952 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 3953 SGPRReg = UsedSGPRs[0]; 3954 } 3955 3956 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { 3957 if (UsedSGPRs[1] == UsedSGPRs[2]) 3958 SGPRReg = UsedSGPRs[1]; 3959 } 3960 3961 return SGPRReg; 3962 } 3963 3964 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 3965 unsigned OperandName) const { 3966 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 3967 if (Idx == -1) 3968 return nullptr; 3969 3970 return &MI.getOperand(Idx); 3971 } 3972 3973 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 3974 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 3975 if (ST.isAmdHsaOS()) { 3976 // Set ATC = 1. GFX9 doesn't have this bit. 3977 if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) 3978 RsrcDataFormat |= (1ULL << 56); 3979 3980 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this. 3981 // BTW, it disables TC L2 and therefore decreases performance. 3982 if (ST.getGeneration() == SISubtarget::VOLCANIC_ISLANDS) 3983 RsrcDataFormat |= (2ULL << 59); 3984 } 3985 3986 return RsrcDataFormat; 3987 } 3988 3989 uint64_t SIInstrInfo::getScratchRsrcWords23() const { 3990 uint64_t Rsrc23 = getDefaultRsrcDataFormat() | 3991 AMDGPU::RSRC_TID_ENABLE | 3992 0xffffffff; // Size; 3993 3994 // GFX9 doesn't have ELEMENT_SIZE. 3995 if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) { 3996 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; 3997 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; 3998 } 3999 4000 // IndexStride = 64. 4001 Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; 4002 4003 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. 4004 // Clear them unless we want a huge stride. 4005 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) 4006 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; 4007 4008 return Rsrc23; 4009 } 4010 4011 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { 4012 unsigned Opc = MI.getOpcode(); 4013 4014 return isSMRD(Opc); 4015 } 4016 4017 bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const { 4018 unsigned Opc = MI.getOpcode(); 4019 4020 return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); 4021 } 4022 4023 unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, 4024 int &FrameIndex) const { 4025 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 4026 if (!Addr || !Addr->isFI()) 4027 return AMDGPU::NoRegister; 4028 4029 assert(!MI.memoperands_empty() && 4030 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUASI.PRIVATE_ADDRESS); 4031 4032 FrameIndex = Addr->getIndex(); 4033 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); 4034 } 4035 4036 unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, 4037 int &FrameIndex) const { 4038 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr); 4039 assert(Addr && Addr->isFI()); 4040 FrameIndex = Addr->getIndex(); 4041 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg(); 4042 } 4043 4044 unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 4045 int &FrameIndex) const { 4046 4047 if (!MI.mayLoad()) 4048 return AMDGPU::NoRegister; 4049 4050 if (isMUBUF(MI) || isVGPRSpill(MI)) 4051 return isStackAccess(MI, FrameIndex); 4052 4053 if (isSGPRSpill(MI)) 4054 return isSGPRStackAccess(MI, FrameIndex); 4055 4056 return AMDGPU::NoRegister; 4057 } 4058 4059 unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI, 4060 int &FrameIndex) const { 4061 if (!MI.mayStore()) 4062 return AMDGPU::NoRegister; 4063 4064 if (isMUBUF(MI) || isVGPRSpill(MI)) 4065 return isStackAccess(MI, FrameIndex); 4066 4067 if (isSGPRSpill(MI)) 4068 return isSGPRStackAccess(MI, FrameIndex); 4069 4070 return AMDGPU::NoRegister; 4071 } 4072 4073 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 4074 unsigned Opc = MI.getOpcode(); 4075 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); 4076 unsigned DescSize = Desc.getSize(); 4077 4078 // If we have a definitive size, we can use it. Otherwise we need to inspect 4079 // the operands to know the size. 4080 // 4081 // FIXME: Instructions that have a base 32-bit encoding report their size as 4082 // 4, even though they are really 8 bytes if they have a literal operand. 4083 if (DescSize != 0 && DescSize != 4) 4084 return DescSize; 4085 4086 // 4-byte instructions may have a 32-bit literal encoded after them. Check 4087 // operands that coud ever be literals. 4088 if (isVALU(MI) || isSALU(MI)) { 4089 if (isFixedSize(MI)) 4090 return DescSize; 4091 4092 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 4093 if (Src0Idx == -1) 4094 return 4; // No operands. 4095 4096 if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx])) 4097 return 8; 4098 4099 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 4100 if (Src1Idx == -1) 4101 return 4; 4102 4103 if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx])) 4104 return 8; 4105 4106 return 4; 4107 } 4108 4109 if (DescSize == 4) 4110 return 4; 4111 4112 switch (Opc) { 4113 case TargetOpcode::IMPLICIT_DEF: 4114 case TargetOpcode::KILL: 4115 case TargetOpcode::DBG_VALUE: 4116 case TargetOpcode::BUNDLE: 4117 case TargetOpcode::EH_LABEL: 4118 return 0; 4119 case TargetOpcode::INLINEASM: { 4120 const MachineFunction *MF = MI.getParent()->getParent(); 4121 const char *AsmStr = MI.getOperand(0).getSymbolName(); 4122 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo()); 4123 } 4124 default: 4125 llvm_unreachable("unable to find instruction size"); 4126 } 4127 } 4128 4129 bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { 4130 if (!isFLAT(MI)) 4131 return false; 4132 4133 if (MI.memoperands_empty()) 4134 return true; 4135 4136 for (const MachineMemOperand *MMO : MI.memoperands()) { 4137 if (MMO->getAddrSpace() == AMDGPUASI.FLAT_ADDRESS) 4138 return true; 4139 } 4140 return false; 4141 } 4142 4143 bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const { 4144 return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO; 4145 } 4146 4147 void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, 4148 MachineBasicBlock *IfEnd) const { 4149 MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator(); 4150 assert(TI != IfEntry->end()); 4151 4152 MachineInstr *Branch = &(*TI); 4153 MachineFunction *MF = IfEntry->getParent(); 4154 MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo(); 4155 4156 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 4157 unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4158 MachineInstr *SIIF = 4159 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg) 4160 .add(Branch->getOperand(0)) 4161 .add(Branch->getOperand(1)); 4162 MachineInstr *SIEND = 4163 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF)) 4164 .addReg(DstReg); 4165 4166 IfEntry->erase(TI); 4167 IfEntry->insert(IfEntry->end(), SIIF); 4168 IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND); 4169 } 4170 } 4171 4172 void SIInstrInfo::convertNonUniformLoopRegion( 4173 MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const { 4174 MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator(); 4175 // We expect 2 terminators, one conditional and one unconditional. 4176 assert(TI != LoopEnd->end()); 4177 4178 MachineInstr *Branch = &(*TI); 4179 MachineFunction *MF = LoopEnd->getParent(); 4180 MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo(); 4181 4182 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 4183 4184 unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4185 unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4186 MachineInstrBuilder HeaderPHIBuilder = 4187 BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg); 4188 for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(), 4189 E = LoopEntry->pred_end(); 4190 PI != E; ++PI) { 4191 if (*PI == LoopEnd) { 4192 HeaderPHIBuilder.addReg(BackEdgeReg); 4193 } else { 4194 MachineBasicBlock *PMBB = *PI; 4195 unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4196 materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(), 4197 ZeroReg, 0); 4198 HeaderPHIBuilder.addReg(ZeroReg); 4199 } 4200 HeaderPHIBuilder.addMBB(*PI); 4201 } 4202 MachineInstr *HeaderPhi = HeaderPHIBuilder; 4203 MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(), 4204 get(AMDGPU::SI_IF_BREAK), BackEdgeReg) 4205 .addReg(DstReg) 4206 .add(Branch->getOperand(0)); 4207 MachineInstr *SILOOP = 4208 BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP)) 4209 .addReg(BackEdgeReg) 4210 .addMBB(LoopEntry); 4211 4212 LoopEntry->insert(LoopEntry->begin(), HeaderPhi); 4213 LoopEnd->erase(TI); 4214 LoopEnd->insert(LoopEnd->end(), SIIFBREAK); 4215 LoopEnd->insert(LoopEnd->end(), SILOOP); 4216 } 4217 } 4218 4219 ArrayRef<std::pair<int, const char *>> 4220 SIInstrInfo::getSerializableTargetIndices() const { 4221 static const std::pair<int, const char *> TargetIndices[] = { 4222 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, 4223 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, 4224 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, 4225 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, 4226 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; 4227 return makeArrayRef(TargetIndices); 4228 } 4229 4230 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The 4231 /// post-RA version of misched uses CreateTargetMIHazardRecognizer. 4232 ScheduleHazardRecognizer * 4233 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, 4234 const ScheduleDAG *DAG) const { 4235 return new GCNHazardRecognizer(DAG->MF); 4236 } 4237 4238 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer 4239 /// pass. 4240 ScheduleHazardRecognizer * 4241 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { 4242 return new GCNHazardRecognizer(MF); 4243 } 4244 4245 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { 4246 return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && 4247 MI.modifiesRegister(AMDGPU::EXEC, &RI); 4248 } 4249 4250 MachineInstrBuilder 4251 SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, 4252 MachineBasicBlock::iterator I, 4253 const DebugLoc &DL, 4254 unsigned DestReg) const { 4255 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4256 4257 unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4258 4259 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) 4260 .addReg(UnusedCarry, RegState::Define | RegState::Dead); 4261 } 4262