1 //===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief SI Implementation of TargetInstrInfo. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "SIInstrInfo.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "GCNHazardRecognizer.h" 18 #include "SIDefines.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/CodeGen/MachineFrameInfo.h" 21 #include "llvm/CodeGen/MachineInstrBuilder.h" 22 #include "llvm/CodeGen/MachineRegisterInfo.h" 23 #include "llvm/CodeGen/ScheduleDAG.h" 24 #include "llvm/IR/DiagnosticInfo.h" 25 #include "llvm/IR/Function.h" 26 #include "llvm/CodeGen/RegisterScavenging.h" 27 #include "llvm/MC/MCInstrDesc.h" 28 #include "llvm/Support/Debug.h" 29 30 using namespace llvm; 31 32 // Must be at least 4 to be able to branch over minimum unconditional branch 33 // code. This is only for making it possible to write reasonably small tests for 34 // long branches. 35 static cl::opt<unsigned> 36 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), 37 cl::desc("Restrict range of branch instructions (DEBUG)")); 38 39 SIInstrInfo::SIInstrInfo(const SISubtarget &ST) 40 : AMDGPUInstrInfo(ST), RI(ST), ST(ST) {} 41 42 //===----------------------------------------------------------------------===// 43 // TargetInstrInfo callbacks 44 //===----------------------------------------------------------------------===// 45 46 static unsigned getNumOperandsNoGlue(SDNode *Node) { 47 unsigned N = Node->getNumOperands(); 48 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 49 --N; 50 return N; 51 } 52 53 static SDValue findChainOperand(SDNode *Load) { 54 SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); 55 assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); 56 return LastOp; 57 } 58 59 /// \brief Returns true if both nodes have the same value for the given 60 /// operand \p Op, or if both nodes do not have this operand. 61 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 62 unsigned Opc0 = N0->getMachineOpcode(); 63 unsigned Opc1 = N1->getMachineOpcode(); 64 65 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 66 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 67 68 if (Op0Idx == -1 && Op1Idx == -1) 69 return true; 70 71 72 if ((Op0Idx == -1 && Op1Idx != -1) || 73 (Op1Idx == -1 && Op0Idx != -1)) 74 return false; 75 76 // getNamedOperandIdx returns the index for the MachineInstr's operands, 77 // which includes the result as the first operand. We are indexing into the 78 // MachineSDNode's operands, so we need to skip the result operand to get 79 // the real index. 80 --Op0Idx; 81 --Op1Idx; 82 83 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 84 } 85 86 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, 87 AliasAnalysis *AA) const { 88 // TODO: The generic check fails for VALU instructions that should be 89 // rematerializable due to implicit reads of exec. We really want all of the 90 // generic logic for this except for this. 91 switch (MI.getOpcode()) { 92 case AMDGPU::V_MOV_B32_e32: 93 case AMDGPU::V_MOV_B32_e64: 94 case AMDGPU::V_MOV_B64_PSEUDO: 95 return true; 96 default: 97 return false; 98 } 99 } 100 101 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 102 int64_t &Offset0, 103 int64_t &Offset1) const { 104 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 105 return false; 106 107 unsigned Opc0 = Load0->getMachineOpcode(); 108 unsigned Opc1 = Load1->getMachineOpcode(); 109 110 // Make sure both are actually loads. 111 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 112 return false; 113 114 if (isDS(Opc0) && isDS(Opc1)) { 115 116 // FIXME: Handle this case: 117 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 118 return false; 119 120 // Check base reg. 121 if (Load0->getOperand(1) != Load1->getOperand(1)) 122 return false; 123 124 // Check chain. 125 if (findChainOperand(Load0) != findChainOperand(Load1)) 126 return false; 127 128 // Skip read2 / write2 variants for simplicity. 129 // TODO: We should report true if the used offsets are adjacent (excluded 130 // st64 versions). 131 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || 132 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) 133 return false; 134 135 Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue(); 136 Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue(); 137 return true; 138 } 139 140 if (isSMRD(Opc0) && isSMRD(Opc1)) { 141 // Skip time and cache invalidation instructions. 142 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 || 143 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1) 144 return false; 145 146 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); 147 148 // Check base reg. 149 if (Load0->getOperand(0) != Load1->getOperand(0)) 150 return false; 151 152 const ConstantSDNode *Load0Offset = 153 dyn_cast<ConstantSDNode>(Load0->getOperand(1)); 154 const ConstantSDNode *Load1Offset = 155 dyn_cast<ConstantSDNode>(Load1->getOperand(1)); 156 157 if (!Load0Offset || !Load1Offset) 158 return false; 159 160 // Check chain. 161 if (findChainOperand(Load0) != findChainOperand(Load1)) 162 return false; 163 164 Offset0 = Load0Offset->getZExtValue(); 165 Offset1 = Load1Offset->getZExtValue(); 166 return true; 167 } 168 169 // MUBUF and MTBUF can access the same addresses. 170 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 171 172 // MUBUF and MTBUF have vaddr at different indices. 173 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 174 findChainOperand(Load0) != findChainOperand(Load1) || 175 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 176 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 177 return false; 178 179 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 180 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 181 182 if (OffIdx0 == -1 || OffIdx1 == -1) 183 return false; 184 185 // getNamedOperandIdx returns the index for MachineInstrs. Since they 186 // inlcude the output in the operand list, but SDNodes don't, we need to 187 // subtract the index by one. 188 --OffIdx0; 189 --OffIdx1; 190 191 SDValue Off0 = Load0->getOperand(OffIdx0); 192 SDValue Off1 = Load1->getOperand(OffIdx1); 193 194 // The offset might be a FrameIndexSDNode. 195 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 196 return false; 197 198 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 199 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 200 return true; 201 } 202 203 return false; 204 } 205 206 static bool isStride64(unsigned Opc) { 207 switch (Opc) { 208 case AMDGPU::DS_READ2ST64_B32: 209 case AMDGPU::DS_READ2ST64_B64: 210 case AMDGPU::DS_WRITE2ST64_B32: 211 case AMDGPU::DS_WRITE2ST64_B64: 212 return true; 213 default: 214 return false; 215 } 216 } 217 218 bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, 219 int64_t &Offset, 220 const TargetRegisterInfo *TRI) const { 221 unsigned Opc = LdSt.getOpcode(); 222 223 if (isDS(LdSt)) { 224 const MachineOperand *OffsetImm = 225 getNamedOperand(LdSt, AMDGPU::OpName::offset); 226 if (OffsetImm) { 227 // Normal, single offset LDS instruction. 228 const MachineOperand *AddrReg = 229 getNamedOperand(LdSt, AMDGPU::OpName::addr); 230 231 BaseReg = AddrReg->getReg(); 232 Offset = OffsetImm->getImm(); 233 return true; 234 } 235 236 // The 2 offset instructions use offset0 and offset1 instead. We can treat 237 // these as a load with a single offset if the 2 offsets are consecutive. We 238 // will use this for some partially aligned loads. 239 const MachineOperand *Offset0Imm = 240 getNamedOperand(LdSt, AMDGPU::OpName::offset0); 241 const MachineOperand *Offset1Imm = 242 getNamedOperand(LdSt, AMDGPU::OpName::offset1); 243 244 uint8_t Offset0 = Offset0Imm->getImm(); 245 uint8_t Offset1 = Offset1Imm->getImm(); 246 247 if (Offset1 > Offset0 && Offset1 - Offset0 == 1) { 248 // Each of these offsets is in element sized units, so we need to convert 249 // to bytes of the individual reads. 250 251 unsigned EltSize; 252 if (LdSt.mayLoad()) 253 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16; 254 else { 255 assert(LdSt.mayStore()); 256 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 257 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8; 258 } 259 260 if (isStride64(Opc)) 261 EltSize *= 64; 262 263 const MachineOperand *AddrReg = 264 getNamedOperand(LdSt, AMDGPU::OpName::addr); 265 BaseReg = AddrReg->getReg(); 266 Offset = EltSize * Offset0; 267 return true; 268 } 269 270 return false; 271 } 272 273 if (isMUBUF(LdSt) || isMTBUF(LdSt)) { 274 const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset); 275 if (SOffset && SOffset->isReg()) 276 return false; 277 278 const MachineOperand *AddrReg = 279 getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 280 if (!AddrReg) 281 return false; 282 283 const MachineOperand *OffsetImm = 284 getNamedOperand(LdSt, AMDGPU::OpName::offset); 285 BaseReg = AddrReg->getReg(); 286 Offset = OffsetImm->getImm(); 287 288 if (SOffset) // soffset can be an inline immediate. 289 Offset += SOffset->getImm(); 290 291 return true; 292 } 293 294 if (isSMRD(LdSt)) { 295 const MachineOperand *OffsetImm = 296 getNamedOperand(LdSt, AMDGPU::OpName::offset); 297 if (!OffsetImm) 298 return false; 299 300 const MachineOperand *SBaseReg = 301 getNamedOperand(LdSt, AMDGPU::OpName::sbase); 302 BaseReg = SBaseReg->getReg(); 303 Offset = OffsetImm->getImm(); 304 return true; 305 } 306 307 if (isFLAT(LdSt)) { 308 const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 309 BaseReg = AddrReg->getReg(); 310 Offset = 0; 311 return true; 312 } 313 314 return false; 315 } 316 317 bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, 318 MachineInstr &SecondLdSt, 319 unsigned NumLoads) const { 320 const MachineOperand *FirstDst = nullptr; 321 const MachineOperand *SecondDst = nullptr; 322 323 if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || 324 (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) || 325 (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) { 326 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); 327 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); 328 } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { 329 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); 330 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); 331 } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) { 332 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); 333 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); 334 } 335 336 if (!FirstDst || !SecondDst) 337 return false; 338 339 // Try to limit clustering based on the total number of bytes loaded 340 // rather than the number of instructions. This is done to help reduce 341 // register pressure. The method used is somewhat inexact, though, 342 // because it assumes that all loads in the cluster will load the 343 // same number of bytes as FirstLdSt. 344 345 // The unit of this value is bytes. 346 // FIXME: This needs finer tuning. 347 unsigned LoadClusterThreshold = 16; 348 349 const MachineRegisterInfo &MRI = 350 FirstLdSt.getParent()->getParent()->getRegInfo(); 351 const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg()); 352 353 return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold; 354 } 355 356 static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, 357 MachineBasicBlock::iterator MI, 358 const DebugLoc &DL, unsigned DestReg, 359 unsigned SrcReg, bool KillSrc) { 360 MachineFunction *MF = MBB.getParent(); 361 DiagnosticInfoUnsupported IllegalCopy(*MF->getFunction(), 362 "illegal SGPR to VGPR copy", 363 DL, DS_Error); 364 LLVMContext &C = MF->getFunction()->getContext(); 365 C.diagnose(IllegalCopy); 366 367 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg) 368 .addReg(SrcReg, getKillRegState(KillSrc)); 369 } 370 371 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 372 MachineBasicBlock::iterator MI, 373 const DebugLoc &DL, unsigned DestReg, 374 unsigned SrcReg, bool KillSrc) const { 375 const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg); 376 377 if (RC == &AMDGPU::VGPR_32RegClass) { 378 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 379 AMDGPU::SReg_32RegClass.contains(SrcReg)); 380 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 381 .addReg(SrcReg, getKillRegState(KillSrc)); 382 return; 383 } 384 385 if (RC == &AMDGPU::SReg_32_XM0RegClass || 386 RC == &AMDGPU::SReg_32RegClass) { 387 if (SrcReg == AMDGPU::SCC) { 388 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) 389 .addImm(-1) 390 .addImm(0); 391 return; 392 } 393 394 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) { 395 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 396 return; 397 } 398 399 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 400 .addReg(SrcReg, getKillRegState(KillSrc)); 401 return; 402 } 403 404 if (RC == &AMDGPU::SReg_64RegClass) { 405 if (DestReg == AMDGPU::VCC) { 406 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 407 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 408 .addReg(SrcReg, getKillRegState(KillSrc)); 409 } else { 410 // FIXME: Hack until VReg_1 removed. 411 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 412 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) 413 .addImm(0) 414 .addReg(SrcReg, getKillRegState(KillSrc)); 415 } 416 417 return; 418 } 419 420 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) { 421 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 422 return; 423 } 424 425 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 426 .addReg(SrcReg, getKillRegState(KillSrc)); 427 return; 428 } 429 430 if (DestReg == AMDGPU::SCC) { 431 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 432 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) 433 .addReg(SrcReg, getKillRegState(KillSrc)) 434 .addImm(0); 435 return; 436 } 437 438 unsigned EltSize = 4; 439 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 440 if (RI.isSGPRClass(RC)) { 441 if (RI.getRegSizeInBits(*RC) > 32) { 442 Opcode = AMDGPU::S_MOV_B64; 443 EltSize = 8; 444 } else { 445 Opcode = AMDGPU::S_MOV_B32; 446 EltSize = 4; 447 } 448 449 if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) { 450 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 451 return; 452 } 453 } 454 455 456 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); 457 bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); 458 459 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 460 unsigned SubIdx; 461 if (Forward) 462 SubIdx = SubIndices[Idx]; 463 else 464 SubIdx = SubIndices[SubIndices.size() - Idx - 1]; 465 466 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 467 get(Opcode), RI.getSubReg(DestReg, SubIdx)); 468 469 Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); 470 471 if (Idx == SubIndices.size() - 1) 472 Builder.addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit); 473 474 if (Idx == 0) 475 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 476 477 Builder.addReg(SrcReg, RegState::Implicit); 478 } 479 } 480 481 int SIInstrInfo::commuteOpcode(unsigned Opcode) const { 482 int NewOpc; 483 484 // Try to map original to commuted opcode 485 NewOpc = AMDGPU::getCommuteRev(Opcode); 486 if (NewOpc != -1) 487 // Check if the commuted (REV) opcode exists on the target. 488 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 489 490 // Try to map commuted to original opcode 491 NewOpc = AMDGPU::getCommuteOrig(Opcode); 492 if (NewOpc != -1) 493 // Check if the original (non-REV) opcode exists on the target. 494 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 495 496 return Opcode; 497 } 498 499 void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, 500 MachineBasicBlock::iterator MI, 501 const DebugLoc &DL, unsigned DestReg, 502 int64_t Value) const { 503 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 504 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg); 505 if (RegClass == &AMDGPU::SReg_32RegClass || 506 RegClass == &AMDGPU::SGPR_32RegClass || 507 RegClass == &AMDGPU::SReg_32_XM0RegClass || 508 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) { 509 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 510 .addImm(Value); 511 return; 512 } 513 514 if (RegClass == &AMDGPU::SReg_64RegClass || 515 RegClass == &AMDGPU::SGPR_64RegClass || 516 RegClass == &AMDGPU::SReg_64_XEXECRegClass) { 517 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 518 .addImm(Value); 519 return; 520 } 521 522 if (RegClass == &AMDGPU::VGPR_32RegClass) { 523 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 524 .addImm(Value); 525 return; 526 } 527 if (RegClass == &AMDGPU::VReg_64RegClass) { 528 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg) 529 .addImm(Value); 530 return; 531 } 532 533 unsigned EltSize = 4; 534 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 535 if (RI.isSGPRClass(RegClass)) { 536 if (RI.getRegSizeInBits(*RegClass) > 32) { 537 Opcode = AMDGPU::S_MOV_B64; 538 EltSize = 8; 539 } else { 540 Opcode = AMDGPU::S_MOV_B32; 541 EltSize = 4; 542 } 543 } 544 545 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize); 546 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 547 int64_t IdxValue = Idx == 0 ? Value : 0; 548 549 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 550 get(Opcode), RI.getSubReg(DestReg, Idx)); 551 Builder.addImm(IdxValue); 552 } 553 } 554 555 const TargetRegisterClass * 556 SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const { 557 return &AMDGPU::VGPR_32RegClass; 558 } 559 560 void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, 561 MachineBasicBlock::iterator I, 562 const DebugLoc &DL, unsigned DstReg, 563 ArrayRef<MachineOperand> Cond, 564 unsigned TrueReg, 565 unsigned FalseReg) const { 566 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 567 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && 568 "Not a VGPR32 reg"); 569 570 if (Cond.size() == 1) { 571 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 572 .addReg(FalseReg) 573 .addReg(TrueReg) 574 .add(Cond[0]); 575 } else if (Cond.size() == 2) { 576 assert(Cond[0].isImm() && "Cond[0] is not an immediate"); 577 switch (Cond[0].getImm()) { 578 case SIInstrInfo::SCC_TRUE: { 579 unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 580 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) 581 .addImm(-1) 582 .addImm(0); 583 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 584 .addReg(FalseReg) 585 .addReg(TrueReg) 586 .addReg(SReg); 587 break; 588 } 589 case SIInstrInfo::SCC_FALSE: { 590 unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 591 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) 592 .addImm(0) 593 .addImm(-1); 594 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 595 .addReg(FalseReg) 596 .addReg(TrueReg) 597 .addReg(SReg); 598 break; 599 } 600 case SIInstrInfo::VCCNZ: { 601 MachineOperand RegOp = Cond[1]; 602 RegOp.setImplicit(false); 603 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 604 .addReg(FalseReg) 605 .addReg(TrueReg) 606 .add(RegOp); 607 break; 608 } 609 case SIInstrInfo::VCCZ: { 610 MachineOperand RegOp = Cond[1]; 611 RegOp.setImplicit(false); 612 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 613 .addReg(TrueReg) 614 .addReg(FalseReg) 615 .add(RegOp); 616 break; 617 } 618 case SIInstrInfo::EXECNZ: { 619 unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 620 unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 621 BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 622 .addImm(0); 623 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) 624 .addImm(-1) 625 .addImm(0); 626 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 627 .addReg(FalseReg) 628 .addReg(TrueReg) 629 .addReg(SReg); 630 break; 631 } 632 case SIInstrInfo::EXECZ: { 633 unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 634 unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 635 BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 636 .addImm(0); 637 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) 638 .addImm(0) 639 .addImm(-1); 640 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 641 .addReg(FalseReg) 642 .addReg(TrueReg) 643 .addReg(SReg); 644 llvm_unreachable("Unhandled branch predicate EXECZ"); 645 break; 646 } 647 default: 648 llvm_unreachable("invalid branch predicate"); 649 } 650 } else { 651 llvm_unreachable("Can only handle Cond size 1 or 2"); 652 } 653 } 654 655 unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB, 656 MachineBasicBlock::iterator I, 657 const DebugLoc &DL, 658 unsigned SrcReg, int Value) const { 659 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 660 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 661 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) 662 .addImm(Value) 663 .addReg(SrcReg); 664 665 return Reg; 666 } 667 668 unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB, 669 MachineBasicBlock::iterator I, 670 const DebugLoc &DL, 671 unsigned SrcReg, int Value) const { 672 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 673 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 674 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) 675 .addImm(Value) 676 .addReg(SrcReg); 677 678 return Reg; 679 } 680 681 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 682 683 if (RI.getRegSizeInBits(*DstRC) == 32) { 684 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 685 } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { 686 return AMDGPU::S_MOV_B64; 687 } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) { 688 return AMDGPU::V_MOV_B64_PSEUDO; 689 } 690 return AMDGPU::COPY; 691 } 692 693 static unsigned getSGPRSpillSaveOpcode(unsigned Size) { 694 switch (Size) { 695 case 4: 696 return AMDGPU::SI_SPILL_S32_SAVE; 697 case 8: 698 return AMDGPU::SI_SPILL_S64_SAVE; 699 case 16: 700 return AMDGPU::SI_SPILL_S128_SAVE; 701 case 32: 702 return AMDGPU::SI_SPILL_S256_SAVE; 703 case 64: 704 return AMDGPU::SI_SPILL_S512_SAVE; 705 default: 706 llvm_unreachable("unknown register size"); 707 } 708 } 709 710 static unsigned getVGPRSpillSaveOpcode(unsigned Size) { 711 switch (Size) { 712 case 4: 713 return AMDGPU::SI_SPILL_V32_SAVE; 714 case 8: 715 return AMDGPU::SI_SPILL_V64_SAVE; 716 case 12: 717 return AMDGPU::SI_SPILL_V96_SAVE; 718 case 16: 719 return AMDGPU::SI_SPILL_V128_SAVE; 720 case 32: 721 return AMDGPU::SI_SPILL_V256_SAVE; 722 case 64: 723 return AMDGPU::SI_SPILL_V512_SAVE; 724 default: 725 llvm_unreachable("unknown register size"); 726 } 727 } 728 729 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 730 MachineBasicBlock::iterator MI, 731 unsigned SrcReg, bool isKill, 732 int FrameIndex, 733 const TargetRegisterClass *RC, 734 const TargetRegisterInfo *TRI) const { 735 MachineFunction *MF = MBB.getParent(); 736 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 737 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 738 DebugLoc DL = MBB.findDebugLoc(MI); 739 740 unsigned Size = FrameInfo.getObjectSize(FrameIndex); 741 unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); 742 MachinePointerInfo PtrInfo 743 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 744 MachineMemOperand *MMO 745 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 746 Size, Align); 747 unsigned SpillSize = TRI->getSpillSize(*RC); 748 749 if (RI.isSGPRClass(RC)) { 750 MFI->setHasSpilledSGPRs(); 751 752 // We are only allowed to create one new instruction when spilling 753 // registers, so we need to use pseudo instruction for spilling SGPRs. 754 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize)); 755 756 // The SGPR spill/restore instructions only work on number sgprs, so we need 757 // to make sure we are using the correct register class. 758 if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) { 759 MachineRegisterInfo &MRI = MF->getRegInfo(); 760 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); 761 } 762 763 MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc) 764 .addReg(SrcReg, getKillRegState(isKill)) // data 765 .addFrameIndex(FrameIndex) // addr 766 .addMemOperand(MMO) 767 .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) 768 .addReg(MFI->getFrameOffsetReg(), RegState::Implicit); 769 // Add the scratch resource registers as implicit uses because we may end up 770 // needing them, and need to ensure that the reserved registers are 771 // correctly handled. 772 773 if (ST.hasScalarStores()) { 774 // m0 is used for offset to scalar stores if used to spill. 775 Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine); 776 } 777 778 return; 779 } 780 781 if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { 782 LLVMContext &Ctx = MF->getFunction()->getContext(); 783 Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" 784 " spill register"); 785 BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) 786 .addReg(SrcReg); 787 788 return; 789 } 790 791 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 792 793 unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize); 794 MFI->setHasSpilledVGPRs(); 795 BuildMI(MBB, MI, DL, get(Opcode)) 796 .addReg(SrcReg, getKillRegState(isKill)) // data 797 .addFrameIndex(FrameIndex) // addr 798 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 799 .addReg(MFI->getFrameOffsetReg()) // scratch_offset 800 .addImm(0) // offset 801 .addMemOperand(MMO); 802 } 803 804 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { 805 switch (Size) { 806 case 4: 807 return AMDGPU::SI_SPILL_S32_RESTORE; 808 case 8: 809 return AMDGPU::SI_SPILL_S64_RESTORE; 810 case 16: 811 return AMDGPU::SI_SPILL_S128_RESTORE; 812 case 32: 813 return AMDGPU::SI_SPILL_S256_RESTORE; 814 case 64: 815 return AMDGPU::SI_SPILL_S512_RESTORE; 816 default: 817 llvm_unreachable("unknown register size"); 818 } 819 } 820 821 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { 822 switch (Size) { 823 case 4: 824 return AMDGPU::SI_SPILL_V32_RESTORE; 825 case 8: 826 return AMDGPU::SI_SPILL_V64_RESTORE; 827 case 12: 828 return AMDGPU::SI_SPILL_V96_RESTORE; 829 case 16: 830 return AMDGPU::SI_SPILL_V128_RESTORE; 831 case 32: 832 return AMDGPU::SI_SPILL_V256_RESTORE; 833 case 64: 834 return AMDGPU::SI_SPILL_V512_RESTORE; 835 default: 836 llvm_unreachable("unknown register size"); 837 } 838 } 839 840 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 841 MachineBasicBlock::iterator MI, 842 unsigned DestReg, int FrameIndex, 843 const TargetRegisterClass *RC, 844 const TargetRegisterInfo *TRI) const { 845 MachineFunction *MF = MBB.getParent(); 846 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 847 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 848 DebugLoc DL = MBB.findDebugLoc(MI); 849 unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); 850 unsigned Size = FrameInfo.getObjectSize(FrameIndex); 851 unsigned SpillSize = TRI->getSpillSize(*RC); 852 853 MachinePointerInfo PtrInfo 854 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 855 856 MachineMemOperand *MMO = MF->getMachineMemOperand( 857 PtrInfo, MachineMemOperand::MOLoad, Size, Align); 858 859 if (RI.isSGPRClass(RC)) { 860 // FIXME: Maybe this should not include a memoperand because it will be 861 // lowered to non-memory instructions. 862 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); 863 if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) { 864 MachineRegisterInfo &MRI = MF->getRegInfo(); 865 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); 866 } 867 868 MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg) 869 .addFrameIndex(FrameIndex) // addr 870 .addMemOperand(MMO) 871 .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) 872 .addReg(MFI->getFrameOffsetReg(), RegState::Implicit); 873 874 if (ST.hasScalarStores()) { 875 // m0 is used for offset to scalar stores if used to spill. 876 Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine); 877 } 878 879 return; 880 } 881 882 if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { 883 LLVMContext &Ctx = MF->getFunction()->getContext(); 884 Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" 885 " restore register"); 886 BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); 887 888 return; 889 } 890 891 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 892 893 unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize); 894 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 895 .addFrameIndex(FrameIndex) // vaddr 896 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 897 .addReg(MFI->getFrameOffsetReg()) // scratch_offset 898 .addImm(0) // offset 899 .addMemOperand(MMO); 900 } 901 902 /// \param @Offset Offset in bytes of the FrameIndex being spilled 903 unsigned SIInstrInfo::calculateLDSSpillAddress( 904 MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg, 905 unsigned FrameOffset, unsigned Size) const { 906 MachineFunction *MF = MBB.getParent(); 907 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 908 const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); 909 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 910 DebugLoc DL = MBB.findDebugLoc(MI); 911 unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); 912 unsigned WavefrontSize = ST.getWavefrontSize(); 913 914 unsigned TIDReg = MFI->getTIDReg(); 915 if (!MFI->hasCalculatedTID()) { 916 MachineBasicBlock &Entry = MBB.getParent()->front(); 917 MachineBasicBlock::iterator Insert = Entry.front(); 918 DebugLoc DL = Insert->getDebugLoc(); 919 920 TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass, 921 *MF); 922 if (TIDReg == AMDGPU::NoRegister) 923 return TIDReg; 924 925 if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) && 926 WorkGroupSize > WavefrontSize) { 927 928 unsigned TIDIGXReg 929 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X); 930 unsigned TIDIGYReg 931 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y); 932 unsigned TIDIGZReg 933 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z); 934 unsigned InputPtrReg = 935 TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); 936 for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { 937 if (!Entry.isLiveIn(Reg)) 938 Entry.addLiveIn(Reg); 939 } 940 941 RS->enterBasicBlock(Entry); 942 // FIXME: Can we scavenge an SReg_64 and access the subregs? 943 unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 944 unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 945 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) 946 .addReg(InputPtrReg) 947 .addImm(SI::KernelInputOffsets::NGROUPS_Z); 948 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) 949 .addReg(InputPtrReg) 950 .addImm(SI::KernelInputOffsets::NGROUPS_Y); 951 952 // NGROUPS.X * NGROUPS.Y 953 BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) 954 .addReg(STmp1) 955 .addReg(STmp0); 956 // (NGROUPS.X * NGROUPS.Y) * TIDIG.X 957 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) 958 .addReg(STmp1) 959 .addReg(TIDIGXReg); 960 // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) 961 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) 962 .addReg(STmp0) 963 .addReg(TIDIGYReg) 964 .addReg(TIDReg); 965 // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z 966 BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) 967 .addReg(TIDReg) 968 .addReg(TIDIGZReg); 969 } else { 970 // Get the wave id 971 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), 972 TIDReg) 973 .addImm(-1) 974 .addImm(0); 975 976 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), 977 TIDReg) 978 .addImm(-1) 979 .addReg(TIDReg); 980 } 981 982 BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), 983 TIDReg) 984 .addImm(2) 985 .addReg(TIDReg); 986 MFI->setTIDReg(TIDReg); 987 } 988 989 // Add FrameIndex to LDS offset 990 unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize); 991 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) 992 .addImm(LDSOffset) 993 .addReg(TIDReg); 994 995 return TmpReg; 996 } 997 998 void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB, 999 MachineBasicBlock::iterator MI, 1000 int Count) const { 1001 DebugLoc DL = MBB.findDebugLoc(MI); 1002 while (Count > 0) { 1003 int Arg; 1004 if (Count >= 8) 1005 Arg = 7; 1006 else 1007 Arg = Count - 1; 1008 Count -= 8; 1009 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)) 1010 .addImm(Arg); 1011 } 1012 } 1013 1014 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, 1015 MachineBasicBlock::iterator MI) const { 1016 insertWaitStates(MBB, MI, 1); 1017 } 1018 1019 void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { 1020 auto MF = MBB.getParent(); 1021 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 1022 1023 assert(Info->isEntryFunction()); 1024 1025 if (MBB.succ_empty()) { 1026 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end(); 1027 if (HasNoTerminator) 1028 BuildMI(MBB, MBB.end(), DebugLoc(), 1029 get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG)); 1030 } 1031 } 1032 1033 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const { 1034 switch (MI.getOpcode()) { 1035 default: return 1; // FIXME: Do wait states equal cycles? 1036 1037 case AMDGPU::S_NOP: 1038 return MI.getOperand(0).getImm() + 1; 1039 } 1040 } 1041 1042 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 1043 MachineBasicBlock &MBB = *MI.getParent(); 1044 DebugLoc DL = MBB.findDebugLoc(MI); 1045 switch (MI.getOpcode()) { 1046 default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); 1047 case AMDGPU::S_MOV_B64_term: { 1048 // This is only a terminator to get the correct spill code placement during 1049 // register allocation. 1050 MI.setDesc(get(AMDGPU::S_MOV_B64)); 1051 break; 1052 } 1053 case AMDGPU::S_XOR_B64_term: { 1054 // This is only a terminator to get the correct spill code placement during 1055 // register allocation. 1056 MI.setDesc(get(AMDGPU::S_XOR_B64)); 1057 break; 1058 } 1059 case AMDGPU::S_ANDN2_B64_term: { 1060 // This is only a terminator to get the correct spill code placement during 1061 // register allocation. 1062 MI.setDesc(get(AMDGPU::S_ANDN2_B64)); 1063 break; 1064 } 1065 case AMDGPU::V_MOV_B64_PSEUDO: { 1066 unsigned Dst = MI.getOperand(0).getReg(); 1067 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 1068 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 1069 1070 const MachineOperand &SrcOp = MI.getOperand(1); 1071 // FIXME: Will this work for 64-bit floating point immediates? 1072 assert(!SrcOp.isFPImm()); 1073 if (SrcOp.isImm()) { 1074 APInt Imm(64, SrcOp.getImm()); 1075 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 1076 .addImm(Imm.getLoBits(32).getZExtValue()) 1077 .addReg(Dst, RegState::Implicit | RegState::Define); 1078 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 1079 .addImm(Imm.getHiBits(32).getZExtValue()) 1080 .addReg(Dst, RegState::Implicit | RegState::Define); 1081 } else { 1082 assert(SrcOp.isReg()); 1083 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 1084 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 1085 .addReg(Dst, RegState::Implicit | RegState::Define); 1086 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 1087 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 1088 .addReg(Dst, RegState::Implicit | RegState::Define); 1089 } 1090 MI.eraseFromParent(); 1091 break; 1092 } 1093 case AMDGPU::V_MOVRELD_B32_V1: 1094 case AMDGPU::V_MOVRELD_B32_V2: 1095 case AMDGPU::V_MOVRELD_B32_V4: 1096 case AMDGPU::V_MOVRELD_B32_V8: 1097 case AMDGPU::V_MOVRELD_B32_V16: { 1098 const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32); 1099 unsigned VecReg = MI.getOperand(0).getReg(); 1100 bool IsUndef = MI.getOperand(1).isUndef(); 1101 unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm(); 1102 assert(VecReg == MI.getOperand(1).getReg()); 1103 1104 MachineInstr *MovRel = 1105 BuildMI(MBB, MI, DL, MovRelDesc) 1106 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) 1107 .add(MI.getOperand(2)) 1108 .addReg(VecReg, RegState::ImplicitDefine) 1109 .addReg(VecReg, 1110 RegState::Implicit | (IsUndef ? RegState::Undef : 0)); 1111 1112 const int ImpDefIdx = 1113 MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses(); 1114 const int ImpUseIdx = ImpDefIdx + 1; 1115 MovRel->tieOperands(ImpDefIdx, ImpUseIdx); 1116 1117 MI.eraseFromParent(); 1118 break; 1119 } 1120 case AMDGPU::SI_PC_ADD_REL_OFFSET: { 1121 MachineFunction &MF = *MBB.getParent(); 1122 unsigned Reg = MI.getOperand(0).getReg(); 1123 unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); 1124 unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); 1125 1126 // Create a bundle so these instructions won't be re-ordered by the 1127 // post-RA scheduler. 1128 MIBundleBuilder Bundler(MBB, MI); 1129 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); 1130 1131 // Add 32-bit offset from this instruction to the start of the 1132 // constant data. 1133 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) 1134 .addReg(RegLo) 1135 .add(MI.getOperand(1))); 1136 1137 MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) 1138 .addReg(RegHi); 1139 if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE) 1140 MIB.addImm(0); 1141 else 1142 MIB.add(MI.getOperand(2)); 1143 1144 Bundler.append(MIB); 1145 llvm::finalizeBundle(MBB, Bundler.begin()); 1146 1147 MI.eraseFromParent(); 1148 break; 1149 } 1150 } 1151 return true; 1152 } 1153 1154 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, 1155 MachineOperand &Src0, 1156 unsigned Src0OpName, 1157 MachineOperand &Src1, 1158 unsigned Src1OpName) const { 1159 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); 1160 if (!Src0Mods) 1161 return false; 1162 1163 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); 1164 assert(Src1Mods && 1165 "All commutable instructions have both src0 and src1 modifiers"); 1166 1167 int Src0ModsVal = Src0Mods->getImm(); 1168 int Src1ModsVal = Src1Mods->getImm(); 1169 1170 Src1Mods->setImm(Src0ModsVal); 1171 Src0Mods->setImm(Src1ModsVal); 1172 return true; 1173 } 1174 1175 static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, 1176 MachineOperand &RegOp, 1177 MachineOperand &NonRegOp) { 1178 unsigned Reg = RegOp.getReg(); 1179 unsigned SubReg = RegOp.getSubReg(); 1180 bool IsKill = RegOp.isKill(); 1181 bool IsDead = RegOp.isDead(); 1182 bool IsUndef = RegOp.isUndef(); 1183 bool IsDebug = RegOp.isDebug(); 1184 1185 if (NonRegOp.isImm()) 1186 RegOp.ChangeToImmediate(NonRegOp.getImm()); 1187 else if (NonRegOp.isFI()) 1188 RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); 1189 else 1190 return nullptr; 1191 1192 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); 1193 NonRegOp.setSubReg(SubReg); 1194 1195 return &MI; 1196 } 1197 1198 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, 1199 unsigned Src0Idx, 1200 unsigned Src1Idx) const { 1201 assert(!NewMI && "this should never be used"); 1202 1203 unsigned Opc = MI.getOpcode(); 1204 int CommutedOpcode = commuteOpcode(Opc); 1205 if (CommutedOpcode == -1) 1206 return nullptr; 1207 1208 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == 1209 static_cast<int>(Src0Idx) && 1210 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == 1211 static_cast<int>(Src1Idx) && 1212 "inconsistency with findCommutedOpIndices"); 1213 1214 MachineOperand &Src0 = MI.getOperand(Src0Idx); 1215 MachineOperand &Src1 = MI.getOperand(Src1Idx); 1216 1217 MachineInstr *CommutedMI = nullptr; 1218 if (Src0.isReg() && Src1.isReg()) { 1219 if (isOperandLegal(MI, Src1Idx, &Src0)) { 1220 // Be sure to copy the source modifiers to the right place. 1221 CommutedMI 1222 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); 1223 } 1224 1225 } else if (Src0.isReg() && !Src1.isReg()) { 1226 // src0 should always be able to support any operand type, so no need to 1227 // check operand legality. 1228 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); 1229 } else if (!Src0.isReg() && Src1.isReg()) { 1230 if (isOperandLegal(MI, Src1Idx, &Src0)) 1231 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); 1232 } else { 1233 // FIXME: Found two non registers to commute. This does happen. 1234 return nullptr; 1235 } 1236 1237 1238 if (CommutedMI) { 1239 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, 1240 Src1, AMDGPU::OpName::src1_modifiers); 1241 1242 CommutedMI->setDesc(get(CommutedOpcode)); 1243 } 1244 1245 return CommutedMI; 1246 } 1247 1248 // This needs to be implemented because the source modifiers may be inserted 1249 // between the true commutable operands, and the base 1250 // TargetInstrInfo::commuteInstruction uses it. 1251 bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0, 1252 unsigned &SrcOpIdx1) const { 1253 if (!MI.isCommutable()) 1254 return false; 1255 1256 unsigned Opc = MI.getOpcode(); 1257 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1258 if (Src0Idx == -1) 1259 return false; 1260 1261 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1262 if (Src1Idx == -1) 1263 return false; 1264 1265 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); 1266 } 1267 1268 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, 1269 int64_t BrOffset) const { 1270 // BranchRelaxation should never have to check s_setpc_b64 because its dest 1271 // block is unanalyzable. 1272 assert(BranchOp != AMDGPU::S_SETPC_B64); 1273 1274 // Convert to dwords. 1275 BrOffset /= 4; 1276 1277 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is 1278 // from the next instruction. 1279 BrOffset -= 1; 1280 1281 return isIntN(BranchOffsetBits, BrOffset); 1282 } 1283 1284 MachineBasicBlock *SIInstrInfo::getBranchDestBlock( 1285 const MachineInstr &MI) const { 1286 if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { 1287 // This would be a difficult analysis to perform, but can always be legal so 1288 // there's no need to analyze it. 1289 return nullptr; 1290 } 1291 1292 return MI.getOperand(0).getMBB(); 1293 } 1294 1295 unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, 1296 MachineBasicBlock &DestBB, 1297 const DebugLoc &DL, 1298 int64_t BrOffset, 1299 RegScavenger *RS) const { 1300 assert(RS && "RegScavenger required for long branching"); 1301 assert(MBB.empty() && 1302 "new block should be inserted for expanding unconditional branch"); 1303 assert(MBB.pred_size() == 1); 1304 1305 MachineFunction *MF = MBB.getParent(); 1306 MachineRegisterInfo &MRI = MF->getRegInfo(); 1307 1308 // FIXME: Virtual register workaround for RegScavenger not working with empty 1309 // blocks. 1310 unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1311 1312 auto I = MBB.end(); 1313 1314 // We need to compute the offset relative to the instruction immediately after 1315 // s_getpc_b64. Insert pc arithmetic code before last terminator. 1316 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); 1317 1318 // TODO: Handle > 32-bit block address. 1319 if (BrOffset >= 0) { 1320 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) 1321 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 1322 .addReg(PCReg, 0, AMDGPU::sub0) 1323 .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD); 1324 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) 1325 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 1326 .addReg(PCReg, 0, AMDGPU::sub1) 1327 .addImm(0); 1328 } else { 1329 // Backwards branch. 1330 BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32)) 1331 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 1332 .addReg(PCReg, 0, AMDGPU::sub0) 1333 .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD); 1334 BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32)) 1335 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 1336 .addReg(PCReg, 0, AMDGPU::sub1) 1337 .addImm(0); 1338 } 1339 1340 // Insert the indirect branch after the other terminator. 1341 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) 1342 .addReg(PCReg); 1343 1344 // FIXME: If spilling is necessary, this will fail because this scavenger has 1345 // no emergency stack slots. It is non-trivial to spill in this situation, 1346 // because the restore code needs to be specially placed after the 1347 // jump. BranchRelaxation then needs to be made aware of the newly inserted 1348 // block. 1349 // 1350 // If a spill is needed for the pc register pair, we need to insert a spill 1351 // restore block right before the destination block, and insert a short branch 1352 // into the old destination block's fallthrough predecessor. 1353 // e.g.: 1354 // 1355 // s_cbranch_scc0 skip_long_branch: 1356 // 1357 // long_branch_bb: 1358 // spill s[8:9] 1359 // s_getpc_b64 s[8:9] 1360 // s_add_u32 s8, s8, restore_bb 1361 // s_addc_u32 s9, s9, 0 1362 // s_setpc_b64 s[8:9] 1363 // 1364 // skip_long_branch: 1365 // foo; 1366 // 1367 // ..... 1368 // 1369 // dest_bb_fallthrough_predecessor: 1370 // bar; 1371 // s_branch dest_bb 1372 // 1373 // restore_bb: 1374 // restore s[8:9] 1375 // fallthrough dest_bb 1376 /// 1377 // dest_bb: 1378 // buzz; 1379 1380 RS->enterBasicBlockEnd(MBB); 1381 unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass, 1382 MachineBasicBlock::iterator(GetPC), 0); 1383 MRI.replaceRegWith(PCReg, Scav); 1384 MRI.clearVirtRegs(); 1385 RS->setRegUsed(Scav); 1386 1387 return 4 + 8 + 4 + 4; 1388 } 1389 1390 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { 1391 switch (Cond) { 1392 case SIInstrInfo::SCC_TRUE: 1393 return AMDGPU::S_CBRANCH_SCC1; 1394 case SIInstrInfo::SCC_FALSE: 1395 return AMDGPU::S_CBRANCH_SCC0; 1396 case SIInstrInfo::VCCNZ: 1397 return AMDGPU::S_CBRANCH_VCCNZ; 1398 case SIInstrInfo::VCCZ: 1399 return AMDGPU::S_CBRANCH_VCCZ; 1400 case SIInstrInfo::EXECNZ: 1401 return AMDGPU::S_CBRANCH_EXECNZ; 1402 case SIInstrInfo::EXECZ: 1403 return AMDGPU::S_CBRANCH_EXECZ; 1404 default: 1405 llvm_unreachable("invalid branch predicate"); 1406 } 1407 } 1408 1409 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { 1410 switch (Opcode) { 1411 case AMDGPU::S_CBRANCH_SCC0: 1412 return SCC_FALSE; 1413 case AMDGPU::S_CBRANCH_SCC1: 1414 return SCC_TRUE; 1415 case AMDGPU::S_CBRANCH_VCCNZ: 1416 return VCCNZ; 1417 case AMDGPU::S_CBRANCH_VCCZ: 1418 return VCCZ; 1419 case AMDGPU::S_CBRANCH_EXECNZ: 1420 return EXECNZ; 1421 case AMDGPU::S_CBRANCH_EXECZ: 1422 return EXECZ; 1423 default: 1424 return INVALID_BR; 1425 } 1426 } 1427 1428 bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, 1429 MachineBasicBlock::iterator I, 1430 MachineBasicBlock *&TBB, 1431 MachineBasicBlock *&FBB, 1432 SmallVectorImpl<MachineOperand> &Cond, 1433 bool AllowModify) const { 1434 if (I->getOpcode() == AMDGPU::S_BRANCH) { 1435 // Unconditional Branch 1436 TBB = I->getOperand(0).getMBB(); 1437 return false; 1438 } 1439 1440 MachineBasicBlock *CondBB = nullptr; 1441 1442 if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 1443 CondBB = I->getOperand(1).getMBB(); 1444 Cond.push_back(I->getOperand(0)); 1445 } else { 1446 BranchPredicate Pred = getBranchPredicate(I->getOpcode()); 1447 if (Pred == INVALID_BR) 1448 return true; 1449 1450 CondBB = I->getOperand(0).getMBB(); 1451 Cond.push_back(MachineOperand::CreateImm(Pred)); 1452 Cond.push_back(I->getOperand(1)); // Save the branch register. 1453 } 1454 ++I; 1455 1456 if (I == MBB.end()) { 1457 // Conditional branch followed by fall-through. 1458 TBB = CondBB; 1459 return false; 1460 } 1461 1462 if (I->getOpcode() == AMDGPU::S_BRANCH) { 1463 TBB = CondBB; 1464 FBB = I->getOperand(0).getMBB(); 1465 return false; 1466 } 1467 1468 return true; 1469 } 1470 1471 bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, 1472 MachineBasicBlock *&FBB, 1473 SmallVectorImpl<MachineOperand> &Cond, 1474 bool AllowModify) const { 1475 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 1476 if (I == MBB.end()) 1477 return false; 1478 1479 if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH) 1480 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); 1481 1482 ++I; 1483 1484 // TODO: Should be able to treat as fallthrough? 1485 if (I == MBB.end()) 1486 return true; 1487 1488 if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify)) 1489 return true; 1490 1491 MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB(); 1492 1493 // Specifically handle the case where the conditional branch is to the same 1494 // destination as the mask branch. e.g. 1495 // 1496 // si_mask_branch BB8 1497 // s_cbranch_execz BB8 1498 // s_cbranch BB9 1499 // 1500 // This is required to understand divergent loops which may need the branches 1501 // to be relaxed. 1502 if (TBB != MaskBrDest || Cond.empty()) 1503 return true; 1504 1505 auto Pred = Cond[0].getImm(); 1506 return (Pred != EXECZ && Pred != EXECNZ); 1507 } 1508 1509 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, 1510 int *BytesRemoved) const { 1511 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 1512 1513 unsigned Count = 0; 1514 unsigned RemovedSize = 0; 1515 while (I != MBB.end()) { 1516 MachineBasicBlock::iterator Next = std::next(I); 1517 if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) { 1518 I = Next; 1519 continue; 1520 } 1521 1522 RemovedSize += getInstSizeInBytes(*I); 1523 I->eraseFromParent(); 1524 ++Count; 1525 I = Next; 1526 } 1527 1528 if (BytesRemoved) 1529 *BytesRemoved = RemovedSize; 1530 1531 return Count; 1532 } 1533 1534 // Copy the flags onto the implicit condition register operand. 1535 static void preserveCondRegFlags(MachineOperand &CondReg, 1536 const MachineOperand &OrigCond) { 1537 CondReg.setIsUndef(OrigCond.isUndef()); 1538 CondReg.setIsKill(OrigCond.isKill()); 1539 } 1540 1541 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, 1542 MachineBasicBlock *TBB, 1543 MachineBasicBlock *FBB, 1544 ArrayRef<MachineOperand> Cond, 1545 const DebugLoc &DL, 1546 int *BytesAdded) const { 1547 1548 if (!FBB && Cond.empty()) { 1549 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 1550 .addMBB(TBB); 1551 if (BytesAdded) 1552 *BytesAdded = 4; 1553 return 1; 1554 } 1555 1556 if(Cond.size() == 1 && Cond[0].isReg()) { 1557 BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO)) 1558 .add(Cond[0]) 1559 .addMBB(TBB); 1560 return 1; 1561 } 1562 1563 assert(TBB && Cond[0].isImm()); 1564 1565 unsigned Opcode 1566 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); 1567 1568 if (!FBB) { 1569 Cond[1].isUndef(); 1570 MachineInstr *CondBr = 1571 BuildMI(&MBB, DL, get(Opcode)) 1572 .addMBB(TBB); 1573 1574 // Copy the flags onto the implicit condition register operand. 1575 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); 1576 1577 if (BytesAdded) 1578 *BytesAdded = 4; 1579 return 1; 1580 } 1581 1582 assert(TBB && FBB); 1583 1584 MachineInstr *CondBr = 1585 BuildMI(&MBB, DL, get(Opcode)) 1586 .addMBB(TBB); 1587 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 1588 .addMBB(FBB); 1589 1590 MachineOperand &CondReg = CondBr->getOperand(1); 1591 CondReg.setIsUndef(Cond[1].isUndef()); 1592 CondReg.setIsKill(Cond[1].isKill()); 1593 1594 if (BytesAdded) 1595 *BytesAdded = 8; 1596 1597 return 2; 1598 } 1599 1600 bool SIInstrInfo::reverseBranchCondition( 1601 SmallVectorImpl<MachineOperand> &Cond) const { 1602 if (Cond.size() != 2) { 1603 return true; 1604 } 1605 1606 if (Cond[0].isImm()) { 1607 Cond[0].setImm(-Cond[0].getImm()); 1608 return false; 1609 } 1610 1611 return true; 1612 } 1613 1614 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 1615 ArrayRef<MachineOperand> Cond, 1616 unsigned TrueReg, unsigned FalseReg, 1617 int &CondCycles, 1618 int &TrueCycles, int &FalseCycles) const { 1619 switch (Cond[0].getImm()) { 1620 case VCCNZ: 1621 case VCCZ: { 1622 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 1623 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 1624 assert(MRI.getRegClass(FalseReg) == RC); 1625 1626 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 1627 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 1628 1629 // Limit to equal cost for branch vs. N v_cndmask_b32s. 1630 return !RI.isSGPRClass(RC) && NumInsts <= 6; 1631 } 1632 case SCC_TRUE: 1633 case SCC_FALSE: { 1634 // FIXME: We could insert for VGPRs if we could replace the original compare 1635 // with a vector one. 1636 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 1637 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 1638 assert(MRI.getRegClass(FalseReg) == RC); 1639 1640 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 1641 1642 // Multiples of 8 can do s_cselect_b64 1643 if (NumInsts % 2 == 0) 1644 NumInsts /= 2; 1645 1646 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 1647 return RI.isSGPRClass(RC); 1648 } 1649 default: 1650 return false; 1651 } 1652 } 1653 1654 void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, 1655 MachineBasicBlock::iterator I, const DebugLoc &DL, 1656 unsigned DstReg, ArrayRef<MachineOperand> Cond, 1657 unsigned TrueReg, unsigned FalseReg) const { 1658 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); 1659 if (Pred == VCCZ || Pred == SCC_FALSE) { 1660 Pred = static_cast<BranchPredicate>(-Pred); 1661 std::swap(TrueReg, FalseReg); 1662 } 1663 1664 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 1665 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); 1666 unsigned DstSize = RI.getRegSizeInBits(*DstRC); 1667 1668 if (DstSize == 32) { 1669 unsigned SelOp = Pred == SCC_TRUE ? 1670 AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32; 1671 1672 // Instruction's operands are backwards from what is expected. 1673 MachineInstr *Select = 1674 BuildMI(MBB, I, DL, get(SelOp), DstReg) 1675 .addReg(FalseReg) 1676 .addReg(TrueReg); 1677 1678 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 1679 return; 1680 } 1681 1682 if (DstSize == 64 && Pred == SCC_TRUE) { 1683 MachineInstr *Select = 1684 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) 1685 .addReg(FalseReg) 1686 .addReg(TrueReg); 1687 1688 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 1689 return; 1690 } 1691 1692 static const int16_t Sub0_15[] = { 1693 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1694 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 1695 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 1696 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 1697 }; 1698 1699 static const int16_t Sub0_15_64[] = { 1700 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 1701 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 1702 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 1703 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 1704 }; 1705 1706 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; 1707 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; 1708 const int16_t *SubIndices = Sub0_15; 1709 int NElts = DstSize / 32; 1710 1711 // 64-bit select is only avaialble for SALU. 1712 if (Pred == SCC_TRUE) { 1713 SelOp = AMDGPU::S_CSELECT_B64; 1714 EltRC = &AMDGPU::SGPR_64RegClass; 1715 SubIndices = Sub0_15_64; 1716 1717 assert(NElts % 2 == 0); 1718 NElts /= 2; 1719 } 1720 1721 MachineInstrBuilder MIB = BuildMI( 1722 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); 1723 1724 I = MIB->getIterator(); 1725 1726 SmallVector<unsigned, 8> Regs; 1727 for (int Idx = 0; Idx != NElts; ++Idx) { 1728 unsigned DstElt = MRI.createVirtualRegister(EltRC); 1729 Regs.push_back(DstElt); 1730 1731 unsigned SubIdx = SubIndices[Idx]; 1732 1733 MachineInstr *Select = 1734 BuildMI(MBB, I, DL, get(SelOp), DstElt) 1735 .addReg(FalseReg, 0, SubIdx) 1736 .addReg(TrueReg, 0, SubIdx); 1737 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 1738 1739 MIB.addReg(DstElt) 1740 .addImm(SubIdx); 1741 } 1742 } 1743 1744 bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { 1745 switch (MI.getOpcode()) { 1746 case AMDGPU::V_MOV_B32_e32: 1747 case AMDGPU::V_MOV_B32_e64: 1748 case AMDGPU::V_MOV_B64_PSEUDO: { 1749 // If there are additional implicit register operands, this may be used for 1750 // register indexing so the source register operand isn't simply copied. 1751 unsigned NumOps = MI.getDesc().getNumOperands() + 1752 MI.getDesc().getNumImplicitUses(); 1753 1754 return MI.getNumOperands() == NumOps; 1755 } 1756 case AMDGPU::S_MOV_B32: 1757 case AMDGPU::S_MOV_B64: 1758 case AMDGPU::COPY: 1759 return true; 1760 default: 1761 return false; 1762 } 1763 } 1764 1765 static void removeModOperands(MachineInstr &MI) { 1766 unsigned Opc = MI.getOpcode(); 1767 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1768 AMDGPU::OpName::src0_modifiers); 1769 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1770 AMDGPU::OpName::src1_modifiers); 1771 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1772 AMDGPU::OpName::src2_modifiers); 1773 1774 MI.RemoveOperand(Src2ModIdx); 1775 MI.RemoveOperand(Src1ModIdx); 1776 MI.RemoveOperand(Src0ModIdx); 1777 } 1778 1779 bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, 1780 unsigned Reg, MachineRegisterInfo *MRI) const { 1781 if (!MRI->hasOneNonDBGUse(Reg)) 1782 return false; 1783 1784 unsigned Opc = UseMI.getOpcode(); 1785 if (Opc == AMDGPU::COPY) { 1786 bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg()); 1787 switch (DefMI.getOpcode()) { 1788 default: 1789 return false; 1790 case AMDGPU::S_MOV_B64: 1791 // TODO: We could fold 64-bit immediates, but this get compilicated 1792 // when there are sub-registers. 1793 return false; 1794 1795 case AMDGPU::V_MOV_B32_e32: 1796 case AMDGPU::S_MOV_B32: 1797 break; 1798 } 1799 unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1800 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); 1801 assert(ImmOp); 1802 // FIXME: We could handle FrameIndex values here. 1803 if (!ImmOp->isImm()) { 1804 return false; 1805 } 1806 UseMI.setDesc(get(NewOpc)); 1807 UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm()); 1808 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); 1809 return true; 1810 } 1811 1812 if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || 1813 Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) { 1814 // Don't fold if we are using source or output modifiers. The new VOP2 1815 // instructions don't have them. 1816 if (hasAnyModifiersSet(UseMI)) 1817 return false; 1818 1819 const MachineOperand &ImmOp = DefMI.getOperand(1); 1820 1821 // If this is a free constant, there's no reason to do this. 1822 // TODO: We could fold this here instead of letting SIFoldOperands do it 1823 // later. 1824 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); 1825 1826 // Any src operand can be used for the legality check. 1827 if (isInlineConstant(UseMI, *Src0, ImmOp)) 1828 return false; 1829 1830 bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64; 1831 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); 1832 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); 1833 1834 // Multiplied part is the constant: Use v_madmk_{f16, f32}. 1835 // We should only expect these to be on src0 due to canonicalizations. 1836 if (Src0->isReg() && Src0->getReg() == Reg) { 1837 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1838 return false; 1839 1840 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) 1841 return false; 1842 1843 // We need to swap operands 0 and 1 since madmk constant is at operand 1. 1844 1845 const int64_t Imm = DefMI.getOperand(1).getImm(); 1846 1847 // FIXME: This would be a lot easier if we could return a new instruction 1848 // instead of having to modify in place. 1849 1850 // Remove these first since they are at the end. 1851 UseMI.RemoveOperand( 1852 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 1853 UseMI.RemoveOperand( 1854 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 1855 1856 unsigned Src1Reg = Src1->getReg(); 1857 unsigned Src1SubReg = Src1->getSubReg(); 1858 Src0->setReg(Src1Reg); 1859 Src0->setSubReg(Src1SubReg); 1860 Src0->setIsKill(Src1->isKill()); 1861 1862 if (Opc == AMDGPU::V_MAC_F32_e64 || 1863 Opc == AMDGPU::V_MAC_F16_e64) 1864 UseMI.untieRegOperand( 1865 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1866 1867 Src1->ChangeToImmediate(Imm); 1868 1869 removeModOperands(UseMI); 1870 UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16)); 1871 1872 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1873 if (DeleteDef) 1874 DefMI.eraseFromParent(); 1875 1876 return true; 1877 } 1878 1879 // Added part is the constant: Use v_madak_{f16, f32}. 1880 if (Src2->isReg() && Src2->getReg() == Reg) { 1881 // Not allowed to use constant bus for another operand. 1882 // We can however allow an inline immediate as src0. 1883 if (!Src0->isImm() && 1884 (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) 1885 return false; 1886 1887 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1888 return false; 1889 1890 const int64_t Imm = DefMI.getOperand(1).getImm(); 1891 1892 // FIXME: This would be a lot easier if we could return a new instruction 1893 // instead of having to modify in place. 1894 1895 // Remove these first since they are at the end. 1896 UseMI.RemoveOperand( 1897 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 1898 UseMI.RemoveOperand( 1899 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 1900 1901 if (Opc == AMDGPU::V_MAC_F32_e64 || 1902 Opc == AMDGPU::V_MAC_F16_e64) 1903 UseMI.untieRegOperand( 1904 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1905 1906 // ChangingToImmediate adds Src2 back to the instruction. 1907 Src2->ChangeToImmediate(Imm); 1908 1909 // These come before src2. 1910 removeModOperands(UseMI); 1911 UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16)); 1912 1913 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1914 if (DeleteDef) 1915 DefMI.eraseFromParent(); 1916 1917 return true; 1918 } 1919 } 1920 1921 return false; 1922 } 1923 1924 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 1925 int WidthB, int OffsetB) { 1926 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1927 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1928 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1929 return LowOffset + LowWidth <= HighOffset; 1930 } 1931 1932 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa, 1933 MachineInstr &MIb) const { 1934 unsigned BaseReg0, BaseReg1; 1935 int64_t Offset0, Offset1; 1936 1937 if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && 1938 getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { 1939 1940 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { 1941 // FIXME: Handle ds_read2 / ds_write2. 1942 return false; 1943 } 1944 unsigned Width0 = (*MIa.memoperands_begin())->getSize(); 1945 unsigned Width1 = (*MIb.memoperands_begin())->getSize(); 1946 if (BaseReg0 == BaseReg1 && 1947 offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { 1948 return true; 1949 } 1950 } 1951 1952 return false; 1953 } 1954 1955 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa, 1956 MachineInstr &MIb, 1957 AliasAnalysis *AA) const { 1958 assert((MIa.mayLoad() || MIa.mayStore()) && 1959 "MIa must load from or modify a memory location"); 1960 assert((MIb.mayLoad() || MIb.mayStore()) && 1961 "MIb must load from or modify a memory location"); 1962 1963 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) 1964 return false; 1965 1966 // XXX - Can we relax this between address spaces? 1967 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 1968 return false; 1969 1970 if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) { 1971 const MachineMemOperand *MMOa = *MIa.memoperands_begin(); 1972 const MachineMemOperand *MMOb = *MIb.memoperands_begin(); 1973 if (MMOa->getValue() && MMOb->getValue()) { 1974 MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo()); 1975 MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo()); 1976 if (!AA->alias(LocA, LocB)) 1977 return true; 1978 } 1979 } 1980 1981 // TODO: Should we check the address space from the MachineMemOperand? That 1982 // would allow us to distinguish objects we know don't alias based on the 1983 // underlying address space, even if it was lowered to a different one, 1984 // e.g. private accesses lowered to use MUBUF instructions on a scratch 1985 // buffer. 1986 if (isDS(MIa)) { 1987 if (isDS(MIb)) 1988 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1989 1990 return !isFLAT(MIb); 1991 } 1992 1993 if (isMUBUF(MIa) || isMTBUF(MIa)) { 1994 if (isMUBUF(MIb) || isMTBUF(MIb)) 1995 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1996 1997 return !isFLAT(MIb) && !isSMRD(MIb); 1998 } 1999 2000 if (isSMRD(MIa)) { 2001 if (isSMRD(MIb)) 2002 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2003 2004 return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa); 2005 } 2006 2007 if (isFLAT(MIa)) { 2008 if (isFLAT(MIb)) 2009 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2010 2011 return false; 2012 } 2013 2014 return false; 2015 } 2016 2017 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, 2018 MachineInstr &MI, 2019 LiveVariables *LV) const { 2020 bool IsF16 = false; 2021 2022 switch (MI.getOpcode()) { 2023 default: 2024 return nullptr; 2025 case AMDGPU::V_MAC_F16_e64: 2026 IsF16 = true; 2027 case AMDGPU::V_MAC_F32_e64: 2028 break; 2029 case AMDGPU::V_MAC_F16_e32: 2030 IsF16 = true; 2031 case AMDGPU::V_MAC_F32_e32: { 2032 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 2033 AMDGPU::OpName::src0); 2034 const MachineOperand *Src0 = &MI.getOperand(Src0Idx); 2035 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0)) 2036 return nullptr; 2037 break; 2038 } 2039 } 2040 2041 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 2042 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); 2043 const MachineOperand *Src0Mods = 2044 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 2045 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 2046 const MachineOperand *Src1Mods = 2047 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 2048 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 2049 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); 2050 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); 2051 2052 return BuildMI(*MBB, MI, MI.getDebugLoc(), 2053 get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32)) 2054 .add(*Dst) 2055 .addImm(Src0Mods ? Src0Mods->getImm() : 0) 2056 .add(*Src0) 2057 .addImm(Src1Mods ? Src1Mods->getImm() : 0) 2058 .add(*Src1) 2059 .addImm(0) // Src mods 2060 .add(*Src2) 2061 .addImm(Clamp ? Clamp->getImm() : 0) 2062 .addImm(Omod ? Omod->getImm() : 0); 2063 } 2064 2065 // It's not generally safe to move VALU instructions across these since it will 2066 // start using the register as a base index rather than directly. 2067 // XXX - Why isn't hasSideEffects sufficient for these? 2068 static bool changesVGPRIndexingMode(const MachineInstr &MI) { 2069 switch (MI.getOpcode()) { 2070 case AMDGPU::S_SET_GPR_IDX_ON: 2071 case AMDGPU::S_SET_GPR_IDX_MODE: 2072 case AMDGPU::S_SET_GPR_IDX_OFF: 2073 return true; 2074 default: 2075 return false; 2076 } 2077 } 2078 2079 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, 2080 const MachineBasicBlock *MBB, 2081 const MachineFunction &MF) const { 2082 // XXX - Do we want the SP check in the base implementation? 2083 2084 // Target-independent instructions do not have an implicit-use of EXEC, even 2085 // when they operate on VGPRs. Treating EXEC modifications as scheduling 2086 // boundaries prevents incorrect movements of such instructions. 2087 return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) || 2088 MI.modifiesRegister(AMDGPU::EXEC, &RI) || 2089 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || 2090 MI.getOpcode() == AMDGPU::S_SETREG_B32 || 2091 changesVGPRIndexingMode(MI); 2092 } 2093 2094 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 2095 switch (Imm.getBitWidth()) { 2096 case 32: 2097 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(), 2098 ST.hasInv2PiInlineImm()); 2099 case 64: 2100 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), 2101 ST.hasInv2PiInlineImm()); 2102 case 16: 2103 return ST.has16BitInsts() && 2104 AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), 2105 ST.hasInv2PiInlineImm()); 2106 default: 2107 llvm_unreachable("invalid bitwidth"); 2108 } 2109 } 2110 2111 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 2112 uint8_t OperandType) const { 2113 if (!MO.isImm() || OperandType < MCOI::OPERAND_FIRST_TARGET) 2114 return false; 2115 2116 // MachineOperand provides no way to tell the true operand size, since it only 2117 // records a 64-bit value. We need to know the size to determine if a 32-bit 2118 // floating point immediate bit pattern is legal for an integer immediate. It 2119 // would be for any 32-bit integer operand, but would not be for a 64-bit one. 2120 2121 int64_t Imm = MO.getImm(); 2122 switch (OperandType) { 2123 case AMDGPU::OPERAND_REG_IMM_INT32: 2124 case AMDGPU::OPERAND_REG_IMM_FP32: 2125 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 2126 case AMDGPU::OPERAND_REG_INLINE_C_FP32: { 2127 int32_t Trunc = static_cast<int32_t>(Imm); 2128 return Trunc == Imm && 2129 AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); 2130 } 2131 case AMDGPU::OPERAND_REG_IMM_INT64: 2132 case AMDGPU::OPERAND_REG_IMM_FP64: 2133 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 2134 case AMDGPU::OPERAND_REG_INLINE_C_FP64: { 2135 return AMDGPU::isInlinableLiteral64(MO.getImm(), 2136 ST.hasInv2PiInlineImm()); 2137 } 2138 case AMDGPU::OPERAND_REG_IMM_INT16: 2139 case AMDGPU::OPERAND_REG_IMM_FP16: 2140 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 2141 case AMDGPU::OPERAND_REG_INLINE_C_FP16: { 2142 if (isInt<16>(Imm) || isUInt<16>(Imm)) { 2143 // A few special case instructions have 16-bit operands on subtargets 2144 // where 16-bit instructions are not legal. 2145 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle 2146 // constants in these cases 2147 int16_t Trunc = static_cast<int16_t>(Imm); 2148 return ST.has16BitInsts() && 2149 AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); 2150 } 2151 2152 return false; 2153 } 2154 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: 2155 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { 2156 uint32_t Trunc = static_cast<uint32_t>(Imm); 2157 return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); 2158 } 2159 default: 2160 llvm_unreachable("invalid bitwidth"); 2161 } 2162 } 2163 2164 bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO, 2165 const MCOperandInfo &OpInfo) const { 2166 switch (MO.getType()) { 2167 case MachineOperand::MO_Register: 2168 return false; 2169 case MachineOperand::MO_Immediate: 2170 return !isInlineConstant(MO, OpInfo); 2171 case MachineOperand::MO_FrameIndex: 2172 case MachineOperand::MO_MachineBasicBlock: 2173 case MachineOperand::MO_ExternalSymbol: 2174 case MachineOperand::MO_GlobalAddress: 2175 case MachineOperand::MO_MCSymbol: 2176 return true; 2177 default: 2178 llvm_unreachable("unexpected operand type"); 2179 } 2180 } 2181 2182 static bool compareMachineOp(const MachineOperand &Op0, 2183 const MachineOperand &Op1) { 2184 if (Op0.getType() != Op1.getType()) 2185 return false; 2186 2187 switch (Op0.getType()) { 2188 case MachineOperand::MO_Register: 2189 return Op0.getReg() == Op1.getReg(); 2190 case MachineOperand::MO_Immediate: 2191 return Op0.getImm() == Op1.getImm(); 2192 default: 2193 llvm_unreachable("Didn't expect to be comparing these operand types"); 2194 } 2195 } 2196 2197 bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, 2198 const MachineOperand &MO) const { 2199 const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo]; 2200 2201 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 2202 2203 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 2204 return true; 2205 2206 if (OpInfo.RegClass < 0) 2207 return false; 2208 2209 if (MO.isImm() && isInlineConstant(MO, OpInfo)) 2210 return RI.opCanUseInlineConstant(OpInfo.OperandType); 2211 2212 return RI.opCanUseLiteralConstant(OpInfo.OperandType); 2213 } 2214 2215 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 2216 int Op32 = AMDGPU::getVOPe32(Opcode); 2217 if (Op32 == -1) 2218 return false; 2219 2220 return pseudoToMCOpcode(Op32) != -1; 2221 } 2222 2223 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 2224 // The src0_modifier operand is present on all instructions 2225 // that have modifiers. 2226 2227 return AMDGPU::getNamedOperandIdx(Opcode, 2228 AMDGPU::OpName::src0_modifiers) != -1; 2229 } 2230 2231 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 2232 unsigned OpName) const { 2233 const MachineOperand *Mods = getNamedOperand(MI, OpName); 2234 return Mods && Mods->getImm(); 2235 } 2236 2237 bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { 2238 return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || 2239 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || 2240 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) || 2241 hasModifiersSet(MI, AMDGPU::OpName::clamp) || 2242 hasModifiersSet(MI, AMDGPU::OpName::omod); 2243 } 2244 2245 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 2246 const MachineOperand &MO, 2247 const MCOperandInfo &OpInfo) const { 2248 // Literal constants use the constant bus. 2249 //if (isLiteralConstantLike(MO, OpInfo)) 2250 // return true; 2251 if (MO.isImm()) 2252 return !isInlineConstant(MO, OpInfo); 2253 2254 if (!MO.isReg()) 2255 return true; // Misc other operands like FrameIndex 2256 2257 if (!MO.isUse()) 2258 return false; 2259 2260 if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) 2261 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 2262 2263 // FLAT_SCR is just an SGPR pair. 2264 if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) 2265 return true; 2266 2267 // EXEC register uses the constant bus. 2268 if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) 2269 return true; 2270 2271 // SGPRs use the constant bus 2272 return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 || 2273 (!MO.isImplicit() && 2274 (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || 2275 AMDGPU::SGPR_64RegClass.contains(MO.getReg())))); 2276 } 2277 2278 static unsigned findImplicitSGPRRead(const MachineInstr &MI) { 2279 for (const MachineOperand &MO : MI.implicit_operands()) { 2280 // We only care about reads. 2281 if (MO.isDef()) 2282 continue; 2283 2284 switch (MO.getReg()) { 2285 case AMDGPU::VCC: 2286 case AMDGPU::M0: 2287 case AMDGPU::FLAT_SCR: 2288 return MO.getReg(); 2289 2290 default: 2291 break; 2292 } 2293 } 2294 2295 return AMDGPU::NoRegister; 2296 } 2297 2298 static bool shouldReadExec(const MachineInstr &MI) { 2299 if (SIInstrInfo::isVALU(MI)) { 2300 switch (MI.getOpcode()) { 2301 case AMDGPU::V_READLANE_B32: 2302 case AMDGPU::V_READLANE_B32_si: 2303 case AMDGPU::V_READLANE_B32_vi: 2304 case AMDGPU::V_WRITELANE_B32: 2305 case AMDGPU::V_WRITELANE_B32_si: 2306 case AMDGPU::V_WRITELANE_B32_vi: 2307 return false; 2308 } 2309 2310 return true; 2311 } 2312 2313 if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) || 2314 SIInstrInfo::isSALU(MI) || 2315 SIInstrInfo::isSMRD(MI)) 2316 return false; 2317 2318 return true; 2319 } 2320 2321 static bool isSubRegOf(const SIRegisterInfo &TRI, 2322 const MachineOperand &SuperVec, 2323 const MachineOperand &SubReg) { 2324 if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg())) 2325 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); 2326 2327 return SubReg.getSubReg() != AMDGPU::NoSubRegister && 2328 SubReg.getReg() == SuperVec.getReg(); 2329 } 2330 2331 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, 2332 StringRef &ErrInfo) const { 2333 uint16_t Opcode = MI.getOpcode(); 2334 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 2335 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 2336 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 2337 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 2338 2339 // Make sure the number of operands is correct. 2340 const MCInstrDesc &Desc = get(Opcode); 2341 if (!Desc.isVariadic() && 2342 Desc.getNumOperands() != MI.getNumExplicitOperands()) { 2343 ErrInfo = "Instruction has wrong number of operands."; 2344 return false; 2345 } 2346 2347 if (MI.isInlineAsm()) { 2348 // Verify register classes for inlineasm constraints. 2349 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands(); 2350 I != E; ++I) { 2351 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI); 2352 if (!RC) 2353 continue; 2354 2355 const MachineOperand &Op = MI.getOperand(I); 2356 if (!Op.isReg()) 2357 continue; 2358 2359 unsigned Reg = Op.getReg(); 2360 if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) { 2361 ErrInfo = "inlineasm operand has incorrect register class."; 2362 return false; 2363 } 2364 } 2365 2366 return true; 2367 } 2368 2369 // Make sure the register classes are correct. 2370 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 2371 if (MI.getOperand(i).isFPImm()) { 2372 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 2373 "all fp values to integers."; 2374 return false; 2375 } 2376 2377 int RegClass = Desc.OpInfo[i].RegClass; 2378 2379 switch (Desc.OpInfo[i].OperandType) { 2380 case MCOI::OPERAND_REGISTER: 2381 if (MI.getOperand(i).isImm()) { 2382 ErrInfo = "Illegal immediate value for operand."; 2383 return false; 2384 } 2385 break; 2386 case AMDGPU::OPERAND_REG_IMM_INT32: 2387 case AMDGPU::OPERAND_REG_IMM_FP32: 2388 break; 2389 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 2390 case AMDGPU::OPERAND_REG_INLINE_C_FP32: 2391 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 2392 case AMDGPU::OPERAND_REG_INLINE_C_FP64: 2393 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 2394 case AMDGPU::OPERAND_REG_INLINE_C_FP16: { 2395 const MachineOperand &MO = MI.getOperand(i); 2396 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { 2397 ErrInfo = "Illegal immediate value for operand."; 2398 return false; 2399 } 2400 break; 2401 } 2402 case MCOI::OPERAND_IMMEDIATE: 2403 case AMDGPU::OPERAND_KIMM32: 2404 // Check if this operand is an immediate. 2405 // FrameIndex operands will be replaced by immediates, so they are 2406 // allowed. 2407 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { 2408 ErrInfo = "Expected immediate, but got non-immediate"; 2409 return false; 2410 } 2411 LLVM_FALLTHROUGH; 2412 default: 2413 continue; 2414 } 2415 2416 if (!MI.getOperand(i).isReg()) 2417 continue; 2418 2419 if (RegClass != -1) { 2420 unsigned Reg = MI.getOperand(i).getReg(); 2421 if (Reg == AMDGPU::NoRegister || 2422 TargetRegisterInfo::isVirtualRegister(Reg)) 2423 continue; 2424 2425 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 2426 if (!RC->contains(Reg)) { 2427 ErrInfo = "Operand has incorrect register class."; 2428 return false; 2429 } 2430 } 2431 } 2432 2433 // Verify VOP* 2434 if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI)) { 2435 // Only look at the true operands. Only a real operand can use the constant 2436 // bus, and we don't want to check pseudo-operands like the source modifier 2437 // flags. 2438 const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; 2439 2440 unsigned ConstantBusCount = 0; 2441 2442 if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) 2443 ++ConstantBusCount; 2444 2445 unsigned SGPRUsed = findImplicitSGPRRead(MI); 2446 if (SGPRUsed != AMDGPU::NoRegister) 2447 ++ConstantBusCount; 2448 2449 for (int OpIdx : OpIndices) { 2450 if (OpIdx == -1) 2451 break; 2452 const MachineOperand &MO = MI.getOperand(OpIdx); 2453 if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { 2454 if (MO.isReg()) { 2455 if (MO.getReg() != SGPRUsed) 2456 ++ConstantBusCount; 2457 SGPRUsed = MO.getReg(); 2458 } else { 2459 ++ConstantBusCount; 2460 } 2461 } 2462 } 2463 if (ConstantBusCount > 1) { 2464 ErrInfo = "VOP* instruction uses the constant bus more than once"; 2465 return false; 2466 } 2467 } 2468 2469 // Verify misc. restrictions on specific instructions. 2470 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || 2471 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { 2472 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 2473 const MachineOperand &Src1 = MI.getOperand(Src1Idx); 2474 const MachineOperand &Src2 = MI.getOperand(Src2Idx); 2475 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 2476 if (!compareMachineOp(Src0, Src1) && 2477 !compareMachineOp(Src0, Src2)) { 2478 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 2479 return false; 2480 } 2481 } 2482 } 2483 2484 if (isSOPK(MI)) { 2485 int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm(); 2486 if (sopkIsZext(MI)) { 2487 if (!isUInt<16>(Imm)) { 2488 ErrInfo = "invalid immediate for SOPK instruction"; 2489 return false; 2490 } 2491 } else { 2492 if (!isInt<16>(Imm)) { 2493 ErrInfo = "invalid immediate for SOPK instruction"; 2494 return false; 2495 } 2496 } 2497 } 2498 2499 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || 2500 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || 2501 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 2502 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { 2503 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 2504 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; 2505 2506 const unsigned StaticNumOps = Desc.getNumOperands() + 2507 Desc.getNumImplicitUses(); 2508 const unsigned NumImplicitOps = IsDst ? 2 : 1; 2509 2510 // Allow additional implicit operands. This allows a fixup done by the post 2511 // RA scheduler where the main implicit operand is killed and implicit-defs 2512 // are added for sub-registers that remain live after this instruction. 2513 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { 2514 ErrInfo = "missing implicit register operands"; 2515 return false; 2516 } 2517 2518 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 2519 if (IsDst) { 2520 if (!Dst->isUse()) { 2521 ErrInfo = "v_movreld_b32 vdst should be a use operand"; 2522 return false; 2523 } 2524 2525 unsigned UseOpIdx; 2526 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || 2527 UseOpIdx != StaticNumOps + 1) { 2528 ErrInfo = "movrel implicit operands should be tied"; 2529 return false; 2530 } 2531 } 2532 2533 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 2534 const MachineOperand &ImpUse 2535 = MI.getOperand(StaticNumOps + NumImplicitOps - 1); 2536 if (!ImpUse.isReg() || !ImpUse.isUse() || 2537 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { 2538 ErrInfo = "src0 should be subreg of implicit vector use"; 2539 return false; 2540 } 2541 } 2542 2543 // Make sure we aren't losing exec uses in the td files. This mostly requires 2544 // being careful when using let Uses to try to add other use registers. 2545 if (shouldReadExec(MI)) { 2546 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 2547 ErrInfo = "VALU instruction does not implicitly read exec mask"; 2548 return false; 2549 } 2550 } 2551 2552 if (isSMRD(MI)) { 2553 if (MI.mayStore()) { 2554 // The register offset form of scalar stores may only use m0 as the 2555 // soffset register. 2556 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff); 2557 if (Soff && Soff->getReg() != AMDGPU::M0) { 2558 ErrInfo = "scalar stores must use m0 as offset register"; 2559 return false; 2560 } 2561 } 2562 } 2563 2564 return true; 2565 } 2566 2567 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { 2568 switch (MI.getOpcode()) { 2569 default: return AMDGPU::INSTRUCTION_LIST_END; 2570 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 2571 case AMDGPU::COPY: return AMDGPU::COPY; 2572 case AMDGPU::PHI: return AMDGPU::PHI; 2573 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 2574 case AMDGPU::S_MOV_B32: 2575 return MI.getOperand(1).isReg() ? 2576 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 2577 case AMDGPU::S_ADD_I32: 2578 case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; 2579 case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; 2580 case AMDGPU::S_SUB_I32: 2581 case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; 2582 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 2583 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; 2584 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; 2585 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; 2586 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; 2587 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; 2588 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; 2589 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; 2590 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; 2591 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 2592 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; 2593 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 2594 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; 2595 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 2596 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; 2597 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; 2598 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; 2599 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; 2600 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; 2601 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 2602 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 2603 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 2604 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 2605 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; 2606 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; 2607 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; 2608 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; 2609 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; 2610 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; 2611 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; 2612 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; 2613 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; 2614 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; 2615 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; 2616 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; 2617 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32; 2618 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32; 2619 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 2620 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 2621 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 2622 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 2623 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; 2624 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; 2625 } 2626 } 2627 2628 bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const { 2629 return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END; 2630 } 2631 2632 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 2633 unsigned OpNo) const { 2634 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 2635 const MCInstrDesc &Desc = get(MI.getOpcode()); 2636 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 2637 Desc.OpInfo[OpNo].RegClass == -1) { 2638 unsigned Reg = MI.getOperand(OpNo).getReg(); 2639 2640 if (TargetRegisterInfo::isVirtualRegister(Reg)) 2641 return MRI.getRegClass(Reg); 2642 return RI.getPhysRegClass(Reg); 2643 } 2644 2645 unsigned RCID = Desc.OpInfo[OpNo].RegClass; 2646 return RI.getRegClass(RCID); 2647 } 2648 2649 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { 2650 switch (MI.getOpcode()) { 2651 case AMDGPU::COPY: 2652 case AMDGPU::REG_SEQUENCE: 2653 case AMDGPU::PHI: 2654 case AMDGPU::INSERT_SUBREG: 2655 return RI.hasVGPRs(getOpRegClass(MI, 0)); 2656 default: 2657 return RI.hasVGPRs(getOpRegClass(MI, OpNo)); 2658 } 2659 } 2660 2661 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { 2662 MachineBasicBlock::iterator I = MI; 2663 MachineBasicBlock *MBB = MI.getParent(); 2664 MachineOperand &MO = MI.getOperand(OpIdx); 2665 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 2666 unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; 2667 const TargetRegisterClass *RC = RI.getRegClass(RCID); 2668 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 2669 if (MO.isReg()) 2670 Opcode = AMDGPU::COPY; 2671 else if (RI.isSGPRClass(RC)) 2672 Opcode = AMDGPU::S_MOV_B32; 2673 2674 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 2675 if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) 2676 VRC = &AMDGPU::VReg_64RegClass; 2677 else 2678 VRC = &AMDGPU::VGPR_32RegClass; 2679 2680 unsigned Reg = MRI.createVirtualRegister(VRC); 2681 DebugLoc DL = MBB->findDebugLoc(I); 2682 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); 2683 MO.ChangeToRegister(Reg, false); 2684 } 2685 2686 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 2687 MachineRegisterInfo &MRI, 2688 MachineOperand &SuperReg, 2689 const TargetRegisterClass *SuperRC, 2690 unsigned SubIdx, 2691 const TargetRegisterClass *SubRC) 2692 const { 2693 MachineBasicBlock *MBB = MI->getParent(); 2694 DebugLoc DL = MI->getDebugLoc(); 2695 unsigned SubReg = MRI.createVirtualRegister(SubRC); 2696 2697 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { 2698 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 2699 .addReg(SuperReg.getReg(), 0, SubIdx); 2700 return SubReg; 2701 } 2702 2703 // Just in case the super register is itself a sub-register, copy it to a new 2704 // value so we don't need to worry about merging its subreg index with the 2705 // SubIdx passed to this function. The register coalescer should be able to 2706 // eliminate this extra copy. 2707 unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); 2708 2709 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 2710 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 2711 2712 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 2713 .addReg(NewSuperReg, 0, SubIdx); 2714 2715 return SubReg; 2716 } 2717 2718 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 2719 MachineBasicBlock::iterator MII, 2720 MachineRegisterInfo &MRI, 2721 MachineOperand &Op, 2722 const TargetRegisterClass *SuperRC, 2723 unsigned SubIdx, 2724 const TargetRegisterClass *SubRC) const { 2725 if (Op.isImm()) { 2726 if (SubIdx == AMDGPU::sub0) 2727 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); 2728 if (SubIdx == AMDGPU::sub1) 2729 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32)); 2730 2731 llvm_unreachable("Unhandled register index for immediate"); 2732 } 2733 2734 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 2735 SubIdx, SubRC); 2736 return MachineOperand::CreateReg(SubReg, false); 2737 } 2738 2739 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 2740 void SIInstrInfo::swapOperands(MachineInstr &Inst) const { 2741 assert(Inst.getNumExplicitOperands() == 3); 2742 MachineOperand Op1 = Inst.getOperand(1); 2743 Inst.RemoveOperand(1); 2744 Inst.addOperand(Op1); 2745 } 2746 2747 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, 2748 const MCOperandInfo &OpInfo, 2749 const MachineOperand &MO) const { 2750 if (!MO.isReg()) 2751 return false; 2752 2753 unsigned Reg = MO.getReg(); 2754 const TargetRegisterClass *RC = 2755 TargetRegisterInfo::isVirtualRegister(Reg) ? 2756 MRI.getRegClass(Reg) : 2757 RI.getPhysRegClass(Reg); 2758 2759 const SIRegisterInfo *TRI = 2760 static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 2761 RC = TRI->getSubRegClass(RC, MO.getSubReg()); 2762 2763 // In order to be legal, the common sub-class must be equal to the 2764 // class of the current operand. For example: 2765 // 2766 // v_mov_b32 s0 ; Operand defined as vsrc_b32 2767 // ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL 2768 // 2769 // s_sendmsg 0, s0 ; Operand defined as m0reg 2770 // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL 2771 2772 return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; 2773 } 2774 2775 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, 2776 const MCOperandInfo &OpInfo, 2777 const MachineOperand &MO) const { 2778 if (MO.isReg()) 2779 return isLegalRegOperand(MRI, OpInfo, MO); 2780 2781 // Handle non-register types that are treated like immediates. 2782 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 2783 return true; 2784 } 2785 2786 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, 2787 const MachineOperand *MO) const { 2788 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 2789 const MCInstrDesc &InstDesc = MI.getDesc(); 2790 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; 2791 const TargetRegisterClass *DefinedRC = 2792 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 2793 if (!MO) 2794 MO = &MI.getOperand(OpIdx); 2795 2796 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { 2797 2798 RegSubRegPair SGPRUsed; 2799 if (MO->isReg()) 2800 SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg()); 2801 2802 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 2803 if (i == OpIdx) 2804 continue; 2805 const MachineOperand &Op = MI.getOperand(i); 2806 if (Op.isReg()) { 2807 if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) && 2808 usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) { 2809 return false; 2810 } 2811 } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { 2812 return false; 2813 } 2814 } 2815 } 2816 2817 if (MO->isReg()) { 2818 assert(DefinedRC); 2819 return isLegalRegOperand(MRI, OpInfo, *MO); 2820 } 2821 2822 // Handle non-register types that are treated like immediates. 2823 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); 2824 2825 if (!DefinedRC) { 2826 // This operand expects an immediate. 2827 return true; 2828 } 2829 2830 return isImmOperandLegal(MI, OpIdx, *MO); 2831 } 2832 2833 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, 2834 MachineInstr &MI) const { 2835 unsigned Opc = MI.getOpcode(); 2836 const MCInstrDesc &InstrDesc = get(Opc); 2837 2838 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 2839 MachineOperand &Src1 = MI.getOperand(Src1Idx); 2840 2841 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 2842 // we need to only have one constant bus use. 2843 // 2844 // Note we do not need to worry about literal constants here. They are 2845 // disabled for the operand type for instructions because they will always 2846 // violate the one constant bus use rule. 2847 bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; 2848 if (HasImplicitSGPR) { 2849 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 2850 MachineOperand &Src0 = MI.getOperand(Src0Idx); 2851 2852 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) 2853 legalizeOpWithMove(MI, Src0Idx); 2854 } 2855 2856 // VOP2 src0 instructions support all operand types, so we don't need to check 2857 // their legality. If src1 is already legal, we don't need to do anything. 2858 if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) 2859 return; 2860 2861 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for 2862 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane 2863 // select is uniform. 2864 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() && 2865 RI.isVGPR(MRI, Src1.getReg())) { 2866 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 2867 const DebugLoc &DL = MI.getDebugLoc(); 2868 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 2869 .add(Src1); 2870 Src1.ChangeToRegister(Reg, false); 2871 return; 2872 } 2873 2874 // We do not use commuteInstruction here because it is too aggressive and will 2875 // commute if it is possible. We only want to commute here if it improves 2876 // legality. This can be called a fairly large number of times so don't waste 2877 // compile time pointlessly swapping and checking legality again. 2878 if (HasImplicitSGPR || !MI.isCommutable()) { 2879 legalizeOpWithMove(MI, Src1Idx); 2880 return; 2881 } 2882 2883 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 2884 MachineOperand &Src0 = MI.getOperand(Src0Idx); 2885 2886 // If src0 can be used as src1, commuting will make the operands legal. 2887 // Otherwise we have to give up and insert a move. 2888 // 2889 // TODO: Other immediate-like operand kinds could be commuted if there was a 2890 // MachineOperand::ChangeTo* for them. 2891 if ((!Src1.isImm() && !Src1.isReg()) || 2892 !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { 2893 legalizeOpWithMove(MI, Src1Idx); 2894 return; 2895 } 2896 2897 int CommutedOpc = commuteOpcode(MI); 2898 if (CommutedOpc == -1) { 2899 legalizeOpWithMove(MI, Src1Idx); 2900 return; 2901 } 2902 2903 MI.setDesc(get(CommutedOpc)); 2904 2905 unsigned Src0Reg = Src0.getReg(); 2906 unsigned Src0SubReg = Src0.getSubReg(); 2907 bool Src0Kill = Src0.isKill(); 2908 2909 if (Src1.isImm()) 2910 Src0.ChangeToImmediate(Src1.getImm()); 2911 else if (Src1.isReg()) { 2912 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); 2913 Src0.setSubReg(Src1.getSubReg()); 2914 } else 2915 llvm_unreachable("Should only have register or immediate operands"); 2916 2917 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); 2918 Src1.setSubReg(Src0SubReg); 2919 } 2920 2921 // Legalize VOP3 operands. Because all operand types are supported for any 2922 // operand, and since literal constants are not allowed and should never be 2923 // seen, we only need to worry about inserting copies if we use multiple SGPR 2924 // operands. 2925 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, 2926 MachineInstr &MI) const { 2927 unsigned Opc = MI.getOpcode(); 2928 2929 int VOP3Idx[3] = { 2930 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 2931 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), 2932 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) 2933 }; 2934 2935 // Find the one SGPR operand we are allowed to use. 2936 unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); 2937 2938 for (unsigned i = 0; i < 3; ++i) { 2939 int Idx = VOP3Idx[i]; 2940 if (Idx == -1) 2941 break; 2942 MachineOperand &MO = MI.getOperand(Idx); 2943 2944 // We should never see a VOP3 instruction with an illegal immediate operand. 2945 if (!MO.isReg()) 2946 continue; 2947 2948 if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 2949 continue; // VGPRs are legal 2950 2951 if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { 2952 SGPRReg = MO.getReg(); 2953 // We can use one SGPR in each VOP3 instruction. 2954 continue; 2955 } 2956 2957 // If we make it this far, then the operand is not legal and we must 2958 // legalize it. 2959 legalizeOpWithMove(MI, Idx); 2960 } 2961 } 2962 2963 unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, 2964 MachineRegisterInfo &MRI) const { 2965 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); 2966 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); 2967 unsigned DstReg = MRI.createVirtualRegister(SRC); 2968 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; 2969 2970 SmallVector<unsigned, 8> SRegs; 2971 for (unsigned i = 0; i < SubRegs; ++i) { 2972 unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2973 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 2974 get(AMDGPU::V_READFIRSTLANE_B32), SGPR) 2975 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); 2976 SRegs.push_back(SGPR); 2977 } 2978 2979 MachineInstrBuilder MIB = 2980 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 2981 get(AMDGPU::REG_SEQUENCE), DstReg); 2982 for (unsigned i = 0; i < SubRegs; ++i) { 2983 MIB.addReg(SRegs[i]); 2984 MIB.addImm(RI.getSubRegFromChannel(i)); 2985 } 2986 return DstReg; 2987 } 2988 2989 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, 2990 MachineInstr &MI) const { 2991 2992 // If the pointer is store in VGPRs, then we need to move them to 2993 // SGPRs using v_readfirstlane. This is safe because we only select 2994 // loads with uniform pointers to SMRD instruction so we know the 2995 // pointer value is uniform. 2996 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); 2997 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { 2998 unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); 2999 SBase->setReg(SGPR); 3000 } 3001 } 3002 3003 void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, 3004 MachineBasicBlock::iterator I, 3005 const TargetRegisterClass *DstRC, 3006 MachineOperand &Op, 3007 MachineRegisterInfo &MRI, 3008 const DebugLoc &DL) const { 3009 3010 unsigned OpReg = Op.getReg(); 3011 unsigned OpSubReg = Op.getSubReg(); 3012 3013 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( 3014 RI.getRegClassForReg(MRI, OpReg), OpSubReg); 3015 3016 // Check if operand is already the correct register class. 3017 if (DstRC == OpRC) 3018 return; 3019 3020 unsigned DstReg = MRI.createVirtualRegister(DstRC); 3021 MachineInstr *Copy = 3022 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); 3023 3024 Op.setReg(DstReg); 3025 Op.setSubReg(0); 3026 3027 MachineInstr *Def = MRI.getVRegDef(OpReg); 3028 if (!Def) 3029 return; 3030 3031 // Try to eliminate the copy if it is copying an immediate value. 3032 if (Def->isMoveImmediate()) 3033 FoldImmediate(*Copy, *Def, OpReg, &MRI); 3034 } 3035 3036 void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { 3037 MachineFunction &MF = *MI.getParent()->getParent(); 3038 MachineRegisterInfo &MRI = MF.getRegInfo(); 3039 3040 // Legalize VOP2 3041 if (isVOP2(MI) || isVOPC(MI)) { 3042 legalizeOperandsVOP2(MRI, MI); 3043 return; 3044 } 3045 3046 // Legalize VOP3 3047 if (isVOP3(MI)) { 3048 legalizeOperandsVOP3(MRI, MI); 3049 return; 3050 } 3051 3052 // Legalize SMRD 3053 if (isSMRD(MI)) { 3054 legalizeOperandsSMRD(MRI, MI); 3055 return; 3056 } 3057 3058 // Legalize REG_SEQUENCE and PHI 3059 // The register class of the operands much be the same type as the register 3060 // class of the output. 3061 if (MI.getOpcode() == AMDGPU::PHI) { 3062 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 3063 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { 3064 if (!MI.getOperand(i).isReg() || 3065 !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) 3066 continue; 3067 const TargetRegisterClass *OpRC = 3068 MRI.getRegClass(MI.getOperand(i).getReg()); 3069 if (RI.hasVGPRs(OpRC)) { 3070 VRC = OpRC; 3071 } else { 3072 SRC = OpRC; 3073 } 3074 } 3075 3076 // If any of the operands are VGPR registers, then they all most be 3077 // otherwise we will create illegal VGPR->SGPR copies when legalizing 3078 // them. 3079 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { 3080 if (!VRC) { 3081 assert(SRC); 3082 VRC = RI.getEquivalentVGPRClass(SRC); 3083 } 3084 RC = VRC; 3085 } else { 3086 RC = SRC; 3087 } 3088 3089 // Update all the operands so they have the same type. 3090 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3091 MachineOperand &Op = MI.getOperand(I); 3092 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 3093 continue; 3094 3095 // MI is a PHI instruction. 3096 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); 3097 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); 3098 3099 // Avoid creating no-op copies with the same src and dst reg class. These 3100 // confuse some of the machine passes. 3101 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); 3102 } 3103 } 3104 3105 // REG_SEQUENCE doesn't really require operand legalization, but if one has a 3106 // VGPR dest type and SGPR sources, insert copies so all operands are 3107 // VGPRs. This seems to help operand folding / the register coalescer. 3108 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { 3109 MachineBasicBlock *MBB = MI.getParent(); 3110 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); 3111 if (RI.hasVGPRs(DstRC)) { 3112 // Update all the operands so they are VGPR register classes. These may 3113 // not be the same register class because REG_SEQUENCE supports mixing 3114 // subregister index types e.g. sub0_sub1 + sub2 + sub3 3115 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3116 MachineOperand &Op = MI.getOperand(I); 3117 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 3118 continue; 3119 3120 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); 3121 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); 3122 if (VRC == OpRC) 3123 continue; 3124 3125 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc()); 3126 Op.setIsKill(); 3127 } 3128 } 3129 3130 return; 3131 } 3132 3133 // Legalize INSERT_SUBREG 3134 // src0 must have the same register class as dst 3135 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { 3136 unsigned Dst = MI.getOperand(0).getReg(); 3137 unsigned Src0 = MI.getOperand(1).getReg(); 3138 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 3139 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 3140 if (DstRC != Src0RC) { 3141 MachineBasicBlock *MBB = MI.getParent(); 3142 MachineOperand &Op = MI.getOperand(1); 3143 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); 3144 } 3145 return; 3146 } 3147 3148 // Legalize MIMG and MUBUF/MTBUF for shaders. 3149 // 3150 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via 3151 // scratch memory access. In both cases, the legalization never involves 3152 // conversion to the addr64 form. 3153 if (isMIMG(MI) || 3154 (AMDGPU::isShader(MF.getFunction()->getCallingConv()) && 3155 (isMUBUF(MI) || isMTBUF(MI)))) { 3156 MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); 3157 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { 3158 unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); 3159 SRsrc->setReg(SGPR); 3160 } 3161 3162 MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); 3163 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { 3164 unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); 3165 SSamp->setReg(SGPR); 3166 } 3167 return; 3168 } 3169 3170 // Legalize MUBUF* instructions by converting to addr64 form. 3171 // FIXME: If we start using the non-addr64 instructions for compute, we 3172 // may need to legalize them as above. This especially applies to the 3173 // buffer_load_format_* variants and variants with idxen (or bothen). 3174 int SRsrcIdx = 3175 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); 3176 if (SRsrcIdx != -1) { 3177 // We have an MUBUF instruction 3178 MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx); 3179 unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass; 3180 if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), 3181 RI.getRegClass(SRsrcRC))) { 3182 // The operands are legal. 3183 // FIXME: We may need to legalize operands besided srsrc. 3184 return; 3185 } 3186 3187 MachineBasicBlock &MBB = *MI.getParent(); 3188 3189 // Extract the ptr from the resource descriptor. 3190 unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, 3191 &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); 3192 3193 // Create an empty resource descriptor 3194 unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3195 unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 3196 unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 3197 unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); 3198 uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); 3199 3200 // Zero64 = 0 3201 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64) 3202 .addImm(0); 3203 3204 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 3205 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo) 3206 .addImm(RsrcDataFormat & 0xFFFFFFFF); 3207 3208 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 3209 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi) 3210 .addImm(RsrcDataFormat >> 32); 3211 3212 // NewSRsrc = {Zero64, SRsrcFormat} 3213 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) 3214 .addReg(Zero64) 3215 .addImm(AMDGPU::sub0_sub1) 3216 .addReg(SRsrcFormatLo) 3217 .addImm(AMDGPU::sub2) 3218 .addReg(SRsrcFormatHi) 3219 .addImm(AMDGPU::sub3); 3220 3221 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 3222 unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 3223 if (VAddr) { 3224 // This is already an ADDR64 instruction so we need to add the pointer 3225 // extracted from the resource descriptor to the current value of VAddr. 3226 unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3227 unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3228 3229 // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 3230 DebugLoc DL = MI.getDebugLoc(); 3231 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) 3232 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 3233 .addReg(VAddr->getReg(), 0, AMDGPU::sub0); 3234 3235 // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1 3236 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) 3237 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 3238 .addReg(VAddr->getReg(), 0, AMDGPU::sub1); 3239 3240 // NewVaddr = {NewVaddrHi, NewVaddrLo} 3241 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 3242 .addReg(NewVAddrLo) 3243 .addImm(AMDGPU::sub0) 3244 .addReg(NewVAddrHi) 3245 .addImm(AMDGPU::sub1); 3246 } else { 3247 // This instructions is the _OFFSET variant, so we need to convert it to 3248 // ADDR64. 3249 assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration() 3250 < SISubtarget::VOLCANIC_ISLANDS && 3251 "FIXME: Need to emit flat atomics here"); 3252 3253 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); 3254 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 3255 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); 3256 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); 3257 3258 // Atomics rith return have have an additional tied operand and are 3259 // missing some of the special bits. 3260 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); 3261 MachineInstr *Addr64; 3262 3263 if (!VDataIn) { 3264 // Regular buffer load / store. 3265 MachineInstrBuilder MIB = 3266 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 3267 .add(*VData) 3268 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 3269 // This will be replaced later 3270 // with the new value of vaddr. 3271 .add(*SRsrc) 3272 .add(*SOffset) 3273 .add(*Offset); 3274 3275 // Atomics do not have this operand. 3276 if (const MachineOperand *GLC = 3277 getNamedOperand(MI, AMDGPU::OpName::glc)) { 3278 MIB.addImm(GLC->getImm()); 3279 } 3280 3281 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)); 3282 3283 if (const MachineOperand *TFE = 3284 getNamedOperand(MI, AMDGPU::OpName::tfe)) { 3285 MIB.addImm(TFE->getImm()); 3286 } 3287 3288 MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); 3289 Addr64 = MIB; 3290 } else { 3291 // Atomics with return. 3292 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 3293 .add(*VData) 3294 .add(*VDataIn) 3295 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 3296 // This will be replaced later 3297 // with the new value of vaddr. 3298 .add(*SRsrc) 3299 .add(*SOffset) 3300 .add(*Offset) 3301 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) 3302 .setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); 3303 } 3304 3305 MI.removeFromParent(); 3306 3307 // NewVaddr = {NewVaddrHi, NewVaddrLo} 3308 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), 3309 NewVAddr) 3310 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 3311 .addImm(AMDGPU::sub0) 3312 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 3313 .addImm(AMDGPU::sub1); 3314 3315 VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr); 3316 SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc); 3317 } 3318 3319 // Update the instruction to use NewVaddr 3320 VAddr->setReg(NewVAddr); 3321 // Update the instruction to use NewSRsrc 3322 SRsrc->setReg(NewSRsrc); 3323 } 3324 } 3325 3326 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { 3327 SmallVector<MachineInstr *, 128> Worklist; 3328 Worklist.push_back(&TopInst); 3329 3330 while (!Worklist.empty()) { 3331 MachineInstr &Inst = *Worklist.pop_back_val(); 3332 MachineBasicBlock *MBB = Inst.getParent(); 3333 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 3334 3335 unsigned Opcode = Inst.getOpcode(); 3336 unsigned NewOpcode = getVALUOp(Inst); 3337 3338 // Handle some special cases 3339 switch (Opcode) { 3340 default: 3341 break; 3342 case AMDGPU::S_AND_B64: 3343 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); 3344 Inst.eraseFromParent(); 3345 continue; 3346 3347 case AMDGPU::S_OR_B64: 3348 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64); 3349 Inst.eraseFromParent(); 3350 continue; 3351 3352 case AMDGPU::S_XOR_B64: 3353 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64); 3354 Inst.eraseFromParent(); 3355 continue; 3356 3357 case AMDGPU::S_NOT_B64: 3358 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); 3359 Inst.eraseFromParent(); 3360 continue; 3361 3362 case AMDGPU::S_BCNT1_I32_B64: 3363 splitScalar64BitBCNT(Worklist, Inst); 3364 Inst.eraseFromParent(); 3365 continue; 3366 3367 case AMDGPU::S_BFE_I64: { 3368 splitScalar64BitBFE(Worklist, Inst); 3369 Inst.eraseFromParent(); 3370 continue; 3371 } 3372 3373 case AMDGPU::S_LSHL_B32: 3374 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3375 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 3376 swapOperands(Inst); 3377 } 3378 break; 3379 case AMDGPU::S_ASHR_I32: 3380 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3381 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 3382 swapOperands(Inst); 3383 } 3384 break; 3385 case AMDGPU::S_LSHR_B32: 3386 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3387 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 3388 swapOperands(Inst); 3389 } 3390 break; 3391 case AMDGPU::S_LSHL_B64: 3392 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3393 NewOpcode = AMDGPU::V_LSHLREV_B64; 3394 swapOperands(Inst); 3395 } 3396 break; 3397 case AMDGPU::S_ASHR_I64: 3398 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3399 NewOpcode = AMDGPU::V_ASHRREV_I64; 3400 swapOperands(Inst); 3401 } 3402 break; 3403 case AMDGPU::S_LSHR_B64: 3404 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3405 NewOpcode = AMDGPU::V_LSHRREV_B64; 3406 swapOperands(Inst); 3407 } 3408 break; 3409 3410 case AMDGPU::S_ABS_I32: 3411 lowerScalarAbs(Worklist, Inst); 3412 Inst.eraseFromParent(); 3413 continue; 3414 3415 case AMDGPU::S_CBRANCH_SCC0: 3416 case AMDGPU::S_CBRANCH_SCC1: 3417 // Clear unused bits of vcc 3418 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), 3419 AMDGPU::VCC) 3420 .addReg(AMDGPU::EXEC) 3421 .addReg(AMDGPU::VCC); 3422 break; 3423 3424 case AMDGPU::S_BFE_U64: 3425 case AMDGPU::S_BFM_B64: 3426 llvm_unreachable("Moving this op to VALU not implemented"); 3427 3428 case AMDGPU::S_PACK_LL_B32_B16: 3429 case AMDGPU::S_PACK_LH_B32_B16: 3430 case AMDGPU::S_PACK_HH_B32_B16: { 3431 movePackToVALU(Worklist, MRI, Inst); 3432 Inst.eraseFromParent(); 3433 continue; 3434 } 3435 } 3436 3437 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 3438 // We cannot move this instruction to the VALU, so we should try to 3439 // legalize its operands instead. 3440 legalizeOperands(Inst); 3441 continue; 3442 } 3443 3444 // Use the new VALU Opcode. 3445 const MCInstrDesc &NewDesc = get(NewOpcode); 3446 Inst.setDesc(NewDesc); 3447 3448 // Remove any references to SCC. Vector instructions can't read from it, and 3449 // We're just about to add the implicit use / defs of VCC, and we don't want 3450 // both. 3451 for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { 3452 MachineOperand &Op = Inst.getOperand(i); 3453 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { 3454 Inst.RemoveOperand(i); 3455 addSCCDefUsersToVALUWorklist(Inst, Worklist); 3456 } 3457 } 3458 3459 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 3460 // We are converting these to a BFE, so we need to add the missing 3461 // operands for the size and offset. 3462 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 3463 Inst.addOperand(MachineOperand::CreateImm(0)); 3464 Inst.addOperand(MachineOperand::CreateImm(Size)); 3465 3466 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 3467 // The VALU version adds the second operand to the result, so insert an 3468 // extra 0 operand. 3469 Inst.addOperand(MachineOperand::CreateImm(0)); 3470 } 3471 3472 Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); 3473 3474 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 3475 const MachineOperand &OffsetWidthOp = Inst.getOperand(2); 3476 // If we need to move this to VGPRs, we need to unpack the second operand 3477 // back into the 2 separate ones for bit offset and width. 3478 assert(OffsetWidthOp.isImm() && 3479 "Scalar BFE is only implemented for constant width and offset"); 3480 uint32_t Imm = OffsetWidthOp.getImm(); 3481 3482 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 3483 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 3484 Inst.RemoveOperand(2); // Remove old immediate. 3485 Inst.addOperand(MachineOperand::CreateImm(Offset)); 3486 Inst.addOperand(MachineOperand::CreateImm(BitWidth)); 3487 } 3488 3489 bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); 3490 unsigned NewDstReg = AMDGPU::NoRegister; 3491 if (HasDst) { 3492 unsigned DstReg = Inst.getOperand(0).getReg(); 3493 if (TargetRegisterInfo::isPhysicalRegister(DstReg)) 3494 continue; 3495 3496 // Update the destination register class. 3497 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); 3498 if (!NewDstRC) 3499 continue; 3500 3501 if (Inst.isCopy() && 3502 TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) && 3503 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { 3504 // Instead of creating a copy where src and dst are the same register 3505 // class, we just replace all uses of dst with src. These kinds of 3506 // copies interfere with the heuristics MachineSink uses to decide 3507 // whether or not to split a critical edge. Since the pass assumes 3508 // that copies will end up as machine instructions and not be 3509 // eliminated. 3510 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); 3511 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); 3512 MRI.clearKillFlags(Inst.getOperand(1).getReg()); 3513 Inst.getOperand(0).setReg(DstReg); 3514 continue; 3515 } 3516 3517 NewDstReg = MRI.createVirtualRegister(NewDstRC); 3518 MRI.replaceRegWith(DstReg, NewDstReg); 3519 } 3520 3521 // Legalize the operands 3522 legalizeOperands(Inst); 3523 3524 if (HasDst) 3525 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 3526 } 3527 } 3528 3529 void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, 3530 MachineInstr &Inst) const { 3531 MachineBasicBlock &MBB = *Inst.getParent(); 3532 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3533 MachineBasicBlock::iterator MII = Inst; 3534 DebugLoc DL = Inst.getDebugLoc(); 3535 3536 MachineOperand &Dest = Inst.getOperand(0); 3537 MachineOperand &Src = Inst.getOperand(1); 3538 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3539 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3540 3541 BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg) 3542 .addImm(0) 3543 .addReg(Src.getReg()); 3544 3545 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) 3546 .addReg(Src.getReg()) 3547 .addReg(TmpReg); 3548 3549 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3550 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3551 } 3552 3553 void SIInstrInfo::splitScalar64BitUnaryOp( 3554 SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst, 3555 unsigned Opcode) const { 3556 MachineBasicBlock &MBB = *Inst.getParent(); 3557 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3558 3559 MachineOperand &Dest = Inst.getOperand(0); 3560 MachineOperand &Src0 = Inst.getOperand(1); 3561 DebugLoc DL = Inst.getDebugLoc(); 3562 3563 MachineBasicBlock::iterator MII = Inst; 3564 3565 const MCInstrDesc &InstDesc = get(Opcode); 3566 const TargetRegisterClass *Src0RC = Src0.isReg() ? 3567 MRI.getRegClass(Src0.getReg()) : 3568 &AMDGPU::SGPR_32RegClass; 3569 3570 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 3571 3572 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3573 AMDGPU::sub0, Src0SubRC); 3574 3575 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 3576 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 3577 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 3578 3579 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 3580 BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); 3581 3582 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3583 AMDGPU::sub1, Src0SubRC); 3584 3585 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 3586 BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); 3587 3588 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 3589 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 3590 .addReg(DestSub0) 3591 .addImm(AMDGPU::sub0) 3592 .addReg(DestSub1) 3593 .addImm(AMDGPU::sub1); 3594 3595 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 3596 3597 // We don't need to legalizeOperands here because for a single operand, src0 3598 // will support any kind of input. 3599 3600 // Move all users of this moved value. 3601 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 3602 } 3603 3604 void SIInstrInfo::splitScalar64BitBinaryOp( 3605 SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst, 3606 unsigned Opcode) const { 3607 MachineBasicBlock &MBB = *Inst.getParent(); 3608 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3609 3610 MachineOperand &Dest = Inst.getOperand(0); 3611 MachineOperand &Src0 = Inst.getOperand(1); 3612 MachineOperand &Src1 = Inst.getOperand(2); 3613 DebugLoc DL = Inst.getDebugLoc(); 3614 3615 MachineBasicBlock::iterator MII = Inst; 3616 3617 const MCInstrDesc &InstDesc = get(Opcode); 3618 const TargetRegisterClass *Src0RC = Src0.isReg() ? 3619 MRI.getRegClass(Src0.getReg()) : 3620 &AMDGPU::SGPR_32RegClass; 3621 3622 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 3623 const TargetRegisterClass *Src1RC = Src1.isReg() ? 3624 MRI.getRegClass(Src1.getReg()) : 3625 &AMDGPU::SGPR_32RegClass; 3626 3627 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 3628 3629 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3630 AMDGPU::sub0, Src0SubRC); 3631 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 3632 AMDGPU::sub0, Src1SubRC); 3633 3634 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 3635 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 3636 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 3637 3638 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 3639 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) 3640 .add(SrcReg0Sub0) 3641 .add(SrcReg1Sub0); 3642 3643 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3644 AMDGPU::sub1, Src0SubRC); 3645 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 3646 AMDGPU::sub1, Src1SubRC); 3647 3648 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 3649 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) 3650 .add(SrcReg0Sub1) 3651 .add(SrcReg1Sub1); 3652 3653 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 3654 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 3655 .addReg(DestSub0) 3656 .addImm(AMDGPU::sub0) 3657 .addReg(DestSub1) 3658 .addImm(AMDGPU::sub1); 3659 3660 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 3661 3662 // Try to legalize the operands in case we need to swap the order to keep it 3663 // valid. 3664 legalizeOperands(LoHalf); 3665 legalizeOperands(HiHalf); 3666 3667 // Move all users of this moved vlaue. 3668 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 3669 } 3670 3671 void SIInstrInfo::splitScalar64BitBCNT( 3672 SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst) const { 3673 MachineBasicBlock &MBB = *Inst.getParent(); 3674 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3675 3676 MachineBasicBlock::iterator MII = Inst; 3677 DebugLoc DL = Inst.getDebugLoc(); 3678 3679 MachineOperand &Dest = Inst.getOperand(0); 3680 MachineOperand &Src = Inst.getOperand(1); 3681 3682 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 3683 const TargetRegisterClass *SrcRC = Src.isReg() ? 3684 MRI.getRegClass(Src.getReg()) : 3685 &AMDGPU::SGPR_32RegClass; 3686 3687 unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3688 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3689 3690 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); 3691 3692 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 3693 AMDGPU::sub0, SrcSubRC); 3694 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 3695 AMDGPU::sub1, SrcSubRC); 3696 3697 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0); 3698 3699 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg); 3700 3701 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3702 3703 // We don't need to legalize operands here. src0 for etiher instruction can be 3704 // an SGPR, and the second input is unused or determined here. 3705 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3706 } 3707 3708 void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, 3709 MachineInstr &Inst) const { 3710 MachineBasicBlock &MBB = *Inst.getParent(); 3711 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3712 MachineBasicBlock::iterator MII = Inst; 3713 DebugLoc DL = Inst.getDebugLoc(); 3714 3715 MachineOperand &Dest = Inst.getOperand(0); 3716 uint32_t Imm = Inst.getOperand(2).getImm(); 3717 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 3718 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 3719 3720 (void) Offset; 3721 3722 // Only sext_inreg cases handled. 3723 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && 3724 Offset == 0 && "Not implemented"); 3725 3726 if (BitWidth < 32) { 3727 unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3728 unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3729 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 3730 3731 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) 3732 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) 3733 .addImm(0) 3734 .addImm(BitWidth); 3735 3736 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 3737 .addImm(31) 3738 .addReg(MidRegLo); 3739 3740 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 3741 .addReg(MidRegLo) 3742 .addImm(AMDGPU::sub0) 3743 .addReg(MidRegHi) 3744 .addImm(AMDGPU::sub1); 3745 3746 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3747 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3748 return; 3749 } 3750 3751 MachineOperand &Src = Inst.getOperand(1); 3752 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3753 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 3754 3755 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 3756 .addImm(31) 3757 .addReg(Src.getReg(), 0, AMDGPU::sub0); 3758 3759 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 3760 .addReg(Src.getReg(), 0, AMDGPU::sub0) 3761 .addImm(AMDGPU::sub0) 3762 .addReg(TmpReg) 3763 .addImm(AMDGPU::sub1); 3764 3765 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3766 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3767 } 3768 3769 void SIInstrInfo::addUsersToMoveToVALUWorklist( 3770 unsigned DstReg, 3771 MachineRegisterInfo &MRI, 3772 SmallVectorImpl<MachineInstr *> &Worklist) const { 3773 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), 3774 E = MRI.use_end(); I != E;) { 3775 MachineInstr &UseMI = *I->getParent(); 3776 if (!canReadVGPR(UseMI, I.getOperandNo())) { 3777 Worklist.push_back(&UseMI); 3778 3779 do { 3780 ++I; 3781 } while (I != E && I->getParent() == &UseMI); 3782 } else { 3783 ++I; 3784 } 3785 } 3786 } 3787 3788 void SIInstrInfo::movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist, 3789 MachineRegisterInfo &MRI, 3790 MachineInstr &Inst) const { 3791 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3792 MachineBasicBlock *MBB = Inst.getParent(); 3793 MachineOperand &Src0 = Inst.getOperand(1); 3794 MachineOperand &Src1 = Inst.getOperand(2); 3795 const DebugLoc &DL = Inst.getDebugLoc(); 3796 3797 switch (Inst.getOpcode()) { 3798 case AMDGPU::S_PACK_LL_B32_B16: { 3799 unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3800 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3801 3802 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are 3803 // 0. 3804 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 3805 .addImm(0xffff); 3806 3807 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg) 3808 .addReg(ImmReg, RegState::Kill) 3809 .add(Src0); 3810 3811 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg) 3812 .add(Src1) 3813 .addImm(16) 3814 .addReg(TmpReg, RegState::Kill); 3815 break; 3816 } 3817 case AMDGPU::S_PACK_LH_B32_B16: { 3818 unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3819 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 3820 .addImm(0xffff); 3821 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg) 3822 .addReg(ImmReg, RegState::Kill) 3823 .add(Src0) 3824 .add(Src1); 3825 break; 3826 } 3827 case AMDGPU::S_PACK_HH_B32_B16: { 3828 unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3829 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3830 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) 3831 .addImm(16) 3832 .add(Src0); 3833 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 3834 .addImm(0xffff0000); 3835 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg) 3836 .add(Src1) 3837 .addReg(ImmReg, RegState::Kill) 3838 .addReg(TmpReg, RegState::Kill); 3839 break; 3840 } 3841 default: 3842 llvm_unreachable("unhandled s_pack_* instruction"); 3843 } 3844 3845 MachineOperand &Dest = Inst.getOperand(0); 3846 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3847 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3848 } 3849 3850 void SIInstrInfo::addSCCDefUsersToVALUWorklist( 3851 MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const { 3852 // This assumes that all the users of SCC are in the same block 3853 // as the SCC def. 3854 for (MachineInstr &MI : 3855 llvm::make_range(MachineBasicBlock::iterator(SCCDefInst), 3856 SCCDefInst.getParent()->end())) { 3857 // Exit if we find another SCC def. 3858 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1) 3859 return; 3860 3861 if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1) 3862 Worklist.push_back(&MI); 3863 } 3864 } 3865 3866 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( 3867 const MachineInstr &Inst) const { 3868 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); 3869 3870 switch (Inst.getOpcode()) { 3871 // For target instructions, getOpRegClass just returns the virtual register 3872 // class associated with the operand, so we need to find an equivalent VGPR 3873 // register class in order to move the instruction to the VALU. 3874 case AMDGPU::COPY: 3875 case AMDGPU::PHI: 3876 case AMDGPU::REG_SEQUENCE: 3877 case AMDGPU::INSERT_SUBREG: 3878 if (RI.hasVGPRs(NewDstRC)) 3879 return nullptr; 3880 3881 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 3882 if (!NewDstRC) 3883 return nullptr; 3884 return NewDstRC; 3885 default: 3886 return NewDstRC; 3887 } 3888 } 3889 3890 // Find the one SGPR operand we are allowed to use. 3891 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI, 3892 int OpIndices[3]) const { 3893 const MCInstrDesc &Desc = MI.getDesc(); 3894 3895 // Find the one SGPR operand we are allowed to use. 3896 // 3897 // First we need to consider the instruction's operand requirements before 3898 // legalizing. Some operands are required to be SGPRs, such as implicit uses 3899 // of VCC, but we are still bound by the constant bus requirement to only use 3900 // one. 3901 // 3902 // If the operand's class is an SGPR, we can never move it. 3903 3904 unsigned SGPRReg = findImplicitSGPRRead(MI); 3905 if (SGPRReg != AMDGPU::NoRegister) 3906 return SGPRReg; 3907 3908 unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; 3909 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 3910 3911 for (unsigned i = 0; i < 3; ++i) { 3912 int Idx = OpIndices[i]; 3913 if (Idx == -1) 3914 break; 3915 3916 const MachineOperand &MO = MI.getOperand(Idx); 3917 if (!MO.isReg()) 3918 continue; 3919 3920 // Is this operand statically required to be an SGPR based on the operand 3921 // constraints? 3922 const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); 3923 bool IsRequiredSGPR = RI.isSGPRClass(OpRC); 3924 if (IsRequiredSGPR) 3925 return MO.getReg(); 3926 3927 // If this could be a VGPR or an SGPR, Check the dynamic register class. 3928 unsigned Reg = MO.getReg(); 3929 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); 3930 if (RI.isSGPRClass(RegRC)) 3931 UsedSGPRs[i] = Reg; 3932 } 3933 3934 // We don't have a required SGPR operand, so we have a bit more freedom in 3935 // selecting operands to move. 3936 3937 // Try to select the most used SGPR. If an SGPR is equal to one of the 3938 // others, we choose that. 3939 // 3940 // e.g. 3941 // V_FMA_F32 v0, s0, s0, s0 -> No moves 3942 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 3943 3944 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should 3945 // prefer those. 3946 3947 if (UsedSGPRs[0] != AMDGPU::NoRegister) { 3948 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 3949 SGPRReg = UsedSGPRs[0]; 3950 } 3951 3952 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { 3953 if (UsedSGPRs[1] == UsedSGPRs[2]) 3954 SGPRReg = UsedSGPRs[1]; 3955 } 3956 3957 return SGPRReg; 3958 } 3959 3960 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 3961 unsigned OperandName) const { 3962 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 3963 if (Idx == -1) 3964 return nullptr; 3965 3966 return &MI.getOperand(Idx); 3967 } 3968 3969 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 3970 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 3971 if (ST.isAmdHsaOS()) { 3972 // Set ATC = 1. GFX9 doesn't have this bit. 3973 if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) 3974 RsrcDataFormat |= (1ULL << 56); 3975 3976 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this. 3977 // BTW, it disables TC L2 and therefore decreases performance. 3978 if (ST.getGeneration() == SISubtarget::VOLCANIC_ISLANDS) 3979 RsrcDataFormat |= (2ULL << 59); 3980 } 3981 3982 return RsrcDataFormat; 3983 } 3984 3985 uint64_t SIInstrInfo::getScratchRsrcWords23() const { 3986 uint64_t Rsrc23 = getDefaultRsrcDataFormat() | 3987 AMDGPU::RSRC_TID_ENABLE | 3988 0xffffffff; // Size; 3989 3990 // GFX9 doesn't have ELEMENT_SIZE. 3991 if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) { 3992 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; 3993 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; 3994 } 3995 3996 // IndexStride = 64. 3997 Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; 3998 3999 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. 4000 // Clear them unless we want a huge stride. 4001 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) 4002 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; 4003 4004 return Rsrc23; 4005 } 4006 4007 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { 4008 unsigned Opc = MI.getOpcode(); 4009 4010 return isSMRD(Opc); 4011 } 4012 4013 bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const { 4014 unsigned Opc = MI.getOpcode(); 4015 4016 return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); 4017 } 4018 4019 unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, 4020 int &FrameIndex) const { 4021 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 4022 if (!Addr || !Addr->isFI()) 4023 return AMDGPU::NoRegister; 4024 4025 assert(!MI.memoperands_empty() && 4026 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUASI.PRIVATE_ADDRESS); 4027 4028 FrameIndex = Addr->getIndex(); 4029 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); 4030 } 4031 4032 unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, 4033 int &FrameIndex) const { 4034 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr); 4035 assert(Addr && Addr->isFI()); 4036 FrameIndex = Addr->getIndex(); 4037 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg(); 4038 } 4039 4040 unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 4041 int &FrameIndex) const { 4042 4043 if (!MI.mayLoad()) 4044 return AMDGPU::NoRegister; 4045 4046 if (isMUBUF(MI) || isVGPRSpill(MI)) 4047 return isStackAccess(MI, FrameIndex); 4048 4049 if (isSGPRSpill(MI)) 4050 return isSGPRStackAccess(MI, FrameIndex); 4051 4052 return AMDGPU::NoRegister; 4053 } 4054 4055 unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI, 4056 int &FrameIndex) const { 4057 if (!MI.mayStore()) 4058 return AMDGPU::NoRegister; 4059 4060 if (isMUBUF(MI) || isVGPRSpill(MI)) 4061 return isStackAccess(MI, FrameIndex); 4062 4063 if (isSGPRSpill(MI)) 4064 return isSGPRStackAccess(MI, FrameIndex); 4065 4066 return AMDGPU::NoRegister; 4067 } 4068 4069 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 4070 unsigned Opc = MI.getOpcode(); 4071 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); 4072 unsigned DescSize = Desc.getSize(); 4073 4074 // If we have a definitive size, we can use it. Otherwise we need to inspect 4075 // the operands to know the size. 4076 // 4077 // FIXME: Instructions that have a base 32-bit encoding report their size as 4078 // 4, even though they are really 8 bytes if they have a literal operand. 4079 if (DescSize != 0 && DescSize != 4) 4080 return DescSize; 4081 4082 // 4-byte instructions may have a 32-bit literal encoded after them. Check 4083 // operands that coud ever be literals. 4084 if (isVALU(MI) || isSALU(MI)) { 4085 if (isFixedSize(MI)) 4086 return DescSize; 4087 4088 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 4089 if (Src0Idx == -1) 4090 return 4; // No operands. 4091 4092 if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx])) 4093 return 8; 4094 4095 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 4096 if (Src1Idx == -1) 4097 return 4; 4098 4099 if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx])) 4100 return 8; 4101 4102 return 4; 4103 } 4104 4105 if (DescSize == 4) 4106 return 4; 4107 4108 switch (Opc) { 4109 case TargetOpcode::IMPLICIT_DEF: 4110 case TargetOpcode::KILL: 4111 case TargetOpcode::DBG_VALUE: 4112 case TargetOpcode::BUNDLE: 4113 case TargetOpcode::EH_LABEL: 4114 return 0; 4115 case TargetOpcode::INLINEASM: { 4116 const MachineFunction *MF = MI.getParent()->getParent(); 4117 const char *AsmStr = MI.getOperand(0).getSymbolName(); 4118 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo()); 4119 } 4120 default: 4121 llvm_unreachable("unable to find instruction size"); 4122 } 4123 } 4124 4125 bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { 4126 if (!isFLAT(MI)) 4127 return false; 4128 4129 if (MI.memoperands_empty()) 4130 return true; 4131 4132 for (const MachineMemOperand *MMO : MI.memoperands()) { 4133 if (MMO->getAddrSpace() == AMDGPUASI.FLAT_ADDRESS) 4134 return true; 4135 } 4136 return false; 4137 } 4138 4139 bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const { 4140 return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO; 4141 } 4142 4143 void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, 4144 MachineBasicBlock *IfEnd) const { 4145 MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator(); 4146 assert(TI != IfEntry->end()); 4147 4148 MachineInstr *Branch = &(*TI); 4149 MachineFunction *MF = IfEntry->getParent(); 4150 MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo(); 4151 4152 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 4153 unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4154 MachineInstr *SIIF = 4155 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg) 4156 .add(Branch->getOperand(0)) 4157 .add(Branch->getOperand(1)); 4158 MachineInstr *SIEND = 4159 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF)) 4160 .addReg(DstReg); 4161 4162 IfEntry->erase(TI); 4163 IfEntry->insert(IfEntry->end(), SIIF); 4164 IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND); 4165 } 4166 } 4167 4168 void SIInstrInfo::convertNonUniformLoopRegion( 4169 MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const { 4170 MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator(); 4171 // We expect 2 terminators, one conditional and one unconditional. 4172 assert(TI != LoopEnd->end()); 4173 4174 MachineInstr *Branch = &(*TI); 4175 MachineFunction *MF = LoopEnd->getParent(); 4176 MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo(); 4177 4178 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 4179 4180 unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4181 unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4182 MachineInstrBuilder HeaderPHIBuilder = 4183 BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg); 4184 for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(), 4185 E = LoopEntry->pred_end(); 4186 PI != E; ++PI) { 4187 if (*PI == LoopEnd) { 4188 HeaderPHIBuilder.addReg(BackEdgeReg); 4189 } else { 4190 MachineBasicBlock *PMBB = *PI; 4191 unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4192 materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(), 4193 ZeroReg, 0); 4194 HeaderPHIBuilder.addReg(ZeroReg); 4195 } 4196 HeaderPHIBuilder.addMBB(*PI); 4197 } 4198 MachineInstr *HeaderPhi = HeaderPHIBuilder; 4199 MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(), 4200 get(AMDGPU::SI_IF_BREAK), BackEdgeReg) 4201 .addReg(DstReg) 4202 .add(Branch->getOperand(0)); 4203 MachineInstr *SILOOP = 4204 BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP)) 4205 .addReg(BackEdgeReg) 4206 .addMBB(LoopEntry); 4207 4208 LoopEntry->insert(LoopEntry->begin(), HeaderPhi); 4209 LoopEnd->erase(TI); 4210 LoopEnd->insert(LoopEnd->end(), SIIFBREAK); 4211 LoopEnd->insert(LoopEnd->end(), SILOOP); 4212 } 4213 } 4214 4215 ArrayRef<std::pair<int, const char *>> 4216 SIInstrInfo::getSerializableTargetIndices() const { 4217 static const std::pair<int, const char *> TargetIndices[] = { 4218 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, 4219 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, 4220 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, 4221 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, 4222 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; 4223 return makeArrayRef(TargetIndices); 4224 } 4225 4226 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The 4227 /// post-RA version of misched uses CreateTargetMIHazardRecognizer. 4228 ScheduleHazardRecognizer * 4229 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, 4230 const ScheduleDAG *DAG) const { 4231 return new GCNHazardRecognizer(DAG->MF); 4232 } 4233 4234 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer 4235 /// pass. 4236 ScheduleHazardRecognizer * 4237 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { 4238 return new GCNHazardRecognizer(MF); 4239 } 4240 4241 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { 4242 return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && 4243 MI.modifiesRegister(AMDGPU::EXEC, &RI); 4244 } 4245 4246 MachineInstrBuilder 4247 SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, 4248 MachineBasicBlock::iterator I, 4249 const DebugLoc &DL, 4250 unsigned DestReg) const { 4251 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4252 4253 unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4254 4255 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) 4256 .addReg(UnusedCarry, RegState::Define | RegState::Dead); 4257 } 4258