1 //===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief SI Implementation of TargetInstrInfo. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "SIInstrInfo.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "GCNHazardRecognizer.h" 18 #include "SIDefines.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/CodeGen/MachineFrameInfo.h" 21 #include "llvm/CodeGen/MachineInstrBuilder.h" 22 #include "llvm/CodeGen/MachineRegisterInfo.h" 23 #include "llvm/CodeGen/ScheduleDAG.h" 24 #include "llvm/IR/Function.h" 25 #include "llvm/CodeGen/RegisterScavenging.h" 26 #include "llvm/MC/MCInstrDesc.h" 27 #include "llvm/Support/Debug.h" 28 29 using namespace llvm; 30 31 SIInstrInfo::SIInstrInfo(const SISubtarget &ST) 32 : AMDGPUInstrInfo(ST), RI(), ST(ST) {} 33 34 //===----------------------------------------------------------------------===// 35 // TargetInstrInfo callbacks 36 //===----------------------------------------------------------------------===// 37 38 static unsigned getNumOperandsNoGlue(SDNode *Node) { 39 unsigned N = Node->getNumOperands(); 40 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 41 --N; 42 return N; 43 } 44 45 static SDValue findChainOperand(SDNode *Load) { 46 SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); 47 assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); 48 return LastOp; 49 } 50 51 /// \brief Returns true if both nodes have the same value for the given 52 /// operand \p Op, or if both nodes do not have this operand. 53 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 54 unsigned Opc0 = N0->getMachineOpcode(); 55 unsigned Opc1 = N1->getMachineOpcode(); 56 57 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 58 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 59 60 if (Op0Idx == -1 && Op1Idx == -1) 61 return true; 62 63 64 if ((Op0Idx == -1 && Op1Idx != -1) || 65 (Op1Idx == -1 && Op0Idx != -1)) 66 return false; 67 68 // getNamedOperandIdx returns the index for the MachineInstr's operands, 69 // which includes the result as the first operand. We are indexing into the 70 // MachineSDNode's operands, so we need to skip the result operand to get 71 // the real index. 72 --Op0Idx; 73 --Op1Idx; 74 75 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 76 } 77 78 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, 79 AliasAnalysis *AA) const { 80 // TODO: The generic check fails for VALU instructions that should be 81 // rematerializable due to implicit reads of exec. We really want all of the 82 // generic logic for this except for this. 83 switch (MI.getOpcode()) { 84 case AMDGPU::V_MOV_B32_e32: 85 case AMDGPU::V_MOV_B32_e64: 86 case AMDGPU::V_MOV_B64_PSEUDO: 87 return true; 88 default: 89 return false; 90 } 91 } 92 93 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 94 int64_t &Offset0, 95 int64_t &Offset1) const { 96 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 97 return false; 98 99 unsigned Opc0 = Load0->getMachineOpcode(); 100 unsigned Opc1 = Load1->getMachineOpcode(); 101 102 // Make sure both are actually loads. 103 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 104 return false; 105 106 if (isDS(Opc0) && isDS(Opc1)) { 107 108 // FIXME: Handle this case: 109 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 110 return false; 111 112 // Check base reg. 113 if (Load0->getOperand(1) != Load1->getOperand(1)) 114 return false; 115 116 // Check chain. 117 if (findChainOperand(Load0) != findChainOperand(Load1)) 118 return false; 119 120 // Skip read2 / write2 variants for simplicity. 121 // TODO: We should report true if the used offsets are adjacent (excluded 122 // st64 versions). 123 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || 124 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) 125 return false; 126 127 Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue(); 128 Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue(); 129 return true; 130 } 131 132 if (isSMRD(Opc0) && isSMRD(Opc1)) { 133 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); 134 135 // Check base reg. 136 if (Load0->getOperand(0) != Load1->getOperand(0)) 137 return false; 138 139 const ConstantSDNode *Load0Offset = 140 dyn_cast<ConstantSDNode>(Load0->getOperand(1)); 141 const ConstantSDNode *Load1Offset = 142 dyn_cast<ConstantSDNode>(Load1->getOperand(1)); 143 144 if (!Load0Offset || !Load1Offset) 145 return false; 146 147 // Check chain. 148 if (findChainOperand(Load0) != findChainOperand(Load1)) 149 return false; 150 151 Offset0 = Load0Offset->getZExtValue(); 152 Offset1 = Load1Offset->getZExtValue(); 153 return true; 154 } 155 156 // MUBUF and MTBUF can access the same addresses. 157 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 158 159 // MUBUF and MTBUF have vaddr at different indices. 160 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 161 findChainOperand(Load0) != findChainOperand(Load1) || 162 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 163 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 164 return false; 165 166 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 167 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 168 169 if (OffIdx0 == -1 || OffIdx1 == -1) 170 return false; 171 172 // getNamedOperandIdx returns the index for MachineInstrs. Since they 173 // inlcude the output in the operand list, but SDNodes don't, we need to 174 // subtract the index by one. 175 --OffIdx0; 176 --OffIdx1; 177 178 SDValue Off0 = Load0->getOperand(OffIdx0); 179 SDValue Off1 = Load1->getOperand(OffIdx1); 180 181 // The offset might be a FrameIndexSDNode. 182 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 183 return false; 184 185 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 186 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 187 return true; 188 } 189 190 return false; 191 } 192 193 static bool isStride64(unsigned Opc) { 194 switch (Opc) { 195 case AMDGPU::DS_READ2ST64_B32: 196 case AMDGPU::DS_READ2ST64_B64: 197 case AMDGPU::DS_WRITE2ST64_B32: 198 case AMDGPU::DS_WRITE2ST64_B64: 199 return true; 200 default: 201 return false; 202 } 203 } 204 205 bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, 206 int64_t &Offset, 207 const TargetRegisterInfo *TRI) const { 208 unsigned Opc = LdSt.getOpcode(); 209 210 if (isDS(LdSt)) { 211 const MachineOperand *OffsetImm = 212 getNamedOperand(LdSt, AMDGPU::OpName::offset); 213 if (OffsetImm) { 214 // Normal, single offset LDS instruction. 215 const MachineOperand *AddrReg = 216 getNamedOperand(LdSt, AMDGPU::OpName::addr); 217 218 BaseReg = AddrReg->getReg(); 219 Offset = OffsetImm->getImm(); 220 return true; 221 } 222 223 // The 2 offset instructions use offset0 and offset1 instead. We can treat 224 // these as a load with a single offset if the 2 offsets are consecutive. We 225 // will use this for some partially aligned loads. 226 const MachineOperand *Offset0Imm = 227 getNamedOperand(LdSt, AMDGPU::OpName::offset0); 228 const MachineOperand *Offset1Imm = 229 getNamedOperand(LdSt, AMDGPU::OpName::offset1); 230 231 uint8_t Offset0 = Offset0Imm->getImm(); 232 uint8_t Offset1 = Offset1Imm->getImm(); 233 234 if (Offset1 > Offset0 && Offset1 - Offset0 == 1) { 235 // Each of these offsets is in element sized units, so we need to convert 236 // to bytes of the individual reads. 237 238 unsigned EltSize; 239 if (LdSt.mayLoad()) 240 EltSize = getOpRegClass(LdSt, 0)->getSize() / 2; 241 else { 242 assert(LdSt.mayStore()); 243 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 244 EltSize = getOpRegClass(LdSt, Data0Idx)->getSize(); 245 } 246 247 if (isStride64(Opc)) 248 EltSize *= 64; 249 250 const MachineOperand *AddrReg = 251 getNamedOperand(LdSt, AMDGPU::OpName::addr); 252 BaseReg = AddrReg->getReg(); 253 Offset = EltSize * Offset0; 254 return true; 255 } 256 257 return false; 258 } 259 260 if (isMUBUF(LdSt) || isMTBUF(LdSt)) { 261 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) 262 return false; 263 264 const MachineOperand *AddrReg = 265 getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 266 if (!AddrReg) 267 return false; 268 269 const MachineOperand *OffsetImm = 270 getNamedOperand(LdSt, AMDGPU::OpName::offset); 271 BaseReg = AddrReg->getReg(); 272 Offset = OffsetImm->getImm(); 273 return true; 274 } 275 276 if (isSMRD(LdSt)) { 277 const MachineOperand *OffsetImm = 278 getNamedOperand(LdSt, AMDGPU::OpName::offset); 279 if (!OffsetImm) 280 return false; 281 282 const MachineOperand *SBaseReg = 283 getNamedOperand(LdSt, AMDGPU::OpName::sbase); 284 BaseReg = SBaseReg->getReg(); 285 Offset = OffsetImm->getImm(); 286 return true; 287 } 288 289 if (isFLAT(LdSt)) { 290 const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::addr); 291 BaseReg = AddrReg->getReg(); 292 Offset = 0; 293 return true; 294 } 295 296 return false; 297 } 298 299 bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, 300 MachineInstr &SecondLdSt, 301 unsigned NumLoads) const { 302 const MachineOperand *FirstDst = nullptr; 303 const MachineOperand *SecondDst = nullptr; 304 305 if (isDS(FirstLdSt) && isDS(SecondLdSt)) { 306 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); 307 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); 308 } 309 310 if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { 311 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); 312 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); 313 } 314 315 if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || 316 (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt))) { 317 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); 318 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); 319 } 320 321 if (!FirstDst || !SecondDst) 322 return false; 323 324 // Try to limit clustering based on the total number of bytes loaded 325 // rather than the number of instructions. This is done to help reduce 326 // register pressure. The method used is somewhat inexact, though, 327 // because it assumes that all loads in the cluster will load the 328 // same number of bytes as FirstLdSt. 329 330 // The unit of this value is bytes. 331 // FIXME: This needs finer tuning. 332 unsigned LoadClusterThreshold = 16; 333 334 const MachineRegisterInfo &MRI = 335 FirstLdSt.getParent()->getParent()->getRegInfo(); 336 const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg()); 337 338 return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold; 339 } 340 341 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 342 MachineBasicBlock::iterator MI, 343 const DebugLoc &DL, unsigned DestReg, 344 unsigned SrcReg, bool KillSrc) const { 345 346 // If we are trying to copy to or from SCC, there is a bug somewhere else in 347 // the backend. While it may be theoretically possible to do this, it should 348 // never be necessary. 349 assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC); 350 351 static const int16_t Sub0_15[] = { 352 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 353 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 354 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 355 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 356 }; 357 358 static const int16_t Sub0_15_64[] = { 359 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 360 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 361 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 362 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 363 }; 364 365 static const int16_t Sub0_7[] = { 366 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 367 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 368 }; 369 370 static const int16_t Sub0_7_64[] = { 371 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 372 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 373 }; 374 375 static const int16_t Sub0_3[] = { 376 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 377 }; 378 379 static const int16_t Sub0_3_64[] = { 380 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 381 }; 382 383 static const int16_t Sub0_2[] = { 384 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 385 }; 386 387 static const int16_t Sub0_1[] = { 388 AMDGPU::sub0, AMDGPU::sub1, 389 }; 390 391 unsigned Opcode; 392 ArrayRef<int16_t> SubIndices; 393 394 if (AMDGPU::SReg_32RegClass.contains(DestReg)) { 395 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 396 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 397 .addReg(SrcReg, getKillRegState(KillSrc)); 398 return; 399 400 } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) { 401 if (DestReg == AMDGPU::VCC) { 402 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 403 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 404 .addReg(SrcReg, getKillRegState(KillSrc)); 405 } else { 406 // FIXME: Hack until VReg_1 removed. 407 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 408 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32)) 409 .addImm(0) 410 .addReg(SrcReg, getKillRegState(KillSrc)); 411 } 412 413 return; 414 } 415 416 assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); 417 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 418 .addReg(SrcReg, getKillRegState(KillSrc)); 419 return; 420 421 } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) { 422 assert(AMDGPU::SReg_128RegClass.contains(SrcReg)); 423 Opcode = AMDGPU::S_MOV_B64; 424 SubIndices = Sub0_3_64; 425 426 } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) { 427 assert(AMDGPU::SReg_256RegClass.contains(SrcReg)); 428 Opcode = AMDGPU::S_MOV_B64; 429 SubIndices = Sub0_7_64; 430 431 } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) { 432 assert(AMDGPU::SReg_512RegClass.contains(SrcReg)); 433 Opcode = AMDGPU::S_MOV_B64; 434 SubIndices = Sub0_15_64; 435 436 } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) { 437 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 438 AMDGPU::SReg_32RegClass.contains(SrcReg)); 439 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 440 .addReg(SrcReg, getKillRegState(KillSrc)); 441 return; 442 443 } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) { 444 assert(AMDGPU::VReg_64RegClass.contains(SrcReg) || 445 AMDGPU::SReg_64RegClass.contains(SrcReg)); 446 Opcode = AMDGPU::V_MOV_B32_e32; 447 SubIndices = Sub0_1; 448 449 } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) { 450 assert(AMDGPU::VReg_96RegClass.contains(SrcReg)); 451 Opcode = AMDGPU::V_MOV_B32_e32; 452 SubIndices = Sub0_2; 453 454 } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) { 455 assert(AMDGPU::VReg_128RegClass.contains(SrcReg) || 456 AMDGPU::SReg_128RegClass.contains(SrcReg)); 457 Opcode = AMDGPU::V_MOV_B32_e32; 458 SubIndices = Sub0_3; 459 460 } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) { 461 assert(AMDGPU::VReg_256RegClass.contains(SrcReg) || 462 AMDGPU::SReg_256RegClass.contains(SrcReg)); 463 Opcode = AMDGPU::V_MOV_B32_e32; 464 SubIndices = Sub0_7; 465 466 } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) { 467 assert(AMDGPU::VReg_512RegClass.contains(SrcReg) || 468 AMDGPU::SReg_512RegClass.contains(SrcReg)); 469 Opcode = AMDGPU::V_MOV_B32_e32; 470 SubIndices = Sub0_15; 471 472 } else { 473 llvm_unreachable("Can't copy register!"); 474 } 475 476 bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); 477 478 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 479 unsigned SubIdx; 480 if (Forward) 481 SubIdx = SubIndices[Idx]; 482 else 483 SubIdx = SubIndices[SubIndices.size() - Idx - 1]; 484 485 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 486 get(Opcode), RI.getSubReg(DestReg, SubIdx)); 487 488 Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); 489 490 if (Idx == SubIndices.size() - 1) 491 Builder.addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit); 492 493 if (Idx == 0) 494 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 495 496 Builder.addReg(SrcReg, RegState::Implicit); 497 } 498 } 499 500 int SIInstrInfo::commuteOpcode(const MachineInstr &MI) const { 501 const unsigned Opcode = MI.getOpcode(); 502 503 int NewOpc; 504 505 // Try to map original to commuted opcode 506 NewOpc = AMDGPU::getCommuteRev(Opcode); 507 if (NewOpc != -1) 508 // Check if the commuted (REV) opcode exists on the target. 509 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 510 511 // Try to map commuted to original opcode 512 NewOpc = AMDGPU::getCommuteOrig(Opcode); 513 if (NewOpc != -1) 514 // Check if the original (non-REV) opcode exists on the target. 515 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 516 517 return Opcode; 518 } 519 520 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 521 522 if (DstRC->getSize() == 4) { 523 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 524 } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) { 525 return AMDGPU::S_MOV_B64; 526 } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) { 527 return AMDGPU::V_MOV_B64_PSEUDO; 528 } 529 return AMDGPU::COPY; 530 } 531 532 static unsigned getSGPRSpillSaveOpcode(unsigned Size) { 533 switch (Size) { 534 case 4: 535 return AMDGPU::SI_SPILL_S32_SAVE; 536 case 8: 537 return AMDGPU::SI_SPILL_S64_SAVE; 538 case 16: 539 return AMDGPU::SI_SPILL_S128_SAVE; 540 case 32: 541 return AMDGPU::SI_SPILL_S256_SAVE; 542 case 64: 543 return AMDGPU::SI_SPILL_S512_SAVE; 544 default: 545 llvm_unreachable("unknown register size"); 546 } 547 } 548 549 static unsigned getVGPRSpillSaveOpcode(unsigned Size) { 550 switch (Size) { 551 case 4: 552 return AMDGPU::SI_SPILL_V32_SAVE; 553 case 8: 554 return AMDGPU::SI_SPILL_V64_SAVE; 555 case 12: 556 return AMDGPU::SI_SPILL_V96_SAVE; 557 case 16: 558 return AMDGPU::SI_SPILL_V128_SAVE; 559 case 32: 560 return AMDGPU::SI_SPILL_V256_SAVE; 561 case 64: 562 return AMDGPU::SI_SPILL_V512_SAVE; 563 default: 564 llvm_unreachable("unknown register size"); 565 } 566 } 567 568 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 569 MachineBasicBlock::iterator MI, 570 unsigned SrcReg, bool isKill, 571 int FrameIndex, 572 const TargetRegisterClass *RC, 573 const TargetRegisterInfo *TRI) const { 574 MachineFunction *MF = MBB.getParent(); 575 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 576 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 577 DebugLoc DL = MBB.findDebugLoc(MI); 578 579 unsigned Size = FrameInfo.getObjectSize(FrameIndex); 580 unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); 581 MachinePointerInfo PtrInfo 582 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 583 MachineMemOperand *MMO 584 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 585 Size, Align); 586 587 if (RI.isSGPRClass(RC)) { 588 MFI->setHasSpilledSGPRs(); 589 590 if (TargetRegisterInfo::isVirtualRegister(SrcReg) && RC->getSize() == 4) { 591 // m0 may not be allowed for readlane. 592 MachineRegisterInfo &MRI = MF->getRegInfo(); 593 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); 594 } 595 596 // We are only allowed to create one new instruction when spilling 597 // registers, so we need to use pseudo instruction for spilling 598 // SGPRs. 599 unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize()); 600 BuildMI(MBB, MI, DL, get(Opcode)) 601 .addReg(SrcReg, getKillRegState(isKill)) // src 602 .addFrameIndex(FrameIndex) // frame_idx 603 .addMemOperand(MMO); 604 605 return; 606 } 607 608 if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { 609 LLVMContext &Ctx = MF->getFunction()->getContext(); 610 Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" 611 " spill register"); 612 BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) 613 .addReg(SrcReg); 614 615 return; 616 } 617 618 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 619 620 unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); 621 MFI->setHasSpilledVGPRs(); 622 BuildMI(MBB, MI, DL, get(Opcode)) 623 .addReg(SrcReg, getKillRegState(isKill)) // src 624 .addFrameIndex(FrameIndex) // frame_idx 625 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 626 .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset 627 .addImm(0) // offset 628 .addMemOperand(MMO); 629 } 630 631 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { 632 switch (Size) { 633 case 4: 634 return AMDGPU::SI_SPILL_S32_RESTORE; 635 case 8: 636 return AMDGPU::SI_SPILL_S64_RESTORE; 637 case 16: 638 return AMDGPU::SI_SPILL_S128_RESTORE; 639 case 32: 640 return AMDGPU::SI_SPILL_S256_RESTORE; 641 case 64: 642 return AMDGPU::SI_SPILL_S512_RESTORE; 643 default: 644 llvm_unreachable("unknown register size"); 645 } 646 } 647 648 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { 649 switch (Size) { 650 case 4: 651 return AMDGPU::SI_SPILL_V32_RESTORE; 652 case 8: 653 return AMDGPU::SI_SPILL_V64_RESTORE; 654 case 12: 655 return AMDGPU::SI_SPILL_V96_RESTORE; 656 case 16: 657 return AMDGPU::SI_SPILL_V128_RESTORE; 658 case 32: 659 return AMDGPU::SI_SPILL_V256_RESTORE; 660 case 64: 661 return AMDGPU::SI_SPILL_V512_RESTORE; 662 default: 663 llvm_unreachable("unknown register size"); 664 } 665 } 666 667 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 668 MachineBasicBlock::iterator MI, 669 unsigned DestReg, int FrameIndex, 670 const TargetRegisterClass *RC, 671 const TargetRegisterInfo *TRI) const { 672 MachineFunction *MF = MBB.getParent(); 673 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 674 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 675 DebugLoc DL = MBB.findDebugLoc(MI); 676 unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); 677 unsigned Size = FrameInfo.getObjectSize(FrameIndex); 678 679 MachinePointerInfo PtrInfo 680 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 681 682 MachineMemOperand *MMO = MF->getMachineMemOperand( 683 PtrInfo, MachineMemOperand::MOLoad, Size, Align); 684 685 if (RI.isSGPRClass(RC)) { 686 // FIXME: Maybe this should not include a memoperand because it will be 687 // lowered to non-memory instructions. 688 unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize()); 689 690 if (TargetRegisterInfo::isVirtualRegister(DestReg) && RC->getSize() == 4) { 691 // m0 may not be allowed for readlane. 692 MachineRegisterInfo &MRI = MF->getRegInfo(); 693 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); 694 } 695 696 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 697 .addFrameIndex(FrameIndex) // frame_idx 698 .addMemOperand(MMO); 699 700 return; 701 } 702 703 if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { 704 LLVMContext &Ctx = MF->getFunction()->getContext(); 705 Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" 706 " restore register"); 707 BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); 708 709 return; 710 } 711 712 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 713 714 unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); 715 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 716 .addFrameIndex(FrameIndex) // frame_idx 717 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 718 .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset 719 .addImm(0) // offset 720 .addMemOperand(MMO); 721 } 722 723 /// \param @Offset Offset in bytes of the FrameIndex being spilled 724 unsigned SIInstrInfo::calculateLDSSpillAddress( 725 MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg, 726 unsigned FrameOffset, unsigned Size) const { 727 MachineFunction *MF = MBB.getParent(); 728 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 729 const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); 730 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 731 DebugLoc DL = MBB.findDebugLoc(MI); 732 unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF); 733 unsigned WavefrontSize = ST.getWavefrontSize(); 734 735 unsigned TIDReg = MFI->getTIDReg(); 736 if (!MFI->hasCalculatedTID()) { 737 MachineBasicBlock &Entry = MBB.getParent()->front(); 738 MachineBasicBlock::iterator Insert = Entry.front(); 739 DebugLoc DL = Insert->getDebugLoc(); 740 741 TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass, 742 *MF); 743 if (TIDReg == AMDGPU::NoRegister) 744 return TIDReg; 745 746 if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) && 747 WorkGroupSize > WavefrontSize) { 748 749 unsigned TIDIGXReg 750 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X); 751 unsigned TIDIGYReg 752 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y); 753 unsigned TIDIGZReg 754 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z); 755 unsigned InputPtrReg = 756 TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); 757 for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { 758 if (!Entry.isLiveIn(Reg)) 759 Entry.addLiveIn(Reg); 760 } 761 762 RS->enterBasicBlock(Entry); 763 // FIXME: Can we scavenge an SReg_64 and access the subregs? 764 unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 765 unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 766 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) 767 .addReg(InputPtrReg) 768 .addImm(SI::KernelInputOffsets::NGROUPS_Z); 769 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) 770 .addReg(InputPtrReg) 771 .addImm(SI::KernelInputOffsets::NGROUPS_Y); 772 773 // NGROUPS.X * NGROUPS.Y 774 BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) 775 .addReg(STmp1) 776 .addReg(STmp0); 777 // (NGROUPS.X * NGROUPS.Y) * TIDIG.X 778 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) 779 .addReg(STmp1) 780 .addReg(TIDIGXReg); 781 // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) 782 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) 783 .addReg(STmp0) 784 .addReg(TIDIGYReg) 785 .addReg(TIDReg); 786 // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z 787 BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) 788 .addReg(TIDReg) 789 .addReg(TIDIGZReg); 790 } else { 791 // Get the wave id 792 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), 793 TIDReg) 794 .addImm(-1) 795 .addImm(0); 796 797 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), 798 TIDReg) 799 .addImm(-1) 800 .addReg(TIDReg); 801 } 802 803 BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), 804 TIDReg) 805 .addImm(2) 806 .addReg(TIDReg); 807 MFI->setTIDReg(TIDReg); 808 } 809 810 // Add FrameIndex to LDS offset 811 unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize); 812 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) 813 .addImm(LDSOffset) 814 .addReg(TIDReg); 815 816 return TmpReg; 817 } 818 819 void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB, 820 MachineBasicBlock::iterator MI, 821 int Count) const { 822 DebugLoc DL = MBB.findDebugLoc(MI); 823 while (Count > 0) { 824 int Arg; 825 if (Count >= 8) 826 Arg = 7; 827 else 828 Arg = Count - 1; 829 Count -= 8; 830 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)) 831 .addImm(Arg); 832 } 833 } 834 835 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, 836 MachineBasicBlock::iterator MI) const { 837 insertWaitStates(MBB, MI, 1); 838 } 839 840 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const { 841 switch (MI.getOpcode()) { 842 default: return 1; // FIXME: Do wait states equal cycles? 843 844 case AMDGPU::S_NOP: 845 return MI.getOperand(0).getImm() + 1; 846 } 847 } 848 849 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 850 MachineBasicBlock &MBB = *MI.getParent(); 851 DebugLoc DL = MBB.findDebugLoc(MI); 852 switch (MI.getOpcode()) { 853 default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); 854 855 case AMDGPU::V_MOV_B64_PSEUDO: { 856 unsigned Dst = MI.getOperand(0).getReg(); 857 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 858 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 859 860 const MachineOperand &SrcOp = MI.getOperand(1); 861 // FIXME: Will this work for 64-bit floating point immediates? 862 assert(!SrcOp.isFPImm()); 863 if (SrcOp.isImm()) { 864 APInt Imm(64, SrcOp.getImm()); 865 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 866 .addImm(Imm.getLoBits(32).getZExtValue()) 867 .addReg(Dst, RegState::Implicit | RegState::Define); 868 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 869 .addImm(Imm.getHiBits(32).getZExtValue()) 870 .addReg(Dst, RegState::Implicit | RegState::Define); 871 } else { 872 assert(SrcOp.isReg()); 873 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 874 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 875 .addReg(Dst, RegState::Implicit | RegState::Define); 876 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 877 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 878 .addReg(Dst, RegState::Implicit | RegState::Define); 879 } 880 MI.eraseFromParent(); 881 break; 882 } 883 884 case AMDGPU::V_CNDMASK_B64_PSEUDO: { 885 unsigned Dst = MI.getOperand(0).getReg(); 886 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 887 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 888 unsigned Src0 = MI.getOperand(1).getReg(); 889 unsigned Src1 = MI.getOperand(2).getReg(); 890 const MachineOperand &SrcCond = MI.getOperand(3); 891 892 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo) 893 .addReg(RI.getSubReg(Src0, AMDGPU::sub0)) 894 .addReg(RI.getSubReg(Src1, AMDGPU::sub0)) 895 .addReg(SrcCond.getReg()) 896 .addReg(Dst, RegState::Implicit | RegState::Define); 897 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi) 898 .addReg(RI.getSubReg(Src0, AMDGPU::sub1)) 899 .addReg(RI.getSubReg(Src1, AMDGPU::sub1)) 900 .addReg(SrcCond.getReg(), getKillRegState(SrcCond.isKill())) 901 .addReg(Dst, RegState::Implicit | RegState::Define); 902 MI.eraseFromParent(); 903 break; 904 } 905 906 case AMDGPU::SI_PC_ADD_REL_OFFSET: { 907 const SIRegisterInfo *TRI 908 = static_cast<const SIRegisterInfo *>(ST.getRegisterInfo()); 909 MachineFunction &MF = *MBB.getParent(); 910 unsigned Reg = MI.getOperand(0).getReg(); 911 unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0); 912 unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1); 913 914 // Create a bundle so these instructions won't be re-ordered by the 915 // post-RA scheduler. 916 MIBundleBuilder Bundler(MBB, MI); 917 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); 918 919 // Add 32-bit offset from this instruction to the start of the 920 // constant data. 921 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) 922 .addReg(RegLo) 923 .addOperand(MI.getOperand(1))); 924 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) 925 .addReg(RegHi) 926 .addImm(0)); 927 928 llvm::finalizeBundle(MBB, Bundler.begin()); 929 930 MI.eraseFromParent(); 931 break; 932 } 933 } 934 return true; 935 } 936 937 /// Commutes the operands in the given instruction. 938 /// The commutable operands are specified by their indices OpIdx0 and OpIdx1. 939 /// 940 /// Do not call this method for a non-commutable instruction or for 941 /// non-commutable pair of operand indices OpIdx0 and OpIdx1. 942 /// Even though the instruction is commutable, the method may still 943 /// fail to commute the operands, null pointer is returned in such cases. 944 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, 945 unsigned OpIdx0, 946 unsigned OpIdx1) const { 947 int CommutedOpcode = commuteOpcode(MI); 948 if (CommutedOpcode == -1) 949 return nullptr; 950 951 int Src0Idx = 952 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 953 MachineOperand &Src0 = MI.getOperand(Src0Idx); 954 if (!Src0.isReg()) 955 return nullptr; 956 957 int Src1Idx = 958 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1); 959 960 if ((OpIdx0 != static_cast<unsigned>(Src0Idx) || 961 OpIdx1 != static_cast<unsigned>(Src1Idx)) && 962 (OpIdx0 != static_cast<unsigned>(Src1Idx) || 963 OpIdx1 != static_cast<unsigned>(Src0Idx))) 964 return nullptr; 965 966 MachineOperand &Src1 = MI.getOperand(Src1Idx); 967 968 if (isVOP2(MI) || isVOPC(MI)) { 969 const MCInstrDesc &InstrDesc = MI.getDesc(); 970 // For VOP2 and VOPC instructions, any operand type is valid to use for 971 // src0. Make sure we can use the src0 as src1. 972 // 973 // We could be stricter here and only allow commuting if there is a reason 974 // to do so. i.e. if both operands are VGPRs there is no real benefit, 975 // although MachineCSE attempts to find matches by commuting. 976 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 977 if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) 978 return nullptr; 979 } 980 981 MachineInstr *CommutedMI = &MI; 982 if (!Src1.isReg()) { 983 // Allow commuting instructions with Imm operands. 984 if (NewMI || !Src1.isImm() || (!isVOP2(MI) && !isVOP3(MI))) { 985 return nullptr; 986 } 987 // Be sure to copy the source modifiers to the right place. 988 if (MachineOperand *Src0Mods = 989 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) { 990 MachineOperand *Src1Mods = 991 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 992 993 int Src0ModsVal = Src0Mods->getImm(); 994 if (!Src1Mods && Src0ModsVal != 0) 995 return nullptr; 996 997 // XXX - This assert might be a lie. It might be useful to have a neg 998 // modifier with 0.0. 999 int Src1ModsVal = Src1Mods->getImm(); 1000 assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates"); 1001 1002 Src1Mods->setImm(Src0ModsVal); 1003 Src0Mods->setImm(Src1ModsVal); 1004 } 1005 1006 unsigned Reg = Src0.getReg(); 1007 unsigned SubReg = Src0.getSubReg(); 1008 if (Src1.isImm()) 1009 Src0.ChangeToImmediate(Src1.getImm()); 1010 else 1011 llvm_unreachable("Should only have immediates"); 1012 1013 Src1.ChangeToRegister(Reg, false); 1014 Src1.setSubReg(SubReg); 1015 } else { 1016 CommutedMI = 1017 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1); 1018 } 1019 1020 if (CommutedMI) 1021 CommutedMI->setDesc(get(CommutedOpcode)); 1022 1023 return CommutedMI; 1024 } 1025 1026 // This needs to be implemented because the source modifiers may be inserted 1027 // between the true commutable operands, and the base 1028 // TargetInstrInfo::commuteInstruction uses it. 1029 bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0, 1030 unsigned &SrcOpIdx1) const { 1031 const MCInstrDesc &MCID = MI.getDesc(); 1032 if (!MCID.isCommutable()) 1033 return false; 1034 1035 unsigned Opc = MI.getOpcode(); 1036 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1037 if (Src0Idx == -1) 1038 return false; 1039 1040 // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on 1041 // immediate. Also, immediate src0 operand is not handled in 1042 // SIInstrInfo::commuteInstruction(); 1043 if (!MI.getOperand(Src0Idx).isReg()) 1044 return false; 1045 1046 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1047 if (Src1Idx == -1) 1048 return false; 1049 1050 MachineOperand &Src1 = MI.getOperand(Src1Idx); 1051 if (Src1.isImm()) { 1052 // SIInstrInfo::commuteInstruction() does support commuting the immediate 1053 // operand src1 in 2 and 3 operand instructions. 1054 if (!isVOP2(MI.getOpcode()) && !isVOP3(MI.getOpcode())) 1055 return false; 1056 } else if (Src1.isReg()) { 1057 // If any source modifiers are set, the generic instruction commuting won't 1058 // understand how to copy the source modifiers. 1059 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || 1060 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)) 1061 return false; 1062 } else 1063 return false; 1064 1065 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); 1066 } 1067 1068 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { 1069 switch (Cond) { 1070 case SIInstrInfo::SCC_TRUE: 1071 return AMDGPU::S_CBRANCH_SCC1; 1072 case SIInstrInfo::SCC_FALSE: 1073 return AMDGPU::S_CBRANCH_SCC0; 1074 case SIInstrInfo::VCCNZ: 1075 return AMDGPU::S_CBRANCH_VCCNZ; 1076 case SIInstrInfo::VCCZ: 1077 return AMDGPU::S_CBRANCH_VCCZ; 1078 case SIInstrInfo::EXECNZ: 1079 return AMDGPU::S_CBRANCH_EXECNZ; 1080 case SIInstrInfo::EXECZ: 1081 return AMDGPU::S_CBRANCH_EXECZ; 1082 default: 1083 llvm_unreachable("invalid branch predicate"); 1084 } 1085 } 1086 1087 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { 1088 switch (Opcode) { 1089 case AMDGPU::S_CBRANCH_SCC0: 1090 return SCC_FALSE; 1091 case AMDGPU::S_CBRANCH_SCC1: 1092 return SCC_TRUE; 1093 case AMDGPU::S_CBRANCH_VCCNZ: 1094 return VCCNZ; 1095 case AMDGPU::S_CBRANCH_VCCZ: 1096 return VCCZ; 1097 case AMDGPU::S_CBRANCH_EXECNZ: 1098 return EXECNZ; 1099 case AMDGPU::S_CBRANCH_EXECZ: 1100 return EXECZ; 1101 default: 1102 return INVALID_BR; 1103 } 1104 } 1105 1106 bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, 1107 MachineBasicBlock *&FBB, 1108 SmallVectorImpl<MachineOperand> &Cond, 1109 bool AllowModify) const { 1110 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 1111 1112 if (I == MBB.end()) 1113 return false; 1114 1115 if (I->getOpcode() == AMDGPU::S_BRANCH) { 1116 // Unconditional Branch 1117 TBB = I->getOperand(0).getMBB(); 1118 return false; 1119 } 1120 1121 BranchPredicate Pred = getBranchPredicate(I->getOpcode()); 1122 if (Pred == INVALID_BR) 1123 return true; 1124 1125 MachineBasicBlock *CondBB = I->getOperand(0).getMBB(); 1126 Cond.push_back(MachineOperand::CreateImm(Pred)); 1127 1128 ++I; 1129 1130 if (I == MBB.end()) { 1131 // Conditional branch followed by fall-through. 1132 TBB = CondBB; 1133 return false; 1134 } 1135 1136 if (I->getOpcode() == AMDGPU::S_BRANCH) { 1137 TBB = CondBB; 1138 FBB = I->getOperand(0).getMBB(); 1139 return false; 1140 } 1141 1142 return true; 1143 } 1144 1145 unsigned SIInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { 1146 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 1147 1148 unsigned Count = 0; 1149 while (I != MBB.end()) { 1150 MachineBasicBlock::iterator Next = std::next(I); 1151 I->eraseFromParent(); 1152 ++Count; 1153 I = Next; 1154 } 1155 1156 return Count; 1157 } 1158 1159 unsigned SIInstrInfo::InsertBranch(MachineBasicBlock &MBB, 1160 MachineBasicBlock *TBB, 1161 MachineBasicBlock *FBB, 1162 ArrayRef<MachineOperand> Cond, 1163 const DebugLoc &DL) const { 1164 1165 if (!FBB && Cond.empty()) { 1166 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 1167 .addMBB(TBB); 1168 return 1; 1169 } 1170 1171 assert(TBB && Cond[0].isImm()); 1172 1173 unsigned Opcode 1174 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); 1175 1176 if (!FBB) { 1177 BuildMI(&MBB, DL, get(Opcode)) 1178 .addMBB(TBB); 1179 return 1; 1180 } 1181 1182 assert(TBB && FBB); 1183 1184 BuildMI(&MBB, DL, get(Opcode)) 1185 .addMBB(TBB); 1186 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 1187 .addMBB(FBB); 1188 1189 return 2; 1190 } 1191 1192 bool SIInstrInfo::ReverseBranchCondition( 1193 SmallVectorImpl<MachineOperand> &Cond) const { 1194 assert(Cond.size() == 1); 1195 Cond[0].setImm(-Cond[0].getImm()); 1196 return false; 1197 } 1198 1199 static void removeModOperands(MachineInstr &MI) { 1200 unsigned Opc = MI.getOpcode(); 1201 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1202 AMDGPU::OpName::src0_modifiers); 1203 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1204 AMDGPU::OpName::src1_modifiers); 1205 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1206 AMDGPU::OpName::src2_modifiers); 1207 1208 MI.RemoveOperand(Src2ModIdx); 1209 MI.RemoveOperand(Src1ModIdx); 1210 MI.RemoveOperand(Src0ModIdx); 1211 } 1212 1213 // TODO: Maybe this should be removed this and custom fold everything in 1214 // SIFoldOperands? 1215 bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, 1216 unsigned Reg, MachineRegisterInfo *MRI) const { 1217 if (!MRI->hasOneNonDBGUse(Reg)) 1218 return false; 1219 1220 unsigned Opc = UseMI.getOpcode(); 1221 if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) { 1222 // Don't fold if we are using source modifiers. The new VOP2 instructions 1223 // don't have them. 1224 if (hasModifiersSet(UseMI, AMDGPU::OpName::src0_modifiers) || 1225 hasModifiersSet(UseMI, AMDGPU::OpName::src1_modifiers) || 1226 hasModifiersSet(UseMI, AMDGPU::OpName::src2_modifiers)) { 1227 return false; 1228 } 1229 1230 const MachineOperand &ImmOp = DefMI.getOperand(1); 1231 1232 // If this is a free constant, there's no reason to do this. 1233 // TODO: We could fold this here instead of letting SIFoldOperands do it 1234 // later. 1235 if (isInlineConstant(ImmOp, 4)) 1236 return false; 1237 1238 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); 1239 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); 1240 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); 1241 1242 // Multiplied part is the constant: Use v_madmk_f32 1243 // We should only expect these to be on src0 due to canonicalizations. 1244 if (Src0->isReg() && Src0->getReg() == Reg) { 1245 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1246 return false; 1247 1248 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) 1249 return false; 1250 1251 // We need to swap operands 0 and 1 since madmk constant is at operand 1. 1252 1253 const int64_t Imm = DefMI.getOperand(1).getImm(); 1254 1255 // FIXME: This would be a lot easier if we could return a new instruction 1256 // instead of having to modify in place. 1257 1258 // Remove these first since they are at the end. 1259 UseMI.RemoveOperand( 1260 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 1261 UseMI.RemoveOperand( 1262 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 1263 1264 unsigned Src1Reg = Src1->getReg(); 1265 unsigned Src1SubReg = Src1->getSubReg(); 1266 Src0->setReg(Src1Reg); 1267 Src0->setSubReg(Src1SubReg); 1268 Src0->setIsKill(Src1->isKill()); 1269 1270 if (Opc == AMDGPU::V_MAC_F32_e64) { 1271 UseMI.untieRegOperand( 1272 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1273 } 1274 1275 Src1->ChangeToImmediate(Imm); 1276 1277 removeModOperands(UseMI); 1278 UseMI.setDesc(get(AMDGPU::V_MADMK_F32)); 1279 1280 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1281 if (DeleteDef) 1282 DefMI.eraseFromParent(); 1283 1284 return true; 1285 } 1286 1287 // Added part is the constant: Use v_madak_f32 1288 if (Src2->isReg() && Src2->getReg() == Reg) { 1289 // Not allowed to use constant bus for another operand. 1290 // We can however allow an inline immediate as src0. 1291 if (!Src0->isImm() && 1292 (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) 1293 return false; 1294 1295 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1296 return false; 1297 1298 const int64_t Imm = DefMI.getOperand(1).getImm(); 1299 1300 // FIXME: This would be a lot easier if we could return a new instruction 1301 // instead of having to modify in place. 1302 1303 // Remove these first since they are at the end. 1304 UseMI.RemoveOperand( 1305 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 1306 UseMI.RemoveOperand( 1307 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 1308 1309 if (Opc == AMDGPU::V_MAC_F32_e64) { 1310 UseMI.untieRegOperand( 1311 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1312 } 1313 1314 // ChangingToImmediate adds Src2 back to the instruction. 1315 Src2->ChangeToImmediate(Imm); 1316 1317 // These come before src2. 1318 removeModOperands(UseMI); 1319 UseMI.setDesc(get(AMDGPU::V_MADAK_F32)); 1320 1321 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1322 if (DeleteDef) 1323 DefMI.eraseFromParent(); 1324 1325 return true; 1326 } 1327 } 1328 1329 return false; 1330 } 1331 1332 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 1333 int WidthB, int OffsetB) { 1334 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1335 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1336 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1337 return LowOffset + LowWidth <= HighOffset; 1338 } 1339 1340 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa, 1341 MachineInstr &MIb) const { 1342 unsigned BaseReg0, BaseReg1; 1343 int64_t Offset0, Offset1; 1344 1345 if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && 1346 getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { 1347 1348 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { 1349 // FIXME: Handle ds_read2 / ds_write2. 1350 return false; 1351 } 1352 unsigned Width0 = (*MIa.memoperands_begin())->getSize(); 1353 unsigned Width1 = (*MIb.memoperands_begin())->getSize(); 1354 if (BaseReg0 == BaseReg1 && 1355 offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { 1356 return true; 1357 } 1358 } 1359 1360 return false; 1361 } 1362 1363 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa, 1364 MachineInstr &MIb, 1365 AliasAnalysis *AA) const { 1366 assert((MIa.mayLoad() || MIa.mayStore()) && 1367 "MIa must load from or modify a memory location"); 1368 assert((MIb.mayLoad() || MIb.mayStore()) && 1369 "MIb must load from or modify a memory location"); 1370 1371 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) 1372 return false; 1373 1374 // XXX - Can we relax this between address spaces? 1375 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 1376 return false; 1377 1378 // TODO: Should we check the address space from the MachineMemOperand? That 1379 // would allow us to distinguish objects we know don't alias based on the 1380 // underlying address space, even if it was lowered to a different one, 1381 // e.g. private accesses lowered to use MUBUF instructions on a scratch 1382 // buffer. 1383 if (isDS(MIa)) { 1384 if (isDS(MIb)) 1385 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1386 1387 return !isFLAT(MIb); 1388 } 1389 1390 if (isMUBUF(MIa) || isMTBUF(MIa)) { 1391 if (isMUBUF(MIb) || isMTBUF(MIb)) 1392 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1393 1394 return !isFLAT(MIb) && !isSMRD(MIb); 1395 } 1396 1397 if (isSMRD(MIa)) { 1398 if (isSMRD(MIb)) 1399 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1400 1401 return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa); 1402 } 1403 1404 if (isFLAT(MIa)) { 1405 if (isFLAT(MIb)) 1406 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1407 1408 return false; 1409 } 1410 1411 return false; 1412 } 1413 1414 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, 1415 MachineInstr &MI, 1416 LiveVariables *LV) const { 1417 1418 switch (MI.getOpcode()) { 1419 default: 1420 return nullptr; 1421 case AMDGPU::V_MAC_F32_e64: 1422 break; 1423 case AMDGPU::V_MAC_F32_e32: { 1424 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); 1425 if (Src0->isImm() && !isInlineConstant(*Src0, 4)) 1426 return nullptr; 1427 break; 1428 } 1429 } 1430 1431 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 1432 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); 1433 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 1434 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 1435 1436 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::V_MAD_F32)) 1437 .addOperand(*Dst) 1438 .addImm(0) // Src0 mods 1439 .addOperand(*Src0) 1440 .addImm(0) // Src1 mods 1441 .addOperand(*Src1) 1442 .addImm(0) // Src mods 1443 .addOperand(*Src2) 1444 .addImm(0) // clamp 1445 .addImm(0); // omod 1446 } 1447 1448 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, 1449 const MachineBasicBlock *MBB, 1450 const MachineFunction &MF) const { 1451 // XXX - Do we want the SP check in the base implementation? 1452 1453 // Target-independent instructions do not have an implicit-use of EXEC, even 1454 // when they operate on VGPRs. Treating EXEC modifications as scheduling 1455 // boundaries prevents incorrect movements of such instructions. 1456 return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) || 1457 MI.modifiesRegister(AMDGPU::EXEC, &RI); 1458 } 1459 1460 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 1461 int64_t SVal = Imm.getSExtValue(); 1462 if (SVal >= -16 && SVal <= 64) 1463 return true; 1464 1465 if (Imm.getBitWidth() == 64) { 1466 uint64_t Val = Imm.getZExtValue(); 1467 return (DoubleToBits(0.0) == Val) || 1468 (DoubleToBits(1.0) == Val) || 1469 (DoubleToBits(-1.0) == Val) || 1470 (DoubleToBits(0.5) == Val) || 1471 (DoubleToBits(-0.5) == Val) || 1472 (DoubleToBits(2.0) == Val) || 1473 (DoubleToBits(-2.0) == Val) || 1474 (DoubleToBits(4.0) == Val) || 1475 (DoubleToBits(-4.0) == Val); 1476 } 1477 1478 // The actual type of the operand does not seem to matter as long 1479 // as the bits match one of the inline immediate values. For example: 1480 // 1481 // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal, 1482 // so it is a legal inline immediate. 1483 // 1484 // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in 1485 // floating-point, so it is a legal inline immediate. 1486 uint32_t Val = Imm.getZExtValue(); 1487 1488 return (FloatToBits(0.0f) == Val) || 1489 (FloatToBits(1.0f) == Val) || 1490 (FloatToBits(-1.0f) == Val) || 1491 (FloatToBits(0.5f) == Val) || 1492 (FloatToBits(-0.5f) == Val) || 1493 (FloatToBits(2.0f) == Val) || 1494 (FloatToBits(-2.0f) == Val) || 1495 (FloatToBits(4.0f) == Val) || 1496 (FloatToBits(-4.0f) == Val); 1497 } 1498 1499 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 1500 unsigned OpSize) const { 1501 if (MO.isImm()) { 1502 // MachineOperand provides no way to tell the true operand size, since it 1503 // only records a 64-bit value. We need to know the size to determine if a 1504 // 32-bit floating point immediate bit pattern is legal for an integer 1505 // immediate. It would be for any 32-bit integer operand, but would not be 1506 // for a 64-bit one. 1507 1508 unsigned BitSize = 8 * OpSize; 1509 return isInlineConstant(APInt(BitSize, MO.getImm(), true)); 1510 } 1511 1512 return false; 1513 } 1514 1515 bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO, 1516 unsigned OpSize) const { 1517 return MO.isImm() && !isInlineConstant(MO, OpSize); 1518 } 1519 1520 static bool compareMachineOp(const MachineOperand &Op0, 1521 const MachineOperand &Op1) { 1522 if (Op0.getType() != Op1.getType()) 1523 return false; 1524 1525 switch (Op0.getType()) { 1526 case MachineOperand::MO_Register: 1527 return Op0.getReg() == Op1.getReg(); 1528 case MachineOperand::MO_Immediate: 1529 return Op0.getImm() == Op1.getImm(); 1530 default: 1531 llvm_unreachable("Didn't expect to be comparing these operand types"); 1532 } 1533 } 1534 1535 bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, 1536 const MachineOperand &MO) const { 1537 const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo]; 1538 1539 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 1540 1541 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 1542 return true; 1543 1544 if (OpInfo.RegClass < 0) 1545 return false; 1546 1547 unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize(); 1548 if (isLiteralConstant(MO, OpSize)) 1549 return RI.opCanUseLiteralConstant(OpInfo.OperandType); 1550 1551 return RI.opCanUseInlineConstant(OpInfo.OperandType); 1552 } 1553 1554 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 1555 int Op32 = AMDGPU::getVOPe32(Opcode); 1556 if (Op32 == -1) 1557 return false; 1558 1559 return pseudoToMCOpcode(Op32) != -1; 1560 } 1561 1562 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 1563 // The src0_modifier operand is present on all instructions 1564 // that have modifiers. 1565 1566 return AMDGPU::getNamedOperandIdx(Opcode, 1567 AMDGPU::OpName::src0_modifiers) != -1; 1568 } 1569 1570 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 1571 unsigned OpName) const { 1572 const MachineOperand *Mods = getNamedOperand(MI, OpName); 1573 return Mods && Mods->getImm(); 1574 } 1575 1576 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 1577 const MachineOperand &MO, 1578 unsigned OpSize) const { 1579 // Literal constants use the constant bus. 1580 if (isLiteralConstant(MO, OpSize)) 1581 return true; 1582 1583 if (!MO.isReg() || !MO.isUse()) 1584 return false; 1585 1586 if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) 1587 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 1588 1589 // FLAT_SCR is just an SGPR pair. 1590 if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) 1591 return true; 1592 1593 // EXEC register uses the constant bus. 1594 if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) 1595 return true; 1596 1597 // SGPRs use the constant bus 1598 return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 || 1599 (!MO.isImplicit() && 1600 (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || 1601 AMDGPU::SGPR_64RegClass.contains(MO.getReg())))); 1602 } 1603 1604 static unsigned findImplicitSGPRRead(const MachineInstr &MI) { 1605 for (const MachineOperand &MO : MI.implicit_operands()) { 1606 // We only care about reads. 1607 if (MO.isDef()) 1608 continue; 1609 1610 switch (MO.getReg()) { 1611 case AMDGPU::VCC: 1612 case AMDGPU::M0: 1613 case AMDGPU::FLAT_SCR: 1614 return MO.getReg(); 1615 1616 default: 1617 break; 1618 } 1619 } 1620 1621 return AMDGPU::NoRegister; 1622 } 1623 1624 static bool shouldReadExec(const MachineInstr &MI) { 1625 if (SIInstrInfo::isVALU(MI)) { 1626 switch (MI.getOpcode()) { 1627 case AMDGPU::V_READLANE_B32: 1628 case AMDGPU::V_READLANE_B32_si: 1629 case AMDGPU::V_READLANE_B32_vi: 1630 case AMDGPU::V_WRITELANE_B32: 1631 case AMDGPU::V_WRITELANE_B32_si: 1632 case AMDGPU::V_WRITELANE_B32_vi: 1633 return false; 1634 } 1635 1636 return true; 1637 } 1638 1639 if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) || 1640 SIInstrInfo::isSALU(MI) || 1641 SIInstrInfo::isSMRD(MI)) 1642 return false; 1643 1644 return true; 1645 } 1646 1647 static bool isSubRegOf(const SIRegisterInfo &TRI, 1648 const MachineOperand &SuperVec, 1649 const MachineOperand &SubReg) { 1650 if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg())) 1651 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); 1652 1653 return SubReg.getSubReg() != AMDGPU::NoSubRegister && 1654 SubReg.getReg() == SuperVec.getReg(); 1655 } 1656 1657 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, 1658 StringRef &ErrInfo) const { 1659 uint16_t Opcode = MI.getOpcode(); 1660 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 1661 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 1662 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 1663 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 1664 1665 // Make sure the number of operands is correct. 1666 const MCInstrDesc &Desc = get(Opcode); 1667 if (!Desc.isVariadic() && 1668 Desc.getNumOperands() != MI.getNumExplicitOperands()) { 1669 ErrInfo = "Instruction has wrong number of operands."; 1670 return false; 1671 } 1672 1673 // Make sure the register classes are correct. 1674 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 1675 if (MI.getOperand(i).isFPImm()) { 1676 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 1677 "all fp values to integers."; 1678 return false; 1679 } 1680 1681 int RegClass = Desc.OpInfo[i].RegClass; 1682 1683 switch (Desc.OpInfo[i].OperandType) { 1684 case MCOI::OPERAND_REGISTER: 1685 if (MI.getOperand(i).isImm()) { 1686 ErrInfo = "Illegal immediate value for operand."; 1687 return false; 1688 } 1689 break; 1690 case AMDGPU::OPERAND_REG_IMM32: 1691 break; 1692 case AMDGPU::OPERAND_REG_INLINE_C: 1693 if (isLiteralConstant(MI.getOperand(i), 1694 RI.getRegClass(RegClass)->getSize())) { 1695 ErrInfo = "Illegal immediate value for operand."; 1696 return false; 1697 } 1698 break; 1699 case MCOI::OPERAND_IMMEDIATE: 1700 case AMDGPU::OPERAND_KIMM32: 1701 // Check if this operand is an immediate. 1702 // FrameIndex operands will be replaced by immediates, so they are 1703 // allowed. 1704 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { 1705 ErrInfo = "Expected immediate, but got non-immediate"; 1706 return false; 1707 } 1708 // Fall-through 1709 default: 1710 continue; 1711 } 1712 1713 if (!MI.getOperand(i).isReg()) 1714 continue; 1715 1716 if (RegClass != -1) { 1717 unsigned Reg = MI.getOperand(i).getReg(); 1718 if (Reg == AMDGPU::NoRegister || 1719 TargetRegisterInfo::isVirtualRegister(Reg)) 1720 continue; 1721 1722 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 1723 if (!RC->contains(Reg)) { 1724 ErrInfo = "Operand has incorrect register class."; 1725 return false; 1726 } 1727 } 1728 } 1729 1730 // Verify VOP* 1731 if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI)) { 1732 // Only look at the true operands. Only a real operand can use the constant 1733 // bus, and we don't want to check pseudo-operands like the source modifier 1734 // flags. 1735 const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; 1736 1737 unsigned ConstantBusCount = 0; 1738 1739 if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) 1740 ++ConstantBusCount; 1741 1742 unsigned SGPRUsed = findImplicitSGPRRead(MI); 1743 if (SGPRUsed != AMDGPU::NoRegister) 1744 ++ConstantBusCount; 1745 1746 for (int OpIdx : OpIndices) { 1747 if (OpIdx == -1) 1748 break; 1749 const MachineOperand &MO = MI.getOperand(OpIdx); 1750 if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) { 1751 if (MO.isReg()) { 1752 if (MO.getReg() != SGPRUsed) 1753 ++ConstantBusCount; 1754 SGPRUsed = MO.getReg(); 1755 } else { 1756 ++ConstantBusCount; 1757 } 1758 } 1759 } 1760 if (ConstantBusCount > 1) { 1761 ErrInfo = "VOP* instruction uses the constant bus more than once"; 1762 return false; 1763 } 1764 } 1765 1766 // Verify misc. restrictions on specific instructions. 1767 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || 1768 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { 1769 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 1770 const MachineOperand &Src1 = MI.getOperand(Src1Idx); 1771 const MachineOperand &Src2 = MI.getOperand(Src2Idx); 1772 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 1773 if (!compareMachineOp(Src0, Src1) && 1774 !compareMachineOp(Src0, Src2)) { 1775 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 1776 return false; 1777 } 1778 } 1779 } 1780 1781 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || 1782 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || 1783 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 1784 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { 1785 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 1786 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; 1787 1788 const unsigned StaticNumOps = Desc.getNumOperands() + 1789 Desc.getNumImplicitUses(); 1790 const unsigned NumImplicitOps = IsDst ? 2 : 1; 1791 1792 if (MI.getNumOperands() != StaticNumOps + NumImplicitOps) { 1793 ErrInfo = "missing implicit register operands"; 1794 return false; 1795 } 1796 1797 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 1798 if (IsDst) { 1799 if (!Dst->isUse()) { 1800 ErrInfo = "v_movreld_b32 vdst should be a use operand"; 1801 return false; 1802 } 1803 1804 unsigned UseOpIdx; 1805 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || 1806 UseOpIdx != StaticNumOps + 1) { 1807 ErrInfo = "movrel implicit operands should be tied"; 1808 return false; 1809 } 1810 } 1811 1812 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 1813 const MachineOperand &ImpUse 1814 = MI.getOperand(StaticNumOps + NumImplicitOps - 1); 1815 if (!ImpUse.isReg() || !ImpUse.isUse() || 1816 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { 1817 ErrInfo = "src0 should be subreg of implicit vector use"; 1818 return false; 1819 } 1820 } 1821 1822 // Make sure we aren't losing exec uses in the td files. This mostly requires 1823 // being careful when using let Uses to try to add other use registers. 1824 if (shouldReadExec(MI)) { 1825 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 1826 ErrInfo = "VALU instruction does not implicitly read exec mask"; 1827 return false; 1828 } 1829 } 1830 1831 return true; 1832 } 1833 1834 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { 1835 switch (MI.getOpcode()) { 1836 default: return AMDGPU::INSTRUCTION_LIST_END; 1837 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 1838 case AMDGPU::COPY: return AMDGPU::COPY; 1839 case AMDGPU::PHI: return AMDGPU::PHI; 1840 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 1841 case AMDGPU::S_MOV_B32: 1842 return MI.getOperand(1).isReg() ? 1843 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 1844 case AMDGPU::S_ADD_I32: 1845 case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; 1846 case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; 1847 case AMDGPU::S_SUB_I32: 1848 case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; 1849 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 1850 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; 1851 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32; 1852 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32; 1853 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32; 1854 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32; 1855 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32; 1856 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32; 1857 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32; 1858 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 1859 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; 1860 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 1861 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; 1862 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 1863 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; 1864 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; 1865 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; 1866 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; 1867 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; 1868 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 1869 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 1870 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 1871 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 1872 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; 1873 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; 1874 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; 1875 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; 1876 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; 1877 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; 1878 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; 1879 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; 1880 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; 1881 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; 1882 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; 1883 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; 1884 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 1885 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 1886 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 1887 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 1888 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; 1889 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; 1890 } 1891 } 1892 1893 bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const { 1894 return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END; 1895 } 1896 1897 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 1898 unsigned OpNo) const { 1899 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 1900 const MCInstrDesc &Desc = get(MI.getOpcode()); 1901 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 1902 Desc.OpInfo[OpNo].RegClass == -1) { 1903 unsigned Reg = MI.getOperand(OpNo).getReg(); 1904 1905 if (TargetRegisterInfo::isVirtualRegister(Reg)) 1906 return MRI.getRegClass(Reg); 1907 return RI.getPhysRegClass(Reg); 1908 } 1909 1910 unsigned RCID = Desc.OpInfo[OpNo].RegClass; 1911 return RI.getRegClass(RCID); 1912 } 1913 1914 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { 1915 switch (MI.getOpcode()) { 1916 case AMDGPU::COPY: 1917 case AMDGPU::REG_SEQUENCE: 1918 case AMDGPU::PHI: 1919 case AMDGPU::INSERT_SUBREG: 1920 return RI.hasVGPRs(getOpRegClass(MI, 0)); 1921 default: 1922 return RI.hasVGPRs(getOpRegClass(MI, OpNo)); 1923 } 1924 } 1925 1926 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { 1927 MachineBasicBlock::iterator I = MI; 1928 MachineBasicBlock *MBB = MI.getParent(); 1929 MachineOperand &MO = MI.getOperand(OpIdx); 1930 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1931 unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; 1932 const TargetRegisterClass *RC = RI.getRegClass(RCID); 1933 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 1934 if (MO.isReg()) 1935 Opcode = AMDGPU::COPY; 1936 else if (RI.isSGPRClass(RC)) 1937 Opcode = AMDGPU::S_MOV_B32; 1938 1939 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 1940 if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) 1941 VRC = &AMDGPU::VReg_64RegClass; 1942 else 1943 VRC = &AMDGPU::VGPR_32RegClass; 1944 1945 unsigned Reg = MRI.createVirtualRegister(VRC); 1946 DebugLoc DL = MBB->findDebugLoc(I); 1947 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).addOperand(MO); 1948 MO.ChangeToRegister(Reg, false); 1949 } 1950 1951 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 1952 MachineRegisterInfo &MRI, 1953 MachineOperand &SuperReg, 1954 const TargetRegisterClass *SuperRC, 1955 unsigned SubIdx, 1956 const TargetRegisterClass *SubRC) 1957 const { 1958 MachineBasicBlock *MBB = MI->getParent(); 1959 DebugLoc DL = MI->getDebugLoc(); 1960 unsigned SubReg = MRI.createVirtualRegister(SubRC); 1961 1962 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { 1963 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 1964 .addReg(SuperReg.getReg(), 0, SubIdx); 1965 return SubReg; 1966 } 1967 1968 // Just in case the super register is itself a sub-register, copy it to a new 1969 // value so we don't need to worry about merging its subreg index with the 1970 // SubIdx passed to this function. The register coalescer should be able to 1971 // eliminate this extra copy. 1972 unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); 1973 1974 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 1975 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 1976 1977 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 1978 .addReg(NewSuperReg, 0, SubIdx); 1979 1980 return SubReg; 1981 } 1982 1983 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 1984 MachineBasicBlock::iterator MII, 1985 MachineRegisterInfo &MRI, 1986 MachineOperand &Op, 1987 const TargetRegisterClass *SuperRC, 1988 unsigned SubIdx, 1989 const TargetRegisterClass *SubRC) const { 1990 if (Op.isImm()) { 1991 // XXX - Is there a better way to do this? 1992 if (SubIdx == AMDGPU::sub0) 1993 return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF); 1994 if (SubIdx == AMDGPU::sub1) 1995 return MachineOperand::CreateImm(Op.getImm() >> 32); 1996 1997 llvm_unreachable("Unhandled register index for immediate"); 1998 } 1999 2000 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 2001 SubIdx, SubRC); 2002 return MachineOperand::CreateReg(SubReg, false); 2003 } 2004 2005 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 2006 void SIInstrInfo::swapOperands(MachineInstr &Inst) const { 2007 assert(Inst.getNumExplicitOperands() == 3); 2008 MachineOperand Op1 = Inst.getOperand(1); 2009 Inst.RemoveOperand(1); 2010 Inst.addOperand(Op1); 2011 } 2012 2013 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, 2014 const MCOperandInfo &OpInfo, 2015 const MachineOperand &MO) const { 2016 if (!MO.isReg()) 2017 return false; 2018 2019 unsigned Reg = MO.getReg(); 2020 const TargetRegisterClass *RC = 2021 TargetRegisterInfo::isVirtualRegister(Reg) ? 2022 MRI.getRegClass(Reg) : 2023 RI.getPhysRegClass(Reg); 2024 2025 const SIRegisterInfo *TRI = 2026 static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 2027 RC = TRI->getSubRegClass(RC, MO.getSubReg()); 2028 2029 // In order to be legal, the common sub-class must be equal to the 2030 // class of the current operand. For example: 2031 // 2032 // v_mov_b32 s0 ; Operand defined as vsrc_32 2033 // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL 2034 // 2035 // s_sendmsg 0, s0 ; Operand defined as m0reg 2036 // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL 2037 2038 return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; 2039 } 2040 2041 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, 2042 const MCOperandInfo &OpInfo, 2043 const MachineOperand &MO) const { 2044 if (MO.isReg()) 2045 return isLegalRegOperand(MRI, OpInfo, MO); 2046 2047 // Handle non-register types that are treated like immediates. 2048 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 2049 return true; 2050 } 2051 2052 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, 2053 const MachineOperand *MO) const { 2054 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 2055 const MCInstrDesc &InstDesc = MI.getDesc(); 2056 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; 2057 const TargetRegisterClass *DefinedRC = 2058 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 2059 if (!MO) 2060 MO = &MI.getOperand(OpIdx); 2061 2062 if (isVALU(MI) && usesConstantBus(MRI, *MO, DefinedRC->getSize())) { 2063 2064 RegSubRegPair SGPRUsed; 2065 if (MO->isReg()) 2066 SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg()); 2067 2068 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 2069 if (i == OpIdx) 2070 continue; 2071 const MachineOperand &Op = MI.getOperand(i); 2072 if (Op.isReg()) { 2073 if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) && 2074 usesConstantBus(MRI, Op, getOpSize(MI, i))) { 2075 return false; 2076 } 2077 } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { 2078 return false; 2079 } 2080 } 2081 } 2082 2083 if (MO->isReg()) { 2084 assert(DefinedRC); 2085 return isLegalRegOperand(MRI, OpInfo, *MO); 2086 } 2087 2088 // Handle non-register types that are treated like immediates. 2089 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); 2090 2091 if (!DefinedRC) { 2092 // This operand expects an immediate. 2093 return true; 2094 } 2095 2096 return isImmOperandLegal(MI, OpIdx, *MO); 2097 } 2098 2099 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, 2100 MachineInstr &MI) const { 2101 unsigned Opc = MI.getOpcode(); 2102 const MCInstrDesc &InstrDesc = get(Opc); 2103 2104 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 2105 MachineOperand &Src1 = MI.getOperand(Src1Idx); 2106 2107 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 2108 // we need to only have one constant bus use. 2109 // 2110 // Note we do not need to worry about literal constants here. They are 2111 // disabled for the operand type for instructions because they will always 2112 // violate the one constant bus use rule. 2113 bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; 2114 if (HasImplicitSGPR) { 2115 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 2116 MachineOperand &Src0 = MI.getOperand(Src0Idx); 2117 2118 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) 2119 legalizeOpWithMove(MI, Src0Idx); 2120 } 2121 2122 // VOP2 src0 instructions support all operand types, so we don't need to check 2123 // their legality. If src1 is already legal, we don't need to do anything. 2124 if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) 2125 return; 2126 2127 // We do not use commuteInstruction here because it is too aggressive and will 2128 // commute if it is possible. We only want to commute here if it improves 2129 // legality. This can be called a fairly large number of times so don't waste 2130 // compile time pointlessly swapping and checking legality again. 2131 if (HasImplicitSGPR || !MI.isCommutable()) { 2132 legalizeOpWithMove(MI, Src1Idx); 2133 return; 2134 } 2135 2136 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 2137 MachineOperand &Src0 = MI.getOperand(Src0Idx); 2138 2139 // If src0 can be used as src1, commuting will make the operands legal. 2140 // Otherwise we have to give up and insert a move. 2141 // 2142 // TODO: Other immediate-like operand kinds could be commuted if there was a 2143 // MachineOperand::ChangeTo* for them. 2144 if ((!Src1.isImm() && !Src1.isReg()) || 2145 !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { 2146 legalizeOpWithMove(MI, Src1Idx); 2147 return; 2148 } 2149 2150 int CommutedOpc = commuteOpcode(MI); 2151 if (CommutedOpc == -1) { 2152 legalizeOpWithMove(MI, Src1Idx); 2153 return; 2154 } 2155 2156 MI.setDesc(get(CommutedOpc)); 2157 2158 unsigned Src0Reg = Src0.getReg(); 2159 unsigned Src0SubReg = Src0.getSubReg(); 2160 bool Src0Kill = Src0.isKill(); 2161 2162 if (Src1.isImm()) 2163 Src0.ChangeToImmediate(Src1.getImm()); 2164 else if (Src1.isReg()) { 2165 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); 2166 Src0.setSubReg(Src1.getSubReg()); 2167 } else 2168 llvm_unreachable("Should only have register or immediate operands"); 2169 2170 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); 2171 Src1.setSubReg(Src0SubReg); 2172 } 2173 2174 // Legalize VOP3 operands. Because all operand types are supported for any 2175 // operand, and since literal constants are not allowed and should never be 2176 // seen, we only need to worry about inserting copies if we use multiple SGPR 2177 // operands. 2178 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, 2179 MachineInstr &MI) const { 2180 unsigned Opc = MI.getOpcode(); 2181 2182 int VOP3Idx[3] = { 2183 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 2184 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), 2185 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) 2186 }; 2187 2188 // Find the one SGPR operand we are allowed to use. 2189 unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); 2190 2191 for (unsigned i = 0; i < 3; ++i) { 2192 int Idx = VOP3Idx[i]; 2193 if (Idx == -1) 2194 break; 2195 MachineOperand &MO = MI.getOperand(Idx); 2196 2197 // We should never see a VOP3 instruction with an illegal immediate operand. 2198 if (!MO.isReg()) 2199 continue; 2200 2201 if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 2202 continue; // VGPRs are legal 2203 2204 if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { 2205 SGPRReg = MO.getReg(); 2206 // We can use one SGPR in each VOP3 instruction. 2207 continue; 2208 } 2209 2210 // If we make it this far, then the operand is not legal and we must 2211 // legalize it. 2212 legalizeOpWithMove(MI, Idx); 2213 } 2214 } 2215 2216 unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, 2217 MachineRegisterInfo &MRI) const { 2218 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); 2219 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); 2220 unsigned DstReg = MRI.createVirtualRegister(SRC); 2221 unsigned SubRegs = VRC->getSize() / 4; 2222 2223 SmallVector<unsigned, 8> SRegs; 2224 for (unsigned i = 0; i < SubRegs; ++i) { 2225 unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2226 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 2227 get(AMDGPU::V_READFIRSTLANE_B32), SGPR) 2228 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); 2229 SRegs.push_back(SGPR); 2230 } 2231 2232 MachineInstrBuilder MIB = 2233 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 2234 get(AMDGPU::REG_SEQUENCE), DstReg); 2235 for (unsigned i = 0; i < SubRegs; ++i) { 2236 MIB.addReg(SRegs[i]); 2237 MIB.addImm(RI.getSubRegFromChannel(i)); 2238 } 2239 return DstReg; 2240 } 2241 2242 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, 2243 MachineInstr &MI) const { 2244 2245 // If the pointer is store in VGPRs, then we need to move them to 2246 // SGPRs using v_readfirstlane. This is safe because we only select 2247 // loads with uniform pointers to SMRD instruction so we know the 2248 // pointer value is uniform. 2249 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); 2250 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { 2251 unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); 2252 SBase->setReg(SGPR); 2253 } 2254 } 2255 2256 void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { 2257 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 2258 2259 // Legalize VOP2 2260 if (isVOP2(MI) || isVOPC(MI)) { 2261 legalizeOperandsVOP2(MRI, MI); 2262 return; 2263 } 2264 2265 // Legalize VOP3 2266 if (isVOP3(MI)) { 2267 legalizeOperandsVOP3(MRI, MI); 2268 return; 2269 } 2270 2271 // Legalize SMRD 2272 if (isSMRD(MI)) { 2273 legalizeOperandsSMRD(MRI, MI); 2274 return; 2275 } 2276 2277 // Legalize REG_SEQUENCE and PHI 2278 // The register class of the operands much be the same type as the register 2279 // class of the output. 2280 if (MI.getOpcode() == AMDGPU::PHI) { 2281 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 2282 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { 2283 if (!MI.getOperand(i).isReg() || 2284 !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) 2285 continue; 2286 const TargetRegisterClass *OpRC = 2287 MRI.getRegClass(MI.getOperand(i).getReg()); 2288 if (RI.hasVGPRs(OpRC)) { 2289 VRC = OpRC; 2290 } else { 2291 SRC = OpRC; 2292 } 2293 } 2294 2295 // If any of the operands are VGPR registers, then they all most be 2296 // otherwise we will create illegal VGPR->SGPR copies when legalizing 2297 // them. 2298 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { 2299 if (!VRC) { 2300 assert(SRC); 2301 VRC = RI.getEquivalentVGPRClass(SRC); 2302 } 2303 RC = VRC; 2304 } else { 2305 RC = SRC; 2306 } 2307 2308 // Update all the operands so they have the same type. 2309 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 2310 MachineOperand &Op = MI.getOperand(I); 2311 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 2312 continue; 2313 unsigned DstReg = MRI.createVirtualRegister(RC); 2314 2315 // MI is a PHI instruction. 2316 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); 2317 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); 2318 2319 BuildMI(*InsertBB, Insert, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg) 2320 .addOperand(Op); 2321 Op.setReg(DstReg); 2322 } 2323 } 2324 2325 // REG_SEQUENCE doesn't really require operand legalization, but if one has a 2326 // VGPR dest type and SGPR sources, insert copies so all operands are 2327 // VGPRs. This seems to help operand folding / the register coalescer. 2328 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { 2329 MachineBasicBlock *MBB = MI.getParent(); 2330 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); 2331 if (RI.hasVGPRs(DstRC)) { 2332 // Update all the operands so they are VGPR register classes. These may 2333 // not be the same register class because REG_SEQUENCE supports mixing 2334 // subregister index types e.g. sub0_sub1 + sub2 + sub3 2335 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 2336 MachineOperand &Op = MI.getOperand(I); 2337 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 2338 continue; 2339 2340 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); 2341 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); 2342 if (VRC == OpRC) 2343 continue; 2344 2345 unsigned DstReg = MRI.createVirtualRegister(VRC); 2346 2347 BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg) 2348 .addOperand(Op); 2349 2350 Op.setReg(DstReg); 2351 Op.setIsKill(); 2352 } 2353 } 2354 2355 return; 2356 } 2357 2358 // Legalize INSERT_SUBREG 2359 // src0 must have the same register class as dst 2360 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { 2361 unsigned Dst = MI.getOperand(0).getReg(); 2362 unsigned Src0 = MI.getOperand(1).getReg(); 2363 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 2364 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 2365 if (DstRC != Src0RC) { 2366 MachineBasicBlock &MBB = *MI.getParent(); 2367 unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); 2368 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), NewSrc0) 2369 .addReg(Src0); 2370 MI.getOperand(1).setReg(NewSrc0); 2371 } 2372 return; 2373 } 2374 2375 // Legalize MIMG 2376 if (isMIMG(MI)) { 2377 MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); 2378 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { 2379 unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); 2380 SRsrc->setReg(SGPR); 2381 } 2382 2383 MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); 2384 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { 2385 unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); 2386 SSamp->setReg(SGPR); 2387 } 2388 return; 2389 } 2390 2391 // Legalize MUBUF* instructions 2392 // FIXME: If we start using the non-addr64 instructions for compute, we 2393 // may need to legalize them here. 2394 int SRsrcIdx = 2395 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); 2396 if (SRsrcIdx != -1) { 2397 // We have an MUBUF instruction 2398 MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx); 2399 unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass; 2400 if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), 2401 RI.getRegClass(SRsrcRC))) { 2402 // The operands are legal. 2403 // FIXME: We may need to legalize operands besided srsrc. 2404 return; 2405 } 2406 2407 MachineBasicBlock &MBB = *MI.getParent(); 2408 2409 // Extract the ptr from the resource descriptor. 2410 unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, 2411 &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); 2412 2413 // Create an empty resource descriptor 2414 unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2415 unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2416 unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2417 unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); 2418 uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); 2419 2420 // Zero64 = 0 2421 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64) 2422 .addImm(0); 2423 2424 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 2425 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo) 2426 .addImm(RsrcDataFormat & 0xFFFFFFFF); 2427 2428 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 2429 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi) 2430 .addImm(RsrcDataFormat >> 32); 2431 2432 // NewSRsrc = {Zero64, SRsrcFormat} 2433 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) 2434 .addReg(Zero64) 2435 .addImm(AMDGPU::sub0_sub1) 2436 .addReg(SRsrcFormatLo) 2437 .addImm(AMDGPU::sub2) 2438 .addReg(SRsrcFormatHi) 2439 .addImm(AMDGPU::sub3); 2440 2441 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 2442 unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2443 if (VAddr) { 2444 // This is already an ADDR64 instruction so we need to add the pointer 2445 // extracted from the resource descriptor to the current value of VAddr. 2446 unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2447 unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2448 2449 // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 2450 DebugLoc DL = MI.getDebugLoc(); 2451 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) 2452 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 2453 .addReg(VAddr->getReg(), 0, AMDGPU::sub0); 2454 2455 // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1 2456 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) 2457 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 2458 .addReg(VAddr->getReg(), 0, AMDGPU::sub1); 2459 2460 // NewVaddr = {NewVaddrHi, NewVaddrLo} 2461 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 2462 .addReg(NewVAddrLo) 2463 .addImm(AMDGPU::sub0) 2464 .addReg(NewVAddrHi) 2465 .addImm(AMDGPU::sub1); 2466 } else { 2467 // This instructions is the _OFFSET variant, so we need to convert it to 2468 // ADDR64. 2469 assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration() 2470 < SISubtarget::VOLCANIC_ISLANDS && 2471 "FIXME: Need to emit flat atomics here"); 2472 2473 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); 2474 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 2475 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); 2476 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); 2477 2478 // Atomics rith return have have an additional tied operand and are 2479 // missing some of the special bits. 2480 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); 2481 MachineInstr *Addr64; 2482 2483 if (!VDataIn) { 2484 // Regular buffer load / store. 2485 MachineInstrBuilder MIB = 2486 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 2487 .addOperand(*VData) 2488 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 2489 // This will be replaced later 2490 // with the new value of vaddr. 2491 .addOperand(*SRsrc) 2492 .addOperand(*SOffset) 2493 .addOperand(*Offset); 2494 2495 // Atomics do not have this operand. 2496 if (const MachineOperand *GLC = 2497 getNamedOperand(MI, AMDGPU::OpName::glc)) { 2498 MIB.addImm(GLC->getImm()); 2499 } 2500 2501 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)); 2502 2503 if (const MachineOperand *TFE = 2504 getNamedOperand(MI, AMDGPU::OpName::tfe)) { 2505 MIB.addImm(TFE->getImm()); 2506 } 2507 2508 MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); 2509 Addr64 = MIB; 2510 } else { 2511 // Atomics with return. 2512 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 2513 .addOperand(*VData) 2514 .addOperand(*VDataIn) 2515 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 2516 // This will be replaced later 2517 // with the new value of vaddr. 2518 .addOperand(*SRsrc) 2519 .addOperand(*SOffset) 2520 .addOperand(*Offset) 2521 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) 2522 .setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); 2523 } 2524 2525 MI.removeFromParent(); 2526 2527 // NewVaddr = {NewVaddrHi, NewVaddrLo} 2528 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), 2529 NewVAddr) 2530 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 2531 .addImm(AMDGPU::sub0) 2532 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 2533 .addImm(AMDGPU::sub1); 2534 2535 VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr); 2536 SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc); 2537 } 2538 2539 // Update the instruction to use NewVaddr 2540 VAddr->setReg(NewVAddr); 2541 // Update the instruction to use NewSRsrc 2542 SRsrc->setReg(NewSRsrc); 2543 } 2544 } 2545 2546 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { 2547 SmallVector<MachineInstr *, 128> Worklist; 2548 Worklist.push_back(&TopInst); 2549 2550 while (!Worklist.empty()) { 2551 MachineInstr &Inst = *Worklist.pop_back_val(); 2552 MachineBasicBlock *MBB = Inst.getParent(); 2553 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 2554 2555 unsigned Opcode = Inst.getOpcode(); 2556 unsigned NewOpcode = getVALUOp(Inst); 2557 2558 // Handle some special cases 2559 switch (Opcode) { 2560 default: 2561 break; 2562 case AMDGPU::S_AND_B64: 2563 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); 2564 Inst.eraseFromParent(); 2565 continue; 2566 2567 case AMDGPU::S_OR_B64: 2568 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64); 2569 Inst.eraseFromParent(); 2570 continue; 2571 2572 case AMDGPU::S_XOR_B64: 2573 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64); 2574 Inst.eraseFromParent(); 2575 continue; 2576 2577 case AMDGPU::S_NOT_B64: 2578 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); 2579 Inst.eraseFromParent(); 2580 continue; 2581 2582 case AMDGPU::S_BCNT1_I32_B64: 2583 splitScalar64BitBCNT(Worklist, Inst); 2584 Inst.eraseFromParent(); 2585 continue; 2586 2587 case AMDGPU::S_BFE_I64: { 2588 splitScalar64BitBFE(Worklist, Inst); 2589 Inst.eraseFromParent(); 2590 continue; 2591 } 2592 2593 case AMDGPU::S_LSHL_B32: 2594 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 2595 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 2596 swapOperands(Inst); 2597 } 2598 break; 2599 case AMDGPU::S_ASHR_I32: 2600 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 2601 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 2602 swapOperands(Inst); 2603 } 2604 break; 2605 case AMDGPU::S_LSHR_B32: 2606 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 2607 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 2608 swapOperands(Inst); 2609 } 2610 break; 2611 case AMDGPU::S_LSHL_B64: 2612 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 2613 NewOpcode = AMDGPU::V_LSHLREV_B64; 2614 swapOperands(Inst); 2615 } 2616 break; 2617 case AMDGPU::S_ASHR_I64: 2618 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 2619 NewOpcode = AMDGPU::V_ASHRREV_I64; 2620 swapOperands(Inst); 2621 } 2622 break; 2623 case AMDGPU::S_LSHR_B64: 2624 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 2625 NewOpcode = AMDGPU::V_LSHRREV_B64; 2626 swapOperands(Inst); 2627 } 2628 break; 2629 2630 case AMDGPU::S_ABS_I32: 2631 lowerScalarAbs(Worklist, Inst); 2632 Inst.eraseFromParent(); 2633 continue; 2634 2635 case AMDGPU::S_CBRANCH_SCC0: 2636 case AMDGPU::S_CBRANCH_SCC1: 2637 // Clear unused bits of vcc 2638 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), 2639 AMDGPU::VCC) 2640 .addReg(AMDGPU::EXEC) 2641 .addReg(AMDGPU::VCC); 2642 break; 2643 2644 case AMDGPU::S_BFE_U64: 2645 case AMDGPU::S_BFM_B64: 2646 llvm_unreachable("Moving this op to VALU not implemented"); 2647 } 2648 2649 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 2650 // We cannot move this instruction to the VALU, so we should try to 2651 // legalize its operands instead. 2652 legalizeOperands(Inst); 2653 continue; 2654 } 2655 2656 // Use the new VALU Opcode. 2657 const MCInstrDesc &NewDesc = get(NewOpcode); 2658 Inst.setDesc(NewDesc); 2659 2660 // Remove any references to SCC. Vector instructions can't read from it, and 2661 // We're just about to add the implicit use / defs of VCC, and we don't want 2662 // both. 2663 for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { 2664 MachineOperand &Op = Inst.getOperand(i); 2665 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { 2666 Inst.RemoveOperand(i); 2667 addSCCDefUsersToVALUWorklist(Inst, Worklist); 2668 } 2669 } 2670 2671 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 2672 // We are converting these to a BFE, so we need to add the missing 2673 // operands for the size and offset. 2674 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 2675 Inst.addOperand(MachineOperand::CreateImm(0)); 2676 Inst.addOperand(MachineOperand::CreateImm(Size)); 2677 2678 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 2679 // The VALU version adds the second operand to the result, so insert an 2680 // extra 0 operand. 2681 Inst.addOperand(MachineOperand::CreateImm(0)); 2682 } 2683 2684 Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); 2685 2686 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 2687 const MachineOperand &OffsetWidthOp = Inst.getOperand(2); 2688 // If we need to move this to VGPRs, we need to unpack the second operand 2689 // back into the 2 separate ones for bit offset and width. 2690 assert(OffsetWidthOp.isImm() && 2691 "Scalar BFE is only implemented for constant width and offset"); 2692 uint32_t Imm = OffsetWidthOp.getImm(); 2693 2694 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 2695 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 2696 Inst.RemoveOperand(2); // Remove old immediate. 2697 Inst.addOperand(MachineOperand::CreateImm(Offset)); 2698 Inst.addOperand(MachineOperand::CreateImm(BitWidth)); 2699 } 2700 2701 bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); 2702 unsigned NewDstReg = AMDGPU::NoRegister; 2703 if (HasDst) { 2704 // Update the destination register class. 2705 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); 2706 if (!NewDstRC) 2707 continue; 2708 2709 unsigned DstReg = Inst.getOperand(0).getReg(); 2710 NewDstReg = MRI.createVirtualRegister(NewDstRC); 2711 MRI.replaceRegWith(DstReg, NewDstReg); 2712 } 2713 2714 // Legalize the operands 2715 legalizeOperands(Inst); 2716 2717 if (HasDst) 2718 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 2719 } 2720 } 2721 2722 void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, 2723 MachineInstr &Inst) const { 2724 MachineBasicBlock &MBB = *Inst.getParent(); 2725 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2726 MachineBasicBlock::iterator MII = Inst; 2727 DebugLoc DL = Inst.getDebugLoc(); 2728 2729 MachineOperand &Dest = Inst.getOperand(0); 2730 MachineOperand &Src = Inst.getOperand(1); 2731 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2732 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2733 2734 BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg) 2735 .addImm(0) 2736 .addReg(Src.getReg()); 2737 2738 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) 2739 .addReg(Src.getReg()) 2740 .addReg(TmpReg); 2741 2742 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2743 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2744 } 2745 2746 void SIInstrInfo::splitScalar64BitUnaryOp( 2747 SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst, 2748 unsigned Opcode) const { 2749 MachineBasicBlock &MBB = *Inst.getParent(); 2750 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2751 2752 MachineOperand &Dest = Inst.getOperand(0); 2753 MachineOperand &Src0 = Inst.getOperand(1); 2754 DebugLoc DL = Inst.getDebugLoc(); 2755 2756 MachineBasicBlock::iterator MII = Inst; 2757 2758 const MCInstrDesc &InstDesc = get(Opcode); 2759 const TargetRegisterClass *Src0RC = Src0.isReg() ? 2760 MRI.getRegClass(Src0.getReg()) : 2761 &AMDGPU::SGPR_32RegClass; 2762 2763 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 2764 2765 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2766 AMDGPU::sub0, Src0SubRC); 2767 2768 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 2769 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 2770 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 2771 2772 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 2773 BuildMI(MBB, MII, DL, InstDesc, DestSub0) 2774 .addOperand(SrcReg0Sub0); 2775 2776 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2777 AMDGPU::sub1, Src0SubRC); 2778 2779 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 2780 BuildMI(MBB, MII, DL, InstDesc, DestSub1) 2781 .addOperand(SrcReg0Sub1); 2782 2783 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 2784 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2785 .addReg(DestSub0) 2786 .addImm(AMDGPU::sub0) 2787 .addReg(DestSub1) 2788 .addImm(AMDGPU::sub1); 2789 2790 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 2791 2792 // We don't need to legalizeOperands here because for a single operand, src0 2793 // will support any kind of input. 2794 2795 // Move all users of this moved value. 2796 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 2797 } 2798 2799 void SIInstrInfo::splitScalar64BitBinaryOp( 2800 SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst, 2801 unsigned Opcode) const { 2802 MachineBasicBlock &MBB = *Inst.getParent(); 2803 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2804 2805 MachineOperand &Dest = Inst.getOperand(0); 2806 MachineOperand &Src0 = Inst.getOperand(1); 2807 MachineOperand &Src1 = Inst.getOperand(2); 2808 DebugLoc DL = Inst.getDebugLoc(); 2809 2810 MachineBasicBlock::iterator MII = Inst; 2811 2812 const MCInstrDesc &InstDesc = get(Opcode); 2813 const TargetRegisterClass *Src0RC = Src0.isReg() ? 2814 MRI.getRegClass(Src0.getReg()) : 2815 &AMDGPU::SGPR_32RegClass; 2816 2817 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 2818 const TargetRegisterClass *Src1RC = Src1.isReg() ? 2819 MRI.getRegClass(Src1.getReg()) : 2820 &AMDGPU::SGPR_32RegClass; 2821 2822 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 2823 2824 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2825 AMDGPU::sub0, Src0SubRC); 2826 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 2827 AMDGPU::sub0, Src1SubRC); 2828 2829 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 2830 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 2831 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 2832 2833 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 2834 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) 2835 .addOperand(SrcReg0Sub0) 2836 .addOperand(SrcReg1Sub0); 2837 2838 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2839 AMDGPU::sub1, Src0SubRC); 2840 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 2841 AMDGPU::sub1, Src1SubRC); 2842 2843 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 2844 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) 2845 .addOperand(SrcReg0Sub1) 2846 .addOperand(SrcReg1Sub1); 2847 2848 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 2849 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2850 .addReg(DestSub0) 2851 .addImm(AMDGPU::sub0) 2852 .addReg(DestSub1) 2853 .addImm(AMDGPU::sub1); 2854 2855 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 2856 2857 // Try to legalize the operands in case we need to swap the order to keep it 2858 // valid. 2859 legalizeOperands(LoHalf); 2860 legalizeOperands(HiHalf); 2861 2862 // Move all users of this moved vlaue. 2863 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 2864 } 2865 2866 void SIInstrInfo::splitScalar64BitBCNT( 2867 SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst) const { 2868 MachineBasicBlock &MBB = *Inst.getParent(); 2869 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2870 2871 MachineBasicBlock::iterator MII = Inst; 2872 DebugLoc DL = Inst.getDebugLoc(); 2873 2874 MachineOperand &Dest = Inst.getOperand(0); 2875 MachineOperand &Src = Inst.getOperand(1); 2876 2877 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 2878 const TargetRegisterClass *SrcRC = Src.isReg() ? 2879 MRI.getRegClass(Src.getReg()) : 2880 &AMDGPU::SGPR_32RegClass; 2881 2882 unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2883 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2884 2885 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); 2886 2887 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 2888 AMDGPU::sub0, SrcSubRC); 2889 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 2890 AMDGPU::sub1, SrcSubRC); 2891 2892 BuildMI(MBB, MII, DL, InstDesc, MidReg) 2893 .addOperand(SrcRegSub0) 2894 .addImm(0); 2895 2896 BuildMI(MBB, MII, DL, InstDesc, ResultReg) 2897 .addOperand(SrcRegSub1) 2898 .addReg(MidReg); 2899 2900 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2901 2902 // We don't need to legalize operands here. src0 for etiher instruction can be 2903 // an SGPR, and the second input is unused or determined here. 2904 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2905 } 2906 2907 void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, 2908 MachineInstr &Inst) const { 2909 MachineBasicBlock &MBB = *Inst.getParent(); 2910 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2911 MachineBasicBlock::iterator MII = Inst; 2912 DebugLoc DL = Inst.getDebugLoc(); 2913 2914 MachineOperand &Dest = Inst.getOperand(0); 2915 uint32_t Imm = Inst.getOperand(2).getImm(); 2916 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 2917 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 2918 2919 (void) Offset; 2920 2921 // Only sext_inreg cases handled. 2922 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && 2923 Offset == 0 && "Not implemented"); 2924 2925 if (BitWidth < 32) { 2926 unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2927 unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2928 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2929 2930 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) 2931 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) 2932 .addImm(0) 2933 .addImm(BitWidth); 2934 2935 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 2936 .addImm(31) 2937 .addReg(MidRegLo); 2938 2939 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 2940 .addReg(MidRegLo) 2941 .addImm(AMDGPU::sub0) 2942 .addReg(MidRegHi) 2943 .addImm(AMDGPU::sub1); 2944 2945 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2946 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2947 return; 2948 } 2949 2950 MachineOperand &Src = Inst.getOperand(1); 2951 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2952 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2953 2954 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 2955 .addImm(31) 2956 .addReg(Src.getReg(), 0, AMDGPU::sub0); 2957 2958 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 2959 .addReg(Src.getReg(), 0, AMDGPU::sub0) 2960 .addImm(AMDGPU::sub0) 2961 .addReg(TmpReg) 2962 .addImm(AMDGPU::sub1); 2963 2964 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2965 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2966 } 2967 2968 void SIInstrInfo::addUsersToMoveToVALUWorklist( 2969 unsigned DstReg, 2970 MachineRegisterInfo &MRI, 2971 SmallVectorImpl<MachineInstr *> &Worklist) const { 2972 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), 2973 E = MRI.use_end(); I != E; ++I) { 2974 MachineInstr &UseMI = *I->getParent(); 2975 if (!canReadVGPR(UseMI, I.getOperandNo())) { 2976 Worklist.push_back(&UseMI); 2977 } 2978 } 2979 } 2980 2981 void SIInstrInfo::addSCCDefUsersToVALUWorklist( 2982 MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const { 2983 // This assumes that all the users of SCC are in the same block 2984 // as the SCC def. 2985 for (MachineInstr &MI : 2986 llvm::make_range(MachineBasicBlock::iterator(SCCDefInst), 2987 SCCDefInst.getParent()->end())) { 2988 // Exit if we find another SCC def. 2989 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1) 2990 return; 2991 2992 if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1) 2993 Worklist.push_back(&MI); 2994 } 2995 } 2996 2997 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( 2998 const MachineInstr &Inst) const { 2999 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); 3000 3001 switch (Inst.getOpcode()) { 3002 // For target instructions, getOpRegClass just returns the virtual register 3003 // class associated with the operand, so we need to find an equivalent VGPR 3004 // register class in order to move the instruction to the VALU. 3005 case AMDGPU::COPY: 3006 case AMDGPU::PHI: 3007 case AMDGPU::REG_SEQUENCE: 3008 case AMDGPU::INSERT_SUBREG: 3009 if (RI.hasVGPRs(NewDstRC)) 3010 return nullptr; 3011 3012 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 3013 if (!NewDstRC) 3014 return nullptr; 3015 return NewDstRC; 3016 default: 3017 return NewDstRC; 3018 } 3019 } 3020 3021 // Find the one SGPR operand we are allowed to use. 3022 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI, 3023 int OpIndices[3]) const { 3024 const MCInstrDesc &Desc = MI.getDesc(); 3025 3026 // Find the one SGPR operand we are allowed to use. 3027 // 3028 // First we need to consider the instruction's operand requirements before 3029 // legalizing. Some operands are required to be SGPRs, such as implicit uses 3030 // of VCC, but we are still bound by the constant bus requirement to only use 3031 // one. 3032 // 3033 // If the operand's class is an SGPR, we can never move it. 3034 3035 unsigned SGPRReg = findImplicitSGPRRead(MI); 3036 if (SGPRReg != AMDGPU::NoRegister) 3037 return SGPRReg; 3038 3039 unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; 3040 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 3041 3042 for (unsigned i = 0; i < 3; ++i) { 3043 int Idx = OpIndices[i]; 3044 if (Idx == -1) 3045 break; 3046 3047 const MachineOperand &MO = MI.getOperand(Idx); 3048 if (!MO.isReg()) 3049 continue; 3050 3051 // Is this operand statically required to be an SGPR based on the operand 3052 // constraints? 3053 const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); 3054 bool IsRequiredSGPR = RI.isSGPRClass(OpRC); 3055 if (IsRequiredSGPR) 3056 return MO.getReg(); 3057 3058 // If this could be a VGPR or an SGPR, Check the dynamic register class. 3059 unsigned Reg = MO.getReg(); 3060 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); 3061 if (RI.isSGPRClass(RegRC)) 3062 UsedSGPRs[i] = Reg; 3063 } 3064 3065 // We don't have a required SGPR operand, so we have a bit more freedom in 3066 // selecting operands to move. 3067 3068 // Try to select the most used SGPR. If an SGPR is equal to one of the 3069 // others, we choose that. 3070 // 3071 // e.g. 3072 // V_FMA_F32 v0, s0, s0, s0 -> No moves 3073 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 3074 3075 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should 3076 // prefer those. 3077 3078 if (UsedSGPRs[0] != AMDGPU::NoRegister) { 3079 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 3080 SGPRReg = UsedSGPRs[0]; 3081 } 3082 3083 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { 3084 if (UsedSGPRs[1] == UsedSGPRs[2]) 3085 SGPRReg = UsedSGPRs[1]; 3086 } 3087 3088 return SGPRReg; 3089 } 3090 3091 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 3092 unsigned OperandName) const { 3093 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 3094 if (Idx == -1) 3095 return nullptr; 3096 3097 return &MI.getOperand(Idx); 3098 } 3099 3100 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 3101 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 3102 if (ST.isAmdHsaOS()) { 3103 RsrcDataFormat |= (1ULL << 56); 3104 3105 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) 3106 // Set MTYPE = 2 3107 RsrcDataFormat |= (2ULL << 59); 3108 } 3109 3110 return RsrcDataFormat; 3111 } 3112 3113 uint64_t SIInstrInfo::getScratchRsrcWords23() const { 3114 uint64_t Rsrc23 = getDefaultRsrcDataFormat() | 3115 AMDGPU::RSRC_TID_ENABLE | 3116 0xffffffff; // Size; 3117 3118 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; 3119 3120 Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT) | 3121 // IndexStride = 64 3122 (UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT); 3123 3124 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. 3125 // Clear them unless we want a huge stride. 3126 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) 3127 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; 3128 3129 return Rsrc23; 3130 } 3131 3132 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { 3133 unsigned Opc = MI.getOpcode(); 3134 3135 return isSMRD(Opc); 3136 } 3137 3138 bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const { 3139 unsigned Opc = MI.getOpcode(); 3140 3141 return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); 3142 } 3143 3144 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 3145 unsigned Opc = MI.getOpcode(); 3146 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); 3147 unsigned DescSize = Desc.getSize(); 3148 3149 // If we have a definitive size, we can use it. Otherwise we need to inspect 3150 // the operands to know the size. 3151 if (DescSize == 8 || DescSize == 4) 3152 return DescSize; 3153 3154 assert(DescSize == 0); 3155 3156 // 4-byte instructions may have a 32-bit literal encoded after them. Check 3157 // operands that coud ever be literals. 3158 if (isVALU(MI) || isSALU(MI)) { 3159 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 3160 if (Src0Idx == -1) 3161 return 4; // No operands. 3162 3163 if (isLiteralConstant(MI.getOperand(Src0Idx), getOpSize(MI, Src0Idx))) 3164 return 8; 3165 3166 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 3167 if (Src1Idx == -1) 3168 return 4; 3169 3170 if (isLiteralConstant(MI.getOperand(Src1Idx), getOpSize(MI, Src1Idx))) 3171 return 8; 3172 3173 return 4; 3174 } 3175 3176 switch (Opc) { 3177 case TargetOpcode::IMPLICIT_DEF: 3178 case TargetOpcode::KILL: 3179 case TargetOpcode::DBG_VALUE: 3180 case TargetOpcode::BUNDLE: 3181 case TargetOpcode::EH_LABEL: 3182 return 0; 3183 case TargetOpcode::INLINEASM: { 3184 const MachineFunction *MF = MI.getParent()->getParent(); 3185 const char *AsmStr = MI.getOperand(0).getSymbolName(); 3186 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo()); 3187 } 3188 default: 3189 llvm_unreachable("unable to find instruction size"); 3190 } 3191 } 3192 3193 ArrayRef<std::pair<int, const char *>> 3194 SIInstrInfo::getSerializableTargetIndices() const { 3195 static const std::pair<int, const char *> TargetIndices[] = { 3196 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, 3197 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, 3198 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, 3199 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, 3200 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; 3201 return makeArrayRef(TargetIndices); 3202 } 3203 3204 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The 3205 /// post-RA version of misched uses CreateTargetMIHazardRecognizer. 3206 ScheduleHazardRecognizer * 3207 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, 3208 const ScheduleDAG *DAG) const { 3209 return new GCNHazardRecognizer(DAG->MF); 3210 } 3211 3212 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer 3213 /// pass. 3214 ScheduleHazardRecognizer * 3215 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { 3216 return new GCNHazardRecognizer(MF); 3217 } 3218