1 //===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief SI Implementation of TargetInstrInfo. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "SIInstrInfo.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "GCNHazardRecognizer.h" 19 #include "SIDefines.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "llvm/CodeGen/MachineFrameInfo.h" 22 #include "llvm/CodeGen/MachineInstrBuilder.h" 23 #include "llvm/CodeGen/MachineRegisterInfo.h" 24 #include "llvm/CodeGen/ScheduleDAG.h" 25 #include "llvm/IR/Function.h" 26 #include "llvm/CodeGen/RegisterScavenging.h" 27 #include "llvm/MC/MCInstrDesc.h" 28 #include "llvm/Support/Debug.h" 29 30 using namespace llvm; 31 32 SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st) 33 : AMDGPUInstrInfo(st), RI() {} 34 35 //===----------------------------------------------------------------------===// 36 // TargetInstrInfo callbacks 37 //===----------------------------------------------------------------------===// 38 39 static unsigned getNumOperandsNoGlue(SDNode *Node) { 40 unsigned N = Node->getNumOperands(); 41 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 42 --N; 43 return N; 44 } 45 46 static SDValue findChainOperand(SDNode *Load) { 47 SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); 48 assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); 49 return LastOp; 50 } 51 52 /// \brief Returns true if both nodes have the same value for the given 53 /// operand \p Op, or if both nodes do not have this operand. 54 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 55 unsigned Opc0 = N0->getMachineOpcode(); 56 unsigned Opc1 = N1->getMachineOpcode(); 57 58 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 59 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 60 61 if (Op0Idx == -1 && Op1Idx == -1) 62 return true; 63 64 65 if ((Op0Idx == -1 && Op1Idx != -1) || 66 (Op1Idx == -1 && Op0Idx != -1)) 67 return false; 68 69 // getNamedOperandIdx returns the index for the MachineInstr's operands, 70 // which includes the result as the first operand. We are indexing into the 71 // MachineSDNode's operands, so we need to skip the result operand to get 72 // the real index. 73 --Op0Idx; 74 --Op1Idx; 75 76 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 77 } 78 79 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, 80 AliasAnalysis *AA) const { 81 // TODO: The generic check fails for VALU instructions that should be 82 // rematerializable due to implicit reads of exec. We really want all of the 83 // generic logic for this except for this. 84 switch (MI->getOpcode()) { 85 case AMDGPU::V_MOV_B32_e32: 86 case AMDGPU::V_MOV_B32_e64: 87 case AMDGPU::V_MOV_B64_PSEUDO: 88 return true; 89 default: 90 return false; 91 } 92 } 93 94 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 95 int64_t &Offset0, 96 int64_t &Offset1) const { 97 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 98 return false; 99 100 unsigned Opc0 = Load0->getMachineOpcode(); 101 unsigned Opc1 = Load1->getMachineOpcode(); 102 103 // Make sure both are actually loads. 104 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 105 return false; 106 107 if (isDS(Opc0) && isDS(Opc1)) { 108 109 // FIXME: Handle this case: 110 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 111 return false; 112 113 // Check base reg. 114 if (Load0->getOperand(1) != Load1->getOperand(1)) 115 return false; 116 117 // Check chain. 118 if (findChainOperand(Load0) != findChainOperand(Load1)) 119 return false; 120 121 // Skip read2 / write2 variants for simplicity. 122 // TODO: We should report true if the used offsets are adjacent (excluded 123 // st64 versions). 124 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || 125 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) 126 return false; 127 128 Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue(); 129 Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue(); 130 return true; 131 } 132 133 if (isSMRD(Opc0) && isSMRD(Opc1)) { 134 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); 135 136 // Check base reg. 137 if (Load0->getOperand(0) != Load1->getOperand(0)) 138 return false; 139 140 const ConstantSDNode *Load0Offset = 141 dyn_cast<ConstantSDNode>(Load0->getOperand(1)); 142 const ConstantSDNode *Load1Offset = 143 dyn_cast<ConstantSDNode>(Load1->getOperand(1)); 144 145 if (!Load0Offset || !Load1Offset) 146 return false; 147 148 // Check chain. 149 if (findChainOperand(Load0) != findChainOperand(Load1)) 150 return false; 151 152 Offset0 = Load0Offset->getZExtValue(); 153 Offset1 = Load1Offset->getZExtValue(); 154 return true; 155 } 156 157 // MUBUF and MTBUF can access the same addresses. 158 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 159 160 // MUBUF and MTBUF have vaddr at different indices. 161 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 162 findChainOperand(Load0) != findChainOperand(Load1) || 163 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 164 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 165 return false; 166 167 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 168 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 169 170 if (OffIdx0 == -1 || OffIdx1 == -1) 171 return false; 172 173 // getNamedOperandIdx returns the index for MachineInstrs. Since they 174 // inlcude the output in the operand list, but SDNodes don't, we need to 175 // subtract the index by one. 176 --OffIdx0; 177 --OffIdx1; 178 179 SDValue Off0 = Load0->getOperand(OffIdx0); 180 SDValue Off1 = Load1->getOperand(OffIdx1); 181 182 // The offset might be a FrameIndexSDNode. 183 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 184 return false; 185 186 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 187 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 188 return true; 189 } 190 191 return false; 192 } 193 194 static bool isStride64(unsigned Opc) { 195 switch (Opc) { 196 case AMDGPU::DS_READ2ST64_B32: 197 case AMDGPU::DS_READ2ST64_B64: 198 case AMDGPU::DS_WRITE2ST64_B32: 199 case AMDGPU::DS_WRITE2ST64_B64: 200 return true; 201 default: 202 return false; 203 } 204 } 205 206 bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, 207 int64_t &Offset, 208 const TargetRegisterInfo *TRI) const { 209 unsigned Opc = LdSt->getOpcode(); 210 211 if (isDS(*LdSt)) { 212 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 213 AMDGPU::OpName::offset); 214 if (OffsetImm) { 215 // Normal, single offset LDS instruction. 216 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 217 AMDGPU::OpName::addr); 218 219 BaseReg = AddrReg->getReg(); 220 Offset = OffsetImm->getImm(); 221 return true; 222 } 223 224 // The 2 offset instructions use offset0 and offset1 instead. We can treat 225 // these as a load with a single offset if the 2 offsets are consecutive. We 226 // will use this for some partially aligned loads. 227 const MachineOperand *Offset0Imm = getNamedOperand(*LdSt, 228 AMDGPU::OpName::offset0); 229 const MachineOperand *Offset1Imm = getNamedOperand(*LdSt, 230 AMDGPU::OpName::offset1); 231 232 uint8_t Offset0 = Offset0Imm->getImm(); 233 uint8_t Offset1 = Offset1Imm->getImm(); 234 235 if (Offset1 > Offset0 && Offset1 - Offset0 == 1) { 236 // Each of these offsets is in element sized units, so we need to convert 237 // to bytes of the individual reads. 238 239 unsigned EltSize; 240 if (LdSt->mayLoad()) 241 EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2; 242 else { 243 assert(LdSt->mayStore()); 244 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 245 EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize(); 246 } 247 248 if (isStride64(Opc)) 249 EltSize *= 64; 250 251 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 252 AMDGPU::OpName::addr); 253 BaseReg = AddrReg->getReg(); 254 Offset = EltSize * Offset0; 255 return true; 256 } 257 258 return false; 259 } 260 261 if (isMUBUF(*LdSt) || isMTBUF(*LdSt)) { 262 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) 263 return false; 264 265 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 266 AMDGPU::OpName::vaddr); 267 if (!AddrReg) 268 return false; 269 270 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 271 AMDGPU::OpName::offset); 272 BaseReg = AddrReg->getReg(); 273 Offset = OffsetImm->getImm(); 274 return true; 275 } 276 277 if (isSMRD(*LdSt)) { 278 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 279 AMDGPU::OpName::offset); 280 if (!OffsetImm) 281 return false; 282 283 const MachineOperand *SBaseReg = getNamedOperand(*LdSt, 284 AMDGPU::OpName::sbase); 285 BaseReg = SBaseReg->getReg(); 286 Offset = OffsetImm->getImm(); 287 return true; 288 } 289 290 return false; 291 } 292 293 bool SIInstrInfo::shouldClusterMemOps(MachineInstr *FirstLdSt, 294 MachineInstr *SecondLdSt, 295 unsigned NumLoads) const { 296 const MachineOperand *FirstDst = nullptr; 297 const MachineOperand *SecondDst = nullptr; 298 299 if (isDS(*FirstLdSt) && isDS(*SecondLdSt)) { 300 FirstDst = getNamedOperand(*FirstLdSt, AMDGPU::OpName::vdst); 301 SecondDst = getNamedOperand(*SecondLdSt, AMDGPU::OpName::vdst); 302 } 303 304 if (isSMRD(*FirstLdSt) && isSMRD(*SecondLdSt)) { 305 FirstDst = getNamedOperand(*FirstLdSt, AMDGPU::OpName::sdst); 306 SecondDst = getNamedOperand(*SecondLdSt, AMDGPU::OpName::sdst); 307 } 308 309 if ((isMUBUF(*FirstLdSt) && isMUBUF(*SecondLdSt)) || 310 (isMTBUF(*FirstLdSt) && isMTBUF(*SecondLdSt))) { 311 FirstDst = getNamedOperand(*FirstLdSt, AMDGPU::OpName::vdata); 312 SecondDst = getNamedOperand(*SecondLdSt, AMDGPU::OpName::vdata); 313 } 314 315 if (!FirstDst || !SecondDst) 316 return false; 317 318 // Try to limit clustering based on the total number of bytes loaded 319 // rather than the number of instructions. This is done to help reduce 320 // register pressure. The method used is somewhat inexact, though, 321 // because it assumes that all loads in the cluster will load the 322 // same number of bytes as FirstLdSt. 323 324 // The unit of this value is bytes. 325 // FIXME: This needs finer tuning. 326 unsigned LoadClusterThreshold = 16; 327 328 const MachineRegisterInfo &MRI = 329 FirstLdSt->getParent()->getParent()->getRegInfo(); 330 const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg()); 331 332 return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold; 333 } 334 335 void 336 SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 337 MachineBasicBlock::iterator MI, DebugLoc DL, 338 unsigned DestReg, unsigned SrcReg, 339 bool KillSrc) const { 340 341 // If we are trying to copy to or from SCC, there is a bug somewhere else in 342 // the backend. While it may be theoretically possible to do this, it should 343 // never be necessary. 344 assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC); 345 346 static const int16_t Sub0_15[] = { 347 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 348 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 349 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 350 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 351 }; 352 353 static const int16_t Sub0_15_64[] = { 354 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 355 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 356 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 357 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 358 }; 359 360 static const int16_t Sub0_7[] = { 361 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 362 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 363 }; 364 365 static const int16_t Sub0_7_64[] = { 366 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 367 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 368 }; 369 370 static const int16_t Sub0_3[] = { 371 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 372 }; 373 374 static const int16_t Sub0_3_64[] = { 375 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 376 }; 377 378 static const int16_t Sub0_2[] = { 379 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 380 }; 381 382 static const int16_t Sub0_1[] = { 383 AMDGPU::sub0, AMDGPU::sub1, 384 }; 385 386 unsigned Opcode; 387 ArrayRef<int16_t> SubIndices; 388 bool Forward; 389 390 if (AMDGPU::SReg_32RegClass.contains(DestReg)) { 391 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 392 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 393 .addReg(SrcReg, getKillRegState(KillSrc)); 394 return; 395 396 } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) { 397 if (DestReg == AMDGPU::VCC) { 398 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 399 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 400 .addReg(SrcReg, getKillRegState(KillSrc)); 401 } else { 402 // FIXME: Hack until VReg_1 removed. 403 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 404 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32)) 405 .addImm(0) 406 .addReg(SrcReg, getKillRegState(KillSrc)); 407 } 408 409 return; 410 } 411 412 assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); 413 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 414 .addReg(SrcReg, getKillRegState(KillSrc)); 415 return; 416 417 } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) { 418 assert(AMDGPU::SReg_128RegClass.contains(SrcReg)); 419 Opcode = AMDGPU::S_MOV_B64; 420 SubIndices = Sub0_3_64; 421 422 } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) { 423 assert(AMDGPU::SReg_256RegClass.contains(SrcReg)); 424 Opcode = AMDGPU::S_MOV_B64; 425 SubIndices = Sub0_7_64; 426 427 } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) { 428 assert(AMDGPU::SReg_512RegClass.contains(SrcReg)); 429 Opcode = AMDGPU::S_MOV_B64; 430 SubIndices = Sub0_15_64; 431 432 } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) { 433 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 434 AMDGPU::SReg_32RegClass.contains(SrcReg)); 435 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 436 .addReg(SrcReg, getKillRegState(KillSrc)); 437 return; 438 439 } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) { 440 assert(AMDGPU::VReg_64RegClass.contains(SrcReg) || 441 AMDGPU::SReg_64RegClass.contains(SrcReg)); 442 Opcode = AMDGPU::V_MOV_B32_e32; 443 SubIndices = Sub0_1; 444 445 } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) { 446 assert(AMDGPU::VReg_96RegClass.contains(SrcReg)); 447 Opcode = AMDGPU::V_MOV_B32_e32; 448 SubIndices = Sub0_2; 449 450 } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) { 451 assert(AMDGPU::VReg_128RegClass.contains(SrcReg) || 452 AMDGPU::SReg_128RegClass.contains(SrcReg)); 453 Opcode = AMDGPU::V_MOV_B32_e32; 454 SubIndices = Sub0_3; 455 456 } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) { 457 assert(AMDGPU::VReg_256RegClass.contains(SrcReg) || 458 AMDGPU::SReg_256RegClass.contains(SrcReg)); 459 Opcode = AMDGPU::V_MOV_B32_e32; 460 SubIndices = Sub0_7; 461 462 } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) { 463 assert(AMDGPU::VReg_512RegClass.contains(SrcReg) || 464 AMDGPU::SReg_512RegClass.contains(SrcReg)); 465 Opcode = AMDGPU::V_MOV_B32_e32; 466 SubIndices = Sub0_15; 467 468 } else { 469 llvm_unreachable("Can't copy register!"); 470 } 471 472 if (RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg)) 473 Forward = true; 474 else 475 Forward = false; 476 477 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 478 unsigned SubIdx; 479 if (Forward) 480 SubIdx = SubIndices[Idx]; 481 else 482 SubIdx = SubIndices[SubIndices.size() - Idx - 1]; 483 484 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 485 get(Opcode), RI.getSubReg(DestReg, SubIdx)); 486 487 Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); 488 489 if (Idx == SubIndices.size() - 1) 490 Builder.addReg(SrcReg, RegState::Kill | RegState::Implicit); 491 492 if (Idx == 0) 493 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 494 } 495 } 496 497 int SIInstrInfo::commuteOpcode(const MachineInstr &MI) const { 498 const unsigned Opcode = MI.getOpcode(); 499 500 int NewOpc; 501 502 // Try to map original to commuted opcode 503 NewOpc = AMDGPU::getCommuteRev(Opcode); 504 if (NewOpc != -1) 505 // Check if the commuted (REV) opcode exists on the target. 506 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 507 508 // Try to map commuted to original opcode 509 NewOpc = AMDGPU::getCommuteOrig(Opcode); 510 if (NewOpc != -1) 511 // Check if the original (non-REV) opcode exists on the target. 512 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 513 514 return Opcode; 515 } 516 517 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 518 519 if (DstRC->getSize() == 4) { 520 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 521 } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) { 522 return AMDGPU::S_MOV_B64; 523 } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) { 524 return AMDGPU::V_MOV_B64_PSEUDO; 525 } 526 return AMDGPU::COPY; 527 } 528 529 static unsigned getSGPRSpillSaveOpcode(unsigned Size) { 530 switch (Size) { 531 case 4: 532 return AMDGPU::SI_SPILL_S32_SAVE; 533 case 8: 534 return AMDGPU::SI_SPILL_S64_SAVE; 535 case 16: 536 return AMDGPU::SI_SPILL_S128_SAVE; 537 case 32: 538 return AMDGPU::SI_SPILL_S256_SAVE; 539 case 64: 540 return AMDGPU::SI_SPILL_S512_SAVE; 541 default: 542 llvm_unreachable("unknown register size"); 543 } 544 } 545 546 static unsigned getVGPRSpillSaveOpcode(unsigned Size) { 547 switch (Size) { 548 case 4: 549 return AMDGPU::SI_SPILL_V32_SAVE; 550 case 8: 551 return AMDGPU::SI_SPILL_V64_SAVE; 552 case 12: 553 return AMDGPU::SI_SPILL_V96_SAVE; 554 case 16: 555 return AMDGPU::SI_SPILL_V128_SAVE; 556 case 32: 557 return AMDGPU::SI_SPILL_V256_SAVE; 558 case 64: 559 return AMDGPU::SI_SPILL_V512_SAVE; 560 default: 561 llvm_unreachable("unknown register size"); 562 } 563 } 564 565 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 566 MachineBasicBlock::iterator MI, 567 unsigned SrcReg, bool isKill, 568 int FrameIndex, 569 const TargetRegisterClass *RC, 570 const TargetRegisterInfo *TRI) const { 571 MachineFunction *MF = MBB.getParent(); 572 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 573 MachineFrameInfo *FrameInfo = MF->getFrameInfo(); 574 DebugLoc DL = MBB.findDebugLoc(MI); 575 576 unsigned Size = FrameInfo->getObjectSize(FrameIndex); 577 unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); 578 MachinePointerInfo PtrInfo 579 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 580 MachineMemOperand *MMO 581 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 582 Size, Align); 583 584 if (RI.isSGPRClass(RC)) { 585 MFI->setHasSpilledSGPRs(); 586 587 // We are only allowed to create one new instruction when spilling 588 // registers, so we need to use pseudo instruction for spilling 589 // SGPRs. 590 unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize()); 591 BuildMI(MBB, MI, DL, get(Opcode)) 592 .addReg(SrcReg) // src 593 .addFrameIndex(FrameIndex) // frame_idx 594 .addMemOperand(MMO); 595 596 return; 597 } 598 599 if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { 600 LLVMContext &Ctx = MF->getFunction()->getContext(); 601 Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" 602 " spill register"); 603 BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) 604 .addReg(SrcReg); 605 606 return; 607 } 608 609 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 610 611 unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); 612 MFI->setHasSpilledVGPRs(); 613 BuildMI(MBB, MI, DL, get(Opcode)) 614 .addReg(SrcReg) // src 615 .addFrameIndex(FrameIndex) // frame_idx 616 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 617 .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset 618 .addImm(0) // offset 619 .addMemOperand(MMO); 620 } 621 622 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { 623 switch (Size) { 624 case 4: 625 return AMDGPU::SI_SPILL_S32_RESTORE; 626 case 8: 627 return AMDGPU::SI_SPILL_S64_RESTORE; 628 case 16: 629 return AMDGPU::SI_SPILL_S128_RESTORE; 630 case 32: 631 return AMDGPU::SI_SPILL_S256_RESTORE; 632 case 64: 633 return AMDGPU::SI_SPILL_S512_RESTORE; 634 default: 635 llvm_unreachable("unknown register size"); 636 } 637 } 638 639 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { 640 switch (Size) { 641 case 4: 642 return AMDGPU::SI_SPILL_V32_RESTORE; 643 case 8: 644 return AMDGPU::SI_SPILL_V64_RESTORE; 645 case 12: 646 return AMDGPU::SI_SPILL_V96_RESTORE; 647 case 16: 648 return AMDGPU::SI_SPILL_V128_RESTORE; 649 case 32: 650 return AMDGPU::SI_SPILL_V256_RESTORE; 651 case 64: 652 return AMDGPU::SI_SPILL_V512_RESTORE; 653 default: 654 llvm_unreachable("unknown register size"); 655 } 656 } 657 658 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 659 MachineBasicBlock::iterator MI, 660 unsigned DestReg, int FrameIndex, 661 const TargetRegisterClass *RC, 662 const TargetRegisterInfo *TRI) const { 663 MachineFunction *MF = MBB.getParent(); 664 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 665 MachineFrameInfo *FrameInfo = MF->getFrameInfo(); 666 DebugLoc DL = MBB.findDebugLoc(MI); 667 unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); 668 unsigned Size = FrameInfo->getObjectSize(FrameIndex); 669 670 MachinePointerInfo PtrInfo 671 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 672 673 MachineMemOperand *MMO = MF->getMachineMemOperand( 674 PtrInfo, MachineMemOperand::MOLoad, Size, Align); 675 676 if (RI.isSGPRClass(RC)) { 677 // FIXME: Maybe this should not include a memoperand because it will be 678 // lowered to non-memory instructions. 679 unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize()); 680 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 681 .addFrameIndex(FrameIndex) // frame_idx 682 .addMemOperand(MMO); 683 684 return; 685 } 686 687 if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { 688 LLVMContext &Ctx = MF->getFunction()->getContext(); 689 Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" 690 " restore register"); 691 BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); 692 693 return; 694 } 695 696 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 697 698 unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); 699 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 700 .addFrameIndex(FrameIndex) // frame_idx 701 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 702 .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset 703 .addImm(0) // offset 704 .addMemOperand(MMO); 705 } 706 707 /// \param @Offset Offset in bytes of the FrameIndex being spilled 708 unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, 709 MachineBasicBlock::iterator MI, 710 RegScavenger *RS, unsigned TmpReg, 711 unsigned FrameOffset, 712 unsigned Size) const { 713 MachineFunction *MF = MBB.getParent(); 714 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 715 const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>(); 716 const SIRegisterInfo *TRI = 717 static_cast<const SIRegisterInfo*>(ST.getRegisterInfo()); 718 DebugLoc DL = MBB.findDebugLoc(MI); 719 unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF); 720 unsigned WavefrontSize = ST.getWavefrontSize(); 721 722 unsigned TIDReg = MFI->getTIDReg(); 723 if (!MFI->hasCalculatedTID()) { 724 MachineBasicBlock &Entry = MBB.getParent()->front(); 725 MachineBasicBlock::iterator Insert = Entry.front(); 726 DebugLoc DL = Insert->getDebugLoc(); 727 728 TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass); 729 if (TIDReg == AMDGPU::NoRegister) 730 return TIDReg; 731 732 733 if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) && 734 WorkGroupSize > WavefrontSize) { 735 736 unsigned TIDIGXReg 737 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X); 738 unsigned TIDIGYReg 739 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y); 740 unsigned TIDIGZReg 741 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z); 742 unsigned InputPtrReg = 743 TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); 744 for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { 745 if (!Entry.isLiveIn(Reg)) 746 Entry.addLiveIn(Reg); 747 } 748 749 RS->enterBasicBlock(Entry); 750 // FIXME: Can we scavenge an SReg_64 and access the subregs? 751 unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 752 unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 753 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) 754 .addReg(InputPtrReg) 755 .addImm(SI::KernelInputOffsets::NGROUPS_Z); 756 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) 757 .addReg(InputPtrReg) 758 .addImm(SI::KernelInputOffsets::NGROUPS_Y); 759 760 // NGROUPS.X * NGROUPS.Y 761 BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) 762 .addReg(STmp1) 763 .addReg(STmp0); 764 // (NGROUPS.X * NGROUPS.Y) * TIDIG.X 765 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) 766 .addReg(STmp1) 767 .addReg(TIDIGXReg); 768 // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) 769 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) 770 .addReg(STmp0) 771 .addReg(TIDIGYReg) 772 .addReg(TIDReg); 773 // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z 774 BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) 775 .addReg(TIDReg) 776 .addReg(TIDIGZReg); 777 } else { 778 // Get the wave id 779 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), 780 TIDReg) 781 .addImm(-1) 782 .addImm(0); 783 784 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), 785 TIDReg) 786 .addImm(-1) 787 .addReg(TIDReg); 788 } 789 790 BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), 791 TIDReg) 792 .addImm(2) 793 .addReg(TIDReg); 794 MFI->setTIDReg(TIDReg); 795 } 796 797 // Add FrameIndex to LDS offset 798 unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize); 799 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) 800 .addImm(LDSOffset) 801 .addReg(TIDReg); 802 803 return TmpReg; 804 } 805 806 void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB, 807 MachineBasicBlock::iterator MI, 808 int Count) const { 809 DebugLoc DL = MBB.findDebugLoc(MI); 810 while (Count > 0) { 811 int Arg; 812 if (Count >= 8) 813 Arg = 7; 814 else 815 Arg = Count - 1; 816 Count -= 8; 817 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)) 818 .addImm(Arg); 819 } 820 } 821 822 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, 823 MachineBasicBlock::iterator MI) const { 824 insertWaitStates(MBB, MI, 1); 825 } 826 827 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const { 828 switch (MI.getOpcode()) { 829 default: return 1; // FIXME: Do wait states equal cycles? 830 831 case AMDGPU::S_NOP: 832 return MI.getOperand(0).getImm() + 1; 833 } 834 } 835 836 bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { 837 MachineBasicBlock &MBB = *MI->getParent(); 838 DebugLoc DL = MBB.findDebugLoc(MI); 839 switch (MI->getOpcode()) { 840 default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); 841 842 case AMDGPU::SGPR_USE: 843 // This is just a placeholder for register allocation. 844 MI->eraseFromParent(); 845 break; 846 847 case AMDGPU::V_MOV_B64_PSEUDO: { 848 unsigned Dst = MI->getOperand(0).getReg(); 849 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 850 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 851 852 const MachineOperand &SrcOp = MI->getOperand(1); 853 // FIXME: Will this work for 64-bit floating point immediates? 854 assert(!SrcOp.isFPImm()); 855 if (SrcOp.isImm()) { 856 APInt Imm(64, SrcOp.getImm()); 857 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 858 .addImm(Imm.getLoBits(32).getZExtValue()) 859 .addReg(Dst, RegState::Implicit); 860 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 861 .addImm(Imm.getHiBits(32).getZExtValue()) 862 .addReg(Dst, RegState::Implicit); 863 } else { 864 assert(SrcOp.isReg()); 865 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 866 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 867 .addReg(Dst, RegState::Implicit); 868 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 869 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 870 .addReg(Dst, RegState::Implicit); 871 } 872 MI->eraseFromParent(); 873 break; 874 } 875 876 case AMDGPU::V_CNDMASK_B64_PSEUDO: { 877 unsigned Dst = MI->getOperand(0).getReg(); 878 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 879 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 880 unsigned Src0 = MI->getOperand(1).getReg(); 881 unsigned Src1 = MI->getOperand(2).getReg(); 882 const MachineOperand &SrcCond = MI->getOperand(3); 883 884 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo) 885 .addReg(RI.getSubReg(Src0, AMDGPU::sub0)) 886 .addReg(RI.getSubReg(Src1, AMDGPU::sub0)) 887 .addOperand(SrcCond); 888 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi) 889 .addReg(RI.getSubReg(Src0, AMDGPU::sub1)) 890 .addReg(RI.getSubReg(Src1, AMDGPU::sub1)) 891 .addOperand(SrcCond); 892 MI->eraseFromParent(); 893 break; 894 } 895 896 case AMDGPU::SI_CONSTDATA_PTR: { 897 const SIRegisterInfo *TRI = 898 static_cast<const SIRegisterInfo *>(ST.getRegisterInfo()); 899 MachineFunction &MF = *MBB.getParent(); 900 unsigned Reg = MI->getOperand(0).getReg(); 901 unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0); 902 unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1); 903 904 // Create a bundle so these instructions won't be re-ordered by the 905 // post-RA scheduler. 906 MIBundleBuilder Bundler(MBB, MI); 907 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); 908 909 // Add 32-bit offset from this instruction to the start of the 910 // constant data. 911 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) 912 .addReg(RegLo) 913 .addOperand(MI->getOperand(1))); 914 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) 915 .addReg(RegHi) 916 .addImm(0)); 917 918 llvm::finalizeBundle(MBB, Bundler.begin()); 919 920 MI->eraseFromParent(); 921 break; 922 } 923 } 924 return true; 925 } 926 927 /// Commutes the operands in the given instruction. 928 /// The commutable operands are specified by their indices OpIdx0 and OpIdx1. 929 /// 930 /// Do not call this method for a non-commutable instruction or for 931 /// non-commutable pair of operand indices OpIdx0 and OpIdx1. 932 /// Even though the instruction is commutable, the method may still 933 /// fail to commute the operands, null pointer is returned in such cases. 934 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI, 935 bool NewMI, 936 unsigned OpIdx0, 937 unsigned OpIdx1) const { 938 int CommutedOpcode = commuteOpcode(*MI); 939 if (CommutedOpcode == -1) 940 return nullptr; 941 942 int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 943 AMDGPU::OpName::src0); 944 MachineOperand &Src0 = MI->getOperand(Src0Idx); 945 if (!Src0.isReg()) 946 return nullptr; 947 948 int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 949 AMDGPU::OpName::src1); 950 951 if ((OpIdx0 != static_cast<unsigned>(Src0Idx) || 952 OpIdx1 != static_cast<unsigned>(Src1Idx)) && 953 (OpIdx0 != static_cast<unsigned>(Src1Idx) || 954 OpIdx1 != static_cast<unsigned>(Src0Idx))) 955 return nullptr; 956 957 MachineOperand &Src1 = MI->getOperand(Src1Idx); 958 959 960 if (isVOP2(*MI) || isVOPC(*MI)) { 961 const MCInstrDesc &InstrDesc = MI->getDesc(); 962 // For VOP2 and VOPC instructions, any operand type is valid to use for 963 // src0. Make sure we can use the src0 as src1. 964 // 965 // We could be stricter here and only allow commuting if there is a reason 966 // to do so. i.e. if both operands are VGPRs there is no real benefit, 967 // although MachineCSE attempts to find matches by commuting. 968 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 969 if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) 970 return nullptr; 971 } 972 973 if (!Src1.isReg()) { 974 // Allow commuting instructions with Imm operands. 975 if (NewMI || !Src1.isImm() || 976 (!isVOP2(*MI) && !isVOP3(*MI))) { 977 return nullptr; 978 } 979 // Be sure to copy the source modifiers to the right place. 980 if (MachineOperand *Src0Mods 981 = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { 982 MachineOperand *Src1Mods 983 = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers); 984 985 int Src0ModsVal = Src0Mods->getImm(); 986 if (!Src1Mods && Src0ModsVal != 0) 987 return nullptr; 988 989 // XXX - This assert might be a lie. It might be useful to have a neg 990 // modifier with 0.0. 991 int Src1ModsVal = Src1Mods->getImm(); 992 assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates"); 993 994 Src1Mods->setImm(Src0ModsVal); 995 Src0Mods->setImm(Src1ModsVal); 996 } 997 998 unsigned Reg = Src0.getReg(); 999 unsigned SubReg = Src0.getSubReg(); 1000 if (Src1.isImm()) 1001 Src0.ChangeToImmediate(Src1.getImm()); 1002 else 1003 llvm_unreachable("Should only have immediates"); 1004 1005 Src1.ChangeToRegister(Reg, false); 1006 Src1.setSubReg(SubReg); 1007 } else { 1008 MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1); 1009 } 1010 1011 if (MI) 1012 MI->setDesc(get(CommutedOpcode)); 1013 1014 return MI; 1015 } 1016 1017 // This needs to be implemented because the source modifiers may be inserted 1018 // between the true commutable operands, and the base 1019 // TargetInstrInfo::commuteInstruction uses it. 1020 bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, 1021 unsigned &SrcOpIdx0, 1022 unsigned &SrcOpIdx1) const { 1023 const MCInstrDesc &MCID = MI->getDesc(); 1024 if (!MCID.isCommutable()) 1025 return false; 1026 1027 unsigned Opc = MI->getOpcode(); 1028 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1029 if (Src0Idx == -1) 1030 return false; 1031 1032 // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on 1033 // immediate. Also, immediate src0 operand is not handled in 1034 // SIInstrInfo::commuteInstruction(); 1035 if (!MI->getOperand(Src0Idx).isReg()) 1036 return false; 1037 1038 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1039 if (Src1Idx == -1) 1040 return false; 1041 1042 MachineOperand &Src1 = MI->getOperand(Src1Idx); 1043 if (Src1.isImm()) { 1044 // SIInstrInfo::commuteInstruction() does support commuting the immediate 1045 // operand src1 in 2 and 3 operand instructions. 1046 if (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode())) 1047 return false; 1048 } else if (Src1.isReg()) { 1049 // If any source modifiers are set, the generic instruction commuting won't 1050 // understand how to copy the source modifiers. 1051 if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || 1052 hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) 1053 return false; 1054 } else 1055 return false; 1056 1057 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); 1058 } 1059 1060 static void removeModOperands(MachineInstr &MI) { 1061 unsigned Opc = MI.getOpcode(); 1062 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1063 AMDGPU::OpName::src0_modifiers); 1064 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1065 AMDGPU::OpName::src1_modifiers); 1066 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1067 AMDGPU::OpName::src2_modifiers); 1068 1069 MI.RemoveOperand(Src2ModIdx); 1070 MI.RemoveOperand(Src1ModIdx); 1071 MI.RemoveOperand(Src0ModIdx); 1072 } 1073 1074 // TODO: Maybe this should be removed this and custom fold everything in 1075 // SIFoldOperands? 1076 bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, 1077 unsigned Reg, MachineRegisterInfo *MRI) const { 1078 if (!MRI->hasOneNonDBGUse(Reg)) 1079 return false; 1080 1081 unsigned Opc = UseMI->getOpcode(); 1082 if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) { 1083 // Don't fold if we are using source modifiers. The new VOP2 instructions 1084 // don't have them. 1085 if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) || 1086 hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) || 1087 hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) { 1088 return false; 1089 } 1090 1091 const MachineOperand &ImmOp = DefMI->getOperand(1); 1092 1093 // If this is a free constant, there's no reason to do this. 1094 // TODO: We could fold this here instead of letting SIFoldOperands do it 1095 // later. 1096 if (isInlineConstant(ImmOp, 4)) 1097 return false; 1098 1099 MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0); 1100 MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1); 1101 MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2); 1102 1103 // Multiplied part is the constant: Use v_madmk_f32 1104 // We should only expect these to be on src0 due to canonicalizations. 1105 if (Src0->isReg() && Src0->getReg() == Reg) { 1106 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1107 return false; 1108 1109 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) 1110 return false; 1111 1112 // We need to swap operands 0 and 1 since madmk constant is at operand 1. 1113 1114 const int64_t Imm = DefMI->getOperand(1).getImm(); 1115 1116 // FIXME: This would be a lot easier if we could return a new instruction 1117 // instead of having to modify in place. 1118 1119 // Remove these first since they are at the end. 1120 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1121 AMDGPU::OpName::omod)); 1122 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1123 AMDGPU::OpName::clamp)); 1124 1125 unsigned Src1Reg = Src1->getReg(); 1126 unsigned Src1SubReg = Src1->getSubReg(); 1127 Src0->setReg(Src1Reg); 1128 Src0->setSubReg(Src1SubReg); 1129 Src0->setIsKill(Src1->isKill()); 1130 1131 if (Opc == AMDGPU::V_MAC_F32_e64) { 1132 UseMI->untieRegOperand( 1133 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1134 } 1135 1136 Src1->ChangeToImmediate(Imm); 1137 1138 removeModOperands(*UseMI); 1139 UseMI->setDesc(get(AMDGPU::V_MADMK_F32)); 1140 1141 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1142 if (DeleteDef) 1143 DefMI->eraseFromParent(); 1144 1145 return true; 1146 } 1147 1148 // Added part is the constant: Use v_madak_f32 1149 if (Src2->isReg() && Src2->getReg() == Reg) { 1150 // Not allowed to use constant bus for another operand. 1151 // We can however allow an inline immediate as src0. 1152 if (!Src0->isImm() && 1153 (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) 1154 return false; 1155 1156 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1157 return false; 1158 1159 const int64_t Imm = DefMI->getOperand(1).getImm(); 1160 1161 // FIXME: This would be a lot easier if we could return a new instruction 1162 // instead of having to modify in place. 1163 1164 // Remove these first since they are at the end. 1165 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1166 AMDGPU::OpName::omod)); 1167 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1168 AMDGPU::OpName::clamp)); 1169 1170 if (Opc == AMDGPU::V_MAC_F32_e64) { 1171 UseMI->untieRegOperand( 1172 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1173 } 1174 1175 // ChangingToImmediate adds Src2 back to the instruction. 1176 Src2->ChangeToImmediate(Imm); 1177 1178 // These come before src2. 1179 removeModOperands(*UseMI); 1180 UseMI->setDesc(get(AMDGPU::V_MADAK_F32)); 1181 1182 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1183 if (DeleteDef) 1184 DefMI->eraseFromParent(); 1185 1186 return true; 1187 } 1188 } 1189 1190 return false; 1191 } 1192 1193 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 1194 int WidthB, int OffsetB) { 1195 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1196 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1197 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1198 return LowOffset + LowWidth <= HighOffset; 1199 } 1200 1201 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa, 1202 MachineInstr *MIb) const { 1203 unsigned BaseReg0, BaseReg1; 1204 int64_t Offset0, Offset1; 1205 1206 if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && 1207 getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { 1208 1209 if (!MIa->hasOneMemOperand() || !MIb->hasOneMemOperand()) { 1210 // FIXME: Handle ds_read2 / ds_write2. 1211 return false; 1212 } 1213 unsigned Width0 = (*MIa->memoperands_begin())->getSize(); 1214 unsigned Width1 = (*MIb->memoperands_begin())->getSize(); 1215 if (BaseReg0 == BaseReg1 && 1216 offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { 1217 return true; 1218 } 1219 } 1220 1221 return false; 1222 } 1223 1224 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, 1225 MachineInstr *MIb, 1226 AliasAnalysis *AA) const { 1227 assert(MIa && (MIa->mayLoad() || MIa->mayStore()) && 1228 "MIa must load from or modify a memory location"); 1229 assert(MIb && (MIb->mayLoad() || MIb->mayStore()) && 1230 "MIb must load from or modify a memory location"); 1231 1232 if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects()) 1233 return false; 1234 1235 // XXX - Can we relax this between address spaces? 1236 if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef()) 1237 return false; 1238 1239 // TODO: Should we check the address space from the MachineMemOperand? That 1240 // would allow us to distinguish objects we know don't alias based on the 1241 // underlying address space, even if it was lowered to a different one, 1242 // e.g. private accesses lowered to use MUBUF instructions on a scratch 1243 // buffer. 1244 if (isDS(*MIa)) { 1245 if (isDS(*MIb)) 1246 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1247 1248 return !isFLAT(*MIb); 1249 } 1250 1251 if (isMUBUF(*MIa) || isMTBUF(*MIa)) { 1252 if (isMUBUF(*MIb) || isMTBUF(*MIb)) 1253 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1254 1255 return !isFLAT(*MIb) && !isSMRD(*MIb); 1256 } 1257 1258 if (isSMRD(*MIa)) { 1259 if (isSMRD(*MIb)) 1260 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1261 1262 return !isFLAT(*MIb) && !isMUBUF(*MIa) && !isMTBUF(*MIa); 1263 } 1264 1265 if (isFLAT(*MIa)) { 1266 if (isFLAT(*MIb)) 1267 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1268 1269 return false; 1270 } 1271 1272 return false; 1273 } 1274 1275 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, 1276 MachineBasicBlock::iterator &MI, 1277 LiveVariables *LV) const { 1278 1279 switch (MI->getOpcode()) { 1280 default: return nullptr; 1281 case AMDGPU::V_MAC_F32_e64: break; 1282 case AMDGPU::V_MAC_F32_e32: { 1283 const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); 1284 if (Src0->isImm() && !isInlineConstant(*Src0, 4)) 1285 return nullptr; 1286 break; 1287 } 1288 } 1289 1290 const MachineOperand *Dst = getNamedOperand(*MI, AMDGPU::OpName::vdst); 1291 const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); 1292 const MachineOperand *Src1 = getNamedOperand(*MI, AMDGPU::OpName::src1); 1293 const MachineOperand *Src2 = getNamedOperand(*MI, AMDGPU::OpName::src2); 1294 1295 return BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_MAD_F32)) 1296 .addOperand(*Dst) 1297 .addImm(0) // Src0 mods 1298 .addOperand(*Src0) 1299 .addImm(0) // Src1 mods 1300 .addOperand(*Src1) 1301 .addImm(0) // Src mods 1302 .addOperand(*Src2) 1303 .addImm(0) // clamp 1304 .addImm(0); // omod 1305 } 1306 1307 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr *MI, 1308 const MachineBasicBlock *MBB, 1309 const MachineFunction &MF) const { 1310 // Target-independent instructions do not have an implicit-use of EXEC, even 1311 // when they operate on VGPRs. Treating EXEC modifications as scheduling 1312 // boundaries prevents incorrect movements of such instructions. 1313 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); 1314 if (MI->modifiesRegister(AMDGPU::EXEC, TRI)) 1315 return true; 1316 1317 return AMDGPUInstrInfo::isSchedulingBoundary(MI, MBB, MF); 1318 } 1319 1320 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 1321 int64_t SVal = Imm.getSExtValue(); 1322 if (SVal >= -16 && SVal <= 64) 1323 return true; 1324 1325 if (Imm.getBitWidth() == 64) { 1326 uint64_t Val = Imm.getZExtValue(); 1327 return (DoubleToBits(0.0) == Val) || 1328 (DoubleToBits(1.0) == Val) || 1329 (DoubleToBits(-1.0) == Val) || 1330 (DoubleToBits(0.5) == Val) || 1331 (DoubleToBits(-0.5) == Val) || 1332 (DoubleToBits(2.0) == Val) || 1333 (DoubleToBits(-2.0) == Val) || 1334 (DoubleToBits(4.0) == Val) || 1335 (DoubleToBits(-4.0) == Val); 1336 } 1337 1338 // The actual type of the operand does not seem to matter as long 1339 // as the bits match one of the inline immediate values. For example: 1340 // 1341 // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal, 1342 // so it is a legal inline immediate. 1343 // 1344 // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in 1345 // floating-point, so it is a legal inline immediate. 1346 uint32_t Val = Imm.getZExtValue(); 1347 1348 return (FloatToBits(0.0f) == Val) || 1349 (FloatToBits(1.0f) == Val) || 1350 (FloatToBits(-1.0f) == Val) || 1351 (FloatToBits(0.5f) == Val) || 1352 (FloatToBits(-0.5f) == Val) || 1353 (FloatToBits(2.0f) == Val) || 1354 (FloatToBits(-2.0f) == Val) || 1355 (FloatToBits(4.0f) == Val) || 1356 (FloatToBits(-4.0f) == Val); 1357 } 1358 1359 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 1360 unsigned OpSize) const { 1361 if (MO.isImm()) { 1362 // MachineOperand provides no way to tell the true operand size, since it 1363 // only records a 64-bit value. We need to know the size to determine if a 1364 // 32-bit floating point immediate bit pattern is legal for an integer 1365 // immediate. It would be for any 32-bit integer operand, but would not be 1366 // for a 64-bit one. 1367 1368 unsigned BitSize = 8 * OpSize; 1369 return isInlineConstant(APInt(BitSize, MO.getImm(), true)); 1370 } 1371 1372 return false; 1373 } 1374 1375 bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO, 1376 unsigned OpSize) const { 1377 return MO.isImm() && !isInlineConstant(MO, OpSize); 1378 } 1379 1380 static bool compareMachineOp(const MachineOperand &Op0, 1381 const MachineOperand &Op1) { 1382 if (Op0.getType() != Op1.getType()) 1383 return false; 1384 1385 switch (Op0.getType()) { 1386 case MachineOperand::MO_Register: 1387 return Op0.getReg() == Op1.getReg(); 1388 case MachineOperand::MO_Immediate: 1389 return Op0.getImm() == Op1.getImm(); 1390 default: 1391 llvm_unreachable("Didn't expect to be comparing these operand types"); 1392 } 1393 } 1394 1395 bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, 1396 const MachineOperand &MO) const { 1397 const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo]; 1398 1399 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 1400 1401 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 1402 return true; 1403 1404 if (OpInfo.RegClass < 0) 1405 return false; 1406 1407 unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize(); 1408 if (isLiteralConstant(MO, OpSize)) 1409 return RI.opCanUseLiteralConstant(OpInfo.OperandType); 1410 1411 return RI.opCanUseInlineConstant(OpInfo.OperandType); 1412 } 1413 1414 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 1415 int Op32 = AMDGPU::getVOPe32(Opcode); 1416 if (Op32 == -1) 1417 return false; 1418 1419 return pseudoToMCOpcode(Op32) != -1; 1420 } 1421 1422 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 1423 // The src0_modifier operand is present on all instructions 1424 // that have modifiers. 1425 1426 return AMDGPU::getNamedOperandIdx(Opcode, 1427 AMDGPU::OpName::src0_modifiers) != -1; 1428 } 1429 1430 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 1431 unsigned OpName) const { 1432 const MachineOperand *Mods = getNamedOperand(MI, OpName); 1433 return Mods && Mods->getImm(); 1434 } 1435 1436 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 1437 const MachineOperand &MO, 1438 unsigned OpSize) const { 1439 // Literal constants use the constant bus. 1440 if (isLiteralConstant(MO, OpSize)) 1441 return true; 1442 1443 if (!MO.isReg() || !MO.isUse()) 1444 return false; 1445 1446 if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) 1447 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 1448 1449 // FLAT_SCR is just an SGPR pair. 1450 if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) 1451 return true; 1452 1453 // EXEC register uses the constant bus. 1454 if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) 1455 return true; 1456 1457 // SGPRs use the constant bus 1458 return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 || 1459 (!MO.isImplicit() && 1460 (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || 1461 AMDGPU::SGPR_64RegClass.contains(MO.getReg())))); 1462 } 1463 1464 static unsigned findImplicitSGPRRead(const MachineInstr &MI) { 1465 for (const MachineOperand &MO : MI.implicit_operands()) { 1466 // We only care about reads. 1467 if (MO.isDef()) 1468 continue; 1469 1470 switch (MO.getReg()) { 1471 case AMDGPU::VCC: 1472 case AMDGPU::M0: 1473 case AMDGPU::FLAT_SCR: 1474 return MO.getReg(); 1475 1476 default: 1477 break; 1478 } 1479 } 1480 1481 return AMDGPU::NoRegister; 1482 } 1483 1484 bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, 1485 StringRef &ErrInfo) const { 1486 uint16_t Opcode = MI->getOpcode(); 1487 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1488 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 1489 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 1490 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 1491 1492 // Make sure we don't have SCC live-ins to basic blocks. moveToVALU assumes 1493 // all SCC users are in the same blocks as their defs. 1494 const MachineBasicBlock *MBB = MI->getParent(); 1495 if (MI == &MBB->front()) { 1496 if (MBB->isLiveIn(AMDGPU::SCC)) { 1497 ErrInfo = "scc register cannot be live across blocks."; 1498 return false; 1499 } 1500 } 1501 1502 // Make sure the number of operands is correct. 1503 const MCInstrDesc &Desc = get(Opcode); 1504 if (!Desc.isVariadic() && 1505 Desc.getNumOperands() != MI->getNumExplicitOperands()) { 1506 ErrInfo = "Instruction has wrong number of operands."; 1507 return false; 1508 } 1509 1510 // Make sure the register classes are correct. 1511 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 1512 if (MI->getOperand(i).isFPImm()) { 1513 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 1514 "all fp values to integers."; 1515 return false; 1516 } 1517 1518 int RegClass = Desc.OpInfo[i].RegClass; 1519 1520 switch (Desc.OpInfo[i].OperandType) { 1521 case MCOI::OPERAND_REGISTER: 1522 if (MI->getOperand(i).isImm()) { 1523 ErrInfo = "Illegal immediate value for operand."; 1524 return false; 1525 } 1526 break; 1527 case AMDGPU::OPERAND_REG_IMM32: 1528 break; 1529 case AMDGPU::OPERAND_REG_INLINE_C: 1530 if (isLiteralConstant(MI->getOperand(i), 1531 RI.getRegClass(RegClass)->getSize())) { 1532 ErrInfo = "Illegal immediate value for operand."; 1533 return false; 1534 } 1535 break; 1536 case MCOI::OPERAND_IMMEDIATE: 1537 // Check if this operand is an immediate. 1538 // FrameIndex operands will be replaced by immediates, so they are 1539 // allowed. 1540 if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) { 1541 ErrInfo = "Expected immediate, but got non-immediate"; 1542 return false; 1543 } 1544 // Fall-through 1545 default: 1546 continue; 1547 } 1548 1549 if (!MI->getOperand(i).isReg()) 1550 continue; 1551 1552 if (RegClass != -1) { 1553 unsigned Reg = MI->getOperand(i).getReg(); 1554 if (TargetRegisterInfo::isVirtualRegister(Reg)) 1555 continue; 1556 1557 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 1558 if (!RC->contains(Reg)) { 1559 ErrInfo = "Operand has incorrect register class."; 1560 return false; 1561 } 1562 } 1563 } 1564 1565 1566 // Verify VOP* 1567 if (isVOP1(*MI) || isVOP2(*MI) || isVOP3(*MI) || isVOPC(*MI)) { 1568 // Only look at the true operands. Only a real operand can use the constant 1569 // bus, and we don't want to check pseudo-operands like the source modifier 1570 // flags. 1571 const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; 1572 1573 unsigned ConstantBusCount = 0; 1574 unsigned SGPRUsed = findImplicitSGPRRead(*MI); 1575 if (SGPRUsed != AMDGPU::NoRegister) 1576 ++ConstantBusCount; 1577 1578 for (int OpIdx : OpIndices) { 1579 if (OpIdx == -1) 1580 break; 1581 const MachineOperand &MO = MI->getOperand(OpIdx); 1582 if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) { 1583 if (MO.isReg()) { 1584 if (MO.getReg() != SGPRUsed) 1585 ++ConstantBusCount; 1586 SGPRUsed = MO.getReg(); 1587 } else { 1588 ++ConstantBusCount; 1589 } 1590 } 1591 } 1592 if (ConstantBusCount > 1) { 1593 ErrInfo = "VOP* instruction uses the constant bus more than once"; 1594 return false; 1595 } 1596 } 1597 1598 // Verify misc. restrictions on specific instructions. 1599 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || 1600 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { 1601 const MachineOperand &Src0 = MI->getOperand(Src0Idx); 1602 const MachineOperand &Src1 = MI->getOperand(Src1Idx); 1603 const MachineOperand &Src2 = MI->getOperand(Src2Idx); 1604 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 1605 if (!compareMachineOp(Src0, Src1) && 1606 !compareMachineOp(Src0, Src2)) { 1607 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 1608 return false; 1609 } 1610 } 1611 } 1612 1613 // Make sure we aren't losing exec uses in the td files. This mostly requires 1614 // being careful when using let Uses to try to add other use registers. 1615 if (!isGenericOpcode(Opcode) && !isSALU(Opcode) && !isSMRD(Opcode)) { 1616 if (!MI->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 1617 ErrInfo = "VALU instruction does not implicitly read exec mask"; 1618 return false; 1619 } 1620 } 1621 1622 return true; 1623 } 1624 1625 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { 1626 switch (MI.getOpcode()) { 1627 default: return AMDGPU::INSTRUCTION_LIST_END; 1628 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 1629 case AMDGPU::COPY: return AMDGPU::COPY; 1630 case AMDGPU::PHI: return AMDGPU::PHI; 1631 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 1632 case AMDGPU::S_MOV_B32: 1633 return MI.getOperand(1).isReg() ? 1634 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 1635 case AMDGPU::S_ADD_I32: 1636 case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; 1637 case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; 1638 case AMDGPU::S_SUB_I32: 1639 case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; 1640 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 1641 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; 1642 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32; 1643 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32; 1644 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32; 1645 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32; 1646 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32; 1647 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32; 1648 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32; 1649 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 1650 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; 1651 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 1652 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; 1653 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 1654 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; 1655 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; 1656 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; 1657 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; 1658 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; 1659 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 1660 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 1661 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 1662 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 1663 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; 1664 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; 1665 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; 1666 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; 1667 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; 1668 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; 1669 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; 1670 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; 1671 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; 1672 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; 1673 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; 1674 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; 1675 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 1676 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 1677 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 1678 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 1679 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; 1680 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; 1681 } 1682 } 1683 1684 bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const { 1685 return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END; 1686 } 1687 1688 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 1689 unsigned OpNo) const { 1690 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 1691 const MCInstrDesc &Desc = get(MI.getOpcode()); 1692 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 1693 Desc.OpInfo[OpNo].RegClass == -1) { 1694 unsigned Reg = MI.getOperand(OpNo).getReg(); 1695 1696 if (TargetRegisterInfo::isVirtualRegister(Reg)) 1697 return MRI.getRegClass(Reg); 1698 return RI.getPhysRegClass(Reg); 1699 } 1700 1701 unsigned RCID = Desc.OpInfo[OpNo].RegClass; 1702 return RI.getRegClass(RCID); 1703 } 1704 1705 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { 1706 switch (MI.getOpcode()) { 1707 case AMDGPU::COPY: 1708 case AMDGPU::REG_SEQUENCE: 1709 case AMDGPU::PHI: 1710 case AMDGPU::INSERT_SUBREG: 1711 return RI.hasVGPRs(getOpRegClass(MI, 0)); 1712 default: 1713 return RI.hasVGPRs(getOpRegClass(MI, OpNo)); 1714 } 1715 } 1716 1717 void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const { 1718 MachineBasicBlock::iterator I = MI; 1719 MachineBasicBlock *MBB = MI->getParent(); 1720 MachineOperand &MO = MI->getOperand(OpIdx); 1721 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1722 unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass; 1723 const TargetRegisterClass *RC = RI.getRegClass(RCID); 1724 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 1725 if (MO.isReg()) 1726 Opcode = AMDGPU::COPY; 1727 else if (RI.isSGPRClass(RC)) 1728 Opcode = AMDGPU::S_MOV_B32; 1729 1730 1731 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 1732 if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) 1733 VRC = &AMDGPU::VReg_64RegClass; 1734 else 1735 VRC = &AMDGPU::VGPR_32RegClass; 1736 1737 unsigned Reg = MRI.createVirtualRegister(VRC); 1738 DebugLoc DL = MBB->findDebugLoc(I); 1739 BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg) 1740 .addOperand(MO); 1741 MO.ChangeToRegister(Reg, false); 1742 } 1743 1744 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 1745 MachineRegisterInfo &MRI, 1746 MachineOperand &SuperReg, 1747 const TargetRegisterClass *SuperRC, 1748 unsigned SubIdx, 1749 const TargetRegisterClass *SubRC) 1750 const { 1751 MachineBasicBlock *MBB = MI->getParent(); 1752 DebugLoc DL = MI->getDebugLoc(); 1753 unsigned SubReg = MRI.createVirtualRegister(SubRC); 1754 1755 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { 1756 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 1757 .addReg(SuperReg.getReg(), 0, SubIdx); 1758 return SubReg; 1759 } 1760 1761 // Just in case the super register is itself a sub-register, copy it to a new 1762 // value so we don't need to worry about merging its subreg index with the 1763 // SubIdx passed to this function. The register coalescer should be able to 1764 // eliminate this extra copy. 1765 unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); 1766 1767 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 1768 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 1769 1770 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 1771 .addReg(NewSuperReg, 0, SubIdx); 1772 1773 return SubReg; 1774 } 1775 1776 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 1777 MachineBasicBlock::iterator MII, 1778 MachineRegisterInfo &MRI, 1779 MachineOperand &Op, 1780 const TargetRegisterClass *SuperRC, 1781 unsigned SubIdx, 1782 const TargetRegisterClass *SubRC) const { 1783 if (Op.isImm()) { 1784 // XXX - Is there a better way to do this? 1785 if (SubIdx == AMDGPU::sub0) 1786 return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF); 1787 if (SubIdx == AMDGPU::sub1) 1788 return MachineOperand::CreateImm(Op.getImm() >> 32); 1789 1790 llvm_unreachable("Unhandled register index for immediate"); 1791 } 1792 1793 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 1794 SubIdx, SubRC); 1795 return MachineOperand::CreateReg(SubReg, false); 1796 } 1797 1798 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 1799 void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const { 1800 assert(Inst->getNumExplicitOperands() == 3); 1801 MachineOperand Op1 = Inst->getOperand(1); 1802 Inst->RemoveOperand(1); 1803 Inst->addOperand(Op1); 1804 } 1805 1806 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, 1807 const MCOperandInfo &OpInfo, 1808 const MachineOperand &MO) const { 1809 if (!MO.isReg()) 1810 return false; 1811 1812 unsigned Reg = MO.getReg(); 1813 const TargetRegisterClass *RC = 1814 TargetRegisterInfo::isVirtualRegister(Reg) ? 1815 MRI.getRegClass(Reg) : 1816 RI.getPhysRegClass(Reg); 1817 1818 const SIRegisterInfo *TRI = 1819 static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 1820 RC = TRI->getSubRegClass(RC, MO.getSubReg()); 1821 1822 // In order to be legal, the common sub-class must be equal to the 1823 // class of the current operand. For example: 1824 // 1825 // v_mov_b32 s0 ; Operand defined as vsrc_32 1826 // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL 1827 // 1828 // s_sendmsg 0, s0 ; Operand defined as m0reg 1829 // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL 1830 1831 return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; 1832 } 1833 1834 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, 1835 const MCOperandInfo &OpInfo, 1836 const MachineOperand &MO) const { 1837 if (MO.isReg()) 1838 return isLegalRegOperand(MRI, OpInfo, MO); 1839 1840 // Handle non-register types that are treated like immediates. 1841 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 1842 return true; 1843 } 1844 1845 bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, 1846 const MachineOperand *MO) const { 1847 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1848 const MCInstrDesc &InstDesc = MI->getDesc(); 1849 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; 1850 const TargetRegisterClass *DefinedRC = 1851 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 1852 if (!MO) 1853 MO = &MI->getOperand(OpIdx); 1854 1855 if (isVALU(*MI) && 1856 usesConstantBus(MRI, *MO, DefinedRC->getSize())) { 1857 1858 RegSubRegPair SGPRUsed; 1859 if (MO->isReg()) 1860 SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg()); 1861 1862 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { 1863 if (i == OpIdx) 1864 continue; 1865 const MachineOperand &Op = MI->getOperand(i); 1866 if (Op.isReg() && 1867 (Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) && 1868 usesConstantBus(MRI, Op, getOpSize(*MI, i))) { 1869 return false; 1870 } 1871 } 1872 } 1873 1874 if (MO->isReg()) { 1875 assert(DefinedRC); 1876 return isLegalRegOperand(MRI, OpInfo, *MO); 1877 } 1878 1879 1880 // Handle non-register types that are treated like immediates. 1881 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); 1882 1883 if (!DefinedRC) { 1884 // This operand expects an immediate. 1885 return true; 1886 } 1887 1888 return isImmOperandLegal(MI, OpIdx, *MO); 1889 } 1890 1891 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, 1892 MachineInstr *MI) const { 1893 unsigned Opc = MI->getOpcode(); 1894 const MCInstrDesc &InstrDesc = get(Opc); 1895 1896 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1897 MachineOperand &Src1 = MI->getOperand(Src1Idx); 1898 1899 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 1900 // we need to only have one constant bus use. 1901 // 1902 // Note we do not need to worry about literal constants here. They are 1903 // disabled for the operand type for instructions because they will always 1904 // violate the one constant bus use rule. 1905 bool HasImplicitSGPR = findImplicitSGPRRead(*MI) != AMDGPU::NoRegister; 1906 if (HasImplicitSGPR) { 1907 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1908 MachineOperand &Src0 = MI->getOperand(Src0Idx); 1909 1910 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) 1911 legalizeOpWithMove(MI, Src0Idx); 1912 } 1913 1914 // VOP2 src0 instructions support all operand types, so we don't need to check 1915 // their legality. If src1 is already legal, we don't need to do anything. 1916 if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) 1917 return; 1918 1919 // We do not use commuteInstruction here because it is too aggressive and will 1920 // commute if it is possible. We only want to commute here if it improves 1921 // legality. This can be called a fairly large number of times so don't waste 1922 // compile time pointlessly swapping and checking legality again. 1923 if (HasImplicitSGPR || !MI->isCommutable()) { 1924 legalizeOpWithMove(MI, Src1Idx); 1925 return; 1926 } 1927 1928 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1929 MachineOperand &Src0 = MI->getOperand(Src0Idx); 1930 1931 // If src0 can be used as src1, commuting will make the operands legal. 1932 // Otherwise we have to give up and insert a move. 1933 // 1934 // TODO: Other immediate-like operand kinds could be commuted if there was a 1935 // MachineOperand::ChangeTo* for them. 1936 if ((!Src1.isImm() && !Src1.isReg()) || 1937 !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { 1938 legalizeOpWithMove(MI, Src1Idx); 1939 return; 1940 } 1941 1942 int CommutedOpc = commuteOpcode(*MI); 1943 if (CommutedOpc == -1) { 1944 legalizeOpWithMove(MI, Src1Idx); 1945 return; 1946 } 1947 1948 MI->setDesc(get(CommutedOpc)); 1949 1950 unsigned Src0Reg = Src0.getReg(); 1951 unsigned Src0SubReg = Src0.getSubReg(); 1952 bool Src0Kill = Src0.isKill(); 1953 1954 if (Src1.isImm()) 1955 Src0.ChangeToImmediate(Src1.getImm()); 1956 else if (Src1.isReg()) { 1957 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); 1958 Src0.setSubReg(Src1.getSubReg()); 1959 } else 1960 llvm_unreachable("Should only have register or immediate operands"); 1961 1962 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); 1963 Src1.setSubReg(Src0SubReg); 1964 } 1965 1966 // Legalize VOP3 operands. Because all operand types are supported for any 1967 // operand, and since literal constants are not allowed and should never be 1968 // seen, we only need to worry about inserting copies if we use multiple SGPR 1969 // operands. 1970 void SIInstrInfo::legalizeOperandsVOP3( 1971 MachineRegisterInfo &MRI, 1972 MachineInstr *MI) const { 1973 unsigned Opc = MI->getOpcode(); 1974 1975 int VOP3Idx[3] = { 1976 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 1977 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), 1978 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) 1979 }; 1980 1981 // Find the one SGPR operand we are allowed to use. 1982 unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); 1983 1984 for (unsigned i = 0; i < 3; ++i) { 1985 int Idx = VOP3Idx[i]; 1986 if (Idx == -1) 1987 break; 1988 MachineOperand &MO = MI->getOperand(Idx); 1989 1990 // We should never see a VOP3 instruction with an illegal immediate operand. 1991 if (!MO.isReg()) 1992 continue; 1993 1994 if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 1995 continue; // VGPRs are legal 1996 1997 if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { 1998 SGPRReg = MO.getReg(); 1999 // We can use one SGPR in each VOP3 instruction. 2000 continue; 2001 } 2002 2003 // If we make it this far, then the operand is not legal and we must 2004 // legalize it. 2005 legalizeOpWithMove(MI, Idx); 2006 } 2007 } 2008 2009 unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr *UseMI, 2010 MachineRegisterInfo &MRI) const { 2011 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); 2012 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); 2013 unsigned DstReg = MRI.createVirtualRegister(SRC); 2014 unsigned SubRegs = VRC->getSize() / 4; 2015 2016 SmallVector<unsigned, 8> SRegs; 2017 for (unsigned i = 0; i < SubRegs; ++i) { 2018 unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2019 BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(), 2020 get(AMDGPU::V_READFIRSTLANE_B32), SGPR) 2021 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); 2022 SRegs.push_back(SGPR); 2023 } 2024 2025 MachineInstrBuilder MIB = BuildMI(*UseMI->getParent(), UseMI, 2026 UseMI->getDebugLoc(), 2027 get(AMDGPU::REG_SEQUENCE), DstReg); 2028 for (unsigned i = 0; i < SubRegs; ++i) { 2029 MIB.addReg(SRegs[i]); 2030 MIB.addImm(RI.getSubRegFromChannel(i)); 2031 } 2032 return DstReg; 2033 } 2034 2035 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, 2036 MachineInstr *MI) const { 2037 2038 // If the pointer is store in VGPRs, then we need to move them to 2039 // SGPRs using v_readfirstlane. This is safe because we only select 2040 // loads with uniform pointers to SMRD instruction so we know the 2041 // pointer value is uniform. 2042 MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase); 2043 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { 2044 unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); 2045 SBase->setReg(SGPR); 2046 } 2047 } 2048 2049 void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { 2050 MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 2051 2052 // Legalize VOP2 2053 if (isVOP2(*MI) || isVOPC(*MI)) { 2054 legalizeOperandsVOP2(MRI, MI); 2055 return; 2056 } 2057 2058 // Legalize VOP3 2059 if (isVOP3(*MI)) { 2060 legalizeOperandsVOP3(MRI, MI); 2061 return; 2062 } 2063 2064 // Legalize SMRD 2065 if (isSMRD(*MI)) { 2066 legalizeOperandsSMRD(MRI, MI); 2067 return; 2068 } 2069 2070 // Legalize REG_SEQUENCE and PHI 2071 // The register class of the operands much be the same type as the register 2072 // class of the output. 2073 if (MI->getOpcode() == AMDGPU::PHI) { 2074 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 2075 for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { 2076 if (!MI->getOperand(i).isReg() || 2077 !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) 2078 continue; 2079 const TargetRegisterClass *OpRC = 2080 MRI.getRegClass(MI->getOperand(i).getReg()); 2081 if (RI.hasVGPRs(OpRC)) { 2082 VRC = OpRC; 2083 } else { 2084 SRC = OpRC; 2085 } 2086 } 2087 2088 // If any of the operands are VGPR registers, then they all most be 2089 // otherwise we will create illegal VGPR->SGPR copies when legalizing 2090 // them. 2091 if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) { 2092 if (!VRC) { 2093 assert(SRC); 2094 VRC = RI.getEquivalentVGPRClass(SRC); 2095 } 2096 RC = VRC; 2097 } else { 2098 RC = SRC; 2099 } 2100 2101 // Update all the operands so they have the same type. 2102 for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { 2103 MachineOperand &Op = MI->getOperand(I); 2104 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 2105 continue; 2106 unsigned DstReg = MRI.createVirtualRegister(RC); 2107 2108 // MI is a PHI instruction. 2109 MachineBasicBlock *InsertBB = MI->getOperand(I + 1).getMBB(); 2110 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); 2111 2112 BuildMI(*InsertBB, Insert, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) 2113 .addOperand(Op); 2114 Op.setReg(DstReg); 2115 } 2116 } 2117 2118 // REG_SEQUENCE doesn't really require operand legalization, but if one has a 2119 // VGPR dest type and SGPR sources, insert copies so all operands are 2120 // VGPRs. This seems to help operand folding / the register coalescer. 2121 if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) { 2122 MachineBasicBlock *MBB = MI->getParent(); 2123 const TargetRegisterClass *DstRC = getOpRegClass(*MI, 0); 2124 if (RI.hasVGPRs(DstRC)) { 2125 // Update all the operands so they are VGPR register classes. These may 2126 // not be the same register class because REG_SEQUENCE supports mixing 2127 // subregister index types e.g. sub0_sub1 + sub2 + sub3 2128 for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { 2129 MachineOperand &Op = MI->getOperand(I); 2130 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 2131 continue; 2132 2133 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); 2134 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); 2135 if (VRC == OpRC) 2136 continue; 2137 2138 unsigned DstReg = MRI.createVirtualRegister(VRC); 2139 2140 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) 2141 .addOperand(Op); 2142 2143 Op.setReg(DstReg); 2144 Op.setIsKill(); 2145 } 2146 } 2147 2148 return; 2149 } 2150 2151 // Legalize INSERT_SUBREG 2152 // src0 must have the same register class as dst 2153 if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) { 2154 unsigned Dst = MI->getOperand(0).getReg(); 2155 unsigned Src0 = MI->getOperand(1).getReg(); 2156 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 2157 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 2158 if (DstRC != Src0RC) { 2159 MachineBasicBlock &MBB = *MI->getParent(); 2160 unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); 2161 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0) 2162 .addReg(Src0); 2163 MI->getOperand(1).setReg(NewSrc0); 2164 } 2165 return; 2166 } 2167 2168 // Legalize MIMG 2169 if (isMIMG(*MI)) { 2170 MachineOperand *SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); 2171 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { 2172 unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); 2173 SRsrc->setReg(SGPR); 2174 } 2175 2176 MachineOperand *SSamp = getNamedOperand(*MI, AMDGPU::OpName::ssamp); 2177 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { 2178 unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); 2179 SSamp->setReg(SGPR); 2180 } 2181 return; 2182 } 2183 2184 // Legalize MUBUF* instructions 2185 // FIXME: If we start using the non-addr64 instructions for compute, we 2186 // may need to legalize them here. 2187 int SRsrcIdx = 2188 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc); 2189 if (SRsrcIdx != -1) { 2190 // We have an MUBUF instruction 2191 MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx); 2192 unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass; 2193 if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), 2194 RI.getRegClass(SRsrcRC))) { 2195 // The operands are legal. 2196 // FIXME: We may need to legalize operands besided srsrc. 2197 return; 2198 } 2199 2200 MachineBasicBlock &MBB = *MI->getParent(); 2201 2202 // Extract the ptr from the resource descriptor. 2203 unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, 2204 &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); 2205 2206 // Create an empty resource descriptor 2207 unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2208 unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2209 unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2210 unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); 2211 uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); 2212 2213 // Zero64 = 0 2214 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64), 2215 Zero64) 2216 .addImm(0); 2217 2218 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 2219 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 2220 SRsrcFormatLo) 2221 .addImm(RsrcDataFormat & 0xFFFFFFFF); 2222 2223 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 2224 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 2225 SRsrcFormatHi) 2226 .addImm(RsrcDataFormat >> 32); 2227 2228 // NewSRsrc = {Zero64, SRsrcFormat} 2229 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) 2230 .addReg(Zero64) 2231 .addImm(AMDGPU::sub0_sub1) 2232 .addReg(SRsrcFormatLo) 2233 .addImm(AMDGPU::sub2) 2234 .addReg(SRsrcFormatHi) 2235 .addImm(AMDGPU::sub3); 2236 2237 MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); 2238 unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2239 if (VAddr) { 2240 // This is already an ADDR64 instruction so we need to add the pointer 2241 // extracted from the resource descriptor to the current value of VAddr. 2242 unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2243 unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2244 2245 // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 2246 DebugLoc DL = MI->getDebugLoc(); 2247 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) 2248 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 2249 .addReg(VAddr->getReg(), 0, AMDGPU::sub0); 2250 2251 // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1 2252 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) 2253 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 2254 .addReg(VAddr->getReg(), 0, AMDGPU::sub1); 2255 2256 // NewVaddr = {NewVaddrHi, NewVaddrLo} 2257 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 2258 .addReg(NewVAddrLo) 2259 .addImm(AMDGPU::sub0) 2260 .addReg(NewVAddrHi) 2261 .addImm(AMDGPU::sub1); 2262 } else { 2263 // This instructions is the _OFFSET variant, so we need to convert it to 2264 // ADDR64. 2265 assert(MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() 2266 < AMDGPUSubtarget::VOLCANIC_ISLANDS && 2267 "FIXME: Need to emit flat atomics here"); 2268 2269 MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata); 2270 MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset); 2271 MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset); 2272 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode()); 2273 2274 // Atomics rith return have have an additional tied operand and are 2275 // missing some of the special bits. 2276 MachineOperand *VDataIn = getNamedOperand(*MI, AMDGPU::OpName::vdata_in); 2277 MachineInstr *Addr64; 2278 2279 if (!VDataIn) { 2280 // Regular buffer load / store. 2281 MachineInstrBuilder MIB 2282 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) 2283 .addOperand(*VData) 2284 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 2285 // This will be replaced later 2286 // with the new value of vaddr. 2287 .addOperand(*SRsrc) 2288 .addOperand(*SOffset) 2289 .addOperand(*Offset); 2290 2291 // Atomics do not have this operand. 2292 if (const MachineOperand *GLC 2293 = getNamedOperand(*MI, AMDGPU::OpName::glc)) { 2294 MIB.addImm(GLC->getImm()); 2295 } 2296 2297 MIB.addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)); 2298 2299 if (const MachineOperand *TFE 2300 = getNamedOperand(*MI, AMDGPU::OpName::tfe)) { 2301 MIB.addImm(TFE->getImm()); 2302 } 2303 2304 MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 2305 Addr64 = MIB; 2306 } else { 2307 // Atomics with return. 2308 Addr64 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) 2309 .addOperand(*VData) 2310 .addOperand(*VDataIn) 2311 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 2312 // This will be replaced later 2313 // with the new value of vaddr. 2314 .addOperand(*SRsrc) 2315 .addOperand(*SOffset) 2316 .addOperand(*Offset) 2317 .addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)) 2318 .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 2319 } 2320 2321 MI->removeFromParent(); 2322 MI = Addr64; 2323 2324 // NewVaddr = {NewVaddrHi, NewVaddrLo} 2325 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 2326 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 2327 .addImm(AMDGPU::sub0) 2328 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 2329 .addImm(AMDGPU::sub1); 2330 2331 VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); 2332 SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); 2333 } 2334 2335 // Update the instruction to use NewVaddr 2336 VAddr->setReg(NewVAddr); 2337 // Update the instruction to use NewSRsrc 2338 SRsrc->setReg(NewSRsrc); 2339 } 2340 } 2341 2342 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { 2343 SmallVector<MachineInstr *, 128> Worklist; 2344 Worklist.push_back(&TopInst); 2345 2346 while (!Worklist.empty()) { 2347 MachineInstr *Inst = Worklist.pop_back_val(); 2348 MachineBasicBlock *MBB = Inst->getParent(); 2349 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 2350 2351 unsigned Opcode = Inst->getOpcode(); 2352 unsigned NewOpcode = getVALUOp(*Inst); 2353 2354 // Handle some special cases 2355 switch (Opcode) { 2356 default: 2357 break; 2358 case AMDGPU::S_AND_B64: 2359 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); 2360 Inst->eraseFromParent(); 2361 continue; 2362 2363 case AMDGPU::S_OR_B64: 2364 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64); 2365 Inst->eraseFromParent(); 2366 continue; 2367 2368 case AMDGPU::S_XOR_B64: 2369 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64); 2370 Inst->eraseFromParent(); 2371 continue; 2372 2373 case AMDGPU::S_NOT_B64: 2374 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); 2375 Inst->eraseFromParent(); 2376 continue; 2377 2378 case AMDGPU::S_BCNT1_I32_B64: 2379 splitScalar64BitBCNT(Worklist, Inst); 2380 Inst->eraseFromParent(); 2381 continue; 2382 2383 case AMDGPU::S_BFE_I64: { 2384 splitScalar64BitBFE(Worklist, Inst); 2385 Inst->eraseFromParent(); 2386 continue; 2387 } 2388 2389 case AMDGPU::S_LSHL_B32: 2390 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2391 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 2392 swapOperands(Inst); 2393 } 2394 break; 2395 case AMDGPU::S_ASHR_I32: 2396 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2397 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 2398 swapOperands(Inst); 2399 } 2400 break; 2401 case AMDGPU::S_LSHR_B32: 2402 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2403 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 2404 swapOperands(Inst); 2405 } 2406 break; 2407 case AMDGPU::S_LSHL_B64: 2408 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2409 NewOpcode = AMDGPU::V_LSHLREV_B64; 2410 swapOperands(Inst); 2411 } 2412 break; 2413 case AMDGPU::S_ASHR_I64: 2414 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2415 NewOpcode = AMDGPU::V_ASHRREV_I64; 2416 swapOperands(Inst); 2417 } 2418 break; 2419 case AMDGPU::S_LSHR_B64: 2420 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2421 NewOpcode = AMDGPU::V_LSHRREV_B64; 2422 swapOperands(Inst); 2423 } 2424 break; 2425 2426 case AMDGPU::S_ABS_I32: 2427 lowerScalarAbs(Worklist, Inst); 2428 Inst->eraseFromParent(); 2429 continue; 2430 2431 case AMDGPU::S_CBRANCH_SCC0: 2432 case AMDGPU::S_CBRANCH_SCC1: 2433 // Clear unused bits of vcc 2434 BuildMI(*MBB, Inst, Inst->getDebugLoc(), get(AMDGPU::S_AND_B64), AMDGPU::VCC) 2435 .addReg(AMDGPU::EXEC) 2436 .addReg(AMDGPU::VCC); 2437 break; 2438 2439 case AMDGPU::S_BFE_U64: 2440 case AMDGPU::S_BFM_B64: 2441 llvm_unreachable("Moving this op to VALU not implemented"); 2442 } 2443 2444 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 2445 // We cannot move this instruction to the VALU, so we should try to 2446 // legalize its operands instead. 2447 legalizeOperands(Inst); 2448 continue; 2449 } 2450 2451 // Use the new VALU Opcode. 2452 const MCInstrDesc &NewDesc = get(NewOpcode); 2453 Inst->setDesc(NewDesc); 2454 2455 // Remove any references to SCC. Vector instructions can't read from it, and 2456 // We're just about to add the implicit use / defs of VCC, and we don't want 2457 // both. 2458 for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) { 2459 MachineOperand &Op = Inst->getOperand(i); 2460 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { 2461 Inst->RemoveOperand(i); 2462 addSCCDefUsersToVALUWorklist(Inst, Worklist); 2463 } 2464 } 2465 2466 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 2467 // We are converting these to a BFE, so we need to add the missing 2468 // operands for the size and offset. 2469 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 2470 Inst->addOperand(MachineOperand::CreateImm(0)); 2471 Inst->addOperand(MachineOperand::CreateImm(Size)); 2472 2473 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 2474 // The VALU version adds the second operand to the result, so insert an 2475 // extra 0 operand. 2476 Inst->addOperand(MachineOperand::CreateImm(0)); 2477 } 2478 2479 Inst->addImplicitDefUseOperands(*Inst->getParent()->getParent()); 2480 2481 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 2482 const MachineOperand &OffsetWidthOp = Inst->getOperand(2); 2483 // If we need to move this to VGPRs, we need to unpack the second operand 2484 // back into the 2 separate ones for bit offset and width. 2485 assert(OffsetWidthOp.isImm() && 2486 "Scalar BFE is only implemented for constant width and offset"); 2487 uint32_t Imm = OffsetWidthOp.getImm(); 2488 2489 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 2490 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 2491 Inst->RemoveOperand(2); // Remove old immediate. 2492 Inst->addOperand(MachineOperand::CreateImm(Offset)); 2493 Inst->addOperand(MachineOperand::CreateImm(BitWidth)); 2494 } 2495 2496 bool HasDst = Inst->getOperand(0).isReg() && Inst->getOperand(0).isDef(); 2497 unsigned NewDstReg = AMDGPU::NoRegister; 2498 if (HasDst) { 2499 // Update the destination register class. 2500 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst); 2501 if (!NewDstRC) 2502 continue; 2503 2504 unsigned DstReg = Inst->getOperand(0).getReg(); 2505 NewDstReg = MRI.createVirtualRegister(NewDstRC); 2506 MRI.replaceRegWith(DstReg, NewDstReg); 2507 } 2508 2509 // Legalize the operands 2510 legalizeOperands(Inst); 2511 2512 if (HasDst) 2513 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 2514 } 2515 } 2516 2517 //===----------------------------------------------------------------------===// 2518 // Indirect addressing callbacks 2519 //===----------------------------------------------------------------------===// 2520 2521 const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const { 2522 return &AMDGPU::VGPR_32RegClass; 2523 } 2524 2525 void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, 2526 MachineInstr *Inst) const { 2527 MachineBasicBlock &MBB = *Inst->getParent(); 2528 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2529 MachineBasicBlock::iterator MII = Inst; 2530 DebugLoc DL = Inst->getDebugLoc(); 2531 2532 MachineOperand &Dest = Inst->getOperand(0); 2533 MachineOperand &Src = Inst->getOperand(1); 2534 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2535 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2536 2537 BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg) 2538 .addImm(0) 2539 .addReg(Src.getReg()); 2540 2541 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) 2542 .addReg(Src.getReg()) 2543 .addReg(TmpReg); 2544 2545 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2546 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2547 } 2548 2549 void SIInstrInfo::splitScalar64BitUnaryOp( 2550 SmallVectorImpl<MachineInstr *> &Worklist, 2551 MachineInstr *Inst, 2552 unsigned Opcode) const { 2553 MachineBasicBlock &MBB = *Inst->getParent(); 2554 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2555 2556 MachineOperand &Dest = Inst->getOperand(0); 2557 MachineOperand &Src0 = Inst->getOperand(1); 2558 DebugLoc DL = Inst->getDebugLoc(); 2559 2560 MachineBasicBlock::iterator MII = Inst; 2561 2562 const MCInstrDesc &InstDesc = get(Opcode); 2563 const TargetRegisterClass *Src0RC = Src0.isReg() ? 2564 MRI.getRegClass(Src0.getReg()) : 2565 &AMDGPU::SGPR_32RegClass; 2566 2567 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 2568 2569 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2570 AMDGPU::sub0, Src0SubRC); 2571 2572 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 2573 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 2574 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 2575 2576 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 2577 BuildMI(MBB, MII, DL, InstDesc, DestSub0) 2578 .addOperand(SrcReg0Sub0); 2579 2580 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2581 AMDGPU::sub1, Src0SubRC); 2582 2583 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 2584 BuildMI(MBB, MII, DL, InstDesc, DestSub1) 2585 .addOperand(SrcReg0Sub1); 2586 2587 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 2588 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2589 .addReg(DestSub0) 2590 .addImm(AMDGPU::sub0) 2591 .addReg(DestSub1) 2592 .addImm(AMDGPU::sub1); 2593 2594 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 2595 2596 // We don't need to legalizeOperands here because for a single operand, src0 2597 // will support any kind of input. 2598 2599 // Move all users of this moved value. 2600 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 2601 } 2602 2603 void SIInstrInfo::splitScalar64BitBinaryOp( 2604 SmallVectorImpl<MachineInstr *> &Worklist, 2605 MachineInstr *Inst, 2606 unsigned Opcode) const { 2607 MachineBasicBlock &MBB = *Inst->getParent(); 2608 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2609 2610 MachineOperand &Dest = Inst->getOperand(0); 2611 MachineOperand &Src0 = Inst->getOperand(1); 2612 MachineOperand &Src1 = Inst->getOperand(2); 2613 DebugLoc DL = Inst->getDebugLoc(); 2614 2615 MachineBasicBlock::iterator MII = Inst; 2616 2617 const MCInstrDesc &InstDesc = get(Opcode); 2618 const TargetRegisterClass *Src0RC = Src0.isReg() ? 2619 MRI.getRegClass(Src0.getReg()) : 2620 &AMDGPU::SGPR_32RegClass; 2621 2622 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 2623 const TargetRegisterClass *Src1RC = Src1.isReg() ? 2624 MRI.getRegClass(Src1.getReg()) : 2625 &AMDGPU::SGPR_32RegClass; 2626 2627 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 2628 2629 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2630 AMDGPU::sub0, Src0SubRC); 2631 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 2632 AMDGPU::sub0, Src1SubRC); 2633 2634 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 2635 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 2636 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 2637 2638 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 2639 MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) 2640 .addOperand(SrcReg0Sub0) 2641 .addOperand(SrcReg1Sub0); 2642 2643 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2644 AMDGPU::sub1, Src0SubRC); 2645 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 2646 AMDGPU::sub1, Src1SubRC); 2647 2648 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 2649 MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) 2650 .addOperand(SrcReg0Sub1) 2651 .addOperand(SrcReg1Sub1); 2652 2653 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 2654 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2655 .addReg(DestSub0) 2656 .addImm(AMDGPU::sub0) 2657 .addReg(DestSub1) 2658 .addImm(AMDGPU::sub1); 2659 2660 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 2661 2662 // Try to legalize the operands in case we need to swap the order to keep it 2663 // valid. 2664 legalizeOperands(LoHalf); 2665 legalizeOperands(HiHalf); 2666 2667 // Move all users of this moved vlaue. 2668 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 2669 } 2670 2671 void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, 2672 MachineInstr *Inst) const { 2673 MachineBasicBlock &MBB = *Inst->getParent(); 2674 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2675 2676 MachineBasicBlock::iterator MII = Inst; 2677 DebugLoc DL = Inst->getDebugLoc(); 2678 2679 MachineOperand &Dest = Inst->getOperand(0); 2680 MachineOperand &Src = Inst->getOperand(1); 2681 2682 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 2683 const TargetRegisterClass *SrcRC = Src.isReg() ? 2684 MRI.getRegClass(Src.getReg()) : 2685 &AMDGPU::SGPR_32RegClass; 2686 2687 unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2688 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2689 2690 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); 2691 2692 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 2693 AMDGPU::sub0, SrcSubRC); 2694 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 2695 AMDGPU::sub1, SrcSubRC); 2696 2697 BuildMI(MBB, MII, DL, InstDesc, MidReg) 2698 .addOperand(SrcRegSub0) 2699 .addImm(0); 2700 2701 BuildMI(MBB, MII, DL, InstDesc, ResultReg) 2702 .addOperand(SrcRegSub1) 2703 .addReg(MidReg); 2704 2705 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2706 2707 // We don't need to legalize operands here. src0 for etiher instruction can be 2708 // an SGPR, and the second input is unused or determined here. 2709 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2710 } 2711 2712 void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, 2713 MachineInstr *Inst) const { 2714 MachineBasicBlock &MBB = *Inst->getParent(); 2715 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2716 MachineBasicBlock::iterator MII = Inst; 2717 DebugLoc DL = Inst->getDebugLoc(); 2718 2719 MachineOperand &Dest = Inst->getOperand(0); 2720 uint32_t Imm = Inst->getOperand(2).getImm(); 2721 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 2722 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 2723 2724 (void) Offset; 2725 2726 // Only sext_inreg cases handled. 2727 assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 && 2728 BitWidth <= 32 && 2729 Offset == 0 && 2730 "Not implemented"); 2731 2732 if (BitWidth < 32) { 2733 unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2734 unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2735 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2736 2737 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) 2738 .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0) 2739 .addImm(0) 2740 .addImm(BitWidth); 2741 2742 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 2743 .addImm(31) 2744 .addReg(MidRegLo); 2745 2746 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 2747 .addReg(MidRegLo) 2748 .addImm(AMDGPU::sub0) 2749 .addReg(MidRegHi) 2750 .addImm(AMDGPU::sub1); 2751 2752 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2753 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2754 return; 2755 } 2756 2757 MachineOperand &Src = Inst->getOperand(1); 2758 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2759 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2760 2761 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 2762 .addImm(31) 2763 .addReg(Src.getReg(), 0, AMDGPU::sub0); 2764 2765 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 2766 .addReg(Src.getReg(), 0, AMDGPU::sub0) 2767 .addImm(AMDGPU::sub0) 2768 .addReg(TmpReg) 2769 .addImm(AMDGPU::sub1); 2770 2771 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2772 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2773 } 2774 2775 void SIInstrInfo::addUsersToMoveToVALUWorklist( 2776 unsigned DstReg, 2777 MachineRegisterInfo &MRI, 2778 SmallVectorImpl<MachineInstr *> &Worklist) const { 2779 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), 2780 E = MRI.use_end(); I != E; ++I) { 2781 MachineInstr &UseMI = *I->getParent(); 2782 if (!canReadVGPR(UseMI, I.getOperandNo())) { 2783 Worklist.push_back(&UseMI); 2784 } 2785 } 2786 } 2787 2788 void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineInstr *SCCDefInst, 2789 SmallVectorImpl<MachineInstr *> &Worklist) const { 2790 // This assumes that all the users of SCC are in the same block 2791 // as the SCC def. 2792 for (MachineBasicBlock::iterator I = SCCDefInst, 2793 E = SCCDefInst->getParent()->end(); I != E; ++I) { 2794 2795 // Exit if we find another SCC def. 2796 if (I->findRegisterDefOperandIdx(AMDGPU::SCC) != -1) 2797 return; 2798 2799 if (I->findRegisterUseOperandIdx(AMDGPU::SCC) != -1) 2800 Worklist.push_back(I); 2801 } 2802 } 2803 2804 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( 2805 const MachineInstr &Inst) const { 2806 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); 2807 2808 switch (Inst.getOpcode()) { 2809 // For target instructions, getOpRegClass just returns the virtual register 2810 // class associated with the operand, so we need to find an equivalent VGPR 2811 // register class in order to move the instruction to the VALU. 2812 case AMDGPU::COPY: 2813 case AMDGPU::PHI: 2814 case AMDGPU::REG_SEQUENCE: 2815 case AMDGPU::INSERT_SUBREG: 2816 if (RI.hasVGPRs(NewDstRC)) 2817 return nullptr; 2818 2819 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 2820 if (!NewDstRC) 2821 return nullptr; 2822 return NewDstRC; 2823 default: 2824 return NewDstRC; 2825 } 2826 } 2827 2828 // Find the one SGPR operand we are allowed to use. 2829 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, 2830 int OpIndices[3]) const { 2831 const MCInstrDesc &Desc = MI->getDesc(); 2832 2833 // Find the one SGPR operand we are allowed to use. 2834 // 2835 // First we need to consider the instruction's operand requirements before 2836 // legalizing. Some operands are required to be SGPRs, such as implicit uses 2837 // of VCC, but we are still bound by the constant bus requirement to only use 2838 // one. 2839 // 2840 // If the operand's class is an SGPR, we can never move it. 2841 2842 unsigned SGPRReg = findImplicitSGPRRead(*MI); 2843 if (SGPRReg != AMDGPU::NoRegister) 2844 return SGPRReg; 2845 2846 unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; 2847 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 2848 2849 for (unsigned i = 0; i < 3; ++i) { 2850 int Idx = OpIndices[i]; 2851 if (Idx == -1) 2852 break; 2853 2854 const MachineOperand &MO = MI->getOperand(Idx); 2855 if (!MO.isReg()) 2856 continue; 2857 2858 // Is this operand statically required to be an SGPR based on the operand 2859 // constraints? 2860 const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); 2861 bool IsRequiredSGPR = RI.isSGPRClass(OpRC); 2862 if (IsRequiredSGPR) 2863 return MO.getReg(); 2864 2865 // If this could be a VGPR or an SGPR, Check the dynamic register class. 2866 unsigned Reg = MO.getReg(); 2867 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); 2868 if (RI.isSGPRClass(RegRC)) 2869 UsedSGPRs[i] = Reg; 2870 } 2871 2872 // We don't have a required SGPR operand, so we have a bit more freedom in 2873 // selecting operands to move. 2874 2875 // Try to select the most used SGPR. If an SGPR is equal to one of the 2876 // others, we choose that. 2877 // 2878 // e.g. 2879 // V_FMA_F32 v0, s0, s0, s0 -> No moves 2880 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 2881 2882 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should 2883 // prefer those. 2884 2885 if (UsedSGPRs[0] != AMDGPU::NoRegister) { 2886 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 2887 SGPRReg = UsedSGPRs[0]; 2888 } 2889 2890 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { 2891 if (UsedSGPRs[1] == UsedSGPRs[2]) 2892 SGPRReg = UsedSGPRs[1]; 2893 } 2894 2895 return SGPRReg; 2896 } 2897 2898 void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved, 2899 const MachineFunction &MF) const { 2900 int End = getIndirectIndexEnd(MF); 2901 int Begin = getIndirectIndexBegin(MF); 2902 2903 if (End == -1) 2904 return; 2905 2906 2907 for (int Index = Begin; Index <= End; ++Index) 2908 Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index)); 2909 2910 for (int Index = std::max(0, Begin - 1); Index <= End; ++Index) 2911 Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index)); 2912 2913 for (int Index = std::max(0, Begin - 2); Index <= End; ++Index) 2914 Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index)); 2915 2916 for (int Index = std::max(0, Begin - 3); Index <= End; ++Index) 2917 Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index)); 2918 2919 for (int Index = std::max(0, Begin - 7); Index <= End; ++Index) 2920 Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index)); 2921 2922 for (int Index = std::max(0, Begin - 15); Index <= End; ++Index) 2923 Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index)); 2924 } 2925 2926 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 2927 unsigned OperandName) const { 2928 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 2929 if (Idx == -1) 2930 return nullptr; 2931 2932 return &MI.getOperand(Idx); 2933 } 2934 2935 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 2936 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 2937 if (ST.isAmdHsaOS()) { 2938 RsrcDataFormat |= (1ULL << 56); 2939 2940 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 2941 // Set MTYPE = 2 2942 RsrcDataFormat |= (2ULL << 59); 2943 } 2944 2945 return RsrcDataFormat; 2946 } 2947 2948 uint64_t SIInstrInfo::getScratchRsrcWords23() const { 2949 uint64_t Rsrc23 = getDefaultRsrcDataFormat() | 2950 AMDGPU::RSRC_TID_ENABLE | 2951 0xffffffff; // Size; 2952 2953 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; 2954 2955 Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT); 2956 2957 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. 2958 // Clear them unless we want a huge stride. 2959 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 2960 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; 2961 2962 return Rsrc23; 2963 } 2964 2965 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr *MI) const { 2966 unsigned Opc = MI->getOpcode(); 2967 2968 return isSMRD(Opc); 2969 } 2970 2971 bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr *MI) const { 2972 unsigned Opc = MI->getOpcode(); 2973 2974 return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); 2975 } 2976 2977 ArrayRef<std::pair<int, const char *>> 2978 SIInstrInfo::getSerializableTargetIndices() const { 2979 static const std::pair<int, const char *> TargetIndices[] = { 2980 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, 2981 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, 2982 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, 2983 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, 2984 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; 2985 return makeArrayRef(TargetIndices); 2986 } 2987 2988 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The 2989 /// post-RA version of misched uses CreateTargetMIHazardRecognizer. 2990 ScheduleHazardRecognizer * 2991 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, 2992 const ScheduleDAG *DAG) const { 2993 return new GCNHazardRecognizer(DAG->MF); 2994 } 2995 2996 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer 2997 /// pass. 2998 ScheduleHazardRecognizer * 2999 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { 3000 return new GCNHazardRecognizer(MF); 3001 } 3002