1 //===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief SI Implementation of TargetInstrInfo. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "SIInstrInfo.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "SIDefines.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/CodeGen/MachineFrameInfo.h" 21 #include "llvm/CodeGen/MachineInstrBuilder.h" 22 #include "llvm/CodeGen/MachineRegisterInfo.h" 23 #include "llvm/IR/Function.h" 24 #include "llvm/CodeGen/RegisterScavenging.h" 25 #include "llvm/MC/MCInstrDesc.h" 26 #include "llvm/Support/Debug.h" 27 28 using namespace llvm; 29 30 SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st) 31 : AMDGPUInstrInfo(st), RI() {} 32 33 //===----------------------------------------------------------------------===// 34 // TargetInstrInfo callbacks 35 //===----------------------------------------------------------------------===// 36 37 static unsigned getNumOperandsNoGlue(SDNode *Node) { 38 unsigned N = Node->getNumOperands(); 39 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 40 --N; 41 return N; 42 } 43 44 static SDValue findChainOperand(SDNode *Load) { 45 SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); 46 assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); 47 return LastOp; 48 } 49 50 /// \brief Returns true if both nodes have the same value for the given 51 /// operand \p Op, or if both nodes do not have this operand. 52 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 53 unsigned Opc0 = N0->getMachineOpcode(); 54 unsigned Opc1 = N1->getMachineOpcode(); 55 56 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 57 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 58 59 if (Op0Idx == -1 && Op1Idx == -1) 60 return true; 61 62 63 if ((Op0Idx == -1 && Op1Idx != -1) || 64 (Op1Idx == -1 && Op0Idx != -1)) 65 return false; 66 67 // getNamedOperandIdx returns the index for the MachineInstr's operands, 68 // which includes the result as the first operand. We are indexing into the 69 // MachineSDNode's operands, so we need to skip the result operand to get 70 // the real index. 71 --Op0Idx; 72 --Op1Idx; 73 74 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 75 } 76 77 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, 78 AliasAnalysis *AA) const { 79 // TODO: The generic check fails for VALU instructions that should be 80 // rematerializable due to implicit reads of exec. We really want all of the 81 // generic logic for this except for this. 82 switch (MI->getOpcode()) { 83 case AMDGPU::V_MOV_B32_e32: 84 case AMDGPU::V_MOV_B32_e64: 85 case AMDGPU::V_MOV_B64_PSEUDO: 86 return true; 87 default: 88 return false; 89 } 90 } 91 92 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 93 int64_t &Offset0, 94 int64_t &Offset1) const { 95 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 96 return false; 97 98 unsigned Opc0 = Load0->getMachineOpcode(); 99 unsigned Opc1 = Load1->getMachineOpcode(); 100 101 // Make sure both are actually loads. 102 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 103 return false; 104 105 if (isDS(Opc0) && isDS(Opc1)) { 106 107 // FIXME: Handle this case: 108 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 109 return false; 110 111 // Check base reg. 112 if (Load0->getOperand(1) != Load1->getOperand(1)) 113 return false; 114 115 // Check chain. 116 if (findChainOperand(Load0) != findChainOperand(Load1)) 117 return false; 118 119 // Skip read2 / write2 variants for simplicity. 120 // TODO: We should report true if the used offsets are adjacent (excluded 121 // st64 versions). 122 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || 123 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) 124 return false; 125 126 Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue(); 127 Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue(); 128 return true; 129 } 130 131 if (isSMRD(Opc0) && isSMRD(Opc1)) { 132 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); 133 134 // Check base reg. 135 if (Load0->getOperand(0) != Load1->getOperand(0)) 136 return false; 137 138 const ConstantSDNode *Load0Offset = 139 dyn_cast<ConstantSDNode>(Load0->getOperand(1)); 140 const ConstantSDNode *Load1Offset = 141 dyn_cast<ConstantSDNode>(Load1->getOperand(1)); 142 143 if (!Load0Offset || !Load1Offset) 144 return false; 145 146 // Check chain. 147 if (findChainOperand(Load0) != findChainOperand(Load1)) 148 return false; 149 150 Offset0 = Load0Offset->getZExtValue(); 151 Offset1 = Load1Offset->getZExtValue(); 152 return true; 153 } 154 155 // MUBUF and MTBUF can access the same addresses. 156 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 157 158 // MUBUF and MTBUF have vaddr at different indices. 159 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 160 findChainOperand(Load0) != findChainOperand(Load1) || 161 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 162 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 163 return false; 164 165 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 166 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 167 168 if (OffIdx0 == -1 || OffIdx1 == -1) 169 return false; 170 171 // getNamedOperandIdx returns the index for MachineInstrs. Since they 172 // inlcude the output in the operand list, but SDNodes don't, we need to 173 // subtract the index by one. 174 --OffIdx0; 175 --OffIdx1; 176 177 SDValue Off0 = Load0->getOperand(OffIdx0); 178 SDValue Off1 = Load1->getOperand(OffIdx1); 179 180 // The offset might be a FrameIndexSDNode. 181 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 182 return false; 183 184 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 185 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 186 return true; 187 } 188 189 return false; 190 } 191 192 static bool isStride64(unsigned Opc) { 193 switch (Opc) { 194 case AMDGPU::DS_READ2ST64_B32: 195 case AMDGPU::DS_READ2ST64_B64: 196 case AMDGPU::DS_WRITE2ST64_B32: 197 case AMDGPU::DS_WRITE2ST64_B64: 198 return true; 199 default: 200 return false; 201 } 202 } 203 204 bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, 205 int64_t &Offset, 206 const TargetRegisterInfo *TRI) const { 207 unsigned Opc = LdSt->getOpcode(); 208 209 if (isDS(*LdSt)) { 210 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 211 AMDGPU::OpName::offset); 212 if (OffsetImm) { 213 // Normal, single offset LDS instruction. 214 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 215 AMDGPU::OpName::addr); 216 217 BaseReg = AddrReg->getReg(); 218 Offset = OffsetImm->getImm(); 219 return true; 220 } 221 222 // The 2 offset instructions use offset0 and offset1 instead. We can treat 223 // these as a load with a single offset if the 2 offsets are consecutive. We 224 // will use this for some partially aligned loads. 225 const MachineOperand *Offset0Imm = getNamedOperand(*LdSt, 226 AMDGPU::OpName::offset0); 227 const MachineOperand *Offset1Imm = getNamedOperand(*LdSt, 228 AMDGPU::OpName::offset1); 229 230 uint8_t Offset0 = Offset0Imm->getImm(); 231 uint8_t Offset1 = Offset1Imm->getImm(); 232 233 if (Offset1 > Offset0 && Offset1 - Offset0 == 1) { 234 // Each of these offsets is in element sized units, so we need to convert 235 // to bytes of the individual reads. 236 237 unsigned EltSize; 238 if (LdSt->mayLoad()) 239 EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2; 240 else { 241 assert(LdSt->mayStore()); 242 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 243 EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize(); 244 } 245 246 if (isStride64(Opc)) 247 EltSize *= 64; 248 249 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 250 AMDGPU::OpName::addr); 251 BaseReg = AddrReg->getReg(); 252 Offset = EltSize * Offset0; 253 return true; 254 } 255 256 return false; 257 } 258 259 if (isMUBUF(*LdSt) || isMTBUF(*LdSt)) { 260 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) 261 return false; 262 263 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 264 AMDGPU::OpName::vaddr); 265 if (!AddrReg) 266 return false; 267 268 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 269 AMDGPU::OpName::offset); 270 BaseReg = AddrReg->getReg(); 271 Offset = OffsetImm->getImm(); 272 return true; 273 } 274 275 if (isSMRD(*LdSt)) { 276 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 277 AMDGPU::OpName::offset); 278 if (!OffsetImm) 279 return false; 280 281 const MachineOperand *SBaseReg = getNamedOperand(*LdSt, 282 AMDGPU::OpName::sbase); 283 BaseReg = SBaseReg->getReg(); 284 Offset = OffsetImm->getImm(); 285 return true; 286 } 287 288 return false; 289 } 290 291 bool SIInstrInfo::shouldClusterMemOps(MachineInstr *FirstLdSt, 292 MachineInstr *SecondLdSt, 293 unsigned NumLoads) const { 294 const MachineOperand *FirstDst = nullptr; 295 const MachineOperand *SecondDst = nullptr; 296 297 if (isDS(*FirstLdSt) && isDS(*SecondLdSt)) { 298 FirstDst = getNamedOperand(*FirstLdSt, AMDGPU::OpName::vdst); 299 SecondDst = getNamedOperand(*SecondLdSt, AMDGPU::OpName::vdst); 300 } 301 302 if (isSMRD(*FirstLdSt) && isSMRD(*SecondLdSt)) { 303 FirstDst = getNamedOperand(*FirstLdSt, AMDGPU::OpName::sdst); 304 SecondDst = getNamedOperand(*SecondLdSt, AMDGPU::OpName::sdst); 305 } 306 307 if ((isMUBUF(*FirstLdSt) && isMUBUF(*SecondLdSt)) || 308 (isMTBUF(*FirstLdSt) && isMTBUF(*SecondLdSt))) { 309 FirstDst = getNamedOperand(*FirstLdSt, AMDGPU::OpName::vdata); 310 SecondDst = getNamedOperand(*SecondLdSt, AMDGPU::OpName::vdata); 311 } 312 313 if (!FirstDst || !SecondDst) 314 return false; 315 316 // Try to limit clustering based on the total number of bytes loaded 317 // rather than the number of instructions. This is done to help reduce 318 // register pressure. The method used is somewhat inexact, though, 319 // because it assumes that all loads in the cluster will load the 320 // same number of bytes as FirstLdSt. 321 322 // The unit of this value is bytes. 323 // FIXME: This needs finer tuning. 324 unsigned LoadClusterThreshold = 16; 325 326 const MachineRegisterInfo &MRI = 327 FirstLdSt->getParent()->getParent()->getRegInfo(); 328 const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg()); 329 330 return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold; 331 } 332 333 void 334 SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 335 MachineBasicBlock::iterator MI, DebugLoc DL, 336 unsigned DestReg, unsigned SrcReg, 337 bool KillSrc) const { 338 339 // If we are trying to copy to or from SCC, there is a bug somewhere else in 340 // the backend. While it may be theoretically possible to do this, it should 341 // never be necessary. 342 assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC); 343 344 static const int16_t Sub0_15[] = { 345 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 346 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 347 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 348 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 349 }; 350 351 static const int16_t Sub0_15_64[] = { 352 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 353 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 354 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 355 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 356 }; 357 358 static const int16_t Sub0_7[] = { 359 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 360 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 361 }; 362 363 static const int16_t Sub0_7_64[] = { 364 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 365 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 366 }; 367 368 static const int16_t Sub0_3[] = { 369 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 370 }; 371 372 static const int16_t Sub0_3_64[] = { 373 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 374 }; 375 376 static const int16_t Sub0_2[] = { 377 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 378 }; 379 380 static const int16_t Sub0_1[] = { 381 AMDGPU::sub0, AMDGPU::sub1, 382 }; 383 384 unsigned Opcode; 385 ArrayRef<int16_t> SubIndices; 386 bool Forward; 387 388 if (AMDGPU::SReg_32RegClass.contains(DestReg)) { 389 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 390 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 391 .addReg(SrcReg, getKillRegState(KillSrc)); 392 return; 393 394 } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) { 395 if (DestReg == AMDGPU::VCC) { 396 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 397 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 398 .addReg(SrcReg, getKillRegState(KillSrc)); 399 } else { 400 // FIXME: Hack until VReg_1 removed. 401 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 402 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32)) 403 .addImm(0) 404 .addReg(SrcReg, getKillRegState(KillSrc)); 405 } 406 407 return; 408 } 409 410 assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); 411 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 412 .addReg(SrcReg, getKillRegState(KillSrc)); 413 return; 414 415 } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) { 416 assert(AMDGPU::SReg_128RegClass.contains(SrcReg)); 417 Opcode = AMDGPU::S_MOV_B64; 418 SubIndices = Sub0_3_64; 419 420 } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) { 421 assert(AMDGPU::SReg_256RegClass.contains(SrcReg)); 422 Opcode = AMDGPU::S_MOV_B64; 423 SubIndices = Sub0_7_64; 424 425 } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) { 426 assert(AMDGPU::SReg_512RegClass.contains(SrcReg)); 427 Opcode = AMDGPU::S_MOV_B64; 428 SubIndices = Sub0_15_64; 429 430 } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) { 431 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 432 AMDGPU::SReg_32RegClass.contains(SrcReg)); 433 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 434 .addReg(SrcReg, getKillRegState(KillSrc)); 435 return; 436 437 } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) { 438 assert(AMDGPU::VReg_64RegClass.contains(SrcReg) || 439 AMDGPU::SReg_64RegClass.contains(SrcReg)); 440 Opcode = AMDGPU::V_MOV_B32_e32; 441 SubIndices = Sub0_1; 442 443 } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) { 444 assert(AMDGPU::VReg_96RegClass.contains(SrcReg)); 445 Opcode = AMDGPU::V_MOV_B32_e32; 446 SubIndices = Sub0_2; 447 448 } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) { 449 assert(AMDGPU::VReg_128RegClass.contains(SrcReg) || 450 AMDGPU::SReg_128RegClass.contains(SrcReg)); 451 Opcode = AMDGPU::V_MOV_B32_e32; 452 SubIndices = Sub0_3; 453 454 } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) { 455 assert(AMDGPU::VReg_256RegClass.contains(SrcReg) || 456 AMDGPU::SReg_256RegClass.contains(SrcReg)); 457 Opcode = AMDGPU::V_MOV_B32_e32; 458 SubIndices = Sub0_7; 459 460 } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) { 461 assert(AMDGPU::VReg_512RegClass.contains(SrcReg) || 462 AMDGPU::SReg_512RegClass.contains(SrcReg)); 463 Opcode = AMDGPU::V_MOV_B32_e32; 464 SubIndices = Sub0_15; 465 466 } else { 467 llvm_unreachable("Can't copy register!"); 468 } 469 470 if (RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg)) 471 Forward = true; 472 else 473 Forward = false; 474 475 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 476 unsigned SubIdx; 477 if (Forward) 478 SubIdx = SubIndices[Idx]; 479 else 480 SubIdx = SubIndices[SubIndices.size() - Idx - 1]; 481 482 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 483 get(Opcode), RI.getSubReg(DestReg, SubIdx)); 484 485 Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); 486 487 if (Idx == SubIndices.size() - 1) 488 Builder.addReg(SrcReg, RegState::Kill | RegState::Implicit); 489 490 if (Idx == 0) 491 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 492 } 493 } 494 495 int SIInstrInfo::commuteOpcode(const MachineInstr &MI) const { 496 const unsigned Opcode = MI.getOpcode(); 497 498 int NewOpc; 499 500 // Try to map original to commuted opcode 501 NewOpc = AMDGPU::getCommuteRev(Opcode); 502 if (NewOpc != -1) 503 // Check if the commuted (REV) opcode exists on the target. 504 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 505 506 // Try to map commuted to original opcode 507 NewOpc = AMDGPU::getCommuteOrig(Opcode); 508 if (NewOpc != -1) 509 // Check if the original (non-REV) opcode exists on the target. 510 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 511 512 return Opcode; 513 } 514 515 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 516 517 if (DstRC->getSize() == 4) { 518 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 519 } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) { 520 return AMDGPU::S_MOV_B64; 521 } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) { 522 return AMDGPU::V_MOV_B64_PSEUDO; 523 } 524 return AMDGPU::COPY; 525 } 526 527 static unsigned getSGPRSpillSaveOpcode(unsigned Size) { 528 switch (Size) { 529 case 4: 530 return AMDGPU::SI_SPILL_S32_SAVE; 531 case 8: 532 return AMDGPU::SI_SPILL_S64_SAVE; 533 case 16: 534 return AMDGPU::SI_SPILL_S128_SAVE; 535 case 32: 536 return AMDGPU::SI_SPILL_S256_SAVE; 537 case 64: 538 return AMDGPU::SI_SPILL_S512_SAVE; 539 default: 540 llvm_unreachable("unknown register size"); 541 } 542 } 543 544 static unsigned getVGPRSpillSaveOpcode(unsigned Size) { 545 switch (Size) { 546 case 4: 547 return AMDGPU::SI_SPILL_V32_SAVE; 548 case 8: 549 return AMDGPU::SI_SPILL_V64_SAVE; 550 case 12: 551 return AMDGPU::SI_SPILL_V96_SAVE; 552 case 16: 553 return AMDGPU::SI_SPILL_V128_SAVE; 554 case 32: 555 return AMDGPU::SI_SPILL_V256_SAVE; 556 case 64: 557 return AMDGPU::SI_SPILL_V512_SAVE; 558 default: 559 llvm_unreachable("unknown register size"); 560 } 561 } 562 563 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 564 MachineBasicBlock::iterator MI, 565 unsigned SrcReg, bool isKill, 566 int FrameIndex, 567 const TargetRegisterClass *RC, 568 const TargetRegisterInfo *TRI) const { 569 MachineFunction *MF = MBB.getParent(); 570 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 571 MachineFrameInfo *FrameInfo = MF->getFrameInfo(); 572 DebugLoc DL = MBB.findDebugLoc(MI); 573 574 unsigned Size = FrameInfo->getObjectSize(FrameIndex); 575 unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); 576 MachinePointerInfo PtrInfo 577 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 578 MachineMemOperand *MMO 579 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 580 Size, Align); 581 582 if (RI.isSGPRClass(RC)) { 583 MFI->setHasSpilledSGPRs(); 584 585 // We are only allowed to create one new instruction when spilling 586 // registers, so we need to use pseudo instruction for spilling 587 // SGPRs. 588 unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize()); 589 BuildMI(MBB, MI, DL, get(Opcode)) 590 .addReg(SrcReg) // src 591 .addFrameIndex(FrameIndex) // frame_idx 592 .addMemOperand(MMO); 593 594 return; 595 } 596 597 if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { 598 LLVMContext &Ctx = MF->getFunction()->getContext(); 599 Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" 600 " spill register"); 601 BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) 602 .addReg(SrcReg); 603 604 return; 605 } 606 607 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 608 609 unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); 610 MFI->setHasSpilledVGPRs(); 611 BuildMI(MBB, MI, DL, get(Opcode)) 612 .addReg(SrcReg) // src 613 .addFrameIndex(FrameIndex) // frame_idx 614 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 615 .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset 616 .addImm(0) // offset 617 .addMemOperand(MMO); 618 } 619 620 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { 621 switch (Size) { 622 case 4: 623 return AMDGPU::SI_SPILL_S32_RESTORE; 624 case 8: 625 return AMDGPU::SI_SPILL_S64_RESTORE; 626 case 16: 627 return AMDGPU::SI_SPILL_S128_RESTORE; 628 case 32: 629 return AMDGPU::SI_SPILL_S256_RESTORE; 630 case 64: 631 return AMDGPU::SI_SPILL_S512_RESTORE; 632 default: 633 llvm_unreachable("unknown register size"); 634 } 635 } 636 637 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { 638 switch (Size) { 639 case 4: 640 return AMDGPU::SI_SPILL_V32_RESTORE; 641 case 8: 642 return AMDGPU::SI_SPILL_V64_RESTORE; 643 case 12: 644 return AMDGPU::SI_SPILL_V96_RESTORE; 645 case 16: 646 return AMDGPU::SI_SPILL_V128_RESTORE; 647 case 32: 648 return AMDGPU::SI_SPILL_V256_RESTORE; 649 case 64: 650 return AMDGPU::SI_SPILL_V512_RESTORE; 651 default: 652 llvm_unreachable("unknown register size"); 653 } 654 } 655 656 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 657 MachineBasicBlock::iterator MI, 658 unsigned DestReg, int FrameIndex, 659 const TargetRegisterClass *RC, 660 const TargetRegisterInfo *TRI) const { 661 MachineFunction *MF = MBB.getParent(); 662 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 663 MachineFrameInfo *FrameInfo = MF->getFrameInfo(); 664 DebugLoc DL = MBB.findDebugLoc(MI); 665 unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); 666 unsigned Size = FrameInfo->getObjectSize(FrameIndex); 667 668 MachinePointerInfo PtrInfo 669 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 670 671 MachineMemOperand *MMO = MF->getMachineMemOperand( 672 PtrInfo, MachineMemOperand::MOLoad, Size, Align); 673 674 if (RI.isSGPRClass(RC)) { 675 // FIXME: Maybe this should not include a memoperand because it will be 676 // lowered to non-memory instructions. 677 unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize()); 678 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 679 .addFrameIndex(FrameIndex) // frame_idx 680 .addMemOperand(MMO); 681 682 return; 683 } 684 685 if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { 686 LLVMContext &Ctx = MF->getFunction()->getContext(); 687 Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" 688 " restore register"); 689 BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); 690 691 return; 692 } 693 694 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 695 696 unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); 697 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 698 .addFrameIndex(FrameIndex) // frame_idx 699 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 700 .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset 701 .addImm(0) // offset 702 .addMemOperand(MMO); 703 } 704 705 /// \param @Offset Offset in bytes of the FrameIndex being spilled 706 unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, 707 MachineBasicBlock::iterator MI, 708 RegScavenger *RS, unsigned TmpReg, 709 unsigned FrameOffset, 710 unsigned Size) const { 711 MachineFunction *MF = MBB.getParent(); 712 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 713 const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>(); 714 const SIRegisterInfo *TRI = 715 static_cast<const SIRegisterInfo*>(ST.getRegisterInfo()); 716 DebugLoc DL = MBB.findDebugLoc(MI); 717 unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF); 718 unsigned WavefrontSize = ST.getWavefrontSize(); 719 720 unsigned TIDReg = MFI->getTIDReg(); 721 if (!MFI->hasCalculatedTID()) { 722 MachineBasicBlock &Entry = MBB.getParent()->front(); 723 MachineBasicBlock::iterator Insert = Entry.front(); 724 DebugLoc DL = Insert->getDebugLoc(); 725 726 TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass); 727 if (TIDReg == AMDGPU::NoRegister) 728 return TIDReg; 729 730 731 if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) && 732 WorkGroupSize > WavefrontSize) { 733 734 unsigned TIDIGXReg 735 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X); 736 unsigned TIDIGYReg 737 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y); 738 unsigned TIDIGZReg 739 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z); 740 unsigned InputPtrReg = 741 TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); 742 for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { 743 if (!Entry.isLiveIn(Reg)) 744 Entry.addLiveIn(Reg); 745 } 746 747 RS->enterBasicBlock(Entry); 748 // FIXME: Can we scavenge an SReg_64 and access the subregs? 749 unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 750 unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 751 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) 752 .addReg(InputPtrReg) 753 .addImm(SI::KernelInputOffsets::NGROUPS_Z); 754 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) 755 .addReg(InputPtrReg) 756 .addImm(SI::KernelInputOffsets::NGROUPS_Y); 757 758 // NGROUPS.X * NGROUPS.Y 759 BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) 760 .addReg(STmp1) 761 .addReg(STmp0); 762 // (NGROUPS.X * NGROUPS.Y) * TIDIG.X 763 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) 764 .addReg(STmp1) 765 .addReg(TIDIGXReg); 766 // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) 767 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) 768 .addReg(STmp0) 769 .addReg(TIDIGYReg) 770 .addReg(TIDReg); 771 // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z 772 BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) 773 .addReg(TIDReg) 774 .addReg(TIDIGZReg); 775 } else { 776 // Get the wave id 777 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), 778 TIDReg) 779 .addImm(-1) 780 .addImm(0); 781 782 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), 783 TIDReg) 784 .addImm(-1) 785 .addReg(TIDReg); 786 } 787 788 BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), 789 TIDReg) 790 .addImm(2) 791 .addReg(TIDReg); 792 MFI->setTIDReg(TIDReg); 793 } 794 795 // Add FrameIndex to LDS offset 796 unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize); 797 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) 798 .addImm(LDSOffset) 799 .addReg(TIDReg); 800 801 return TmpReg; 802 } 803 804 void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB, 805 MachineBasicBlock::iterator MI, 806 int Count) const { 807 while (Count > 0) { 808 int Arg; 809 if (Count >= 8) 810 Arg = 7; 811 else 812 Arg = Count - 1; 813 Count -= 8; 814 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_NOP)) 815 .addImm(Arg); 816 } 817 } 818 819 bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { 820 MachineBasicBlock &MBB = *MI->getParent(); 821 DebugLoc DL = MBB.findDebugLoc(MI); 822 switch (MI->getOpcode()) { 823 default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); 824 825 case AMDGPU::SGPR_USE: 826 // This is just a placeholder for register allocation. 827 MI->eraseFromParent(); 828 break; 829 830 case AMDGPU::V_MOV_B64_PSEUDO: { 831 unsigned Dst = MI->getOperand(0).getReg(); 832 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 833 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 834 835 const MachineOperand &SrcOp = MI->getOperand(1); 836 // FIXME: Will this work for 64-bit floating point immediates? 837 assert(!SrcOp.isFPImm()); 838 if (SrcOp.isImm()) { 839 APInt Imm(64, SrcOp.getImm()); 840 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 841 .addImm(Imm.getLoBits(32).getZExtValue()) 842 .addReg(Dst, RegState::Implicit); 843 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 844 .addImm(Imm.getHiBits(32).getZExtValue()) 845 .addReg(Dst, RegState::Implicit); 846 } else { 847 assert(SrcOp.isReg()); 848 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 849 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 850 .addReg(Dst, RegState::Implicit); 851 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 852 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 853 .addReg(Dst, RegState::Implicit); 854 } 855 MI->eraseFromParent(); 856 break; 857 } 858 859 case AMDGPU::V_CNDMASK_B64_PSEUDO: { 860 unsigned Dst = MI->getOperand(0).getReg(); 861 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 862 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 863 unsigned Src0 = MI->getOperand(1).getReg(); 864 unsigned Src1 = MI->getOperand(2).getReg(); 865 const MachineOperand &SrcCond = MI->getOperand(3); 866 867 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo) 868 .addReg(RI.getSubReg(Src0, AMDGPU::sub0)) 869 .addReg(RI.getSubReg(Src1, AMDGPU::sub0)) 870 .addOperand(SrcCond); 871 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi) 872 .addReg(RI.getSubReg(Src0, AMDGPU::sub1)) 873 .addReg(RI.getSubReg(Src1, AMDGPU::sub1)) 874 .addOperand(SrcCond); 875 MI->eraseFromParent(); 876 break; 877 } 878 879 case AMDGPU::SI_CONSTDATA_PTR: { 880 const SIRegisterInfo *TRI = 881 static_cast<const SIRegisterInfo *>(ST.getRegisterInfo()); 882 MachineFunction &MF = *MBB.getParent(); 883 unsigned Reg = MI->getOperand(0).getReg(); 884 unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0); 885 unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1); 886 887 // Create a bundle so these instructions won't be re-ordered by the 888 // post-RA scheduler. 889 MIBundleBuilder Bundler(MBB, MI); 890 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); 891 892 // Add 32-bit offset from this instruction to the start of the 893 // constant data. 894 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) 895 .addReg(RegLo) 896 .addOperand(MI->getOperand(1))); 897 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) 898 .addReg(RegHi) 899 .addImm(0)); 900 901 llvm::finalizeBundle(MBB, Bundler.begin()); 902 903 MI->eraseFromParent(); 904 break; 905 } 906 } 907 return true; 908 } 909 910 /// Commutes the operands in the given instruction. 911 /// The commutable operands are specified by their indices OpIdx0 and OpIdx1. 912 /// 913 /// Do not call this method for a non-commutable instruction or for 914 /// non-commutable pair of operand indices OpIdx0 and OpIdx1. 915 /// Even though the instruction is commutable, the method may still 916 /// fail to commute the operands, null pointer is returned in such cases. 917 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI, 918 bool NewMI, 919 unsigned OpIdx0, 920 unsigned OpIdx1) const { 921 int CommutedOpcode = commuteOpcode(*MI); 922 if (CommutedOpcode == -1) 923 return nullptr; 924 925 int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 926 AMDGPU::OpName::src0); 927 MachineOperand &Src0 = MI->getOperand(Src0Idx); 928 if (!Src0.isReg()) 929 return nullptr; 930 931 int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 932 AMDGPU::OpName::src1); 933 934 if ((OpIdx0 != static_cast<unsigned>(Src0Idx) || 935 OpIdx1 != static_cast<unsigned>(Src1Idx)) && 936 (OpIdx0 != static_cast<unsigned>(Src1Idx) || 937 OpIdx1 != static_cast<unsigned>(Src0Idx))) 938 return nullptr; 939 940 MachineOperand &Src1 = MI->getOperand(Src1Idx); 941 942 943 if (isVOP2(*MI) || isVOPC(*MI)) { 944 const MCInstrDesc &InstrDesc = MI->getDesc(); 945 // For VOP2 and VOPC instructions, any operand type is valid to use for 946 // src0. Make sure we can use the src0 as src1. 947 // 948 // We could be stricter here and only allow commuting if there is a reason 949 // to do so. i.e. if both operands are VGPRs there is no real benefit, 950 // although MachineCSE attempts to find matches by commuting. 951 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 952 if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) 953 return nullptr; 954 } 955 956 if (!Src1.isReg()) { 957 // Allow commuting instructions with Imm operands. 958 if (NewMI || !Src1.isImm() || 959 (!isVOP2(*MI) && !isVOP3(*MI))) { 960 return nullptr; 961 } 962 // Be sure to copy the source modifiers to the right place. 963 if (MachineOperand *Src0Mods 964 = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { 965 MachineOperand *Src1Mods 966 = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers); 967 968 int Src0ModsVal = Src0Mods->getImm(); 969 if (!Src1Mods && Src0ModsVal != 0) 970 return nullptr; 971 972 // XXX - This assert might be a lie. It might be useful to have a neg 973 // modifier with 0.0. 974 int Src1ModsVal = Src1Mods->getImm(); 975 assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates"); 976 977 Src1Mods->setImm(Src0ModsVal); 978 Src0Mods->setImm(Src1ModsVal); 979 } 980 981 unsigned Reg = Src0.getReg(); 982 unsigned SubReg = Src0.getSubReg(); 983 if (Src1.isImm()) 984 Src0.ChangeToImmediate(Src1.getImm()); 985 else 986 llvm_unreachable("Should only have immediates"); 987 988 Src1.ChangeToRegister(Reg, false); 989 Src1.setSubReg(SubReg); 990 } else { 991 MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1); 992 } 993 994 if (MI) 995 MI->setDesc(get(CommutedOpcode)); 996 997 return MI; 998 } 999 1000 // This needs to be implemented because the source modifiers may be inserted 1001 // between the true commutable operands, and the base 1002 // TargetInstrInfo::commuteInstruction uses it. 1003 bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, 1004 unsigned &SrcOpIdx0, 1005 unsigned &SrcOpIdx1) const { 1006 const MCInstrDesc &MCID = MI->getDesc(); 1007 if (!MCID.isCommutable()) 1008 return false; 1009 1010 unsigned Opc = MI->getOpcode(); 1011 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1012 if (Src0Idx == -1) 1013 return false; 1014 1015 // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on 1016 // immediate. Also, immediate src0 operand is not handled in 1017 // SIInstrInfo::commuteInstruction(); 1018 if (!MI->getOperand(Src0Idx).isReg()) 1019 return false; 1020 1021 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1022 if (Src1Idx == -1) 1023 return false; 1024 1025 MachineOperand &Src1 = MI->getOperand(Src1Idx); 1026 if (Src1.isImm()) { 1027 // SIInstrInfo::commuteInstruction() does support commuting the immediate 1028 // operand src1 in 2 and 3 operand instructions. 1029 if (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode())) 1030 return false; 1031 } else if (Src1.isReg()) { 1032 // If any source modifiers are set, the generic instruction commuting won't 1033 // understand how to copy the source modifiers. 1034 if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || 1035 hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) 1036 return false; 1037 } else 1038 return false; 1039 1040 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); 1041 } 1042 1043 static void removeModOperands(MachineInstr &MI) { 1044 unsigned Opc = MI.getOpcode(); 1045 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1046 AMDGPU::OpName::src0_modifiers); 1047 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1048 AMDGPU::OpName::src1_modifiers); 1049 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1050 AMDGPU::OpName::src2_modifiers); 1051 1052 MI.RemoveOperand(Src2ModIdx); 1053 MI.RemoveOperand(Src1ModIdx); 1054 MI.RemoveOperand(Src0ModIdx); 1055 } 1056 1057 // TODO: Maybe this should be removed this and custom fold everything in 1058 // SIFoldOperands? 1059 bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, 1060 unsigned Reg, MachineRegisterInfo *MRI) const { 1061 if (!MRI->hasOneNonDBGUse(Reg)) 1062 return false; 1063 1064 unsigned Opc = UseMI->getOpcode(); 1065 if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) { 1066 // Don't fold if we are using source modifiers. The new VOP2 instructions 1067 // don't have them. 1068 if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) || 1069 hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) || 1070 hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) { 1071 return false; 1072 } 1073 1074 const MachineOperand &ImmOp = DefMI->getOperand(1); 1075 1076 // If this is a free constant, there's no reason to do this. 1077 // TODO: We could fold this here instead of letting SIFoldOperands do it 1078 // later. 1079 if (isInlineConstant(ImmOp, 4)) 1080 return false; 1081 1082 MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0); 1083 MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1); 1084 MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2); 1085 1086 // Multiplied part is the constant: Use v_madmk_f32 1087 // We should only expect these to be on src0 due to canonicalizations. 1088 if (Src0->isReg() && Src0->getReg() == Reg) { 1089 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1090 return false; 1091 1092 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) 1093 return false; 1094 1095 // We need to swap operands 0 and 1 since madmk constant is at operand 1. 1096 1097 const int64_t Imm = DefMI->getOperand(1).getImm(); 1098 1099 // FIXME: This would be a lot easier if we could return a new instruction 1100 // instead of having to modify in place. 1101 1102 // Remove these first since they are at the end. 1103 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1104 AMDGPU::OpName::omod)); 1105 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1106 AMDGPU::OpName::clamp)); 1107 1108 unsigned Src1Reg = Src1->getReg(); 1109 unsigned Src1SubReg = Src1->getSubReg(); 1110 Src0->setReg(Src1Reg); 1111 Src0->setSubReg(Src1SubReg); 1112 Src0->setIsKill(Src1->isKill()); 1113 1114 if (Opc == AMDGPU::V_MAC_F32_e64) { 1115 UseMI->untieRegOperand( 1116 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1117 } 1118 1119 Src1->ChangeToImmediate(Imm); 1120 1121 removeModOperands(*UseMI); 1122 UseMI->setDesc(get(AMDGPU::V_MADMK_F32)); 1123 1124 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1125 if (DeleteDef) 1126 DefMI->eraseFromParent(); 1127 1128 return true; 1129 } 1130 1131 // Added part is the constant: Use v_madak_f32 1132 if (Src2->isReg() && Src2->getReg() == Reg) { 1133 // Not allowed to use constant bus for another operand. 1134 // We can however allow an inline immediate as src0. 1135 if (!Src0->isImm() && 1136 (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) 1137 return false; 1138 1139 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1140 return false; 1141 1142 const int64_t Imm = DefMI->getOperand(1).getImm(); 1143 1144 // FIXME: This would be a lot easier if we could return a new instruction 1145 // instead of having to modify in place. 1146 1147 // Remove these first since they are at the end. 1148 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1149 AMDGPU::OpName::omod)); 1150 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1151 AMDGPU::OpName::clamp)); 1152 1153 if (Opc == AMDGPU::V_MAC_F32_e64) { 1154 UseMI->untieRegOperand( 1155 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1156 } 1157 1158 // ChangingToImmediate adds Src2 back to the instruction. 1159 Src2->ChangeToImmediate(Imm); 1160 1161 // These come before src2. 1162 removeModOperands(*UseMI); 1163 UseMI->setDesc(get(AMDGPU::V_MADAK_F32)); 1164 1165 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1166 if (DeleteDef) 1167 DefMI->eraseFromParent(); 1168 1169 return true; 1170 } 1171 } 1172 1173 return false; 1174 } 1175 1176 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 1177 int WidthB, int OffsetB) { 1178 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1179 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1180 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1181 return LowOffset + LowWidth <= HighOffset; 1182 } 1183 1184 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa, 1185 MachineInstr *MIb) const { 1186 unsigned BaseReg0, BaseReg1; 1187 int64_t Offset0, Offset1; 1188 1189 if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && 1190 getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { 1191 assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() && 1192 "read2 / write2 not expected here yet"); 1193 unsigned Width0 = (*MIa->memoperands_begin())->getSize(); 1194 unsigned Width1 = (*MIb->memoperands_begin())->getSize(); 1195 if (BaseReg0 == BaseReg1 && 1196 offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { 1197 return true; 1198 } 1199 } 1200 1201 return false; 1202 } 1203 1204 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, 1205 MachineInstr *MIb, 1206 AliasAnalysis *AA) const { 1207 assert(MIa && (MIa->mayLoad() || MIa->mayStore()) && 1208 "MIa must load from or modify a memory location"); 1209 assert(MIb && (MIb->mayLoad() || MIb->mayStore()) && 1210 "MIb must load from or modify a memory location"); 1211 1212 if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects()) 1213 return false; 1214 1215 // XXX - Can we relax this between address spaces? 1216 if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef()) 1217 return false; 1218 1219 // TODO: Should we check the address space from the MachineMemOperand? That 1220 // would allow us to distinguish objects we know don't alias based on the 1221 // underlying address space, even if it was lowered to a different one, 1222 // e.g. private accesses lowered to use MUBUF instructions on a scratch 1223 // buffer. 1224 if (isDS(*MIa)) { 1225 if (isDS(*MIb)) 1226 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1227 1228 return !isFLAT(*MIb); 1229 } 1230 1231 if (isMUBUF(*MIa) || isMTBUF(*MIa)) { 1232 if (isMUBUF(*MIb) || isMTBUF(*MIb)) 1233 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1234 1235 return !isFLAT(*MIb) && !isSMRD(*MIb); 1236 } 1237 1238 if (isSMRD(*MIa)) { 1239 if (isSMRD(*MIb)) 1240 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1241 1242 return !isFLAT(*MIb) && !isMUBUF(*MIa) && !isMTBUF(*MIa); 1243 } 1244 1245 if (isFLAT(*MIa)) { 1246 if (isFLAT(*MIb)) 1247 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1248 1249 return false; 1250 } 1251 1252 return false; 1253 } 1254 1255 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, 1256 MachineBasicBlock::iterator &MI, 1257 LiveVariables *LV) const { 1258 1259 switch (MI->getOpcode()) { 1260 default: return nullptr; 1261 case AMDGPU::V_MAC_F32_e64: break; 1262 case AMDGPU::V_MAC_F32_e32: { 1263 const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); 1264 if (Src0->isImm() && !isInlineConstant(*Src0, 4)) 1265 return nullptr; 1266 break; 1267 } 1268 } 1269 1270 const MachineOperand *Dst = getNamedOperand(*MI, AMDGPU::OpName::vdst); 1271 const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); 1272 const MachineOperand *Src1 = getNamedOperand(*MI, AMDGPU::OpName::src1); 1273 const MachineOperand *Src2 = getNamedOperand(*MI, AMDGPU::OpName::src2); 1274 1275 return BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_MAD_F32)) 1276 .addOperand(*Dst) 1277 .addImm(0) // Src0 mods 1278 .addOperand(*Src0) 1279 .addImm(0) // Src1 mods 1280 .addOperand(*Src1) 1281 .addImm(0) // Src mods 1282 .addOperand(*Src2) 1283 .addImm(0) // clamp 1284 .addImm(0); // omod 1285 } 1286 1287 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr *MI, 1288 const MachineBasicBlock *MBB, 1289 const MachineFunction &MF) const { 1290 // Target-independent instructions do not have an implicit-use of EXEC, even 1291 // when they operate on VGPRs. Treating EXEC modifications as scheduling 1292 // boundaries prevents incorrect movements of such instructions. 1293 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); 1294 if (MI->modifiesRegister(AMDGPU::EXEC, TRI)) 1295 return true; 1296 1297 return AMDGPUInstrInfo::isSchedulingBoundary(MI, MBB, MF); 1298 } 1299 1300 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 1301 int64_t SVal = Imm.getSExtValue(); 1302 if (SVal >= -16 && SVal <= 64) 1303 return true; 1304 1305 if (Imm.getBitWidth() == 64) { 1306 uint64_t Val = Imm.getZExtValue(); 1307 return (DoubleToBits(0.0) == Val) || 1308 (DoubleToBits(1.0) == Val) || 1309 (DoubleToBits(-1.0) == Val) || 1310 (DoubleToBits(0.5) == Val) || 1311 (DoubleToBits(-0.5) == Val) || 1312 (DoubleToBits(2.0) == Val) || 1313 (DoubleToBits(-2.0) == Val) || 1314 (DoubleToBits(4.0) == Val) || 1315 (DoubleToBits(-4.0) == Val); 1316 } 1317 1318 // The actual type of the operand does not seem to matter as long 1319 // as the bits match one of the inline immediate values. For example: 1320 // 1321 // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal, 1322 // so it is a legal inline immediate. 1323 // 1324 // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in 1325 // floating-point, so it is a legal inline immediate. 1326 uint32_t Val = Imm.getZExtValue(); 1327 1328 return (FloatToBits(0.0f) == Val) || 1329 (FloatToBits(1.0f) == Val) || 1330 (FloatToBits(-1.0f) == Val) || 1331 (FloatToBits(0.5f) == Val) || 1332 (FloatToBits(-0.5f) == Val) || 1333 (FloatToBits(2.0f) == Val) || 1334 (FloatToBits(-2.0f) == Val) || 1335 (FloatToBits(4.0f) == Val) || 1336 (FloatToBits(-4.0f) == Val); 1337 } 1338 1339 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 1340 unsigned OpSize) const { 1341 if (MO.isImm()) { 1342 // MachineOperand provides no way to tell the true operand size, since it 1343 // only records a 64-bit value. We need to know the size to determine if a 1344 // 32-bit floating point immediate bit pattern is legal for an integer 1345 // immediate. It would be for any 32-bit integer operand, but would not be 1346 // for a 64-bit one. 1347 1348 unsigned BitSize = 8 * OpSize; 1349 return isInlineConstant(APInt(BitSize, MO.getImm(), true)); 1350 } 1351 1352 return false; 1353 } 1354 1355 bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO, 1356 unsigned OpSize) const { 1357 return MO.isImm() && !isInlineConstant(MO, OpSize); 1358 } 1359 1360 static bool compareMachineOp(const MachineOperand &Op0, 1361 const MachineOperand &Op1) { 1362 if (Op0.getType() != Op1.getType()) 1363 return false; 1364 1365 switch (Op0.getType()) { 1366 case MachineOperand::MO_Register: 1367 return Op0.getReg() == Op1.getReg(); 1368 case MachineOperand::MO_Immediate: 1369 return Op0.getImm() == Op1.getImm(); 1370 default: 1371 llvm_unreachable("Didn't expect to be comparing these operand types"); 1372 } 1373 } 1374 1375 bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, 1376 const MachineOperand &MO) const { 1377 const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo]; 1378 1379 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 1380 1381 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 1382 return true; 1383 1384 if (OpInfo.RegClass < 0) 1385 return false; 1386 1387 unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize(); 1388 if (isLiteralConstant(MO, OpSize)) 1389 return RI.opCanUseLiteralConstant(OpInfo.OperandType); 1390 1391 return RI.opCanUseInlineConstant(OpInfo.OperandType); 1392 } 1393 1394 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 1395 int Op32 = AMDGPU::getVOPe32(Opcode); 1396 if (Op32 == -1) 1397 return false; 1398 1399 return pseudoToMCOpcode(Op32) != -1; 1400 } 1401 1402 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 1403 // The src0_modifier operand is present on all instructions 1404 // that have modifiers. 1405 1406 return AMDGPU::getNamedOperandIdx(Opcode, 1407 AMDGPU::OpName::src0_modifiers) != -1; 1408 } 1409 1410 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 1411 unsigned OpName) const { 1412 const MachineOperand *Mods = getNamedOperand(MI, OpName); 1413 return Mods && Mods->getImm(); 1414 } 1415 1416 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 1417 const MachineOperand &MO, 1418 unsigned OpSize) const { 1419 // Literal constants use the constant bus. 1420 if (isLiteralConstant(MO, OpSize)) 1421 return true; 1422 1423 if (!MO.isReg() || !MO.isUse()) 1424 return false; 1425 1426 if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) 1427 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 1428 1429 // FLAT_SCR is just an SGPR pair. 1430 if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) 1431 return true; 1432 1433 // EXEC register uses the constant bus. 1434 if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) 1435 return true; 1436 1437 // SGPRs use the constant bus 1438 return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 || 1439 (!MO.isImplicit() && 1440 (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || 1441 AMDGPU::SGPR_64RegClass.contains(MO.getReg())))); 1442 } 1443 1444 static unsigned findImplicitSGPRRead(const MachineInstr &MI) { 1445 for (const MachineOperand &MO : MI.implicit_operands()) { 1446 // We only care about reads. 1447 if (MO.isDef()) 1448 continue; 1449 1450 switch (MO.getReg()) { 1451 case AMDGPU::VCC: 1452 case AMDGPU::M0: 1453 case AMDGPU::FLAT_SCR: 1454 return MO.getReg(); 1455 1456 default: 1457 break; 1458 } 1459 } 1460 1461 return AMDGPU::NoRegister; 1462 } 1463 1464 bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, 1465 StringRef &ErrInfo) const { 1466 uint16_t Opcode = MI->getOpcode(); 1467 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1468 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 1469 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 1470 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 1471 1472 // Make sure we don't have SCC live-ins to basic blocks. moveToVALU assumes 1473 // all SCC users are in the same blocks as their defs. 1474 const MachineBasicBlock *MBB = MI->getParent(); 1475 if (MI == &MBB->front()) { 1476 if (MBB->isLiveIn(AMDGPU::SCC)) { 1477 ErrInfo = "scc register cannot be live across blocks."; 1478 return false; 1479 } 1480 } 1481 1482 // Make sure the number of operands is correct. 1483 const MCInstrDesc &Desc = get(Opcode); 1484 if (!Desc.isVariadic() && 1485 Desc.getNumOperands() != MI->getNumExplicitOperands()) { 1486 ErrInfo = "Instruction has wrong number of operands."; 1487 return false; 1488 } 1489 1490 // Make sure the register classes are correct. 1491 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 1492 if (MI->getOperand(i).isFPImm()) { 1493 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 1494 "all fp values to integers."; 1495 return false; 1496 } 1497 1498 int RegClass = Desc.OpInfo[i].RegClass; 1499 1500 switch (Desc.OpInfo[i].OperandType) { 1501 case MCOI::OPERAND_REGISTER: 1502 if (MI->getOperand(i).isImm()) { 1503 ErrInfo = "Illegal immediate value for operand."; 1504 return false; 1505 } 1506 break; 1507 case AMDGPU::OPERAND_REG_IMM32: 1508 break; 1509 case AMDGPU::OPERAND_REG_INLINE_C: 1510 if (isLiteralConstant(MI->getOperand(i), 1511 RI.getRegClass(RegClass)->getSize())) { 1512 ErrInfo = "Illegal immediate value for operand."; 1513 return false; 1514 } 1515 break; 1516 case MCOI::OPERAND_IMMEDIATE: 1517 // Check if this operand is an immediate. 1518 // FrameIndex operands will be replaced by immediates, so they are 1519 // allowed. 1520 if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) { 1521 ErrInfo = "Expected immediate, but got non-immediate"; 1522 return false; 1523 } 1524 // Fall-through 1525 default: 1526 continue; 1527 } 1528 1529 if (!MI->getOperand(i).isReg()) 1530 continue; 1531 1532 if (RegClass != -1) { 1533 unsigned Reg = MI->getOperand(i).getReg(); 1534 if (TargetRegisterInfo::isVirtualRegister(Reg)) 1535 continue; 1536 1537 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 1538 if (!RC->contains(Reg)) { 1539 ErrInfo = "Operand has incorrect register class."; 1540 return false; 1541 } 1542 } 1543 } 1544 1545 1546 // Verify VOP* 1547 if (isVOP1(*MI) || isVOP2(*MI) || isVOP3(*MI) || isVOPC(*MI)) { 1548 // Only look at the true operands. Only a real operand can use the constant 1549 // bus, and we don't want to check pseudo-operands like the source modifier 1550 // flags. 1551 const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; 1552 1553 unsigned ConstantBusCount = 0; 1554 unsigned SGPRUsed = findImplicitSGPRRead(*MI); 1555 if (SGPRUsed != AMDGPU::NoRegister) 1556 ++ConstantBusCount; 1557 1558 for (int OpIdx : OpIndices) { 1559 if (OpIdx == -1) 1560 break; 1561 const MachineOperand &MO = MI->getOperand(OpIdx); 1562 if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) { 1563 if (MO.isReg()) { 1564 if (MO.getReg() != SGPRUsed) 1565 ++ConstantBusCount; 1566 SGPRUsed = MO.getReg(); 1567 } else { 1568 ++ConstantBusCount; 1569 } 1570 } 1571 } 1572 if (ConstantBusCount > 1) { 1573 ErrInfo = "VOP* instruction uses the constant bus more than once"; 1574 return false; 1575 } 1576 } 1577 1578 // Verify misc. restrictions on specific instructions. 1579 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || 1580 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { 1581 const MachineOperand &Src0 = MI->getOperand(Src0Idx); 1582 const MachineOperand &Src1 = MI->getOperand(Src1Idx); 1583 const MachineOperand &Src2 = MI->getOperand(Src2Idx); 1584 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 1585 if (!compareMachineOp(Src0, Src1) && 1586 !compareMachineOp(Src0, Src2)) { 1587 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 1588 return false; 1589 } 1590 } 1591 } 1592 1593 // Make sure we aren't losing exec uses in the td files. This mostly requires 1594 // being careful when using let Uses to try to add other use registers. 1595 if (!isGenericOpcode(Opcode) && !isSALU(Opcode) && !isSMRD(Opcode)) { 1596 if (!MI->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 1597 ErrInfo = "VALU instruction does not implicitly read exec mask"; 1598 return false; 1599 } 1600 } 1601 1602 return true; 1603 } 1604 1605 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { 1606 switch (MI.getOpcode()) { 1607 default: return AMDGPU::INSTRUCTION_LIST_END; 1608 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 1609 case AMDGPU::COPY: return AMDGPU::COPY; 1610 case AMDGPU::PHI: return AMDGPU::PHI; 1611 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 1612 case AMDGPU::S_MOV_B32: 1613 return MI.getOperand(1).isReg() ? 1614 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 1615 case AMDGPU::S_ADD_I32: 1616 case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; 1617 case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; 1618 case AMDGPU::S_SUB_I32: 1619 case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; 1620 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 1621 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; 1622 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32; 1623 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32; 1624 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32; 1625 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32; 1626 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32; 1627 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32; 1628 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32; 1629 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 1630 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; 1631 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 1632 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; 1633 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 1634 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; 1635 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; 1636 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; 1637 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; 1638 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; 1639 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 1640 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 1641 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 1642 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 1643 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; 1644 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; 1645 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; 1646 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; 1647 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; 1648 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; 1649 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; 1650 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; 1651 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; 1652 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; 1653 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; 1654 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; 1655 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 1656 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 1657 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 1658 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 1659 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; 1660 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; 1661 } 1662 } 1663 1664 bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const { 1665 return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END; 1666 } 1667 1668 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 1669 unsigned OpNo) const { 1670 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 1671 const MCInstrDesc &Desc = get(MI.getOpcode()); 1672 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 1673 Desc.OpInfo[OpNo].RegClass == -1) { 1674 unsigned Reg = MI.getOperand(OpNo).getReg(); 1675 1676 if (TargetRegisterInfo::isVirtualRegister(Reg)) 1677 return MRI.getRegClass(Reg); 1678 return RI.getPhysRegClass(Reg); 1679 } 1680 1681 unsigned RCID = Desc.OpInfo[OpNo].RegClass; 1682 return RI.getRegClass(RCID); 1683 } 1684 1685 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { 1686 switch (MI.getOpcode()) { 1687 case AMDGPU::COPY: 1688 case AMDGPU::REG_SEQUENCE: 1689 case AMDGPU::PHI: 1690 case AMDGPU::INSERT_SUBREG: 1691 return RI.hasVGPRs(getOpRegClass(MI, 0)); 1692 default: 1693 return RI.hasVGPRs(getOpRegClass(MI, OpNo)); 1694 } 1695 } 1696 1697 void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const { 1698 MachineBasicBlock::iterator I = MI; 1699 MachineBasicBlock *MBB = MI->getParent(); 1700 MachineOperand &MO = MI->getOperand(OpIdx); 1701 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1702 unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass; 1703 const TargetRegisterClass *RC = RI.getRegClass(RCID); 1704 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 1705 if (MO.isReg()) 1706 Opcode = AMDGPU::COPY; 1707 else if (RI.isSGPRClass(RC)) 1708 Opcode = AMDGPU::S_MOV_B32; 1709 1710 1711 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 1712 if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) 1713 VRC = &AMDGPU::VReg_64RegClass; 1714 else 1715 VRC = &AMDGPU::VGPR_32RegClass; 1716 1717 unsigned Reg = MRI.createVirtualRegister(VRC); 1718 DebugLoc DL = MBB->findDebugLoc(I); 1719 BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg) 1720 .addOperand(MO); 1721 MO.ChangeToRegister(Reg, false); 1722 } 1723 1724 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 1725 MachineRegisterInfo &MRI, 1726 MachineOperand &SuperReg, 1727 const TargetRegisterClass *SuperRC, 1728 unsigned SubIdx, 1729 const TargetRegisterClass *SubRC) 1730 const { 1731 MachineBasicBlock *MBB = MI->getParent(); 1732 DebugLoc DL = MI->getDebugLoc(); 1733 unsigned SubReg = MRI.createVirtualRegister(SubRC); 1734 1735 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { 1736 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 1737 .addReg(SuperReg.getReg(), 0, SubIdx); 1738 return SubReg; 1739 } 1740 1741 // Just in case the super register is itself a sub-register, copy it to a new 1742 // value so we don't need to worry about merging its subreg index with the 1743 // SubIdx passed to this function. The register coalescer should be able to 1744 // eliminate this extra copy. 1745 unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); 1746 1747 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 1748 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 1749 1750 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 1751 .addReg(NewSuperReg, 0, SubIdx); 1752 1753 return SubReg; 1754 } 1755 1756 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 1757 MachineBasicBlock::iterator MII, 1758 MachineRegisterInfo &MRI, 1759 MachineOperand &Op, 1760 const TargetRegisterClass *SuperRC, 1761 unsigned SubIdx, 1762 const TargetRegisterClass *SubRC) const { 1763 if (Op.isImm()) { 1764 // XXX - Is there a better way to do this? 1765 if (SubIdx == AMDGPU::sub0) 1766 return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF); 1767 if (SubIdx == AMDGPU::sub1) 1768 return MachineOperand::CreateImm(Op.getImm() >> 32); 1769 1770 llvm_unreachable("Unhandled register index for immediate"); 1771 } 1772 1773 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 1774 SubIdx, SubRC); 1775 return MachineOperand::CreateReg(SubReg, false); 1776 } 1777 1778 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 1779 void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const { 1780 assert(Inst->getNumExplicitOperands() == 3); 1781 MachineOperand Op1 = Inst->getOperand(1); 1782 Inst->RemoveOperand(1); 1783 Inst->addOperand(Op1); 1784 } 1785 1786 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, 1787 const MCOperandInfo &OpInfo, 1788 const MachineOperand &MO) const { 1789 if (!MO.isReg()) 1790 return false; 1791 1792 unsigned Reg = MO.getReg(); 1793 const TargetRegisterClass *RC = 1794 TargetRegisterInfo::isVirtualRegister(Reg) ? 1795 MRI.getRegClass(Reg) : 1796 RI.getPhysRegClass(Reg); 1797 1798 const SIRegisterInfo *TRI = 1799 static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 1800 RC = TRI->getSubRegClass(RC, MO.getSubReg()); 1801 1802 // In order to be legal, the common sub-class must be equal to the 1803 // class of the current operand. For example: 1804 // 1805 // v_mov_b32 s0 ; Operand defined as vsrc_32 1806 // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL 1807 // 1808 // s_sendmsg 0, s0 ; Operand defined as m0reg 1809 // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL 1810 1811 return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; 1812 } 1813 1814 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, 1815 const MCOperandInfo &OpInfo, 1816 const MachineOperand &MO) const { 1817 if (MO.isReg()) 1818 return isLegalRegOperand(MRI, OpInfo, MO); 1819 1820 // Handle non-register types that are treated like immediates. 1821 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 1822 return true; 1823 } 1824 1825 bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, 1826 const MachineOperand *MO) const { 1827 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1828 const MCInstrDesc &InstDesc = MI->getDesc(); 1829 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; 1830 const TargetRegisterClass *DefinedRC = 1831 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 1832 if (!MO) 1833 MO = &MI->getOperand(OpIdx); 1834 1835 if (isVALU(*MI) && 1836 usesConstantBus(MRI, *MO, DefinedRC->getSize())) { 1837 1838 RegSubRegPair SGPRUsed; 1839 if (MO->isReg()) 1840 SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg()); 1841 1842 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { 1843 if (i == OpIdx) 1844 continue; 1845 const MachineOperand &Op = MI->getOperand(i); 1846 if (Op.isReg() && 1847 (Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) && 1848 usesConstantBus(MRI, Op, getOpSize(*MI, i))) { 1849 return false; 1850 } 1851 } 1852 } 1853 1854 if (MO->isReg()) { 1855 assert(DefinedRC); 1856 return isLegalRegOperand(MRI, OpInfo, *MO); 1857 } 1858 1859 1860 // Handle non-register types that are treated like immediates. 1861 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); 1862 1863 if (!DefinedRC) { 1864 // This operand expects an immediate. 1865 return true; 1866 } 1867 1868 return isImmOperandLegal(MI, OpIdx, *MO); 1869 } 1870 1871 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, 1872 MachineInstr *MI) const { 1873 unsigned Opc = MI->getOpcode(); 1874 const MCInstrDesc &InstrDesc = get(Opc); 1875 1876 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1877 MachineOperand &Src1 = MI->getOperand(Src1Idx); 1878 1879 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 1880 // we need to only have one constant bus use. 1881 // 1882 // Note we do not need to worry about literal constants here. They are 1883 // disabled for the operand type for instructions because they will always 1884 // violate the one constant bus use rule. 1885 bool HasImplicitSGPR = findImplicitSGPRRead(*MI) != AMDGPU::NoRegister; 1886 if (HasImplicitSGPR) { 1887 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1888 MachineOperand &Src0 = MI->getOperand(Src0Idx); 1889 1890 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) 1891 legalizeOpWithMove(MI, Src0Idx); 1892 } 1893 1894 // VOP2 src0 instructions support all operand types, so we don't need to check 1895 // their legality. If src1 is already legal, we don't need to do anything. 1896 if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) 1897 return; 1898 1899 // We do not use commuteInstruction here because it is too aggressive and will 1900 // commute if it is possible. We only want to commute here if it improves 1901 // legality. This can be called a fairly large number of times so don't waste 1902 // compile time pointlessly swapping and checking legality again. 1903 if (HasImplicitSGPR || !MI->isCommutable()) { 1904 legalizeOpWithMove(MI, Src1Idx); 1905 return; 1906 } 1907 1908 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1909 MachineOperand &Src0 = MI->getOperand(Src0Idx); 1910 1911 // If src0 can be used as src1, commuting will make the operands legal. 1912 // Otherwise we have to give up and insert a move. 1913 // 1914 // TODO: Other immediate-like operand kinds could be commuted if there was a 1915 // MachineOperand::ChangeTo* for them. 1916 if ((!Src1.isImm() && !Src1.isReg()) || 1917 !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { 1918 legalizeOpWithMove(MI, Src1Idx); 1919 return; 1920 } 1921 1922 int CommutedOpc = commuteOpcode(*MI); 1923 if (CommutedOpc == -1) { 1924 legalizeOpWithMove(MI, Src1Idx); 1925 return; 1926 } 1927 1928 MI->setDesc(get(CommutedOpc)); 1929 1930 unsigned Src0Reg = Src0.getReg(); 1931 unsigned Src0SubReg = Src0.getSubReg(); 1932 bool Src0Kill = Src0.isKill(); 1933 1934 if (Src1.isImm()) 1935 Src0.ChangeToImmediate(Src1.getImm()); 1936 else if (Src1.isReg()) { 1937 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); 1938 Src0.setSubReg(Src1.getSubReg()); 1939 } else 1940 llvm_unreachable("Should only have register or immediate operands"); 1941 1942 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); 1943 Src1.setSubReg(Src0SubReg); 1944 } 1945 1946 // Legalize VOP3 operands. Because all operand types are supported for any 1947 // operand, and since literal constants are not allowed and should never be 1948 // seen, we only need to worry about inserting copies if we use multiple SGPR 1949 // operands. 1950 void SIInstrInfo::legalizeOperandsVOP3( 1951 MachineRegisterInfo &MRI, 1952 MachineInstr *MI) const { 1953 unsigned Opc = MI->getOpcode(); 1954 1955 int VOP3Idx[3] = { 1956 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 1957 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), 1958 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) 1959 }; 1960 1961 // Find the one SGPR operand we are allowed to use. 1962 unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); 1963 1964 for (unsigned i = 0; i < 3; ++i) { 1965 int Idx = VOP3Idx[i]; 1966 if (Idx == -1) 1967 break; 1968 MachineOperand &MO = MI->getOperand(Idx); 1969 1970 // We should never see a VOP3 instruction with an illegal immediate operand. 1971 if (!MO.isReg()) 1972 continue; 1973 1974 if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 1975 continue; // VGPRs are legal 1976 1977 if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { 1978 SGPRReg = MO.getReg(); 1979 // We can use one SGPR in each VOP3 instruction. 1980 continue; 1981 } 1982 1983 // If we make it this far, then the operand is not legal and we must 1984 // legalize it. 1985 legalizeOpWithMove(MI, Idx); 1986 } 1987 } 1988 1989 unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr *UseMI, 1990 MachineRegisterInfo &MRI) const { 1991 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); 1992 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); 1993 unsigned DstReg = MRI.createVirtualRegister(SRC); 1994 unsigned SubRegs = VRC->getSize() / 4; 1995 1996 SmallVector<unsigned, 8> SRegs; 1997 for (unsigned i = 0; i < SubRegs; ++i) { 1998 unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 1999 BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(), 2000 get(AMDGPU::V_READFIRSTLANE_B32), SGPR) 2001 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); 2002 SRegs.push_back(SGPR); 2003 } 2004 2005 MachineInstrBuilder MIB = BuildMI(*UseMI->getParent(), UseMI, 2006 UseMI->getDebugLoc(), 2007 get(AMDGPU::REG_SEQUENCE), DstReg); 2008 for (unsigned i = 0; i < SubRegs; ++i) { 2009 MIB.addReg(SRegs[i]); 2010 MIB.addImm(RI.getSubRegFromChannel(i)); 2011 } 2012 return DstReg; 2013 } 2014 2015 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, 2016 MachineInstr *MI) const { 2017 2018 // If the pointer is store in VGPRs, then we need to move them to 2019 // SGPRs using v_readfirstlane. This is safe because we only select 2020 // loads with uniform pointers to SMRD instruction so we know the 2021 // pointer value is uniform. 2022 MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase); 2023 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { 2024 unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); 2025 SBase->setReg(SGPR); 2026 } 2027 } 2028 2029 void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { 2030 MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 2031 2032 // Legalize VOP2 2033 if (isVOP2(*MI) || isVOPC(*MI)) { 2034 legalizeOperandsVOP2(MRI, MI); 2035 return; 2036 } 2037 2038 // Legalize VOP3 2039 if (isVOP3(*MI)) { 2040 legalizeOperandsVOP3(MRI, MI); 2041 return; 2042 } 2043 2044 // Legalize SMRD 2045 if (isSMRD(*MI)) { 2046 legalizeOperandsSMRD(MRI, MI); 2047 return; 2048 } 2049 2050 // Legalize REG_SEQUENCE and PHI 2051 // The register class of the operands much be the same type as the register 2052 // class of the output. 2053 if (MI->getOpcode() == AMDGPU::PHI) { 2054 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 2055 for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { 2056 if (!MI->getOperand(i).isReg() || 2057 !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) 2058 continue; 2059 const TargetRegisterClass *OpRC = 2060 MRI.getRegClass(MI->getOperand(i).getReg()); 2061 if (RI.hasVGPRs(OpRC)) { 2062 VRC = OpRC; 2063 } else { 2064 SRC = OpRC; 2065 } 2066 } 2067 2068 // If any of the operands are VGPR registers, then they all most be 2069 // otherwise we will create illegal VGPR->SGPR copies when legalizing 2070 // them. 2071 if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) { 2072 if (!VRC) { 2073 assert(SRC); 2074 VRC = RI.getEquivalentVGPRClass(SRC); 2075 } 2076 RC = VRC; 2077 } else { 2078 RC = SRC; 2079 } 2080 2081 // Update all the operands so they have the same type. 2082 for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { 2083 MachineOperand &Op = MI->getOperand(I); 2084 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 2085 continue; 2086 unsigned DstReg = MRI.createVirtualRegister(RC); 2087 2088 // MI is a PHI instruction. 2089 MachineBasicBlock *InsertBB = MI->getOperand(I + 1).getMBB(); 2090 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); 2091 2092 BuildMI(*InsertBB, Insert, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) 2093 .addOperand(Op); 2094 Op.setReg(DstReg); 2095 } 2096 } 2097 2098 // REG_SEQUENCE doesn't really require operand legalization, but if one has a 2099 // VGPR dest type and SGPR sources, insert copies so all operands are 2100 // VGPRs. This seems to help operand folding / the register coalescer. 2101 if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) { 2102 MachineBasicBlock *MBB = MI->getParent(); 2103 const TargetRegisterClass *DstRC = getOpRegClass(*MI, 0); 2104 if (RI.hasVGPRs(DstRC)) { 2105 // Update all the operands so they are VGPR register classes. These may 2106 // not be the same register class because REG_SEQUENCE supports mixing 2107 // subregister index types e.g. sub0_sub1 + sub2 + sub3 2108 for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { 2109 MachineOperand &Op = MI->getOperand(I); 2110 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 2111 continue; 2112 2113 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); 2114 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); 2115 if (VRC == OpRC) 2116 continue; 2117 2118 unsigned DstReg = MRI.createVirtualRegister(VRC); 2119 2120 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) 2121 .addOperand(Op); 2122 2123 Op.setReg(DstReg); 2124 Op.setIsKill(); 2125 } 2126 } 2127 2128 return; 2129 } 2130 2131 // Legalize INSERT_SUBREG 2132 // src0 must have the same register class as dst 2133 if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) { 2134 unsigned Dst = MI->getOperand(0).getReg(); 2135 unsigned Src0 = MI->getOperand(1).getReg(); 2136 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 2137 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 2138 if (DstRC != Src0RC) { 2139 MachineBasicBlock &MBB = *MI->getParent(); 2140 unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); 2141 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0) 2142 .addReg(Src0); 2143 MI->getOperand(1).setReg(NewSrc0); 2144 } 2145 return; 2146 } 2147 2148 // Legalize MIMG 2149 if (isMIMG(*MI)) { 2150 MachineOperand *SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); 2151 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { 2152 unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); 2153 SRsrc->setReg(SGPR); 2154 } 2155 2156 MachineOperand *SSamp = getNamedOperand(*MI, AMDGPU::OpName::ssamp); 2157 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { 2158 unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); 2159 SSamp->setReg(SGPR); 2160 } 2161 return; 2162 } 2163 2164 // Legalize MUBUF* instructions 2165 // FIXME: If we start using the non-addr64 instructions for compute, we 2166 // may need to legalize them here. 2167 int SRsrcIdx = 2168 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc); 2169 if (SRsrcIdx != -1) { 2170 // We have an MUBUF instruction 2171 MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx); 2172 unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass; 2173 if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), 2174 RI.getRegClass(SRsrcRC))) { 2175 // The operands are legal. 2176 // FIXME: We may need to legalize operands besided srsrc. 2177 return; 2178 } 2179 2180 MachineBasicBlock &MBB = *MI->getParent(); 2181 2182 // Extract the ptr from the resource descriptor. 2183 unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, 2184 &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); 2185 2186 // Create an empty resource descriptor 2187 unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2188 unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2189 unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2190 unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); 2191 uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); 2192 2193 // Zero64 = 0 2194 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64), 2195 Zero64) 2196 .addImm(0); 2197 2198 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 2199 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 2200 SRsrcFormatLo) 2201 .addImm(RsrcDataFormat & 0xFFFFFFFF); 2202 2203 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 2204 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 2205 SRsrcFormatHi) 2206 .addImm(RsrcDataFormat >> 32); 2207 2208 // NewSRsrc = {Zero64, SRsrcFormat} 2209 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) 2210 .addReg(Zero64) 2211 .addImm(AMDGPU::sub0_sub1) 2212 .addReg(SRsrcFormatLo) 2213 .addImm(AMDGPU::sub2) 2214 .addReg(SRsrcFormatHi) 2215 .addImm(AMDGPU::sub3); 2216 2217 MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); 2218 unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2219 if (VAddr) { 2220 // This is already an ADDR64 instruction so we need to add the pointer 2221 // extracted from the resource descriptor to the current value of VAddr. 2222 unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2223 unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2224 2225 // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 2226 DebugLoc DL = MI->getDebugLoc(); 2227 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) 2228 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 2229 .addReg(VAddr->getReg(), 0, AMDGPU::sub0); 2230 2231 // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1 2232 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) 2233 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 2234 .addReg(VAddr->getReg(), 0, AMDGPU::sub1); 2235 2236 // NewVaddr = {NewVaddrHi, NewVaddrLo} 2237 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 2238 .addReg(NewVAddrLo) 2239 .addImm(AMDGPU::sub0) 2240 .addReg(NewVAddrHi) 2241 .addImm(AMDGPU::sub1); 2242 } else { 2243 // This instructions is the _OFFSET variant, so we need to convert it to 2244 // ADDR64. 2245 assert(MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() 2246 < AMDGPUSubtarget::VOLCANIC_ISLANDS && 2247 "FIXME: Need to emit flat atomics here"); 2248 2249 MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata); 2250 MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset); 2251 MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset); 2252 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode()); 2253 2254 // Atomics rith return have have an additional tied operand and are 2255 // missing some of the special bits. 2256 MachineOperand *VDataIn = getNamedOperand(*MI, AMDGPU::OpName::vdata_in); 2257 MachineInstr *Addr64; 2258 2259 if (!VDataIn) { 2260 // Regular buffer load / store. 2261 MachineInstrBuilder MIB 2262 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) 2263 .addOperand(*VData) 2264 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 2265 // This will be replaced later 2266 // with the new value of vaddr. 2267 .addOperand(*SRsrc) 2268 .addOperand(*SOffset) 2269 .addOperand(*Offset); 2270 2271 // Atomics do not have this operand. 2272 if (const MachineOperand *GLC 2273 = getNamedOperand(*MI, AMDGPU::OpName::glc)) { 2274 MIB.addImm(GLC->getImm()); 2275 } 2276 2277 MIB.addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)); 2278 2279 if (const MachineOperand *TFE 2280 = getNamedOperand(*MI, AMDGPU::OpName::tfe)) { 2281 MIB.addImm(TFE->getImm()); 2282 } 2283 2284 MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 2285 Addr64 = MIB; 2286 } else { 2287 // Atomics with return. 2288 Addr64 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) 2289 .addOperand(*VData) 2290 .addOperand(*VDataIn) 2291 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 2292 // This will be replaced later 2293 // with the new value of vaddr. 2294 .addOperand(*SRsrc) 2295 .addOperand(*SOffset) 2296 .addOperand(*Offset) 2297 .addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)) 2298 .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 2299 } 2300 2301 MI->removeFromParent(); 2302 MI = Addr64; 2303 2304 // NewVaddr = {NewVaddrHi, NewVaddrLo} 2305 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 2306 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 2307 .addImm(AMDGPU::sub0) 2308 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 2309 .addImm(AMDGPU::sub1); 2310 2311 VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); 2312 SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); 2313 } 2314 2315 // Update the instruction to use NewVaddr 2316 VAddr->setReg(NewVAddr); 2317 // Update the instruction to use NewSRsrc 2318 SRsrc->setReg(NewSRsrc); 2319 } 2320 } 2321 2322 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { 2323 SmallVector<MachineInstr *, 128> Worklist; 2324 Worklist.push_back(&TopInst); 2325 2326 while (!Worklist.empty()) { 2327 MachineInstr *Inst = Worklist.pop_back_val(); 2328 MachineBasicBlock *MBB = Inst->getParent(); 2329 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 2330 2331 unsigned Opcode = Inst->getOpcode(); 2332 unsigned NewOpcode = getVALUOp(*Inst); 2333 2334 // Handle some special cases 2335 switch (Opcode) { 2336 default: 2337 break; 2338 case AMDGPU::S_AND_B64: 2339 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); 2340 Inst->eraseFromParent(); 2341 continue; 2342 2343 case AMDGPU::S_OR_B64: 2344 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64); 2345 Inst->eraseFromParent(); 2346 continue; 2347 2348 case AMDGPU::S_XOR_B64: 2349 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64); 2350 Inst->eraseFromParent(); 2351 continue; 2352 2353 case AMDGPU::S_NOT_B64: 2354 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); 2355 Inst->eraseFromParent(); 2356 continue; 2357 2358 case AMDGPU::S_BCNT1_I32_B64: 2359 splitScalar64BitBCNT(Worklist, Inst); 2360 Inst->eraseFromParent(); 2361 continue; 2362 2363 case AMDGPU::S_BFE_I64: { 2364 splitScalar64BitBFE(Worklist, Inst); 2365 Inst->eraseFromParent(); 2366 continue; 2367 } 2368 2369 case AMDGPU::S_LSHL_B32: 2370 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2371 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 2372 swapOperands(Inst); 2373 } 2374 break; 2375 case AMDGPU::S_ASHR_I32: 2376 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2377 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 2378 swapOperands(Inst); 2379 } 2380 break; 2381 case AMDGPU::S_LSHR_B32: 2382 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2383 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 2384 swapOperands(Inst); 2385 } 2386 break; 2387 case AMDGPU::S_LSHL_B64: 2388 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2389 NewOpcode = AMDGPU::V_LSHLREV_B64; 2390 swapOperands(Inst); 2391 } 2392 break; 2393 case AMDGPU::S_ASHR_I64: 2394 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2395 NewOpcode = AMDGPU::V_ASHRREV_I64; 2396 swapOperands(Inst); 2397 } 2398 break; 2399 case AMDGPU::S_LSHR_B64: 2400 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2401 NewOpcode = AMDGPU::V_LSHRREV_B64; 2402 swapOperands(Inst); 2403 } 2404 break; 2405 2406 case AMDGPU::S_ABS_I32: 2407 lowerScalarAbs(Worklist, Inst); 2408 Inst->eraseFromParent(); 2409 continue; 2410 2411 case AMDGPU::S_CBRANCH_SCC0: 2412 case AMDGPU::S_CBRANCH_SCC1: 2413 // Clear unused bits of vcc 2414 BuildMI(*MBB, Inst, Inst->getDebugLoc(), get(AMDGPU::S_AND_B64), AMDGPU::VCC) 2415 .addReg(AMDGPU::EXEC) 2416 .addReg(AMDGPU::VCC); 2417 break; 2418 2419 case AMDGPU::S_BFE_U64: 2420 case AMDGPU::S_BFM_B64: 2421 llvm_unreachable("Moving this op to VALU not implemented"); 2422 } 2423 2424 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 2425 // We cannot move this instruction to the VALU, so we should try to 2426 // legalize its operands instead. 2427 legalizeOperands(Inst); 2428 continue; 2429 } 2430 2431 // Use the new VALU Opcode. 2432 const MCInstrDesc &NewDesc = get(NewOpcode); 2433 Inst->setDesc(NewDesc); 2434 2435 // Remove any references to SCC. Vector instructions can't read from it, and 2436 // We're just about to add the implicit use / defs of VCC, and we don't want 2437 // both. 2438 for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) { 2439 MachineOperand &Op = Inst->getOperand(i); 2440 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { 2441 Inst->RemoveOperand(i); 2442 addSCCDefUsersToVALUWorklist(Inst, Worklist); 2443 } 2444 } 2445 2446 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 2447 // We are converting these to a BFE, so we need to add the missing 2448 // operands for the size and offset. 2449 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 2450 Inst->addOperand(MachineOperand::CreateImm(0)); 2451 Inst->addOperand(MachineOperand::CreateImm(Size)); 2452 2453 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 2454 // The VALU version adds the second operand to the result, so insert an 2455 // extra 0 operand. 2456 Inst->addOperand(MachineOperand::CreateImm(0)); 2457 } 2458 2459 Inst->addImplicitDefUseOperands(*Inst->getParent()->getParent()); 2460 2461 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 2462 const MachineOperand &OffsetWidthOp = Inst->getOperand(2); 2463 // If we need to move this to VGPRs, we need to unpack the second operand 2464 // back into the 2 separate ones for bit offset and width. 2465 assert(OffsetWidthOp.isImm() && 2466 "Scalar BFE is only implemented for constant width and offset"); 2467 uint32_t Imm = OffsetWidthOp.getImm(); 2468 2469 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 2470 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 2471 Inst->RemoveOperand(2); // Remove old immediate. 2472 Inst->addOperand(MachineOperand::CreateImm(Offset)); 2473 Inst->addOperand(MachineOperand::CreateImm(BitWidth)); 2474 } 2475 2476 bool HasDst = Inst->getOperand(0).isReg() && Inst->getOperand(0).isDef(); 2477 unsigned NewDstReg = AMDGPU::NoRegister; 2478 if (HasDst) { 2479 // Update the destination register class. 2480 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst); 2481 if (!NewDstRC) 2482 continue; 2483 2484 unsigned DstReg = Inst->getOperand(0).getReg(); 2485 NewDstReg = MRI.createVirtualRegister(NewDstRC); 2486 MRI.replaceRegWith(DstReg, NewDstReg); 2487 } 2488 2489 // Legalize the operands 2490 legalizeOperands(Inst); 2491 2492 if (HasDst) 2493 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 2494 } 2495 } 2496 2497 //===----------------------------------------------------------------------===// 2498 // Indirect addressing callbacks 2499 //===----------------------------------------------------------------------===// 2500 2501 const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const { 2502 return &AMDGPU::VGPR_32RegClass; 2503 } 2504 2505 void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, 2506 MachineInstr *Inst) const { 2507 MachineBasicBlock &MBB = *Inst->getParent(); 2508 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2509 MachineBasicBlock::iterator MII = Inst; 2510 DebugLoc DL = Inst->getDebugLoc(); 2511 2512 MachineOperand &Dest = Inst->getOperand(0); 2513 MachineOperand &Src = Inst->getOperand(1); 2514 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2515 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2516 2517 BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg) 2518 .addImm(0) 2519 .addReg(Src.getReg()); 2520 2521 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) 2522 .addReg(Src.getReg()) 2523 .addReg(TmpReg); 2524 2525 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2526 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2527 } 2528 2529 void SIInstrInfo::splitScalar64BitUnaryOp( 2530 SmallVectorImpl<MachineInstr *> &Worklist, 2531 MachineInstr *Inst, 2532 unsigned Opcode) const { 2533 MachineBasicBlock &MBB = *Inst->getParent(); 2534 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2535 2536 MachineOperand &Dest = Inst->getOperand(0); 2537 MachineOperand &Src0 = Inst->getOperand(1); 2538 DebugLoc DL = Inst->getDebugLoc(); 2539 2540 MachineBasicBlock::iterator MII = Inst; 2541 2542 const MCInstrDesc &InstDesc = get(Opcode); 2543 const TargetRegisterClass *Src0RC = Src0.isReg() ? 2544 MRI.getRegClass(Src0.getReg()) : 2545 &AMDGPU::SGPR_32RegClass; 2546 2547 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 2548 2549 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2550 AMDGPU::sub0, Src0SubRC); 2551 2552 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 2553 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 2554 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 2555 2556 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 2557 BuildMI(MBB, MII, DL, InstDesc, DestSub0) 2558 .addOperand(SrcReg0Sub0); 2559 2560 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2561 AMDGPU::sub1, Src0SubRC); 2562 2563 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 2564 BuildMI(MBB, MII, DL, InstDesc, DestSub1) 2565 .addOperand(SrcReg0Sub1); 2566 2567 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 2568 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2569 .addReg(DestSub0) 2570 .addImm(AMDGPU::sub0) 2571 .addReg(DestSub1) 2572 .addImm(AMDGPU::sub1); 2573 2574 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 2575 2576 // We don't need to legalizeOperands here because for a single operand, src0 2577 // will support any kind of input. 2578 2579 // Move all users of this moved value. 2580 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 2581 } 2582 2583 void SIInstrInfo::splitScalar64BitBinaryOp( 2584 SmallVectorImpl<MachineInstr *> &Worklist, 2585 MachineInstr *Inst, 2586 unsigned Opcode) const { 2587 MachineBasicBlock &MBB = *Inst->getParent(); 2588 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2589 2590 MachineOperand &Dest = Inst->getOperand(0); 2591 MachineOperand &Src0 = Inst->getOperand(1); 2592 MachineOperand &Src1 = Inst->getOperand(2); 2593 DebugLoc DL = Inst->getDebugLoc(); 2594 2595 MachineBasicBlock::iterator MII = Inst; 2596 2597 const MCInstrDesc &InstDesc = get(Opcode); 2598 const TargetRegisterClass *Src0RC = Src0.isReg() ? 2599 MRI.getRegClass(Src0.getReg()) : 2600 &AMDGPU::SGPR_32RegClass; 2601 2602 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 2603 const TargetRegisterClass *Src1RC = Src1.isReg() ? 2604 MRI.getRegClass(Src1.getReg()) : 2605 &AMDGPU::SGPR_32RegClass; 2606 2607 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 2608 2609 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2610 AMDGPU::sub0, Src0SubRC); 2611 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 2612 AMDGPU::sub0, Src1SubRC); 2613 2614 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 2615 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 2616 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 2617 2618 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 2619 MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) 2620 .addOperand(SrcReg0Sub0) 2621 .addOperand(SrcReg1Sub0); 2622 2623 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2624 AMDGPU::sub1, Src0SubRC); 2625 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 2626 AMDGPU::sub1, Src1SubRC); 2627 2628 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 2629 MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) 2630 .addOperand(SrcReg0Sub1) 2631 .addOperand(SrcReg1Sub1); 2632 2633 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 2634 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2635 .addReg(DestSub0) 2636 .addImm(AMDGPU::sub0) 2637 .addReg(DestSub1) 2638 .addImm(AMDGPU::sub1); 2639 2640 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 2641 2642 // Try to legalize the operands in case we need to swap the order to keep it 2643 // valid. 2644 legalizeOperands(LoHalf); 2645 legalizeOperands(HiHalf); 2646 2647 // Move all users of this moved vlaue. 2648 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 2649 } 2650 2651 void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, 2652 MachineInstr *Inst) const { 2653 MachineBasicBlock &MBB = *Inst->getParent(); 2654 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2655 2656 MachineBasicBlock::iterator MII = Inst; 2657 DebugLoc DL = Inst->getDebugLoc(); 2658 2659 MachineOperand &Dest = Inst->getOperand(0); 2660 MachineOperand &Src = Inst->getOperand(1); 2661 2662 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 2663 const TargetRegisterClass *SrcRC = Src.isReg() ? 2664 MRI.getRegClass(Src.getReg()) : 2665 &AMDGPU::SGPR_32RegClass; 2666 2667 unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2668 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2669 2670 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); 2671 2672 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 2673 AMDGPU::sub0, SrcSubRC); 2674 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 2675 AMDGPU::sub1, SrcSubRC); 2676 2677 BuildMI(MBB, MII, DL, InstDesc, MidReg) 2678 .addOperand(SrcRegSub0) 2679 .addImm(0); 2680 2681 BuildMI(MBB, MII, DL, InstDesc, ResultReg) 2682 .addOperand(SrcRegSub1) 2683 .addReg(MidReg); 2684 2685 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2686 2687 // We don't need to legalize operands here. src0 for etiher instruction can be 2688 // an SGPR, and the second input is unused or determined here. 2689 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2690 } 2691 2692 void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, 2693 MachineInstr *Inst) const { 2694 MachineBasicBlock &MBB = *Inst->getParent(); 2695 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2696 MachineBasicBlock::iterator MII = Inst; 2697 DebugLoc DL = Inst->getDebugLoc(); 2698 2699 MachineOperand &Dest = Inst->getOperand(0); 2700 uint32_t Imm = Inst->getOperand(2).getImm(); 2701 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 2702 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 2703 2704 (void) Offset; 2705 2706 // Only sext_inreg cases handled. 2707 assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 && 2708 BitWidth <= 32 && 2709 Offset == 0 && 2710 "Not implemented"); 2711 2712 if (BitWidth < 32) { 2713 unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2714 unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2715 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2716 2717 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) 2718 .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0) 2719 .addImm(0) 2720 .addImm(BitWidth); 2721 2722 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 2723 .addImm(31) 2724 .addReg(MidRegLo); 2725 2726 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 2727 .addReg(MidRegLo) 2728 .addImm(AMDGPU::sub0) 2729 .addReg(MidRegHi) 2730 .addImm(AMDGPU::sub1); 2731 2732 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2733 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2734 return; 2735 } 2736 2737 MachineOperand &Src = Inst->getOperand(1); 2738 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2739 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2740 2741 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 2742 .addImm(31) 2743 .addReg(Src.getReg(), 0, AMDGPU::sub0); 2744 2745 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 2746 .addReg(Src.getReg(), 0, AMDGPU::sub0) 2747 .addImm(AMDGPU::sub0) 2748 .addReg(TmpReg) 2749 .addImm(AMDGPU::sub1); 2750 2751 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2752 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2753 } 2754 2755 void SIInstrInfo::addUsersToMoveToVALUWorklist( 2756 unsigned DstReg, 2757 MachineRegisterInfo &MRI, 2758 SmallVectorImpl<MachineInstr *> &Worklist) const { 2759 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), 2760 E = MRI.use_end(); I != E; ++I) { 2761 MachineInstr &UseMI = *I->getParent(); 2762 if (!canReadVGPR(UseMI, I.getOperandNo())) { 2763 Worklist.push_back(&UseMI); 2764 } 2765 } 2766 } 2767 2768 void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineInstr *SCCDefInst, 2769 SmallVectorImpl<MachineInstr *> &Worklist) const { 2770 // This assumes that all the users of SCC are in the same block 2771 // as the SCC def. 2772 for (MachineBasicBlock::iterator I = SCCDefInst, 2773 E = SCCDefInst->getParent()->end(); I != E; ++I) { 2774 2775 // Exit if we find another SCC def. 2776 if (I->findRegisterDefOperandIdx(AMDGPU::SCC) != -1) 2777 return; 2778 2779 if (I->findRegisterUseOperandIdx(AMDGPU::SCC) != -1) 2780 Worklist.push_back(I); 2781 } 2782 } 2783 2784 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( 2785 const MachineInstr &Inst) const { 2786 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); 2787 2788 switch (Inst.getOpcode()) { 2789 // For target instructions, getOpRegClass just returns the virtual register 2790 // class associated with the operand, so we need to find an equivalent VGPR 2791 // register class in order to move the instruction to the VALU. 2792 case AMDGPU::COPY: 2793 case AMDGPU::PHI: 2794 case AMDGPU::REG_SEQUENCE: 2795 case AMDGPU::INSERT_SUBREG: 2796 if (RI.hasVGPRs(NewDstRC)) 2797 return nullptr; 2798 2799 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 2800 if (!NewDstRC) 2801 return nullptr; 2802 return NewDstRC; 2803 default: 2804 return NewDstRC; 2805 } 2806 } 2807 2808 // Find the one SGPR operand we are allowed to use. 2809 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, 2810 int OpIndices[3]) const { 2811 const MCInstrDesc &Desc = MI->getDesc(); 2812 2813 // Find the one SGPR operand we are allowed to use. 2814 // 2815 // First we need to consider the instruction's operand requirements before 2816 // legalizing. Some operands are required to be SGPRs, such as implicit uses 2817 // of VCC, but we are still bound by the constant bus requirement to only use 2818 // one. 2819 // 2820 // If the operand's class is an SGPR, we can never move it. 2821 2822 unsigned SGPRReg = findImplicitSGPRRead(*MI); 2823 if (SGPRReg != AMDGPU::NoRegister) 2824 return SGPRReg; 2825 2826 unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; 2827 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 2828 2829 for (unsigned i = 0; i < 3; ++i) { 2830 int Idx = OpIndices[i]; 2831 if (Idx == -1) 2832 break; 2833 2834 const MachineOperand &MO = MI->getOperand(Idx); 2835 if (!MO.isReg()) 2836 continue; 2837 2838 // Is this operand statically required to be an SGPR based on the operand 2839 // constraints? 2840 const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); 2841 bool IsRequiredSGPR = RI.isSGPRClass(OpRC); 2842 if (IsRequiredSGPR) 2843 return MO.getReg(); 2844 2845 // If this could be a VGPR or an SGPR, Check the dynamic register class. 2846 unsigned Reg = MO.getReg(); 2847 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); 2848 if (RI.isSGPRClass(RegRC)) 2849 UsedSGPRs[i] = Reg; 2850 } 2851 2852 // We don't have a required SGPR operand, so we have a bit more freedom in 2853 // selecting operands to move. 2854 2855 // Try to select the most used SGPR. If an SGPR is equal to one of the 2856 // others, we choose that. 2857 // 2858 // e.g. 2859 // V_FMA_F32 v0, s0, s0, s0 -> No moves 2860 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 2861 2862 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should 2863 // prefer those. 2864 2865 if (UsedSGPRs[0] != AMDGPU::NoRegister) { 2866 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 2867 SGPRReg = UsedSGPRs[0]; 2868 } 2869 2870 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { 2871 if (UsedSGPRs[1] == UsedSGPRs[2]) 2872 SGPRReg = UsedSGPRs[1]; 2873 } 2874 2875 return SGPRReg; 2876 } 2877 2878 void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved, 2879 const MachineFunction &MF) const { 2880 int End = getIndirectIndexEnd(MF); 2881 int Begin = getIndirectIndexBegin(MF); 2882 2883 if (End == -1) 2884 return; 2885 2886 2887 for (int Index = Begin; Index <= End; ++Index) 2888 Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index)); 2889 2890 for (int Index = std::max(0, Begin - 1); Index <= End; ++Index) 2891 Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index)); 2892 2893 for (int Index = std::max(0, Begin - 2); Index <= End; ++Index) 2894 Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index)); 2895 2896 for (int Index = std::max(0, Begin - 3); Index <= End; ++Index) 2897 Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index)); 2898 2899 for (int Index = std::max(0, Begin - 7); Index <= End; ++Index) 2900 Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index)); 2901 2902 for (int Index = std::max(0, Begin - 15); Index <= End; ++Index) 2903 Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index)); 2904 } 2905 2906 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 2907 unsigned OperandName) const { 2908 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 2909 if (Idx == -1) 2910 return nullptr; 2911 2912 return &MI.getOperand(Idx); 2913 } 2914 2915 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 2916 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 2917 if (ST.isAmdHsaOS()) { 2918 RsrcDataFormat |= (1ULL << 56); 2919 2920 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 2921 // Set MTYPE = 2 2922 RsrcDataFormat |= (2ULL << 59); 2923 } 2924 2925 return RsrcDataFormat; 2926 } 2927 2928 uint64_t SIInstrInfo::getScratchRsrcWords23() const { 2929 uint64_t Rsrc23 = getDefaultRsrcDataFormat() | 2930 AMDGPU::RSRC_TID_ENABLE | 2931 0xffffffff; // Size; 2932 2933 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; 2934 2935 Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT); 2936 2937 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. 2938 // Clear them unless we want a huge stride. 2939 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 2940 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; 2941 2942 return Rsrc23; 2943 } 2944 2945 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr *MI) const { 2946 unsigned Opc = MI->getOpcode(); 2947 2948 return isSMRD(Opc); 2949 } 2950 2951 bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr *MI) const { 2952 unsigned Opc = MI->getOpcode(); 2953 2954 return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); 2955 } 2956 2957 ArrayRef<std::pair<int, const char *>> 2958 SIInstrInfo::getSerializableTargetIndices() const { 2959 static const std::pair<int, const char *> TargetIndices[] = { 2960 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, 2961 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, 2962 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, 2963 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, 2964 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; 2965 return makeArrayRef(TargetIndices); 2966 } 2967