1 //===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief SI Implementation of TargetInstrInfo. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "SIInstrInfo.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "SIDefines.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/CodeGen/MachineFrameInfo.h" 21 #include "llvm/CodeGen/MachineInstrBuilder.h" 22 #include "llvm/CodeGen/MachineRegisterInfo.h" 23 #include "llvm/IR/Function.h" 24 #include "llvm/CodeGen/RegisterScavenging.h" 25 #include "llvm/MC/MCInstrDesc.h" 26 #include "llvm/Support/Debug.h" 27 28 using namespace llvm; 29 30 SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st) 31 : AMDGPUInstrInfo(st), RI() {} 32 33 //===----------------------------------------------------------------------===// 34 // TargetInstrInfo callbacks 35 //===----------------------------------------------------------------------===// 36 37 static unsigned getNumOperandsNoGlue(SDNode *Node) { 38 unsigned N = Node->getNumOperands(); 39 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 40 --N; 41 return N; 42 } 43 44 static SDValue findChainOperand(SDNode *Load) { 45 SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); 46 assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); 47 return LastOp; 48 } 49 50 /// \brief Returns true if both nodes have the same value for the given 51 /// operand \p Op, or if both nodes do not have this operand. 52 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 53 unsigned Opc0 = N0->getMachineOpcode(); 54 unsigned Opc1 = N1->getMachineOpcode(); 55 56 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 57 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 58 59 if (Op0Idx == -1 && Op1Idx == -1) 60 return true; 61 62 63 if ((Op0Idx == -1 && Op1Idx != -1) || 64 (Op1Idx == -1 && Op0Idx != -1)) 65 return false; 66 67 // getNamedOperandIdx returns the index for the MachineInstr's operands, 68 // which includes the result as the first operand. We are indexing into the 69 // MachineSDNode's operands, so we need to skip the result operand to get 70 // the real index. 71 --Op0Idx; 72 --Op1Idx; 73 74 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 75 } 76 77 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, 78 AliasAnalysis *AA) const { 79 // TODO: The generic check fails for VALU instructions that should be 80 // rematerializable due to implicit reads of exec. We really want all of the 81 // generic logic for this except for this. 82 switch (MI->getOpcode()) { 83 case AMDGPU::V_MOV_B32_e32: 84 case AMDGPU::V_MOV_B32_e64: 85 case AMDGPU::V_MOV_B64_PSEUDO: 86 return true; 87 default: 88 return false; 89 } 90 } 91 92 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 93 int64_t &Offset0, 94 int64_t &Offset1) const { 95 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 96 return false; 97 98 unsigned Opc0 = Load0->getMachineOpcode(); 99 unsigned Opc1 = Load1->getMachineOpcode(); 100 101 // Make sure both are actually loads. 102 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 103 return false; 104 105 if (isDS(Opc0) && isDS(Opc1)) { 106 107 // FIXME: Handle this case: 108 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 109 return false; 110 111 // Check base reg. 112 if (Load0->getOperand(1) != Load1->getOperand(1)) 113 return false; 114 115 // Check chain. 116 if (findChainOperand(Load0) != findChainOperand(Load1)) 117 return false; 118 119 // Skip read2 / write2 variants for simplicity. 120 // TODO: We should report true if the used offsets are adjacent (excluded 121 // st64 versions). 122 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || 123 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) 124 return false; 125 126 Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue(); 127 Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue(); 128 return true; 129 } 130 131 if (isSMRD(Opc0) && isSMRD(Opc1)) { 132 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); 133 134 // Check base reg. 135 if (Load0->getOperand(0) != Load1->getOperand(0)) 136 return false; 137 138 const ConstantSDNode *Load0Offset = 139 dyn_cast<ConstantSDNode>(Load0->getOperand(1)); 140 const ConstantSDNode *Load1Offset = 141 dyn_cast<ConstantSDNode>(Load1->getOperand(1)); 142 143 if (!Load0Offset || !Load1Offset) 144 return false; 145 146 // Check chain. 147 if (findChainOperand(Load0) != findChainOperand(Load1)) 148 return false; 149 150 Offset0 = Load0Offset->getZExtValue(); 151 Offset1 = Load1Offset->getZExtValue(); 152 return true; 153 } 154 155 // MUBUF and MTBUF can access the same addresses. 156 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 157 158 // MUBUF and MTBUF have vaddr at different indices. 159 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 160 findChainOperand(Load0) != findChainOperand(Load1) || 161 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 162 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 163 return false; 164 165 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 166 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 167 168 if (OffIdx0 == -1 || OffIdx1 == -1) 169 return false; 170 171 // getNamedOperandIdx returns the index for MachineInstrs. Since they 172 // inlcude the output in the operand list, but SDNodes don't, we need to 173 // subtract the index by one. 174 --OffIdx0; 175 --OffIdx1; 176 177 SDValue Off0 = Load0->getOperand(OffIdx0); 178 SDValue Off1 = Load1->getOperand(OffIdx1); 179 180 // The offset might be a FrameIndexSDNode. 181 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 182 return false; 183 184 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 185 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 186 return true; 187 } 188 189 return false; 190 } 191 192 static bool isStride64(unsigned Opc) { 193 switch (Opc) { 194 case AMDGPU::DS_READ2ST64_B32: 195 case AMDGPU::DS_READ2ST64_B64: 196 case AMDGPU::DS_WRITE2ST64_B32: 197 case AMDGPU::DS_WRITE2ST64_B64: 198 return true; 199 default: 200 return false; 201 } 202 } 203 204 bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, 205 int64_t &Offset, 206 const TargetRegisterInfo *TRI) const { 207 unsigned Opc = LdSt->getOpcode(); 208 209 if (isDS(*LdSt)) { 210 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 211 AMDGPU::OpName::offset); 212 if (OffsetImm) { 213 // Normal, single offset LDS instruction. 214 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 215 AMDGPU::OpName::addr); 216 217 BaseReg = AddrReg->getReg(); 218 Offset = OffsetImm->getImm(); 219 return true; 220 } 221 222 // The 2 offset instructions use offset0 and offset1 instead. We can treat 223 // these as a load with a single offset if the 2 offsets are consecutive. We 224 // will use this for some partially aligned loads. 225 const MachineOperand *Offset0Imm = getNamedOperand(*LdSt, 226 AMDGPU::OpName::offset0); 227 // DS_PERMUTE does not have Offset0Imm (and Offset1Imm). 228 if (!Offset0Imm) 229 return false; 230 231 const MachineOperand *Offset1Imm = getNamedOperand(*LdSt, 232 AMDGPU::OpName::offset1); 233 234 uint8_t Offset0 = Offset0Imm->getImm(); 235 uint8_t Offset1 = Offset1Imm->getImm(); 236 237 if (Offset1 > Offset0 && Offset1 - Offset0 == 1) { 238 // Each of these offsets is in element sized units, so we need to convert 239 // to bytes of the individual reads. 240 241 unsigned EltSize; 242 if (LdSt->mayLoad()) 243 EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2; 244 else { 245 assert(LdSt->mayStore()); 246 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 247 EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize(); 248 } 249 250 if (isStride64(Opc)) 251 EltSize *= 64; 252 253 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 254 AMDGPU::OpName::addr); 255 BaseReg = AddrReg->getReg(); 256 Offset = EltSize * Offset0; 257 return true; 258 } 259 260 return false; 261 } 262 263 if (isMUBUF(*LdSt) || isMTBUF(*LdSt)) { 264 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) 265 return false; 266 267 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 268 AMDGPU::OpName::vaddr); 269 if (!AddrReg) 270 return false; 271 272 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 273 AMDGPU::OpName::offset); 274 BaseReg = AddrReg->getReg(); 275 Offset = OffsetImm->getImm(); 276 return true; 277 } 278 279 if (isSMRD(*LdSt)) { 280 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 281 AMDGPU::OpName::offset); 282 if (!OffsetImm) 283 return false; 284 285 const MachineOperand *SBaseReg = getNamedOperand(*LdSt, 286 AMDGPU::OpName::sbase); 287 BaseReg = SBaseReg->getReg(); 288 Offset = OffsetImm->getImm(); 289 return true; 290 } 291 292 return false; 293 } 294 295 bool SIInstrInfo::shouldClusterMemOps(MachineInstr *FirstLdSt, 296 MachineInstr *SecondLdSt, 297 unsigned NumLoads) const { 298 const MachineOperand *FirstDst = nullptr; 299 const MachineOperand *SecondDst = nullptr; 300 301 if (isDS(*FirstLdSt) && isDS(*SecondLdSt)) { 302 FirstDst = getNamedOperand(*FirstLdSt, AMDGPU::OpName::vdst); 303 SecondDst = getNamedOperand(*SecondLdSt, AMDGPU::OpName::vdst); 304 } 305 306 if (isSMRD(*FirstLdSt) && isSMRD(*FirstLdSt)) { 307 FirstDst = getNamedOperand(*FirstLdSt, AMDGPU::OpName::sdst); 308 SecondDst = getNamedOperand(*SecondLdSt, AMDGPU::OpName::sdst); 309 } 310 311 if ((isMUBUF(*FirstLdSt) && isMUBUF(*SecondLdSt)) || 312 (isMTBUF(*FirstLdSt) && isMTBUF(*SecondLdSt))) { 313 FirstDst = getNamedOperand(*FirstLdSt, AMDGPU::OpName::vdata); 314 SecondDst = getNamedOperand(*SecondLdSt, AMDGPU::OpName::vdata); 315 } 316 317 if (!FirstDst || !SecondDst) 318 return false; 319 320 // Try to limit clustering based on the total number of bytes loaded 321 // rather than the number of instructions. This is done to help reduce 322 // register pressure. The method used is somewhat inexact, though, 323 // because it assumes that all loads in the cluster will load the 324 // same number of bytes as FirstLdSt. 325 326 // The unit of this value is bytes. 327 // FIXME: This needs finer tuning. 328 unsigned LoadClusterThreshold = 16; 329 330 const MachineRegisterInfo &MRI = 331 FirstLdSt->getParent()->getParent()->getRegInfo(); 332 const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg()); 333 334 return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold; 335 } 336 337 void 338 SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 339 MachineBasicBlock::iterator MI, DebugLoc DL, 340 unsigned DestReg, unsigned SrcReg, 341 bool KillSrc) const { 342 343 // If we are trying to copy to or from SCC, there is a bug somewhere else in 344 // the backend. While it may be theoretically possible to do this, it should 345 // never be necessary. 346 assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC); 347 348 static const int16_t Sub0_15[] = { 349 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 350 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 351 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 352 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 353 }; 354 355 static const int16_t Sub0_15_64[] = { 356 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 357 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 358 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 359 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 360 }; 361 362 static const int16_t Sub0_7[] = { 363 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 364 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 365 }; 366 367 static const int16_t Sub0_7_64[] = { 368 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 369 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 370 }; 371 372 static const int16_t Sub0_3[] = { 373 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 374 }; 375 376 static const int16_t Sub0_3_64[] = { 377 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 378 }; 379 380 static const int16_t Sub0_2[] = { 381 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 382 }; 383 384 static const int16_t Sub0_1[] = { 385 AMDGPU::sub0, AMDGPU::sub1, 386 }; 387 388 unsigned Opcode; 389 ArrayRef<int16_t> SubIndices; 390 bool Forward; 391 392 if (AMDGPU::SReg_32RegClass.contains(DestReg)) { 393 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 394 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 395 .addReg(SrcReg, getKillRegState(KillSrc)); 396 return; 397 398 } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) { 399 if (DestReg == AMDGPU::VCC) { 400 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 401 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 402 .addReg(SrcReg, getKillRegState(KillSrc)); 403 } else { 404 // FIXME: Hack until VReg_1 removed. 405 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 406 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32)) 407 .addImm(0) 408 .addReg(SrcReg, getKillRegState(KillSrc)); 409 } 410 411 return; 412 } 413 414 assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); 415 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 416 .addReg(SrcReg, getKillRegState(KillSrc)); 417 return; 418 419 } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) { 420 assert(AMDGPU::SReg_128RegClass.contains(SrcReg)); 421 Opcode = AMDGPU::S_MOV_B64; 422 SubIndices = Sub0_3_64; 423 424 } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) { 425 assert(AMDGPU::SReg_256RegClass.contains(SrcReg)); 426 Opcode = AMDGPU::S_MOV_B64; 427 SubIndices = Sub0_7_64; 428 429 } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) { 430 assert(AMDGPU::SReg_512RegClass.contains(SrcReg)); 431 Opcode = AMDGPU::S_MOV_B64; 432 SubIndices = Sub0_15_64; 433 434 } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) { 435 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 436 AMDGPU::SReg_32RegClass.contains(SrcReg)); 437 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 438 .addReg(SrcReg, getKillRegState(KillSrc)); 439 return; 440 441 } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) { 442 assert(AMDGPU::VReg_64RegClass.contains(SrcReg) || 443 AMDGPU::SReg_64RegClass.contains(SrcReg)); 444 Opcode = AMDGPU::V_MOV_B32_e32; 445 SubIndices = Sub0_1; 446 447 } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) { 448 assert(AMDGPU::VReg_96RegClass.contains(SrcReg)); 449 Opcode = AMDGPU::V_MOV_B32_e32; 450 SubIndices = Sub0_2; 451 452 } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) { 453 assert(AMDGPU::VReg_128RegClass.contains(SrcReg) || 454 AMDGPU::SReg_128RegClass.contains(SrcReg)); 455 Opcode = AMDGPU::V_MOV_B32_e32; 456 SubIndices = Sub0_3; 457 458 } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) { 459 assert(AMDGPU::VReg_256RegClass.contains(SrcReg) || 460 AMDGPU::SReg_256RegClass.contains(SrcReg)); 461 Opcode = AMDGPU::V_MOV_B32_e32; 462 SubIndices = Sub0_7; 463 464 } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) { 465 assert(AMDGPU::VReg_512RegClass.contains(SrcReg) || 466 AMDGPU::SReg_512RegClass.contains(SrcReg)); 467 Opcode = AMDGPU::V_MOV_B32_e32; 468 SubIndices = Sub0_15; 469 470 } else { 471 llvm_unreachable("Can't copy register!"); 472 } 473 474 if (RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg)) 475 Forward = true; 476 else 477 Forward = false; 478 479 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 480 unsigned SubIdx; 481 if (Forward) 482 SubIdx = SubIndices[Idx]; 483 else 484 SubIdx = SubIndices[SubIndices.size() - Idx - 1]; 485 486 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 487 get(Opcode), RI.getSubReg(DestReg, SubIdx)); 488 489 Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); 490 491 if (Idx == SubIndices.size() - 1) 492 Builder.addReg(SrcReg, RegState::Kill | RegState::Implicit); 493 494 if (Idx == 0) 495 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 496 } 497 } 498 499 int SIInstrInfo::commuteOpcode(const MachineInstr &MI) const { 500 const unsigned Opcode = MI.getOpcode(); 501 502 int NewOpc; 503 504 // Try to map original to commuted opcode 505 NewOpc = AMDGPU::getCommuteRev(Opcode); 506 if (NewOpc != -1) 507 // Check if the commuted (REV) opcode exists on the target. 508 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 509 510 // Try to map commuted to original opcode 511 NewOpc = AMDGPU::getCommuteOrig(Opcode); 512 if (NewOpc != -1) 513 // Check if the original (non-REV) opcode exists on the target. 514 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 515 516 return Opcode; 517 } 518 519 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 520 521 if (DstRC->getSize() == 4) { 522 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 523 } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) { 524 return AMDGPU::S_MOV_B64; 525 } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) { 526 return AMDGPU::V_MOV_B64_PSEUDO; 527 } 528 return AMDGPU::COPY; 529 } 530 531 static unsigned getSGPRSpillSaveOpcode(unsigned Size) { 532 switch (Size) { 533 case 4: 534 return AMDGPU::SI_SPILL_S32_SAVE; 535 case 8: 536 return AMDGPU::SI_SPILL_S64_SAVE; 537 case 16: 538 return AMDGPU::SI_SPILL_S128_SAVE; 539 case 32: 540 return AMDGPU::SI_SPILL_S256_SAVE; 541 case 64: 542 return AMDGPU::SI_SPILL_S512_SAVE; 543 default: 544 llvm_unreachable("unknown register size"); 545 } 546 } 547 548 static unsigned getVGPRSpillSaveOpcode(unsigned Size) { 549 switch (Size) { 550 case 4: 551 return AMDGPU::SI_SPILL_V32_SAVE; 552 case 8: 553 return AMDGPU::SI_SPILL_V64_SAVE; 554 case 12: 555 return AMDGPU::SI_SPILL_V96_SAVE; 556 case 16: 557 return AMDGPU::SI_SPILL_V128_SAVE; 558 case 32: 559 return AMDGPU::SI_SPILL_V256_SAVE; 560 case 64: 561 return AMDGPU::SI_SPILL_V512_SAVE; 562 default: 563 llvm_unreachable("unknown register size"); 564 } 565 } 566 567 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 568 MachineBasicBlock::iterator MI, 569 unsigned SrcReg, bool isKill, 570 int FrameIndex, 571 const TargetRegisterClass *RC, 572 const TargetRegisterInfo *TRI) const { 573 MachineFunction *MF = MBB.getParent(); 574 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 575 MachineFrameInfo *FrameInfo = MF->getFrameInfo(); 576 DebugLoc DL = MBB.findDebugLoc(MI); 577 578 unsigned Size = FrameInfo->getObjectSize(FrameIndex); 579 unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); 580 MachinePointerInfo PtrInfo 581 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 582 MachineMemOperand *MMO 583 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 584 Size, Align); 585 586 if (RI.isSGPRClass(RC)) { 587 MFI->setHasSpilledSGPRs(); 588 589 // We are only allowed to create one new instruction when spilling 590 // registers, so we need to use pseudo instruction for spilling 591 // SGPRs. 592 unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize()); 593 BuildMI(MBB, MI, DL, get(Opcode)) 594 .addReg(SrcReg) // src 595 .addFrameIndex(FrameIndex) // frame_idx 596 .addMemOperand(MMO); 597 598 return; 599 } 600 601 if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { 602 LLVMContext &Ctx = MF->getFunction()->getContext(); 603 Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" 604 " spill register"); 605 BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) 606 .addReg(SrcReg); 607 608 return; 609 } 610 611 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 612 613 unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); 614 MFI->setHasSpilledVGPRs(); 615 BuildMI(MBB, MI, DL, get(Opcode)) 616 .addReg(SrcReg) // src 617 .addFrameIndex(FrameIndex) // frame_idx 618 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 619 .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset 620 .addImm(0) // offset 621 .addMemOperand(MMO); 622 } 623 624 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { 625 switch (Size) { 626 case 4: 627 return AMDGPU::SI_SPILL_S32_RESTORE; 628 case 8: 629 return AMDGPU::SI_SPILL_S64_RESTORE; 630 case 16: 631 return AMDGPU::SI_SPILL_S128_RESTORE; 632 case 32: 633 return AMDGPU::SI_SPILL_S256_RESTORE; 634 case 64: 635 return AMDGPU::SI_SPILL_S512_RESTORE; 636 default: 637 llvm_unreachable("unknown register size"); 638 } 639 } 640 641 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { 642 switch (Size) { 643 case 4: 644 return AMDGPU::SI_SPILL_V32_RESTORE; 645 case 8: 646 return AMDGPU::SI_SPILL_V64_RESTORE; 647 case 12: 648 return AMDGPU::SI_SPILL_V96_RESTORE; 649 case 16: 650 return AMDGPU::SI_SPILL_V128_RESTORE; 651 case 32: 652 return AMDGPU::SI_SPILL_V256_RESTORE; 653 case 64: 654 return AMDGPU::SI_SPILL_V512_RESTORE; 655 default: 656 llvm_unreachable("unknown register size"); 657 } 658 } 659 660 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 661 MachineBasicBlock::iterator MI, 662 unsigned DestReg, int FrameIndex, 663 const TargetRegisterClass *RC, 664 const TargetRegisterInfo *TRI) const { 665 MachineFunction *MF = MBB.getParent(); 666 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 667 MachineFrameInfo *FrameInfo = MF->getFrameInfo(); 668 DebugLoc DL = MBB.findDebugLoc(MI); 669 unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); 670 unsigned Size = FrameInfo->getObjectSize(FrameIndex); 671 672 MachinePointerInfo PtrInfo 673 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 674 675 MachineMemOperand *MMO = MF->getMachineMemOperand( 676 PtrInfo, MachineMemOperand::MOLoad, Size, Align); 677 678 if (RI.isSGPRClass(RC)) { 679 // FIXME: Maybe this should not include a memoperand because it will be 680 // lowered to non-memory instructions. 681 unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize()); 682 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 683 .addFrameIndex(FrameIndex) // frame_idx 684 .addMemOperand(MMO); 685 686 return; 687 } 688 689 if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { 690 LLVMContext &Ctx = MF->getFunction()->getContext(); 691 Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" 692 " restore register"); 693 BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); 694 695 return; 696 } 697 698 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 699 700 unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); 701 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 702 .addFrameIndex(FrameIndex) // frame_idx 703 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 704 .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset 705 .addImm(0) // offset 706 .addMemOperand(MMO); 707 } 708 709 /// \param @Offset Offset in bytes of the FrameIndex being spilled 710 unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, 711 MachineBasicBlock::iterator MI, 712 RegScavenger *RS, unsigned TmpReg, 713 unsigned FrameOffset, 714 unsigned Size) const { 715 MachineFunction *MF = MBB.getParent(); 716 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 717 const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>(); 718 const SIRegisterInfo *TRI = 719 static_cast<const SIRegisterInfo*>(ST.getRegisterInfo()); 720 DebugLoc DL = MBB.findDebugLoc(MI); 721 unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF); 722 unsigned WavefrontSize = ST.getWavefrontSize(); 723 724 unsigned TIDReg = MFI->getTIDReg(); 725 if (!MFI->hasCalculatedTID()) { 726 MachineBasicBlock &Entry = MBB.getParent()->front(); 727 MachineBasicBlock::iterator Insert = Entry.front(); 728 DebugLoc DL = Insert->getDebugLoc(); 729 730 TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass); 731 if (TIDReg == AMDGPU::NoRegister) 732 return TIDReg; 733 734 735 if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) && 736 WorkGroupSize > WavefrontSize) { 737 738 unsigned TIDIGXReg 739 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X); 740 unsigned TIDIGYReg 741 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y); 742 unsigned TIDIGZReg 743 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z); 744 unsigned InputPtrReg = 745 TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); 746 for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { 747 if (!Entry.isLiveIn(Reg)) 748 Entry.addLiveIn(Reg); 749 } 750 751 RS->enterBasicBlock(Entry); 752 // FIXME: Can we scavenge an SReg_64 and access the subregs? 753 unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 754 unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 755 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) 756 .addReg(InputPtrReg) 757 .addImm(SI::KernelInputOffsets::NGROUPS_Z); 758 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) 759 .addReg(InputPtrReg) 760 .addImm(SI::KernelInputOffsets::NGROUPS_Y); 761 762 // NGROUPS.X * NGROUPS.Y 763 BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) 764 .addReg(STmp1) 765 .addReg(STmp0); 766 // (NGROUPS.X * NGROUPS.Y) * TIDIG.X 767 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) 768 .addReg(STmp1) 769 .addReg(TIDIGXReg); 770 // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) 771 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) 772 .addReg(STmp0) 773 .addReg(TIDIGYReg) 774 .addReg(TIDReg); 775 // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z 776 BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) 777 .addReg(TIDReg) 778 .addReg(TIDIGZReg); 779 } else { 780 // Get the wave id 781 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), 782 TIDReg) 783 .addImm(-1) 784 .addImm(0); 785 786 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), 787 TIDReg) 788 .addImm(-1) 789 .addReg(TIDReg); 790 } 791 792 BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), 793 TIDReg) 794 .addImm(2) 795 .addReg(TIDReg); 796 MFI->setTIDReg(TIDReg); 797 } 798 799 // Add FrameIndex to LDS offset 800 unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize); 801 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) 802 .addImm(LDSOffset) 803 .addReg(TIDReg); 804 805 return TmpReg; 806 } 807 808 void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB, 809 MachineBasicBlock::iterator MI, 810 int Count) const { 811 while (Count > 0) { 812 int Arg; 813 if (Count >= 8) 814 Arg = 7; 815 else 816 Arg = Count - 1; 817 Count -= 8; 818 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_NOP)) 819 .addImm(Arg); 820 } 821 } 822 823 bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { 824 MachineBasicBlock &MBB = *MI->getParent(); 825 DebugLoc DL = MBB.findDebugLoc(MI); 826 switch (MI->getOpcode()) { 827 default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); 828 829 case AMDGPU::SGPR_USE: 830 // This is just a placeholder for register allocation. 831 MI->eraseFromParent(); 832 break; 833 834 case AMDGPU::V_MOV_B64_PSEUDO: { 835 unsigned Dst = MI->getOperand(0).getReg(); 836 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 837 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 838 839 const MachineOperand &SrcOp = MI->getOperand(1); 840 // FIXME: Will this work for 64-bit floating point immediates? 841 assert(!SrcOp.isFPImm()); 842 if (SrcOp.isImm()) { 843 APInt Imm(64, SrcOp.getImm()); 844 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 845 .addImm(Imm.getLoBits(32).getZExtValue()) 846 .addReg(Dst, RegState::Implicit); 847 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 848 .addImm(Imm.getHiBits(32).getZExtValue()) 849 .addReg(Dst, RegState::Implicit); 850 } else { 851 assert(SrcOp.isReg()); 852 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 853 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 854 .addReg(Dst, RegState::Implicit); 855 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 856 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 857 .addReg(Dst, RegState::Implicit); 858 } 859 MI->eraseFromParent(); 860 break; 861 } 862 863 case AMDGPU::V_CNDMASK_B64_PSEUDO: { 864 unsigned Dst = MI->getOperand(0).getReg(); 865 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 866 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 867 unsigned Src0 = MI->getOperand(1).getReg(); 868 unsigned Src1 = MI->getOperand(2).getReg(); 869 const MachineOperand &SrcCond = MI->getOperand(3); 870 871 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo) 872 .addReg(RI.getSubReg(Src0, AMDGPU::sub0)) 873 .addReg(RI.getSubReg(Src1, AMDGPU::sub0)) 874 .addOperand(SrcCond); 875 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi) 876 .addReg(RI.getSubReg(Src0, AMDGPU::sub1)) 877 .addReg(RI.getSubReg(Src1, AMDGPU::sub1)) 878 .addOperand(SrcCond); 879 MI->eraseFromParent(); 880 break; 881 } 882 883 case AMDGPU::SI_CONSTDATA_PTR: { 884 const SIRegisterInfo *TRI = 885 static_cast<const SIRegisterInfo *>(ST.getRegisterInfo()); 886 MachineFunction &MF = *MBB.getParent(); 887 unsigned Reg = MI->getOperand(0).getReg(); 888 unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0); 889 unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1); 890 891 // Create a bundle so these instructions won't be re-ordered by the 892 // post-RA scheduler. 893 MIBundleBuilder Bundler(MBB, MI); 894 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); 895 896 // Add 32-bit offset from this instruction to the start of the 897 // constant data. 898 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) 899 .addReg(RegLo) 900 .addOperand(MI->getOperand(1))); 901 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) 902 .addReg(RegHi) 903 .addImm(0)); 904 905 llvm::finalizeBundle(MBB, Bundler.begin()); 906 907 MI->eraseFromParent(); 908 break; 909 } 910 } 911 return true; 912 } 913 914 /// Commutes the operands in the given instruction. 915 /// The commutable operands are specified by their indices OpIdx0 and OpIdx1. 916 /// 917 /// Do not call this method for a non-commutable instruction or for 918 /// non-commutable pair of operand indices OpIdx0 and OpIdx1. 919 /// Even though the instruction is commutable, the method may still 920 /// fail to commute the operands, null pointer is returned in such cases. 921 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI, 922 bool NewMI, 923 unsigned OpIdx0, 924 unsigned OpIdx1) const { 925 int CommutedOpcode = commuteOpcode(*MI); 926 if (CommutedOpcode == -1) 927 return nullptr; 928 929 int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 930 AMDGPU::OpName::src0); 931 MachineOperand &Src0 = MI->getOperand(Src0Idx); 932 if (!Src0.isReg()) 933 return nullptr; 934 935 int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 936 AMDGPU::OpName::src1); 937 938 if ((OpIdx0 != static_cast<unsigned>(Src0Idx) || 939 OpIdx1 != static_cast<unsigned>(Src1Idx)) && 940 (OpIdx0 != static_cast<unsigned>(Src1Idx) || 941 OpIdx1 != static_cast<unsigned>(Src0Idx))) 942 return nullptr; 943 944 MachineOperand &Src1 = MI->getOperand(Src1Idx); 945 946 947 if (isVOP2(*MI)) { 948 const MCInstrDesc &InstrDesc = MI->getDesc(); 949 // For VOP2 instructions, any operand type is valid to use for src0. Make 950 // sure we can use the src1 as src0. 951 // 952 // We could be stricter here and only allow commuting if there is a reason 953 // to do so. i.e. if both operands are VGPRs there is no real benefit, 954 // although MachineCSE attempts to find matches by commuting. 955 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 956 if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) 957 return nullptr; 958 } 959 960 if (!Src1.isReg()) { 961 // Allow commuting instructions with Imm operands. 962 if (NewMI || !Src1.isImm() || 963 (!isVOP2(*MI) && !isVOP3(*MI))) { 964 return nullptr; 965 } 966 // Be sure to copy the source modifiers to the right place. 967 if (MachineOperand *Src0Mods 968 = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { 969 MachineOperand *Src1Mods 970 = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers); 971 972 int Src0ModsVal = Src0Mods->getImm(); 973 if (!Src1Mods && Src0ModsVal != 0) 974 return nullptr; 975 976 // XXX - This assert might be a lie. It might be useful to have a neg 977 // modifier with 0.0. 978 int Src1ModsVal = Src1Mods->getImm(); 979 assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates"); 980 981 Src1Mods->setImm(Src0ModsVal); 982 Src0Mods->setImm(Src1ModsVal); 983 } 984 985 unsigned Reg = Src0.getReg(); 986 unsigned SubReg = Src0.getSubReg(); 987 if (Src1.isImm()) 988 Src0.ChangeToImmediate(Src1.getImm()); 989 else 990 llvm_unreachable("Should only have immediates"); 991 992 Src1.ChangeToRegister(Reg, false); 993 Src1.setSubReg(SubReg); 994 } else { 995 MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1); 996 } 997 998 if (MI) 999 MI->setDesc(get(CommutedOpcode)); 1000 1001 return MI; 1002 } 1003 1004 // This needs to be implemented because the source modifiers may be inserted 1005 // between the true commutable operands, and the base 1006 // TargetInstrInfo::commuteInstruction uses it. 1007 bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, 1008 unsigned &SrcOpIdx0, 1009 unsigned &SrcOpIdx1) const { 1010 const MCInstrDesc &MCID = MI->getDesc(); 1011 if (!MCID.isCommutable()) 1012 return false; 1013 1014 unsigned Opc = MI->getOpcode(); 1015 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1016 if (Src0Idx == -1) 1017 return false; 1018 1019 // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on 1020 // immediate. Also, immediate src0 operand is not handled in 1021 // SIInstrInfo::commuteInstruction(); 1022 if (!MI->getOperand(Src0Idx).isReg()) 1023 return false; 1024 1025 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1026 if (Src1Idx == -1) 1027 return false; 1028 1029 MachineOperand &Src1 = MI->getOperand(Src1Idx); 1030 if (Src1.isImm()) { 1031 // SIInstrInfo::commuteInstruction() does support commuting the immediate 1032 // operand src1 in 2 and 3 operand instructions. 1033 if (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode())) 1034 return false; 1035 } else if (Src1.isReg()) { 1036 // If any source modifiers are set, the generic instruction commuting won't 1037 // understand how to copy the source modifiers. 1038 if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || 1039 hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) 1040 return false; 1041 } else 1042 return false; 1043 1044 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); 1045 } 1046 1047 static void removeModOperands(MachineInstr &MI) { 1048 unsigned Opc = MI.getOpcode(); 1049 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1050 AMDGPU::OpName::src0_modifiers); 1051 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1052 AMDGPU::OpName::src1_modifiers); 1053 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1054 AMDGPU::OpName::src2_modifiers); 1055 1056 MI.RemoveOperand(Src2ModIdx); 1057 MI.RemoveOperand(Src1ModIdx); 1058 MI.RemoveOperand(Src0ModIdx); 1059 } 1060 1061 // TODO: Maybe this should be removed this and custom fold everything in 1062 // SIFoldOperands? 1063 bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, 1064 unsigned Reg, MachineRegisterInfo *MRI) const { 1065 if (!MRI->hasOneNonDBGUse(Reg)) 1066 return false; 1067 1068 unsigned Opc = UseMI->getOpcode(); 1069 if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) { 1070 // Don't fold if we are using source modifiers. The new VOP2 instructions 1071 // don't have them. 1072 if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) || 1073 hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) || 1074 hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) { 1075 return false; 1076 } 1077 1078 const MachineOperand &ImmOp = DefMI->getOperand(1); 1079 1080 // If this is a free constant, there's no reason to do this. 1081 // TODO: We could fold this here instead of letting SIFoldOperands do it 1082 // later. 1083 if (isInlineConstant(ImmOp, 4)) 1084 return false; 1085 1086 MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0); 1087 MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1); 1088 MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2); 1089 1090 // Multiplied part is the constant: Use v_madmk_f32 1091 // We should only expect these to be on src0 due to canonicalizations. 1092 if (Src0->isReg() && Src0->getReg() == Reg) { 1093 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1094 return false; 1095 1096 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) 1097 return false; 1098 1099 // We need to swap operands 0 and 1 since madmk constant is at operand 1. 1100 1101 const int64_t Imm = DefMI->getOperand(1).getImm(); 1102 1103 // FIXME: This would be a lot easier if we could return a new instruction 1104 // instead of having to modify in place. 1105 1106 // Remove these first since they are at the end. 1107 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1108 AMDGPU::OpName::omod)); 1109 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1110 AMDGPU::OpName::clamp)); 1111 1112 unsigned Src1Reg = Src1->getReg(); 1113 unsigned Src1SubReg = Src1->getSubReg(); 1114 Src0->setReg(Src1Reg); 1115 Src0->setSubReg(Src1SubReg); 1116 Src0->setIsKill(Src1->isKill()); 1117 1118 if (Opc == AMDGPU::V_MAC_F32_e64) { 1119 UseMI->untieRegOperand( 1120 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1121 } 1122 1123 Src1->ChangeToImmediate(Imm); 1124 1125 removeModOperands(*UseMI); 1126 UseMI->setDesc(get(AMDGPU::V_MADMK_F32)); 1127 1128 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1129 if (DeleteDef) 1130 DefMI->eraseFromParent(); 1131 1132 return true; 1133 } 1134 1135 // Added part is the constant: Use v_madak_f32 1136 if (Src2->isReg() && Src2->getReg() == Reg) { 1137 // Not allowed to use constant bus for another operand. 1138 // We can however allow an inline immediate as src0. 1139 if (!Src0->isImm() && 1140 (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) 1141 return false; 1142 1143 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1144 return false; 1145 1146 const int64_t Imm = DefMI->getOperand(1).getImm(); 1147 1148 // FIXME: This would be a lot easier if we could return a new instruction 1149 // instead of having to modify in place. 1150 1151 // Remove these first since they are at the end. 1152 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1153 AMDGPU::OpName::omod)); 1154 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1155 AMDGPU::OpName::clamp)); 1156 1157 if (Opc == AMDGPU::V_MAC_F32_e64) { 1158 UseMI->untieRegOperand( 1159 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1160 } 1161 1162 // ChangingToImmediate adds Src2 back to the instruction. 1163 Src2->ChangeToImmediate(Imm); 1164 1165 // These come before src2. 1166 removeModOperands(*UseMI); 1167 UseMI->setDesc(get(AMDGPU::V_MADAK_F32)); 1168 1169 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1170 if (DeleteDef) 1171 DefMI->eraseFromParent(); 1172 1173 return true; 1174 } 1175 } 1176 1177 return false; 1178 } 1179 1180 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 1181 int WidthB, int OffsetB) { 1182 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1183 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1184 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1185 return LowOffset + LowWidth <= HighOffset; 1186 } 1187 1188 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa, 1189 MachineInstr *MIb) const { 1190 unsigned BaseReg0, BaseReg1; 1191 int64_t Offset0, Offset1; 1192 1193 if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && 1194 getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { 1195 assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() && 1196 "read2 / write2 not expected here yet"); 1197 unsigned Width0 = (*MIa->memoperands_begin())->getSize(); 1198 unsigned Width1 = (*MIb->memoperands_begin())->getSize(); 1199 if (BaseReg0 == BaseReg1 && 1200 offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { 1201 return true; 1202 } 1203 } 1204 1205 return false; 1206 } 1207 1208 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, 1209 MachineInstr *MIb, 1210 AliasAnalysis *AA) const { 1211 assert(MIa && (MIa->mayLoad() || MIa->mayStore()) && 1212 "MIa must load from or modify a memory location"); 1213 assert(MIb && (MIb->mayLoad() || MIb->mayStore()) && 1214 "MIb must load from or modify a memory location"); 1215 1216 if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects()) 1217 return false; 1218 1219 // XXX - Can we relax this between address spaces? 1220 if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef()) 1221 return false; 1222 1223 // TODO: Should we check the address space from the MachineMemOperand? That 1224 // would allow us to distinguish objects we know don't alias based on the 1225 // underlying address space, even if it was lowered to a different one, 1226 // e.g. private accesses lowered to use MUBUF instructions on a scratch 1227 // buffer. 1228 if (isDS(*MIa)) { 1229 if (isDS(*MIb)) 1230 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1231 1232 return !isFLAT(*MIb); 1233 } 1234 1235 if (isMUBUF(*MIa) || isMTBUF(*MIa)) { 1236 if (isMUBUF(*MIb) || isMTBUF(*MIb)) 1237 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1238 1239 return !isFLAT(*MIb) && !isSMRD(*MIb); 1240 } 1241 1242 if (isSMRD(*MIa)) { 1243 if (isSMRD(*MIb)) 1244 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1245 1246 return !isFLAT(*MIb) && !isMUBUF(*MIa) && !isMTBUF(*MIa); 1247 } 1248 1249 if (isFLAT(*MIa)) { 1250 if (isFLAT(*MIb)) 1251 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1252 1253 return false; 1254 } 1255 1256 return false; 1257 } 1258 1259 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, 1260 MachineBasicBlock::iterator &MI, 1261 LiveVariables *LV) const { 1262 1263 switch (MI->getOpcode()) { 1264 default: return nullptr; 1265 case AMDGPU::V_MAC_F32_e64: break; 1266 case AMDGPU::V_MAC_F32_e32: { 1267 const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); 1268 if (Src0->isImm() && !isInlineConstant(*Src0, 4)) 1269 return nullptr; 1270 break; 1271 } 1272 } 1273 1274 const MachineOperand *Dst = getNamedOperand(*MI, AMDGPU::OpName::vdst); 1275 const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); 1276 const MachineOperand *Src1 = getNamedOperand(*MI, AMDGPU::OpName::src1); 1277 const MachineOperand *Src2 = getNamedOperand(*MI, AMDGPU::OpName::src2); 1278 1279 return BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_MAD_F32)) 1280 .addOperand(*Dst) 1281 .addImm(0) // Src0 mods 1282 .addOperand(*Src0) 1283 .addImm(0) // Src1 mods 1284 .addOperand(*Src1) 1285 .addImm(0) // Src mods 1286 .addOperand(*Src2) 1287 .addImm(0) // clamp 1288 .addImm(0); // omod 1289 } 1290 1291 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr *MI, 1292 const MachineBasicBlock *MBB, 1293 const MachineFunction &MF) const { 1294 // Target-independent instructions do not have an implicit-use of EXEC, even 1295 // when they operate on VGPRs. Treating EXEC modifications as scheduling 1296 // boundaries prevents incorrect movements of such instructions. 1297 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); 1298 if (MI->modifiesRegister(AMDGPU::EXEC, TRI)) 1299 return true; 1300 1301 return AMDGPUInstrInfo::isSchedulingBoundary(MI, MBB, MF); 1302 } 1303 1304 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 1305 int64_t SVal = Imm.getSExtValue(); 1306 if (SVal >= -16 && SVal <= 64) 1307 return true; 1308 1309 if (Imm.getBitWidth() == 64) { 1310 uint64_t Val = Imm.getZExtValue(); 1311 return (DoubleToBits(0.0) == Val) || 1312 (DoubleToBits(1.0) == Val) || 1313 (DoubleToBits(-1.0) == Val) || 1314 (DoubleToBits(0.5) == Val) || 1315 (DoubleToBits(-0.5) == Val) || 1316 (DoubleToBits(2.0) == Val) || 1317 (DoubleToBits(-2.0) == Val) || 1318 (DoubleToBits(4.0) == Val) || 1319 (DoubleToBits(-4.0) == Val); 1320 } 1321 1322 // The actual type of the operand does not seem to matter as long 1323 // as the bits match one of the inline immediate values. For example: 1324 // 1325 // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal, 1326 // so it is a legal inline immediate. 1327 // 1328 // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in 1329 // floating-point, so it is a legal inline immediate. 1330 uint32_t Val = Imm.getZExtValue(); 1331 1332 return (FloatToBits(0.0f) == Val) || 1333 (FloatToBits(1.0f) == Val) || 1334 (FloatToBits(-1.0f) == Val) || 1335 (FloatToBits(0.5f) == Val) || 1336 (FloatToBits(-0.5f) == Val) || 1337 (FloatToBits(2.0f) == Val) || 1338 (FloatToBits(-2.0f) == Val) || 1339 (FloatToBits(4.0f) == Val) || 1340 (FloatToBits(-4.0f) == Val); 1341 } 1342 1343 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 1344 unsigned OpSize) const { 1345 if (MO.isImm()) { 1346 // MachineOperand provides no way to tell the true operand size, since it 1347 // only records a 64-bit value. We need to know the size to determine if a 1348 // 32-bit floating point immediate bit pattern is legal for an integer 1349 // immediate. It would be for any 32-bit integer operand, but would not be 1350 // for a 64-bit one. 1351 1352 unsigned BitSize = 8 * OpSize; 1353 return isInlineConstant(APInt(BitSize, MO.getImm(), true)); 1354 } 1355 1356 return false; 1357 } 1358 1359 bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO, 1360 unsigned OpSize) const { 1361 return MO.isImm() && !isInlineConstant(MO, OpSize); 1362 } 1363 1364 static bool compareMachineOp(const MachineOperand &Op0, 1365 const MachineOperand &Op1) { 1366 if (Op0.getType() != Op1.getType()) 1367 return false; 1368 1369 switch (Op0.getType()) { 1370 case MachineOperand::MO_Register: 1371 return Op0.getReg() == Op1.getReg(); 1372 case MachineOperand::MO_Immediate: 1373 return Op0.getImm() == Op1.getImm(); 1374 default: 1375 llvm_unreachable("Didn't expect to be comparing these operand types"); 1376 } 1377 } 1378 1379 bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, 1380 const MachineOperand &MO) const { 1381 const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo]; 1382 1383 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 1384 1385 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 1386 return true; 1387 1388 if (OpInfo.RegClass < 0) 1389 return false; 1390 1391 unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize(); 1392 if (isLiteralConstant(MO, OpSize)) 1393 return RI.opCanUseLiteralConstant(OpInfo.OperandType); 1394 1395 return RI.opCanUseInlineConstant(OpInfo.OperandType); 1396 } 1397 1398 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 1399 int Op32 = AMDGPU::getVOPe32(Opcode); 1400 if (Op32 == -1) 1401 return false; 1402 1403 return pseudoToMCOpcode(Op32) != -1; 1404 } 1405 1406 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 1407 // The src0_modifier operand is present on all instructions 1408 // that have modifiers. 1409 1410 return AMDGPU::getNamedOperandIdx(Opcode, 1411 AMDGPU::OpName::src0_modifiers) != -1; 1412 } 1413 1414 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 1415 unsigned OpName) const { 1416 const MachineOperand *Mods = getNamedOperand(MI, OpName); 1417 return Mods && Mods->getImm(); 1418 } 1419 1420 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 1421 const MachineOperand &MO, 1422 unsigned OpSize) const { 1423 // Literal constants use the constant bus. 1424 if (isLiteralConstant(MO, OpSize)) 1425 return true; 1426 1427 if (!MO.isReg() || !MO.isUse()) 1428 return false; 1429 1430 if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) 1431 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 1432 1433 // FLAT_SCR is just an SGPR pair. 1434 if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) 1435 return true; 1436 1437 // EXEC register uses the constant bus. 1438 if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) 1439 return true; 1440 1441 // SGPRs use the constant bus 1442 return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 || 1443 (!MO.isImplicit() && 1444 (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || 1445 AMDGPU::SGPR_64RegClass.contains(MO.getReg())))); 1446 } 1447 1448 static unsigned findImplicitSGPRRead(const MachineInstr &MI) { 1449 for (const MachineOperand &MO : MI.implicit_operands()) { 1450 // We only care about reads. 1451 if (MO.isDef()) 1452 continue; 1453 1454 switch (MO.getReg()) { 1455 case AMDGPU::VCC: 1456 case AMDGPU::M0: 1457 case AMDGPU::FLAT_SCR: 1458 return MO.getReg(); 1459 1460 default: 1461 break; 1462 } 1463 } 1464 1465 return AMDGPU::NoRegister; 1466 } 1467 1468 bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, 1469 StringRef &ErrInfo) const { 1470 uint16_t Opcode = MI->getOpcode(); 1471 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1472 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 1473 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 1474 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 1475 1476 // Make sure we don't have SCC live-ins to basic blocks. moveToVALU assumes 1477 // all SCC users are in the same blocks as their defs. 1478 const MachineBasicBlock *MBB = MI->getParent(); 1479 if (MI == &MBB->front()) { 1480 if (MBB->isLiveIn(AMDGPU::SCC)) { 1481 ErrInfo = "scc register cannot be live across blocks."; 1482 return false; 1483 } 1484 } 1485 1486 // Make sure the number of operands is correct. 1487 const MCInstrDesc &Desc = get(Opcode); 1488 if (!Desc.isVariadic() && 1489 Desc.getNumOperands() != MI->getNumExplicitOperands()) { 1490 ErrInfo = "Instruction has wrong number of operands."; 1491 return false; 1492 } 1493 1494 // Make sure the register classes are correct. 1495 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 1496 if (MI->getOperand(i).isFPImm()) { 1497 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 1498 "all fp values to integers."; 1499 return false; 1500 } 1501 1502 int RegClass = Desc.OpInfo[i].RegClass; 1503 1504 switch (Desc.OpInfo[i].OperandType) { 1505 case MCOI::OPERAND_REGISTER: 1506 if (MI->getOperand(i).isImm()) { 1507 ErrInfo = "Illegal immediate value for operand."; 1508 return false; 1509 } 1510 break; 1511 case AMDGPU::OPERAND_REG_IMM32: 1512 break; 1513 case AMDGPU::OPERAND_REG_INLINE_C: 1514 if (isLiteralConstant(MI->getOperand(i), 1515 RI.getRegClass(RegClass)->getSize())) { 1516 ErrInfo = "Illegal immediate value for operand."; 1517 return false; 1518 } 1519 break; 1520 case MCOI::OPERAND_IMMEDIATE: 1521 // Check if this operand is an immediate. 1522 // FrameIndex operands will be replaced by immediates, so they are 1523 // allowed. 1524 if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) { 1525 ErrInfo = "Expected immediate, but got non-immediate"; 1526 return false; 1527 } 1528 // Fall-through 1529 default: 1530 continue; 1531 } 1532 1533 if (!MI->getOperand(i).isReg()) 1534 continue; 1535 1536 if (RegClass != -1) { 1537 unsigned Reg = MI->getOperand(i).getReg(); 1538 if (TargetRegisterInfo::isVirtualRegister(Reg)) 1539 continue; 1540 1541 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 1542 if (!RC->contains(Reg)) { 1543 ErrInfo = "Operand has incorrect register class."; 1544 return false; 1545 } 1546 } 1547 } 1548 1549 1550 // Verify VOP* 1551 if (isVOP1(*MI) || isVOP2(*MI) || isVOP3(*MI) || isVOPC(*MI)) { 1552 // Only look at the true operands. Only a real operand can use the constant 1553 // bus, and we don't want to check pseudo-operands like the source modifier 1554 // flags. 1555 const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; 1556 1557 unsigned ConstantBusCount = 0; 1558 unsigned SGPRUsed = findImplicitSGPRRead(*MI); 1559 if (SGPRUsed != AMDGPU::NoRegister) 1560 ++ConstantBusCount; 1561 1562 for (int OpIdx : OpIndices) { 1563 if (OpIdx == -1) 1564 break; 1565 const MachineOperand &MO = MI->getOperand(OpIdx); 1566 if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) { 1567 if (MO.isReg()) { 1568 if (MO.getReg() != SGPRUsed) 1569 ++ConstantBusCount; 1570 SGPRUsed = MO.getReg(); 1571 } else { 1572 ++ConstantBusCount; 1573 } 1574 } 1575 } 1576 if (ConstantBusCount > 1) { 1577 ErrInfo = "VOP* instruction uses the constant bus more than once"; 1578 return false; 1579 } 1580 } 1581 1582 // Verify misc. restrictions on specific instructions. 1583 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || 1584 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { 1585 const MachineOperand &Src0 = MI->getOperand(Src0Idx); 1586 const MachineOperand &Src1 = MI->getOperand(Src1Idx); 1587 const MachineOperand &Src2 = MI->getOperand(Src2Idx); 1588 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 1589 if (!compareMachineOp(Src0, Src1) && 1590 !compareMachineOp(Src0, Src2)) { 1591 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 1592 return false; 1593 } 1594 } 1595 } 1596 1597 // Make sure we aren't losing exec uses in the td files. This mostly requires 1598 // being careful when using let Uses to try to add other use registers. 1599 if (!isGenericOpcode(Opcode) && !isSALU(Opcode) && !isSMRD(Opcode)) { 1600 const MachineOperand *Exec = MI->findRegisterUseOperand(AMDGPU::EXEC); 1601 if (!Exec || !Exec->isImplicit()) { 1602 ErrInfo = "VALU instruction does not implicitly read exec mask"; 1603 return false; 1604 } 1605 } 1606 1607 return true; 1608 } 1609 1610 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { 1611 switch (MI.getOpcode()) { 1612 default: return AMDGPU::INSTRUCTION_LIST_END; 1613 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 1614 case AMDGPU::COPY: return AMDGPU::COPY; 1615 case AMDGPU::PHI: return AMDGPU::PHI; 1616 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 1617 case AMDGPU::S_MOV_B32: 1618 return MI.getOperand(1).isReg() ? 1619 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 1620 case AMDGPU::S_ADD_I32: 1621 case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; 1622 case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; 1623 case AMDGPU::S_SUB_I32: 1624 case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; 1625 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 1626 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; 1627 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32; 1628 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32; 1629 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32; 1630 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32; 1631 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32; 1632 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32; 1633 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32; 1634 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 1635 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; 1636 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 1637 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; 1638 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 1639 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; 1640 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; 1641 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; 1642 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; 1643 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; 1644 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 1645 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 1646 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 1647 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 1648 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; 1649 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; 1650 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; 1651 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; 1652 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; 1653 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; 1654 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; 1655 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; 1656 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; 1657 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; 1658 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; 1659 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; 1660 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 1661 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 1662 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 1663 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 1664 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; 1665 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; 1666 } 1667 } 1668 1669 bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const { 1670 return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END; 1671 } 1672 1673 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 1674 unsigned OpNo) const { 1675 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 1676 const MCInstrDesc &Desc = get(MI.getOpcode()); 1677 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 1678 Desc.OpInfo[OpNo].RegClass == -1) { 1679 unsigned Reg = MI.getOperand(OpNo).getReg(); 1680 1681 if (TargetRegisterInfo::isVirtualRegister(Reg)) 1682 return MRI.getRegClass(Reg); 1683 return RI.getPhysRegClass(Reg); 1684 } 1685 1686 unsigned RCID = Desc.OpInfo[OpNo].RegClass; 1687 return RI.getRegClass(RCID); 1688 } 1689 1690 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { 1691 switch (MI.getOpcode()) { 1692 case AMDGPU::COPY: 1693 case AMDGPU::REG_SEQUENCE: 1694 case AMDGPU::PHI: 1695 case AMDGPU::INSERT_SUBREG: 1696 return RI.hasVGPRs(getOpRegClass(MI, 0)); 1697 default: 1698 return RI.hasVGPRs(getOpRegClass(MI, OpNo)); 1699 } 1700 } 1701 1702 void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const { 1703 MachineBasicBlock::iterator I = MI; 1704 MachineBasicBlock *MBB = MI->getParent(); 1705 MachineOperand &MO = MI->getOperand(OpIdx); 1706 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1707 unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass; 1708 const TargetRegisterClass *RC = RI.getRegClass(RCID); 1709 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 1710 if (MO.isReg()) 1711 Opcode = AMDGPU::COPY; 1712 else if (RI.isSGPRClass(RC)) 1713 Opcode = AMDGPU::S_MOV_B32; 1714 1715 1716 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 1717 if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) 1718 VRC = &AMDGPU::VReg_64RegClass; 1719 else 1720 VRC = &AMDGPU::VGPR_32RegClass; 1721 1722 unsigned Reg = MRI.createVirtualRegister(VRC); 1723 DebugLoc DL = MBB->findDebugLoc(I); 1724 BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg) 1725 .addOperand(MO); 1726 MO.ChangeToRegister(Reg, false); 1727 } 1728 1729 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 1730 MachineRegisterInfo &MRI, 1731 MachineOperand &SuperReg, 1732 const TargetRegisterClass *SuperRC, 1733 unsigned SubIdx, 1734 const TargetRegisterClass *SubRC) 1735 const { 1736 MachineBasicBlock *MBB = MI->getParent(); 1737 DebugLoc DL = MI->getDebugLoc(); 1738 unsigned SubReg = MRI.createVirtualRegister(SubRC); 1739 1740 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { 1741 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 1742 .addReg(SuperReg.getReg(), 0, SubIdx); 1743 return SubReg; 1744 } 1745 1746 // Just in case the super register is itself a sub-register, copy it to a new 1747 // value so we don't need to worry about merging its subreg index with the 1748 // SubIdx passed to this function. The register coalescer should be able to 1749 // eliminate this extra copy. 1750 unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); 1751 1752 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 1753 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 1754 1755 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 1756 .addReg(NewSuperReg, 0, SubIdx); 1757 1758 return SubReg; 1759 } 1760 1761 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 1762 MachineBasicBlock::iterator MII, 1763 MachineRegisterInfo &MRI, 1764 MachineOperand &Op, 1765 const TargetRegisterClass *SuperRC, 1766 unsigned SubIdx, 1767 const TargetRegisterClass *SubRC) const { 1768 if (Op.isImm()) { 1769 // XXX - Is there a better way to do this? 1770 if (SubIdx == AMDGPU::sub0) 1771 return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF); 1772 if (SubIdx == AMDGPU::sub1) 1773 return MachineOperand::CreateImm(Op.getImm() >> 32); 1774 1775 llvm_unreachable("Unhandled register index for immediate"); 1776 } 1777 1778 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 1779 SubIdx, SubRC); 1780 return MachineOperand::CreateReg(SubReg, false); 1781 } 1782 1783 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 1784 void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const { 1785 assert(Inst->getNumExplicitOperands() == 3); 1786 MachineOperand Op1 = Inst->getOperand(1); 1787 Inst->RemoveOperand(1); 1788 Inst->addOperand(Op1); 1789 } 1790 1791 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, 1792 const MCOperandInfo &OpInfo, 1793 const MachineOperand &MO) const { 1794 if (!MO.isReg()) 1795 return false; 1796 1797 unsigned Reg = MO.getReg(); 1798 const TargetRegisterClass *RC = 1799 TargetRegisterInfo::isVirtualRegister(Reg) ? 1800 MRI.getRegClass(Reg) : 1801 RI.getPhysRegClass(Reg); 1802 1803 const SIRegisterInfo *TRI = 1804 static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 1805 RC = TRI->getSubRegClass(RC, MO.getSubReg()); 1806 1807 // In order to be legal, the common sub-class must be equal to the 1808 // class of the current operand. For example: 1809 // 1810 // v_mov_b32 s0 ; Operand defined as vsrc_32 1811 // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL 1812 // 1813 // s_sendmsg 0, s0 ; Operand defined as m0reg 1814 // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL 1815 1816 return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; 1817 } 1818 1819 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, 1820 const MCOperandInfo &OpInfo, 1821 const MachineOperand &MO) const { 1822 if (MO.isReg()) 1823 return isLegalRegOperand(MRI, OpInfo, MO); 1824 1825 // Handle non-register types that are treated like immediates. 1826 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 1827 return true; 1828 } 1829 1830 bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, 1831 const MachineOperand *MO) const { 1832 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1833 const MCInstrDesc &InstDesc = MI->getDesc(); 1834 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; 1835 const TargetRegisterClass *DefinedRC = 1836 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 1837 if (!MO) 1838 MO = &MI->getOperand(OpIdx); 1839 1840 if (isVALU(*MI) && 1841 usesConstantBus(MRI, *MO, DefinedRC->getSize())) { 1842 1843 RegSubRegPair SGPRUsed; 1844 if (MO->isReg()) 1845 SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg()); 1846 1847 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { 1848 if (i == OpIdx) 1849 continue; 1850 const MachineOperand &Op = MI->getOperand(i); 1851 if (Op.isReg() && 1852 (Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) && 1853 usesConstantBus(MRI, Op, getOpSize(*MI, i))) { 1854 return false; 1855 } 1856 } 1857 } 1858 1859 if (MO->isReg()) { 1860 assert(DefinedRC); 1861 return isLegalRegOperand(MRI, OpInfo, *MO); 1862 } 1863 1864 1865 // Handle non-register types that are treated like immediates. 1866 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); 1867 1868 if (!DefinedRC) { 1869 // This operand expects an immediate. 1870 return true; 1871 } 1872 1873 return isImmOperandLegal(MI, OpIdx, *MO); 1874 } 1875 1876 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, 1877 MachineInstr *MI) const { 1878 unsigned Opc = MI->getOpcode(); 1879 const MCInstrDesc &InstrDesc = get(Opc); 1880 1881 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1882 MachineOperand &Src1 = MI->getOperand(Src1Idx); 1883 1884 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 1885 // we need to only have one constant bus use. 1886 // 1887 // Note we do not need to worry about literal constants here. They are 1888 // disabled for the operand type for instructions because they will always 1889 // violate the one constant bus use rule. 1890 bool HasImplicitSGPR = findImplicitSGPRRead(*MI) != AMDGPU::NoRegister; 1891 if (HasImplicitSGPR) { 1892 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1893 MachineOperand &Src0 = MI->getOperand(Src0Idx); 1894 1895 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) 1896 legalizeOpWithMove(MI, Src0Idx); 1897 } 1898 1899 // VOP2 src0 instructions support all operand types, so we don't need to check 1900 // their legality. If src1 is already legal, we don't need to do anything. 1901 if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) 1902 return; 1903 1904 // We do not use commuteInstruction here because it is too aggressive and will 1905 // commute if it is possible. We only want to commute here if it improves 1906 // legality. This can be called a fairly large number of times so don't waste 1907 // compile time pointlessly swapping and checking legality again. 1908 if (HasImplicitSGPR || !MI->isCommutable()) { 1909 legalizeOpWithMove(MI, Src1Idx); 1910 return; 1911 } 1912 1913 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1914 MachineOperand &Src0 = MI->getOperand(Src0Idx); 1915 1916 // If src0 can be used as src1, commuting will make the operands legal. 1917 // Otherwise we have to give up and insert a move. 1918 // 1919 // TODO: Other immediate-like operand kinds could be commuted if there was a 1920 // MachineOperand::ChangeTo* for them. 1921 if ((!Src1.isImm() && !Src1.isReg()) || 1922 !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { 1923 legalizeOpWithMove(MI, Src1Idx); 1924 return; 1925 } 1926 1927 int CommutedOpc = commuteOpcode(*MI); 1928 if (CommutedOpc == -1) { 1929 legalizeOpWithMove(MI, Src1Idx); 1930 return; 1931 } 1932 1933 MI->setDesc(get(CommutedOpc)); 1934 1935 unsigned Src0Reg = Src0.getReg(); 1936 unsigned Src0SubReg = Src0.getSubReg(); 1937 bool Src0Kill = Src0.isKill(); 1938 1939 if (Src1.isImm()) 1940 Src0.ChangeToImmediate(Src1.getImm()); 1941 else if (Src1.isReg()) { 1942 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); 1943 Src0.setSubReg(Src1.getSubReg()); 1944 } else 1945 llvm_unreachable("Should only have register or immediate operands"); 1946 1947 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); 1948 Src1.setSubReg(Src0SubReg); 1949 } 1950 1951 // Legalize VOP3 operands. Because all operand types are supported for any 1952 // operand, and since literal constants are not allowed and should never be 1953 // seen, we only need to worry about inserting copies if we use multiple SGPR 1954 // operands. 1955 void SIInstrInfo::legalizeOperandsVOP3( 1956 MachineRegisterInfo &MRI, 1957 MachineInstr *MI) const { 1958 unsigned Opc = MI->getOpcode(); 1959 1960 int VOP3Idx[3] = { 1961 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 1962 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), 1963 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) 1964 }; 1965 1966 // Find the one SGPR operand we are allowed to use. 1967 unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); 1968 1969 for (unsigned i = 0; i < 3; ++i) { 1970 int Idx = VOP3Idx[i]; 1971 if (Idx == -1) 1972 break; 1973 MachineOperand &MO = MI->getOperand(Idx); 1974 1975 // We should never see a VOP3 instruction with an illegal immediate operand. 1976 if (!MO.isReg()) 1977 continue; 1978 1979 if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 1980 continue; // VGPRs are legal 1981 1982 if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { 1983 SGPRReg = MO.getReg(); 1984 // We can use one SGPR in each VOP3 instruction. 1985 continue; 1986 } 1987 1988 // If we make it this far, then the operand is not legal and we must 1989 // legalize it. 1990 legalizeOpWithMove(MI, Idx); 1991 } 1992 } 1993 1994 unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr *UseMI, 1995 MachineRegisterInfo &MRI) const { 1996 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); 1997 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); 1998 unsigned DstReg = MRI.createVirtualRegister(SRC); 1999 unsigned SubRegs = VRC->getSize() / 4; 2000 2001 SmallVector<unsigned, 8> SRegs; 2002 for (unsigned i = 0; i < SubRegs; ++i) { 2003 unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2004 BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(), 2005 get(AMDGPU::V_READFIRSTLANE_B32), SGPR) 2006 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); 2007 SRegs.push_back(SGPR); 2008 } 2009 2010 MachineInstrBuilder MIB = BuildMI(*UseMI->getParent(), UseMI, 2011 UseMI->getDebugLoc(), 2012 get(AMDGPU::REG_SEQUENCE), DstReg); 2013 for (unsigned i = 0; i < SubRegs; ++i) { 2014 MIB.addReg(SRegs[i]); 2015 MIB.addImm(RI.getSubRegFromChannel(i)); 2016 } 2017 return DstReg; 2018 } 2019 2020 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, 2021 MachineInstr *MI) const { 2022 2023 // If the pointer is store in VGPRs, then we need to move them to 2024 // SGPRs using v_readfirstlane. This is safe because we only select 2025 // loads with uniform pointers to SMRD instruction so we know the 2026 // pointer value is uniform. 2027 MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase); 2028 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { 2029 unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); 2030 SBase->setReg(SGPR); 2031 } 2032 } 2033 2034 void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { 2035 MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 2036 2037 // Legalize VOP2 2038 if (isVOP2(*MI) || isVOPC(*MI)) { 2039 legalizeOperandsVOP2(MRI, MI); 2040 return; 2041 } 2042 2043 // Legalize VOP3 2044 if (isVOP3(*MI)) { 2045 legalizeOperandsVOP3(MRI, MI); 2046 return; 2047 } 2048 2049 // Legalize SMRD 2050 if (isSMRD(*MI)) { 2051 legalizeOperandsSMRD(MRI, MI); 2052 return; 2053 } 2054 2055 // Legalize REG_SEQUENCE and PHI 2056 // The register class of the operands much be the same type as the register 2057 // class of the output. 2058 if (MI->getOpcode() == AMDGPU::PHI) { 2059 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 2060 for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { 2061 if (!MI->getOperand(i).isReg() || 2062 !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) 2063 continue; 2064 const TargetRegisterClass *OpRC = 2065 MRI.getRegClass(MI->getOperand(i).getReg()); 2066 if (RI.hasVGPRs(OpRC)) { 2067 VRC = OpRC; 2068 } else { 2069 SRC = OpRC; 2070 } 2071 } 2072 2073 // If any of the operands are VGPR registers, then they all most be 2074 // otherwise we will create illegal VGPR->SGPR copies when legalizing 2075 // them. 2076 if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) { 2077 if (!VRC) { 2078 assert(SRC); 2079 VRC = RI.getEquivalentVGPRClass(SRC); 2080 } 2081 RC = VRC; 2082 } else { 2083 RC = SRC; 2084 } 2085 2086 // Update all the operands so they have the same type. 2087 for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { 2088 MachineOperand &Op = MI->getOperand(I); 2089 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 2090 continue; 2091 unsigned DstReg = MRI.createVirtualRegister(RC); 2092 2093 // MI is a PHI instruction. 2094 MachineBasicBlock *InsertBB = MI->getOperand(I + 1).getMBB(); 2095 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); 2096 2097 BuildMI(*InsertBB, Insert, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) 2098 .addOperand(Op); 2099 Op.setReg(DstReg); 2100 } 2101 } 2102 2103 // REG_SEQUENCE doesn't really require operand legalization, but if one has a 2104 // VGPR dest type and SGPR sources, insert copies so all operands are 2105 // VGPRs. This seems to help operand folding / the register coalescer. 2106 if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) { 2107 MachineBasicBlock *MBB = MI->getParent(); 2108 const TargetRegisterClass *DstRC = getOpRegClass(*MI, 0); 2109 if (RI.hasVGPRs(DstRC)) { 2110 // Update all the operands so they are VGPR register classes. These may 2111 // not be the same register class because REG_SEQUENCE supports mixing 2112 // subregister index types e.g. sub0_sub1 + sub2 + sub3 2113 for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { 2114 MachineOperand &Op = MI->getOperand(I); 2115 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 2116 continue; 2117 2118 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); 2119 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); 2120 if (VRC == OpRC) 2121 continue; 2122 2123 unsigned DstReg = MRI.createVirtualRegister(VRC); 2124 2125 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) 2126 .addOperand(Op); 2127 2128 Op.setReg(DstReg); 2129 Op.setIsKill(); 2130 } 2131 } 2132 2133 return; 2134 } 2135 2136 // Legalize INSERT_SUBREG 2137 // src0 must have the same register class as dst 2138 if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) { 2139 unsigned Dst = MI->getOperand(0).getReg(); 2140 unsigned Src0 = MI->getOperand(1).getReg(); 2141 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 2142 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 2143 if (DstRC != Src0RC) { 2144 MachineBasicBlock &MBB = *MI->getParent(); 2145 unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); 2146 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0) 2147 .addReg(Src0); 2148 MI->getOperand(1).setReg(NewSrc0); 2149 } 2150 return; 2151 } 2152 2153 // Legalize MIMG 2154 if (isMIMG(*MI)) { 2155 MachineOperand *SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); 2156 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { 2157 unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); 2158 SRsrc->setReg(SGPR); 2159 } 2160 2161 MachineOperand *SSamp = getNamedOperand(*MI, AMDGPU::OpName::ssamp); 2162 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { 2163 unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); 2164 SSamp->setReg(SGPR); 2165 } 2166 return; 2167 } 2168 2169 // Legalize MUBUF* instructions 2170 // FIXME: If we start using the non-addr64 instructions for compute, we 2171 // may need to legalize them here. 2172 int SRsrcIdx = 2173 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc); 2174 if (SRsrcIdx != -1) { 2175 // We have an MUBUF instruction 2176 MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx); 2177 unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass; 2178 if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), 2179 RI.getRegClass(SRsrcRC))) { 2180 // The operands are legal. 2181 // FIXME: We may need to legalize operands besided srsrc. 2182 return; 2183 } 2184 2185 MachineBasicBlock &MBB = *MI->getParent(); 2186 2187 // Extract the ptr from the resource descriptor. 2188 unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, 2189 &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); 2190 2191 // Create an empty resource descriptor 2192 unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2193 unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2194 unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2195 unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); 2196 uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); 2197 2198 // Zero64 = 0 2199 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64), 2200 Zero64) 2201 .addImm(0); 2202 2203 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 2204 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 2205 SRsrcFormatLo) 2206 .addImm(RsrcDataFormat & 0xFFFFFFFF); 2207 2208 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 2209 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 2210 SRsrcFormatHi) 2211 .addImm(RsrcDataFormat >> 32); 2212 2213 // NewSRsrc = {Zero64, SRsrcFormat} 2214 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) 2215 .addReg(Zero64) 2216 .addImm(AMDGPU::sub0_sub1) 2217 .addReg(SRsrcFormatLo) 2218 .addImm(AMDGPU::sub2) 2219 .addReg(SRsrcFormatHi) 2220 .addImm(AMDGPU::sub3); 2221 2222 MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); 2223 unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2224 if (VAddr) { 2225 // This is already an ADDR64 instruction so we need to add the pointer 2226 // extracted from the resource descriptor to the current value of VAddr. 2227 unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2228 unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2229 2230 // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 2231 DebugLoc DL = MI->getDebugLoc(); 2232 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) 2233 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 2234 .addReg(VAddr->getReg(), 0, AMDGPU::sub0); 2235 2236 // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1 2237 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) 2238 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 2239 .addReg(VAddr->getReg(), 0, AMDGPU::sub1); 2240 2241 // NewVaddr = {NewVaddrHi, NewVaddrLo} 2242 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 2243 .addReg(NewVAddrLo) 2244 .addImm(AMDGPU::sub0) 2245 .addReg(NewVAddrHi) 2246 .addImm(AMDGPU::sub1); 2247 } else { 2248 // This instructions is the _OFFSET variant, so we need to convert it to 2249 // ADDR64. 2250 assert(MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() 2251 < AMDGPUSubtarget::VOLCANIC_ISLANDS && 2252 "FIXME: Need to emit flat atomics here"); 2253 2254 MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata); 2255 MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset); 2256 MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset); 2257 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode()); 2258 2259 // Atomics rith return have have an additional tied operand and are 2260 // missing some of the special bits. 2261 MachineOperand *VDataIn = getNamedOperand(*MI, AMDGPU::OpName::vdata_in); 2262 MachineInstr *Addr64; 2263 2264 if (!VDataIn) { 2265 // Regular buffer load / store. 2266 MachineInstrBuilder MIB 2267 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) 2268 .addOperand(*VData) 2269 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 2270 // This will be replaced later 2271 // with the new value of vaddr. 2272 .addOperand(*SRsrc) 2273 .addOperand(*SOffset) 2274 .addOperand(*Offset); 2275 2276 // Atomics do not have this operand. 2277 if (const MachineOperand *GLC 2278 = getNamedOperand(*MI, AMDGPU::OpName::glc)) { 2279 MIB.addImm(GLC->getImm()); 2280 } 2281 2282 MIB.addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)); 2283 2284 if (const MachineOperand *TFE 2285 = getNamedOperand(*MI, AMDGPU::OpName::tfe)) { 2286 MIB.addImm(TFE->getImm()); 2287 } 2288 2289 MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 2290 Addr64 = MIB; 2291 } else { 2292 // Atomics with return. 2293 Addr64 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) 2294 .addOperand(*VData) 2295 .addOperand(*VDataIn) 2296 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 2297 // This will be replaced later 2298 // with the new value of vaddr. 2299 .addOperand(*SRsrc) 2300 .addOperand(*SOffset) 2301 .addOperand(*Offset) 2302 .addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)) 2303 .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 2304 } 2305 2306 MI->removeFromParent(); 2307 MI = Addr64; 2308 2309 // NewVaddr = {NewVaddrHi, NewVaddrLo} 2310 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 2311 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 2312 .addImm(AMDGPU::sub0) 2313 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 2314 .addImm(AMDGPU::sub1); 2315 2316 VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); 2317 SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); 2318 } 2319 2320 // Update the instruction to use NewVaddr 2321 VAddr->setReg(NewVAddr); 2322 // Update the instruction to use NewSRsrc 2323 SRsrc->setReg(NewSRsrc); 2324 } 2325 } 2326 2327 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { 2328 SmallVector<MachineInstr *, 128> Worklist; 2329 Worklist.push_back(&TopInst); 2330 2331 while (!Worklist.empty()) { 2332 MachineInstr *Inst = Worklist.pop_back_val(); 2333 MachineBasicBlock *MBB = Inst->getParent(); 2334 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 2335 2336 unsigned Opcode = Inst->getOpcode(); 2337 unsigned NewOpcode = getVALUOp(*Inst); 2338 2339 // Handle some special cases 2340 switch (Opcode) { 2341 default: 2342 break; 2343 case AMDGPU::S_AND_B64: 2344 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); 2345 Inst->eraseFromParent(); 2346 continue; 2347 2348 case AMDGPU::S_OR_B64: 2349 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64); 2350 Inst->eraseFromParent(); 2351 continue; 2352 2353 case AMDGPU::S_XOR_B64: 2354 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64); 2355 Inst->eraseFromParent(); 2356 continue; 2357 2358 case AMDGPU::S_NOT_B64: 2359 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); 2360 Inst->eraseFromParent(); 2361 continue; 2362 2363 case AMDGPU::S_BCNT1_I32_B64: 2364 splitScalar64BitBCNT(Worklist, Inst); 2365 Inst->eraseFromParent(); 2366 continue; 2367 2368 case AMDGPU::S_BFE_I64: { 2369 splitScalar64BitBFE(Worklist, Inst); 2370 Inst->eraseFromParent(); 2371 continue; 2372 } 2373 2374 case AMDGPU::S_LSHL_B32: 2375 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2376 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 2377 swapOperands(Inst); 2378 } 2379 break; 2380 case AMDGPU::S_ASHR_I32: 2381 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2382 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 2383 swapOperands(Inst); 2384 } 2385 break; 2386 case AMDGPU::S_LSHR_B32: 2387 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2388 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 2389 swapOperands(Inst); 2390 } 2391 break; 2392 case AMDGPU::S_LSHL_B64: 2393 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2394 NewOpcode = AMDGPU::V_LSHLREV_B64; 2395 swapOperands(Inst); 2396 } 2397 break; 2398 case AMDGPU::S_ASHR_I64: 2399 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2400 NewOpcode = AMDGPU::V_ASHRREV_I64; 2401 swapOperands(Inst); 2402 } 2403 break; 2404 case AMDGPU::S_LSHR_B64: 2405 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2406 NewOpcode = AMDGPU::V_LSHRREV_B64; 2407 swapOperands(Inst); 2408 } 2409 break; 2410 2411 case AMDGPU::S_ABS_I32: 2412 lowerScalarAbs(Worklist, Inst); 2413 Inst->eraseFromParent(); 2414 continue; 2415 2416 case AMDGPU::S_CBRANCH_SCC0: 2417 case AMDGPU::S_CBRANCH_SCC1: 2418 // Clear unused bits of vcc 2419 BuildMI(*MBB, Inst, Inst->getDebugLoc(), get(AMDGPU::S_AND_B64), AMDGPU::VCC) 2420 .addReg(AMDGPU::EXEC) 2421 .addReg(AMDGPU::VCC); 2422 break; 2423 2424 case AMDGPU::S_BFE_U64: 2425 case AMDGPU::S_BFM_B64: 2426 llvm_unreachable("Moving this op to VALU not implemented"); 2427 } 2428 2429 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 2430 // We cannot move this instruction to the VALU, so we should try to 2431 // legalize its operands instead. 2432 legalizeOperands(Inst); 2433 continue; 2434 } 2435 2436 // Use the new VALU Opcode. 2437 const MCInstrDesc &NewDesc = get(NewOpcode); 2438 Inst->setDesc(NewDesc); 2439 2440 // Remove any references to SCC. Vector instructions can't read from it, and 2441 // We're just about to add the implicit use / defs of VCC, and we don't want 2442 // both. 2443 for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) { 2444 MachineOperand &Op = Inst->getOperand(i); 2445 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { 2446 Inst->RemoveOperand(i); 2447 addSCCDefUsersToVALUWorklist(Inst, Worklist); 2448 } 2449 } 2450 2451 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 2452 // We are converting these to a BFE, so we need to add the missing 2453 // operands for the size and offset. 2454 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 2455 Inst->addOperand(MachineOperand::CreateImm(0)); 2456 Inst->addOperand(MachineOperand::CreateImm(Size)); 2457 2458 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 2459 // The VALU version adds the second operand to the result, so insert an 2460 // extra 0 operand. 2461 Inst->addOperand(MachineOperand::CreateImm(0)); 2462 } 2463 2464 Inst->addImplicitDefUseOperands(*Inst->getParent()->getParent()); 2465 2466 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 2467 const MachineOperand &OffsetWidthOp = Inst->getOperand(2); 2468 // If we need to move this to VGPRs, we need to unpack the second operand 2469 // back into the 2 separate ones for bit offset and width. 2470 assert(OffsetWidthOp.isImm() && 2471 "Scalar BFE is only implemented for constant width and offset"); 2472 uint32_t Imm = OffsetWidthOp.getImm(); 2473 2474 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 2475 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 2476 Inst->RemoveOperand(2); // Remove old immediate. 2477 Inst->addOperand(MachineOperand::CreateImm(Offset)); 2478 Inst->addOperand(MachineOperand::CreateImm(BitWidth)); 2479 } 2480 2481 bool HasDst = Inst->getOperand(0).isReg() && Inst->getOperand(0).isDef(); 2482 unsigned NewDstReg = AMDGPU::NoRegister; 2483 if (HasDst) { 2484 // Update the destination register class. 2485 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst); 2486 if (!NewDstRC) 2487 continue; 2488 2489 unsigned DstReg = Inst->getOperand(0).getReg(); 2490 NewDstReg = MRI.createVirtualRegister(NewDstRC); 2491 MRI.replaceRegWith(DstReg, NewDstReg); 2492 } 2493 2494 // Legalize the operands 2495 legalizeOperands(Inst); 2496 2497 if (HasDst) 2498 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 2499 } 2500 } 2501 2502 //===----------------------------------------------------------------------===// 2503 // Indirect addressing callbacks 2504 //===----------------------------------------------------------------------===// 2505 2506 const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const { 2507 return &AMDGPU::VGPR_32RegClass; 2508 } 2509 2510 void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, 2511 MachineInstr *Inst) const { 2512 MachineBasicBlock &MBB = *Inst->getParent(); 2513 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2514 MachineBasicBlock::iterator MII = Inst; 2515 DebugLoc DL = Inst->getDebugLoc(); 2516 2517 MachineOperand &Dest = Inst->getOperand(0); 2518 MachineOperand &Src = Inst->getOperand(1); 2519 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2520 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2521 2522 BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg) 2523 .addImm(0) 2524 .addReg(Src.getReg()); 2525 2526 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) 2527 .addReg(Src.getReg()) 2528 .addReg(TmpReg); 2529 2530 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2531 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2532 } 2533 2534 void SIInstrInfo::splitScalar64BitUnaryOp( 2535 SmallVectorImpl<MachineInstr *> &Worklist, 2536 MachineInstr *Inst, 2537 unsigned Opcode) const { 2538 MachineBasicBlock &MBB = *Inst->getParent(); 2539 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2540 2541 MachineOperand &Dest = Inst->getOperand(0); 2542 MachineOperand &Src0 = Inst->getOperand(1); 2543 DebugLoc DL = Inst->getDebugLoc(); 2544 2545 MachineBasicBlock::iterator MII = Inst; 2546 2547 const MCInstrDesc &InstDesc = get(Opcode); 2548 const TargetRegisterClass *Src0RC = Src0.isReg() ? 2549 MRI.getRegClass(Src0.getReg()) : 2550 &AMDGPU::SGPR_32RegClass; 2551 2552 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 2553 2554 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2555 AMDGPU::sub0, Src0SubRC); 2556 2557 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 2558 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 2559 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 2560 2561 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 2562 BuildMI(MBB, MII, DL, InstDesc, DestSub0) 2563 .addOperand(SrcReg0Sub0); 2564 2565 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2566 AMDGPU::sub1, Src0SubRC); 2567 2568 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 2569 BuildMI(MBB, MII, DL, InstDesc, DestSub1) 2570 .addOperand(SrcReg0Sub1); 2571 2572 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 2573 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2574 .addReg(DestSub0) 2575 .addImm(AMDGPU::sub0) 2576 .addReg(DestSub1) 2577 .addImm(AMDGPU::sub1); 2578 2579 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 2580 2581 // We don't need to legalizeOperands here because for a single operand, src0 2582 // will support any kind of input. 2583 2584 // Move all users of this moved value. 2585 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 2586 } 2587 2588 void SIInstrInfo::splitScalar64BitBinaryOp( 2589 SmallVectorImpl<MachineInstr *> &Worklist, 2590 MachineInstr *Inst, 2591 unsigned Opcode) const { 2592 MachineBasicBlock &MBB = *Inst->getParent(); 2593 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2594 2595 MachineOperand &Dest = Inst->getOperand(0); 2596 MachineOperand &Src0 = Inst->getOperand(1); 2597 MachineOperand &Src1 = Inst->getOperand(2); 2598 DebugLoc DL = Inst->getDebugLoc(); 2599 2600 MachineBasicBlock::iterator MII = Inst; 2601 2602 const MCInstrDesc &InstDesc = get(Opcode); 2603 const TargetRegisterClass *Src0RC = Src0.isReg() ? 2604 MRI.getRegClass(Src0.getReg()) : 2605 &AMDGPU::SGPR_32RegClass; 2606 2607 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 2608 const TargetRegisterClass *Src1RC = Src1.isReg() ? 2609 MRI.getRegClass(Src1.getReg()) : 2610 &AMDGPU::SGPR_32RegClass; 2611 2612 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 2613 2614 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2615 AMDGPU::sub0, Src0SubRC); 2616 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 2617 AMDGPU::sub0, Src1SubRC); 2618 2619 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 2620 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 2621 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 2622 2623 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 2624 MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) 2625 .addOperand(SrcReg0Sub0) 2626 .addOperand(SrcReg1Sub0); 2627 2628 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2629 AMDGPU::sub1, Src0SubRC); 2630 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 2631 AMDGPU::sub1, Src1SubRC); 2632 2633 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 2634 MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) 2635 .addOperand(SrcReg0Sub1) 2636 .addOperand(SrcReg1Sub1); 2637 2638 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 2639 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2640 .addReg(DestSub0) 2641 .addImm(AMDGPU::sub0) 2642 .addReg(DestSub1) 2643 .addImm(AMDGPU::sub1); 2644 2645 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 2646 2647 // Try to legalize the operands in case we need to swap the order to keep it 2648 // valid. 2649 legalizeOperands(LoHalf); 2650 legalizeOperands(HiHalf); 2651 2652 // Move all users of this moved vlaue. 2653 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 2654 } 2655 2656 void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, 2657 MachineInstr *Inst) const { 2658 MachineBasicBlock &MBB = *Inst->getParent(); 2659 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2660 2661 MachineBasicBlock::iterator MII = Inst; 2662 DebugLoc DL = Inst->getDebugLoc(); 2663 2664 MachineOperand &Dest = Inst->getOperand(0); 2665 MachineOperand &Src = Inst->getOperand(1); 2666 2667 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 2668 const TargetRegisterClass *SrcRC = Src.isReg() ? 2669 MRI.getRegClass(Src.getReg()) : 2670 &AMDGPU::SGPR_32RegClass; 2671 2672 unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2673 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2674 2675 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); 2676 2677 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 2678 AMDGPU::sub0, SrcSubRC); 2679 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 2680 AMDGPU::sub1, SrcSubRC); 2681 2682 BuildMI(MBB, MII, DL, InstDesc, MidReg) 2683 .addOperand(SrcRegSub0) 2684 .addImm(0); 2685 2686 BuildMI(MBB, MII, DL, InstDesc, ResultReg) 2687 .addOperand(SrcRegSub1) 2688 .addReg(MidReg); 2689 2690 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2691 2692 // We don't need to legalize operands here. src0 for etiher instruction can be 2693 // an SGPR, and the second input is unused or determined here. 2694 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2695 } 2696 2697 void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, 2698 MachineInstr *Inst) const { 2699 MachineBasicBlock &MBB = *Inst->getParent(); 2700 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2701 MachineBasicBlock::iterator MII = Inst; 2702 DebugLoc DL = Inst->getDebugLoc(); 2703 2704 MachineOperand &Dest = Inst->getOperand(0); 2705 uint32_t Imm = Inst->getOperand(2).getImm(); 2706 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 2707 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 2708 2709 (void) Offset; 2710 2711 // Only sext_inreg cases handled. 2712 assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 && 2713 BitWidth <= 32 && 2714 Offset == 0 && 2715 "Not implemented"); 2716 2717 if (BitWidth < 32) { 2718 unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2719 unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2720 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2721 2722 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) 2723 .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0) 2724 .addImm(0) 2725 .addImm(BitWidth); 2726 2727 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 2728 .addImm(31) 2729 .addReg(MidRegLo); 2730 2731 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 2732 .addReg(MidRegLo) 2733 .addImm(AMDGPU::sub0) 2734 .addReg(MidRegHi) 2735 .addImm(AMDGPU::sub1); 2736 2737 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2738 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2739 return; 2740 } 2741 2742 MachineOperand &Src = Inst->getOperand(1); 2743 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2744 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2745 2746 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 2747 .addImm(31) 2748 .addReg(Src.getReg(), 0, AMDGPU::sub0); 2749 2750 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 2751 .addReg(Src.getReg(), 0, AMDGPU::sub0) 2752 .addImm(AMDGPU::sub0) 2753 .addReg(TmpReg) 2754 .addImm(AMDGPU::sub1); 2755 2756 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2757 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2758 } 2759 2760 void SIInstrInfo::addUsersToMoveToVALUWorklist( 2761 unsigned DstReg, 2762 MachineRegisterInfo &MRI, 2763 SmallVectorImpl<MachineInstr *> &Worklist) const { 2764 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), 2765 E = MRI.use_end(); I != E; ++I) { 2766 MachineInstr &UseMI = *I->getParent(); 2767 if (!canReadVGPR(UseMI, I.getOperandNo())) { 2768 Worklist.push_back(&UseMI); 2769 } 2770 } 2771 } 2772 2773 void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineInstr *SCCDefInst, 2774 SmallVectorImpl<MachineInstr *> &Worklist) const { 2775 // This assumes that all the users of SCC are in the same block 2776 // as the SCC def. 2777 for (MachineBasicBlock::iterator I = SCCDefInst, 2778 E = SCCDefInst->getParent()->end(); I != E; ++I) { 2779 2780 // Exit if we find another SCC def. 2781 if (I->findRegisterDefOperandIdx(AMDGPU::SCC) != -1) 2782 return; 2783 2784 if (I->findRegisterUseOperandIdx(AMDGPU::SCC) != -1) 2785 Worklist.push_back(I); 2786 } 2787 } 2788 2789 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( 2790 const MachineInstr &Inst) const { 2791 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); 2792 2793 switch (Inst.getOpcode()) { 2794 // For target instructions, getOpRegClass just returns the virtual register 2795 // class associated with the operand, so we need to find an equivalent VGPR 2796 // register class in order to move the instruction to the VALU. 2797 case AMDGPU::COPY: 2798 case AMDGPU::PHI: 2799 case AMDGPU::REG_SEQUENCE: 2800 case AMDGPU::INSERT_SUBREG: 2801 if (RI.hasVGPRs(NewDstRC)) 2802 return nullptr; 2803 2804 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 2805 if (!NewDstRC) 2806 return nullptr; 2807 return NewDstRC; 2808 default: 2809 return NewDstRC; 2810 } 2811 } 2812 2813 // Find the one SGPR operand we are allowed to use. 2814 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, 2815 int OpIndices[3]) const { 2816 const MCInstrDesc &Desc = MI->getDesc(); 2817 2818 // Find the one SGPR operand we are allowed to use. 2819 // 2820 // First we need to consider the instruction's operand requirements before 2821 // legalizing. Some operands are required to be SGPRs, such as implicit uses 2822 // of VCC, but we are still bound by the constant bus requirement to only use 2823 // one. 2824 // 2825 // If the operand's class is an SGPR, we can never move it. 2826 2827 unsigned SGPRReg = findImplicitSGPRRead(*MI); 2828 if (SGPRReg != AMDGPU::NoRegister) 2829 return SGPRReg; 2830 2831 unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; 2832 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 2833 2834 for (unsigned i = 0; i < 3; ++i) { 2835 int Idx = OpIndices[i]; 2836 if (Idx == -1) 2837 break; 2838 2839 const MachineOperand &MO = MI->getOperand(Idx); 2840 if (!MO.isReg()) 2841 continue; 2842 2843 // Is this operand statically required to be an SGPR based on the operand 2844 // constraints? 2845 const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); 2846 bool IsRequiredSGPR = RI.isSGPRClass(OpRC); 2847 if (IsRequiredSGPR) 2848 return MO.getReg(); 2849 2850 // If this could be a VGPR or an SGPR, Check the dynamic register class. 2851 unsigned Reg = MO.getReg(); 2852 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); 2853 if (RI.isSGPRClass(RegRC)) 2854 UsedSGPRs[i] = Reg; 2855 } 2856 2857 // We don't have a required SGPR operand, so we have a bit more freedom in 2858 // selecting operands to move. 2859 2860 // Try to select the most used SGPR. If an SGPR is equal to one of the 2861 // others, we choose that. 2862 // 2863 // e.g. 2864 // V_FMA_F32 v0, s0, s0, s0 -> No moves 2865 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 2866 2867 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should 2868 // prefer those. 2869 2870 if (UsedSGPRs[0] != AMDGPU::NoRegister) { 2871 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 2872 SGPRReg = UsedSGPRs[0]; 2873 } 2874 2875 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { 2876 if (UsedSGPRs[1] == UsedSGPRs[2]) 2877 SGPRReg = UsedSGPRs[1]; 2878 } 2879 2880 return SGPRReg; 2881 } 2882 2883 void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved, 2884 const MachineFunction &MF) const { 2885 int End = getIndirectIndexEnd(MF); 2886 int Begin = getIndirectIndexBegin(MF); 2887 2888 if (End == -1) 2889 return; 2890 2891 2892 for (int Index = Begin; Index <= End; ++Index) 2893 Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index)); 2894 2895 for (int Index = std::max(0, Begin - 1); Index <= End; ++Index) 2896 Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index)); 2897 2898 for (int Index = std::max(0, Begin - 2); Index <= End; ++Index) 2899 Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index)); 2900 2901 for (int Index = std::max(0, Begin - 3); Index <= End; ++Index) 2902 Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index)); 2903 2904 for (int Index = std::max(0, Begin - 7); Index <= End; ++Index) 2905 Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index)); 2906 2907 for (int Index = std::max(0, Begin - 15); Index <= End; ++Index) 2908 Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index)); 2909 } 2910 2911 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 2912 unsigned OperandName) const { 2913 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 2914 if (Idx == -1) 2915 return nullptr; 2916 2917 return &MI.getOperand(Idx); 2918 } 2919 2920 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 2921 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 2922 if (ST.isAmdHsaOS()) { 2923 RsrcDataFormat |= (1ULL << 56); 2924 2925 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 2926 // Set MTYPE = 2 2927 RsrcDataFormat |= (2ULL << 59); 2928 } 2929 2930 return RsrcDataFormat; 2931 } 2932 2933 uint64_t SIInstrInfo::getScratchRsrcWords23() const { 2934 uint64_t Rsrc23 = getDefaultRsrcDataFormat() | 2935 AMDGPU::RSRC_TID_ENABLE | 2936 0xffffffff; // Size; 2937 2938 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; 2939 2940 Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT); 2941 2942 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. 2943 // Clear them unless we want a huge stride. 2944 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 2945 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; 2946 2947 return Rsrc23; 2948 } 2949 2950 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr *MI) const { 2951 unsigned Opc = MI->getOpcode(); 2952 2953 return isSMRD(Opc); 2954 } 2955 2956 bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr *MI) const { 2957 unsigned Opc = MI->getOpcode(); 2958 2959 return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); 2960 } 2961 2962 ArrayRef<std::pair<int, const char *>> 2963 SIInstrInfo::getSerializableTargetIndices() const { 2964 static const std::pair<int, const char *> TargetIndices[] = { 2965 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, 2966 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, 2967 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, 2968 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, 2969 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; 2970 return makeArrayRef(TargetIndices); 2971 } 2972