1 //===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief SI Implementation of TargetInstrInfo. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "SIInstrInfo.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "SIDefines.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/CodeGen/MachineFrameInfo.h" 21 #include "llvm/CodeGen/MachineInstrBuilder.h" 22 #include "llvm/CodeGen/MachineRegisterInfo.h" 23 #include "llvm/IR/Function.h" 24 #include "llvm/CodeGen/RegisterScavenging.h" 25 #include "llvm/MC/MCInstrDesc.h" 26 #include "llvm/Support/Debug.h" 27 28 using namespace llvm; 29 30 SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st) 31 : AMDGPUInstrInfo(st), RI() {} 32 33 //===----------------------------------------------------------------------===// 34 // TargetInstrInfo callbacks 35 //===----------------------------------------------------------------------===// 36 37 static unsigned getNumOperandsNoGlue(SDNode *Node) { 38 unsigned N = Node->getNumOperands(); 39 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 40 --N; 41 return N; 42 } 43 44 static SDValue findChainOperand(SDNode *Load) { 45 SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); 46 assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); 47 return LastOp; 48 } 49 50 /// \brief Returns true if both nodes have the same value for the given 51 /// operand \p Op, or if both nodes do not have this operand. 52 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 53 unsigned Opc0 = N0->getMachineOpcode(); 54 unsigned Opc1 = N1->getMachineOpcode(); 55 56 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 57 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 58 59 if (Op0Idx == -1 && Op1Idx == -1) 60 return true; 61 62 63 if ((Op0Idx == -1 && Op1Idx != -1) || 64 (Op1Idx == -1 && Op0Idx != -1)) 65 return false; 66 67 // getNamedOperandIdx returns the index for the MachineInstr's operands, 68 // which includes the result as the first operand. We are indexing into the 69 // MachineSDNode's operands, so we need to skip the result operand to get 70 // the real index. 71 --Op0Idx; 72 --Op1Idx; 73 74 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 75 } 76 77 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, 78 AliasAnalysis *AA) const { 79 // TODO: The generic check fails for VALU instructions that should be 80 // rematerializable due to implicit reads of exec. We really want all of the 81 // generic logic for this except for this. 82 switch (MI->getOpcode()) { 83 case AMDGPU::V_MOV_B32_e32: 84 case AMDGPU::V_MOV_B32_e64: 85 case AMDGPU::V_MOV_B64_PSEUDO: 86 return true; 87 default: 88 return false; 89 } 90 } 91 92 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 93 int64_t &Offset0, 94 int64_t &Offset1) const { 95 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 96 return false; 97 98 unsigned Opc0 = Load0->getMachineOpcode(); 99 unsigned Opc1 = Load1->getMachineOpcode(); 100 101 // Make sure both are actually loads. 102 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 103 return false; 104 105 if (isDS(Opc0) && isDS(Opc1)) { 106 107 // FIXME: Handle this case: 108 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 109 return false; 110 111 // Check base reg. 112 if (Load0->getOperand(1) != Load1->getOperand(1)) 113 return false; 114 115 // Check chain. 116 if (findChainOperand(Load0) != findChainOperand(Load1)) 117 return false; 118 119 // Skip read2 / write2 variants for simplicity. 120 // TODO: We should report true if the used offsets are adjacent (excluded 121 // st64 versions). 122 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || 123 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) 124 return false; 125 126 Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue(); 127 Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue(); 128 return true; 129 } 130 131 if (isSMRD(Opc0) && isSMRD(Opc1)) { 132 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); 133 134 // Check base reg. 135 if (Load0->getOperand(0) != Load1->getOperand(0)) 136 return false; 137 138 const ConstantSDNode *Load0Offset = 139 dyn_cast<ConstantSDNode>(Load0->getOperand(1)); 140 const ConstantSDNode *Load1Offset = 141 dyn_cast<ConstantSDNode>(Load1->getOperand(1)); 142 143 if (!Load0Offset || !Load1Offset) 144 return false; 145 146 // Check chain. 147 if (findChainOperand(Load0) != findChainOperand(Load1)) 148 return false; 149 150 Offset0 = Load0Offset->getZExtValue(); 151 Offset1 = Load1Offset->getZExtValue(); 152 return true; 153 } 154 155 // MUBUF and MTBUF can access the same addresses. 156 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 157 158 // MUBUF and MTBUF have vaddr at different indices. 159 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 160 findChainOperand(Load0) != findChainOperand(Load1) || 161 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 162 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 163 return false; 164 165 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 166 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 167 168 if (OffIdx0 == -1 || OffIdx1 == -1) 169 return false; 170 171 // getNamedOperandIdx returns the index for MachineInstrs. Since they 172 // inlcude the output in the operand list, but SDNodes don't, we need to 173 // subtract the index by one. 174 --OffIdx0; 175 --OffIdx1; 176 177 SDValue Off0 = Load0->getOperand(OffIdx0); 178 SDValue Off1 = Load1->getOperand(OffIdx1); 179 180 // The offset might be a FrameIndexSDNode. 181 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 182 return false; 183 184 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 185 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 186 return true; 187 } 188 189 return false; 190 } 191 192 static bool isStride64(unsigned Opc) { 193 switch (Opc) { 194 case AMDGPU::DS_READ2ST64_B32: 195 case AMDGPU::DS_READ2ST64_B64: 196 case AMDGPU::DS_WRITE2ST64_B32: 197 case AMDGPU::DS_WRITE2ST64_B64: 198 return true; 199 default: 200 return false; 201 } 202 } 203 204 bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, 205 int64_t &Offset, 206 const TargetRegisterInfo *TRI) const { 207 unsigned Opc = LdSt->getOpcode(); 208 209 if (isDS(*LdSt)) { 210 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 211 AMDGPU::OpName::offset); 212 if (OffsetImm) { 213 // Normal, single offset LDS instruction. 214 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 215 AMDGPU::OpName::addr); 216 217 BaseReg = AddrReg->getReg(); 218 Offset = OffsetImm->getImm(); 219 return true; 220 } 221 222 // The 2 offset instructions use offset0 and offset1 instead. We can treat 223 // these as a load with a single offset if the 2 offsets are consecutive. We 224 // will use this for some partially aligned loads. 225 const MachineOperand *Offset0Imm = getNamedOperand(*LdSt, 226 AMDGPU::OpName::offset0); 227 // DS_PERMUTE does not have Offset0Imm (and Offset1Imm). 228 if (!Offset0Imm) 229 return false; 230 231 const MachineOperand *Offset1Imm = getNamedOperand(*LdSt, 232 AMDGPU::OpName::offset1); 233 234 uint8_t Offset0 = Offset0Imm->getImm(); 235 uint8_t Offset1 = Offset1Imm->getImm(); 236 237 if (Offset1 > Offset0 && Offset1 - Offset0 == 1) { 238 // Each of these offsets is in element sized units, so we need to convert 239 // to bytes of the individual reads. 240 241 unsigned EltSize; 242 if (LdSt->mayLoad()) 243 EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2; 244 else { 245 assert(LdSt->mayStore()); 246 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 247 EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize(); 248 } 249 250 if (isStride64(Opc)) 251 EltSize *= 64; 252 253 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 254 AMDGPU::OpName::addr); 255 BaseReg = AddrReg->getReg(); 256 Offset = EltSize * Offset0; 257 return true; 258 } 259 260 return false; 261 } 262 263 if (isMUBUF(*LdSt) || isMTBUF(*LdSt)) { 264 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) 265 return false; 266 267 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 268 AMDGPU::OpName::vaddr); 269 if (!AddrReg) 270 return false; 271 272 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 273 AMDGPU::OpName::offset); 274 BaseReg = AddrReg->getReg(); 275 Offset = OffsetImm->getImm(); 276 return true; 277 } 278 279 if (isSMRD(*LdSt)) { 280 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 281 AMDGPU::OpName::offset); 282 if (!OffsetImm) 283 return false; 284 285 const MachineOperand *SBaseReg = getNamedOperand(*LdSt, 286 AMDGPU::OpName::sbase); 287 BaseReg = SBaseReg->getReg(); 288 Offset = OffsetImm->getImm(); 289 return true; 290 } 291 292 return false; 293 } 294 295 bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, 296 MachineInstr *SecondLdSt, 297 unsigned NumLoads) const { 298 const MachineOperand *FirstDst = nullptr; 299 const MachineOperand *SecondDst = nullptr; 300 301 if (isDS(*FirstLdSt) && isDS(*SecondLdSt)) { 302 FirstDst = getNamedOperand(*FirstLdSt, AMDGPU::OpName::vdst); 303 SecondDst = getNamedOperand(*SecondLdSt, AMDGPU::OpName::vdst); 304 } 305 306 if (isSMRD(*FirstLdSt) && isSMRD(*FirstLdSt)) { 307 FirstDst = getNamedOperand(*FirstLdSt, AMDGPU::OpName::sdst); 308 SecondDst = getNamedOperand(*SecondLdSt, AMDGPU::OpName::sdst); 309 } 310 311 if ((isMUBUF(*FirstLdSt) && isMUBUF(*SecondLdSt)) || 312 (isMTBUF(*FirstLdSt) && isMTBUF(*SecondLdSt))) { 313 FirstDst = getNamedOperand(*FirstLdSt, AMDGPU::OpName::vdata); 314 SecondDst = getNamedOperand(*SecondLdSt, AMDGPU::OpName::vdata); 315 } 316 317 if (!FirstDst || !SecondDst) 318 return false; 319 320 // Try to limit clustering based on the total number of bytes loaded 321 // rather than the number of instructions. This is done to help reduce 322 // register pressure. The method used is somewhat inexact, though, 323 // because it assumes that all loads in the cluster will load the 324 // same number of bytes as FirstLdSt. 325 326 // The unit of this value is bytes. 327 // FIXME: This needs finer tuning. 328 unsigned LoadClusterThreshold = 16; 329 330 const MachineRegisterInfo &MRI = 331 FirstLdSt->getParent()->getParent()->getRegInfo(); 332 const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg()); 333 334 return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold; 335 } 336 337 void 338 SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 339 MachineBasicBlock::iterator MI, DebugLoc DL, 340 unsigned DestReg, unsigned SrcReg, 341 bool KillSrc) const { 342 343 // If we are trying to copy to or from SCC, there is a bug somewhere else in 344 // the backend. While it may be theoretically possible to do this, it should 345 // never be necessary. 346 assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC); 347 348 static const int16_t Sub0_15[] = { 349 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 350 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 351 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 352 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 353 }; 354 355 static const int16_t Sub0_15_64[] = { 356 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 357 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 358 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 359 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 360 }; 361 362 static const int16_t Sub0_7[] = { 363 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 364 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 365 }; 366 367 static const int16_t Sub0_7_64[] = { 368 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 369 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 370 }; 371 372 static const int16_t Sub0_3[] = { 373 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 374 }; 375 376 static const int16_t Sub0_3_64[] = { 377 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 378 }; 379 380 static const int16_t Sub0_2[] = { 381 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 382 }; 383 384 static const int16_t Sub0_1[] = { 385 AMDGPU::sub0, AMDGPU::sub1, 386 }; 387 388 unsigned Opcode; 389 ArrayRef<int16_t> SubIndices; 390 bool Forward; 391 392 if (AMDGPU::SReg_32RegClass.contains(DestReg)) { 393 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 394 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 395 .addReg(SrcReg, getKillRegState(KillSrc)); 396 return; 397 398 } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) { 399 if (DestReg == AMDGPU::VCC) { 400 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 401 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 402 .addReg(SrcReg, getKillRegState(KillSrc)); 403 } else { 404 // FIXME: Hack until VReg_1 removed. 405 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 406 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32)) 407 .addImm(0) 408 .addReg(SrcReg, getKillRegState(KillSrc)); 409 } 410 411 return; 412 } 413 414 assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); 415 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 416 .addReg(SrcReg, getKillRegState(KillSrc)); 417 return; 418 419 } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) { 420 assert(AMDGPU::SReg_128RegClass.contains(SrcReg)); 421 Opcode = AMDGPU::S_MOV_B64; 422 SubIndices = Sub0_3_64; 423 424 } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) { 425 assert(AMDGPU::SReg_256RegClass.contains(SrcReg)); 426 Opcode = AMDGPU::S_MOV_B64; 427 SubIndices = Sub0_7_64; 428 429 } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) { 430 assert(AMDGPU::SReg_512RegClass.contains(SrcReg)); 431 Opcode = AMDGPU::S_MOV_B64; 432 SubIndices = Sub0_15_64; 433 434 } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) { 435 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 436 AMDGPU::SReg_32RegClass.contains(SrcReg)); 437 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 438 .addReg(SrcReg, getKillRegState(KillSrc)); 439 return; 440 441 } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) { 442 assert(AMDGPU::VReg_64RegClass.contains(SrcReg) || 443 AMDGPU::SReg_64RegClass.contains(SrcReg)); 444 Opcode = AMDGPU::V_MOV_B32_e32; 445 SubIndices = Sub0_1; 446 447 } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) { 448 assert(AMDGPU::VReg_96RegClass.contains(SrcReg)); 449 Opcode = AMDGPU::V_MOV_B32_e32; 450 SubIndices = Sub0_2; 451 452 } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) { 453 assert(AMDGPU::VReg_128RegClass.contains(SrcReg) || 454 AMDGPU::SReg_128RegClass.contains(SrcReg)); 455 Opcode = AMDGPU::V_MOV_B32_e32; 456 SubIndices = Sub0_3; 457 458 } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) { 459 assert(AMDGPU::VReg_256RegClass.contains(SrcReg) || 460 AMDGPU::SReg_256RegClass.contains(SrcReg)); 461 Opcode = AMDGPU::V_MOV_B32_e32; 462 SubIndices = Sub0_7; 463 464 } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) { 465 assert(AMDGPU::VReg_512RegClass.contains(SrcReg) || 466 AMDGPU::SReg_512RegClass.contains(SrcReg)); 467 Opcode = AMDGPU::V_MOV_B32_e32; 468 SubIndices = Sub0_15; 469 470 } else { 471 llvm_unreachable("Can't copy register!"); 472 } 473 474 if (RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg)) 475 Forward = true; 476 else 477 Forward = false; 478 479 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 480 unsigned SubIdx; 481 if (Forward) 482 SubIdx = SubIndices[Idx]; 483 else 484 SubIdx = SubIndices[SubIndices.size() - Idx - 1]; 485 486 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 487 get(Opcode), RI.getSubReg(DestReg, SubIdx)); 488 489 Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); 490 491 if (Idx == SubIndices.size() - 1) 492 Builder.addReg(SrcReg, RegState::Kill | RegState::Implicit); 493 494 if (Idx == 0) 495 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 496 } 497 } 498 499 int SIInstrInfo::commuteOpcode(const MachineInstr &MI) const { 500 const unsigned Opcode = MI.getOpcode(); 501 502 int NewOpc; 503 504 // Try to map original to commuted opcode 505 NewOpc = AMDGPU::getCommuteRev(Opcode); 506 if (NewOpc != -1) 507 // Check if the commuted (REV) opcode exists on the target. 508 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 509 510 // Try to map commuted to original opcode 511 NewOpc = AMDGPU::getCommuteOrig(Opcode); 512 if (NewOpc != -1) 513 // Check if the original (non-REV) opcode exists on the target. 514 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 515 516 return Opcode; 517 } 518 519 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 520 521 if (DstRC->getSize() == 4) { 522 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 523 } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) { 524 return AMDGPU::S_MOV_B64; 525 } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) { 526 return AMDGPU::V_MOV_B64_PSEUDO; 527 } 528 return AMDGPU::COPY; 529 } 530 531 static unsigned getSGPRSpillSaveOpcode(unsigned Size) { 532 switch (Size) { 533 case 4: 534 return AMDGPU::SI_SPILL_S32_SAVE; 535 case 8: 536 return AMDGPU::SI_SPILL_S64_SAVE; 537 case 16: 538 return AMDGPU::SI_SPILL_S128_SAVE; 539 case 32: 540 return AMDGPU::SI_SPILL_S256_SAVE; 541 case 64: 542 return AMDGPU::SI_SPILL_S512_SAVE; 543 default: 544 llvm_unreachable("unknown register size"); 545 } 546 } 547 548 static unsigned getVGPRSpillSaveOpcode(unsigned Size) { 549 switch (Size) { 550 case 4: 551 return AMDGPU::SI_SPILL_V32_SAVE; 552 case 8: 553 return AMDGPU::SI_SPILL_V64_SAVE; 554 case 16: 555 return AMDGPU::SI_SPILL_V128_SAVE; 556 case 32: 557 return AMDGPU::SI_SPILL_V256_SAVE; 558 case 64: 559 return AMDGPU::SI_SPILL_V512_SAVE; 560 default: 561 llvm_unreachable("unknown register size"); 562 } 563 } 564 565 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 566 MachineBasicBlock::iterator MI, 567 unsigned SrcReg, bool isKill, 568 int FrameIndex, 569 const TargetRegisterClass *RC, 570 const TargetRegisterInfo *TRI) const { 571 MachineFunction *MF = MBB.getParent(); 572 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 573 MachineFrameInfo *FrameInfo = MF->getFrameInfo(); 574 DebugLoc DL = MBB.findDebugLoc(MI); 575 576 unsigned Size = FrameInfo->getObjectSize(FrameIndex); 577 unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); 578 MachinePointerInfo PtrInfo 579 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 580 MachineMemOperand *MMO 581 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 582 Size, Align); 583 584 if (RI.isSGPRClass(RC)) { 585 MFI->setHasSpilledSGPRs(); 586 587 // We are only allowed to create one new instruction when spilling 588 // registers, so we need to use pseudo instruction for spilling 589 // SGPRs. 590 unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize()); 591 BuildMI(MBB, MI, DL, get(Opcode)) 592 .addReg(SrcReg) // src 593 .addFrameIndex(FrameIndex) // frame_idx 594 .addMemOperand(MMO); 595 596 return; 597 } 598 599 if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { 600 LLVMContext &Ctx = MF->getFunction()->getContext(); 601 Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" 602 " spill register"); 603 BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) 604 .addReg(SrcReg); 605 606 return; 607 } 608 609 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 610 611 unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); 612 MFI->setHasSpilledVGPRs(); 613 BuildMI(MBB, MI, DL, get(Opcode)) 614 .addReg(SrcReg) // src 615 .addFrameIndex(FrameIndex) // frame_idx 616 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 617 .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset 618 .addImm(0) // offset 619 .addMemOperand(MMO); 620 } 621 622 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { 623 switch (Size) { 624 case 4: 625 return AMDGPU::SI_SPILL_S32_RESTORE; 626 case 8: 627 return AMDGPU::SI_SPILL_S64_RESTORE; 628 case 16: 629 return AMDGPU::SI_SPILL_S128_RESTORE; 630 case 32: 631 return AMDGPU::SI_SPILL_S256_RESTORE; 632 case 64: 633 return AMDGPU::SI_SPILL_S512_RESTORE; 634 default: 635 llvm_unreachable("unknown register size"); 636 } 637 } 638 639 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { 640 switch (Size) { 641 case 4: 642 return AMDGPU::SI_SPILL_V32_RESTORE; 643 case 8: 644 return AMDGPU::SI_SPILL_V64_RESTORE; 645 case 16: 646 return AMDGPU::SI_SPILL_V128_RESTORE; 647 case 32: 648 return AMDGPU::SI_SPILL_V256_RESTORE; 649 case 64: 650 return AMDGPU::SI_SPILL_V512_RESTORE; 651 default: 652 llvm_unreachable("unknown register size"); 653 } 654 } 655 656 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 657 MachineBasicBlock::iterator MI, 658 unsigned DestReg, int FrameIndex, 659 const TargetRegisterClass *RC, 660 const TargetRegisterInfo *TRI) const { 661 MachineFunction *MF = MBB.getParent(); 662 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 663 MachineFrameInfo *FrameInfo = MF->getFrameInfo(); 664 DebugLoc DL = MBB.findDebugLoc(MI); 665 unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); 666 unsigned Size = FrameInfo->getObjectSize(FrameIndex); 667 668 MachinePointerInfo PtrInfo 669 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 670 671 MachineMemOperand *MMO = MF->getMachineMemOperand( 672 PtrInfo, MachineMemOperand::MOLoad, Size, Align); 673 674 if (RI.isSGPRClass(RC)) { 675 // FIXME: Maybe this should not include a memoperand because it will be 676 // lowered to non-memory instructions. 677 unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize()); 678 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 679 .addFrameIndex(FrameIndex) // frame_idx 680 .addMemOperand(MMO); 681 682 return; 683 } 684 685 if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { 686 LLVMContext &Ctx = MF->getFunction()->getContext(); 687 Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" 688 " restore register"); 689 BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); 690 691 return; 692 } 693 694 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 695 696 unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); 697 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 698 .addFrameIndex(FrameIndex) // frame_idx 699 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 700 .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset 701 .addImm(0) // offset 702 .addMemOperand(MMO); 703 } 704 705 /// \param @Offset Offset in bytes of the FrameIndex being spilled 706 unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, 707 MachineBasicBlock::iterator MI, 708 RegScavenger *RS, unsigned TmpReg, 709 unsigned FrameOffset, 710 unsigned Size) const { 711 MachineFunction *MF = MBB.getParent(); 712 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 713 const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>(); 714 const SIRegisterInfo *TRI = 715 static_cast<const SIRegisterInfo*>(ST.getRegisterInfo()); 716 DebugLoc DL = MBB.findDebugLoc(MI); 717 unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF); 718 unsigned WavefrontSize = ST.getWavefrontSize(); 719 720 unsigned TIDReg = MFI->getTIDReg(); 721 if (!MFI->hasCalculatedTID()) { 722 MachineBasicBlock &Entry = MBB.getParent()->front(); 723 MachineBasicBlock::iterator Insert = Entry.front(); 724 DebugLoc DL = Insert->getDebugLoc(); 725 726 TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass); 727 if (TIDReg == AMDGPU::NoRegister) 728 return TIDReg; 729 730 731 if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) && 732 WorkGroupSize > WavefrontSize) { 733 734 unsigned TIDIGXReg 735 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X); 736 unsigned TIDIGYReg 737 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y); 738 unsigned TIDIGZReg 739 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z); 740 unsigned InputPtrReg = 741 TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); 742 for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { 743 if (!Entry.isLiveIn(Reg)) 744 Entry.addLiveIn(Reg); 745 } 746 747 RS->enterBasicBlock(Entry); 748 // FIXME: Can we scavenge an SReg_64 and access the subregs? 749 unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 750 unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 751 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) 752 .addReg(InputPtrReg) 753 .addImm(SI::KernelInputOffsets::NGROUPS_Z); 754 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) 755 .addReg(InputPtrReg) 756 .addImm(SI::KernelInputOffsets::NGROUPS_Y); 757 758 // NGROUPS.X * NGROUPS.Y 759 BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) 760 .addReg(STmp1) 761 .addReg(STmp0); 762 // (NGROUPS.X * NGROUPS.Y) * TIDIG.X 763 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) 764 .addReg(STmp1) 765 .addReg(TIDIGXReg); 766 // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) 767 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) 768 .addReg(STmp0) 769 .addReg(TIDIGYReg) 770 .addReg(TIDReg); 771 // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z 772 BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) 773 .addReg(TIDReg) 774 .addReg(TIDIGZReg); 775 } else { 776 // Get the wave id 777 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), 778 TIDReg) 779 .addImm(-1) 780 .addImm(0); 781 782 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), 783 TIDReg) 784 .addImm(-1) 785 .addReg(TIDReg); 786 } 787 788 BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), 789 TIDReg) 790 .addImm(2) 791 .addReg(TIDReg); 792 MFI->setTIDReg(TIDReg); 793 } 794 795 // Add FrameIndex to LDS offset 796 unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize); 797 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) 798 .addImm(LDSOffset) 799 .addReg(TIDReg); 800 801 return TmpReg; 802 } 803 804 void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB, 805 MachineBasicBlock::iterator MI, 806 int Count) const { 807 while (Count > 0) { 808 int Arg; 809 if (Count >= 8) 810 Arg = 7; 811 else 812 Arg = Count - 1; 813 Count -= 8; 814 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_NOP)) 815 .addImm(Arg); 816 } 817 } 818 819 bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { 820 MachineBasicBlock &MBB = *MI->getParent(); 821 DebugLoc DL = MBB.findDebugLoc(MI); 822 switch (MI->getOpcode()) { 823 default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); 824 825 case AMDGPU::SGPR_USE: 826 // This is just a placeholder for register allocation. 827 MI->eraseFromParent(); 828 break; 829 830 case AMDGPU::V_MOV_B64_PSEUDO: { 831 unsigned Dst = MI->getOperand(0).getReg(); 832 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 833 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 834 835 const MachineOperand &SrcOp = MI->getOperand(1); 836 // FIXME: Will this work for 64-bit floating point immediates? 837 assert(!SrcOp.isFPImm()); 838 if (SrcOp.isImm()) { 839 APInt Imm(64, SrcOp.getImm()); 840 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 841 .addImm(Imm.getLoBits(32).getZExtValue()) 842 .addReg(Dst, RegState::Implicit); 843 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 844 .addImm(Imm.getHiBits(32).getZExtValue()) 845 .addReg(Dst, RegState::Implicit); 846 } else { 847 assert(SrcOp.isReg()); 848 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 849 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 850 .addReg(Dst, RegState::Implicit); 851 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 852 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 853 .addReg(Dst, RegState::Implicit); 854 } 855 MI->eraseFromParent(); 856 break; 857 } 858 859 case AMDGPU::V_CNDMASK_B64_PSEUDO: { 860 unsigned Dst = MI->getOperand(0).getReg(); 861 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 862 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 863 unsigned Src0 = MI->getOperand(1).getReg(); 864 unsigned Src1 = MI->getOperand(2).getReg(); 865 const MachineOperand &SrcCond = MI->getOperand(3); 866 867 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo) 868 .addReg(RI.getSubReg(Src0, AMDGPU::sub0)) 869 .addReg(RI.getSubReg(Src1, AMDGPU::sub0)) 870 .addOperand(SrcCond); 871 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi) 872 .addReg(RI.getSubReg(Src0, AMDGPU::sub1)) 873 .addReg(RI.getSubReg(Src1, AMDGPU::sub1)) 874 .addOperand(SrcCond); 875 MI->eraseFromParent(); 876 break; 877 } 878 879 case AMDGPU::SI_CONSTDATA_PTR: { 880 const SIRegisterInfo *TRI = 881 static_cast<const SIRegisterInfo *>(ST.getRegisterInfo()); 882 MachineFunction &MF = *MBB.getParent(); 883 unsigned Reg = MI->getOperand(0).getReg(); 884 unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0); 885 unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1); 886 887 // Create a bundle so these instructions won't be re-ordered by the 888 // post-RA scheduler. 889 MIBundleBuilder Bundler(MBB, MI); 890 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); 891 892 // Add 32-bit offset from this instruction to the start of the 893 // constant data. 894 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) 895 .addReg(RegLo) 896 .addOperand(MI->getOperand(1))); 897 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) 898 .addReg(RegHi) 899 .addImm(0)); 900 901 llvm::finalizeBundle(MBB, Bundler.begin()); 902 903 MI->eraseFromParent(); 904 break; 905 } 906 } 907 return true; 908 } 909 910 /// Commutes the operands in the given instruction. 911 /// The commutable operands are specified by their indices OpIdx0 and OpIdx1. 912 /// 913 /// Do not call this method for a non-commutable instruction or for 914 /// non-commutable pair of operand indices OpIdx0 and OpIdx1. 915 /// Even though the instruction is commutable, the method may still 916 /// fail to commute the operands, null pointer is returned in such cases. 917 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI, 918 bool NewMI, 919 unsigned OpIdx0, 920 unsigned OpIdx1) const { 921 int CommutedOpcode = commuteOpcode(*MI); 922 if (CommutedOpcode == -1) 923 return nullptr; 924 925 int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 926 AMDGPU::OpName::src0); 927 MachineOperand &Src0 = MI->getOperand(Src0Idx); 928 if (!Src0.isReg()) 929 return nullptr; 930 931 int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 932 AMDGPU::OpName::src1); 933 934 if ((OpIdx0 != static_cast<unsigned>(Src0Idx) || 935 OpIdx1 != static_cast<unsigned>(Src1Idx)) && 936 (OpIdx0 != static_cast<unsigned>(Src1Idx) || 937 OpIdx1 != static_cast<unsigned>(Src0Idx))) 938 return nullptr; 939 940 MachineOperand &Src1 = MI->getOperand(Src1Idx); 941 942 943 if (isVOP2(*MI)) { 944 const MCInstrDesc &InstrDesc = MI->getDesc(); 945 // For VOP2 instructions, any operand type is valid to use for src0. Make 946 // sure we can use the src1 as src0. 947 // 948 // We could be stricter here and only allow commuting if there is a reason 949 // to do so. i.e. if both operands are VGPRs there is no real benefit, 950 // although MachineCSE attempts to find matches by commuting. 951 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 952 if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) 953 return nullptr; 954 } 955 956 if (!Src1.isReg()) { 957 // Allow commuting instructions with Imm operands. 958 if (NewMI || !Src1.isImm() || 959 (!isVOP2(*MI) && !isVOP3(*MI))) { 960 return nullptr; 961 } 962 // Be sure to copy the source modifiers to the right place. 963 if (MachineOperand *Src0Mods 964 = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { 965 MachineOperand *Src1Mods 966 = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers); 967 968 int Src0ModsVal = Src0Mods->getImm(); 969 if (!Src1Mods && Src0ModsVal != 0) 970 return nullptr; 971 972 // XXX - This assert might be a lie. It might be useful to have a neg 973 // modifier with 0.0. 974 int Src1ModsVal = Src1Mods->getImm(); 975 assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates"); 976 977 Src1Mods->setImm(Src0ModsVal); 978 Src0Mods->setImm(Src1ModsVal); 979 } 980 981 unsigned Reg = Src0.getReg(); 982 unsigned SubReg = Src0.getSubReg(); 983 if (Src1.isImm()) 984 Src0.ChangeToImmediate(Src1.getImm()); 985 else 986 llvm_unreachable("Should only have immediates"); 987 988 Src1.ChangeToRegister(Reg, false); 989 Src1.setSubReg(SubReg); 990 } else { 991 MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1); 992 } 993 994 if (MI) 995 MI->setDesc(get(CommutedOpcode)); 996 997 return MI; 998 } 999 1000 // This needs to be implemented because the source modifiers may be inserted 1001 // between the true commutable operands, and the base 1002 // TargetInstrInfo::commuteInstruction uses it. 1003 bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, 1004 unsigned &SrcOpIdx0, 1005 unsigned &SrcOpIdx1) const { 1006 const MCInstrDesc &MCID = MI->getDesc(); 1007 if (!MCID.isCommutable()) 1008 return false; 1009 1010 unsigned Opc = MI->getOpcode(); 1011 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1012 if (Src0Idx == -1) 1013 return false; 1014 1015 // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on 1016 // immediate. Also, immediate src0 operand is not handled in 1017 // SIInstrInfo::commuteInstruction(); 1018 if (!MI->getOperand(Src0Idx).isReg()) 1019 return false; 1020 1021 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1022 if (Src1Idx == -1) 1023 return false; 1024 1025 MachineOperand &Src1 = MI->getOperand(Src1Idx); 1026 if (Src1.isImm()) { 1027 // SIInstrInfo::commuteInstruction() does support commuting the immediate 1028 // operand src1 in 2 and 3 operand instructions. 1029 if (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode())) 1030 return false; 1031 } else if (Src1.isReg()) { 1032 // If any source modifiers are set, the generic instruction commuting won't 1033 // understand how to copy the source modifiers. 1034 if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || 1035 hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) 1036 return false; 1037 } else 1038 return false; 1039 1040 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); 1041 } 1042 1043 static void removeModOperands(MachineInstr &MI) { 1044 unsigned Opc = MI.getOpcode(); 1045 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1046 AMDGPU::OpName::src0_modifiers); 1047 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1048 AMDGPU::OpName::src1_modifiers); 1049 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1050 AMDGPU::OpName::src2_modifiers); 1051 1052 MI.RemoveOperand(Src2ModIdx); 1053 MI.RemoveOperand(Src1ModIdx); 1054 MI.RemoveOperand(Src0ModIdx); 1055 } 1056 1057 bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, 1058 unsigned Reg, MachineRegisterInfo *MRI) const { 1059 if (!MRI->hasOneNonDBGUse(Reg)) 1060 return false; 1061 1062 unsigned Opc = UseMI->getOpcode(); 1063 if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) { 1064 // Don't fold if we are using source modifiers. The new VOP2 instructions 1065 // don't have them. 1066 if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) || 1067 hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) || 1068 hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) { 1069 return false; 1070 } 1071 1072 MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0); 1073 MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1); 1074 MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2); 1075 1076 // Multiplied part is the constant: Use v_madmk_f32 1077 // We should only expect these to be on src0 due to canonicalizations. 1078 if (Src0->isReg() && Src0->getReg() == Reg) { 1079 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1080 return false; 1081 1082 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) 1083 return false; 1084 1085 // We need to swap operands 0 and 1 since madmk constant is at operand 1. 1086 1087 const int64_t Imm = DefMI->getOperand(1).getImm(); 1088 1089 // FIXME: This would be a lot easier if we could return a new instruction 1090 // instead of having to modify in place. 1091 1092 // Remove these first since they are at the end. 1093 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1094 AMDGPU::OpName::omod)); 1095 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1096 AMDGPU::OpName::clamp)); 1097 1098 unsigned Src1Reg = Src1->getReg(); 1099 unsigned Src1SubReg = Src1->getSubReg(); 1100 Src0->setReg(Src1Reg); 1101 Src0->setSubReg(Src1SubReg); 1102 Src0->setIsKill(Src1->isKill()); 1103 1104 if (Opc == AMDGPU::V_MAC_F32_e64) { 1105 UseMI->untieRegOperand( 1106 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1107 } 1108 1109 Src1->ChangeToImmediate(Imm); 1110 1111 removeModOperands(*UseMI); 1112 UseMI->setDesc(get(AMDGPU::V_MADMK_F32)); 1113 1114 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1115 if (DeleteDef) 1116 DefMI->eraseFromParent(); 1117 1118 return true; 1119 } 1120 1121 // Added part is the constant: Use v_madak_f32 1122 if (Src2->isReg() && Src2->getReg() == Reg) { 1123 // Not allowed to use constant bus for another operand. 1124 // We can however allow an inline immediate as src0. 1125 if (!Src0->isImm() && 1126 (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) 1127 return false; 1128 1129 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1130 return false; 1131 1132 const int64_t Imm = DefMI->getOperand(1).getImm(); 1133 1134 // FIXME: This would be a lot easier if we could return a new instruction 1135 // instead of having to modify in place. 1136 1137 // Remove these first since they are at the end. 1138 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1139 AMDGPU::OpName::omod)); 1140 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1141 AMDGPU::OpName::clamp)); 1142 1143 if (Opc == AMDGPU::V_MAC_F32_e64) { 1144 UseMI->untieRegOperand( 1145 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1146 } 1147 1148 // ChangingToImmediate adds Src2 back to the instruction. 1149 Src2->ChangeToImmediate(Imm); 1150 1151 // These come before src2. 1152 removeModOperands(*UseMI); 1153 UseMI->setDesc(get(AMDGPU::V_MADAK_F32)); 1154 1155 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1156 if (DeleteDef) 1157 DefMI->eraseFromParent(); 1158 1159 return true; 1160 } 1161 } 1162 1163 return false; 1164 } 1165 1166 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 1167 int WidthB, int OffsetB) { 1168 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1169 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1170 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1171 return LowOffset + LowWidth <= HighOffset; 1172 } 1173 1174 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa, 1175 MachineInstr *MIb) const { 1176 unsigned BaseReg0, BaseReg1; 1177 int64_t Offset0, Offset1; 1178 1179 if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && 1180 getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { 1181 assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() && 1182 "read2 / write2 not expected here yet"); 1183 unsigned Width0 = (*MIa->memoperands_begin())->getSize(); 1184 unsigned Width1 = (*MIb->memoperands_begin())->getSize(); 1185 if (BaseReg0 == BaseReg1 && 1186 offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { 1187 return true; 1188 } 1189 } 1190 1191 return false; 1192 } 1193 1194 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, 1195 MachineInstr *MIb, 1196 AliasAnalysis *AA) const { 1197 assert(MIa && (MIa->mayLoad() || MIa->mayStore()) && 1198 "MIa must load from or modify a memory location"); 1199 assert(MIb && (MIb->mayLoad() || MIb->mayStore()) && 1200 "MIb must load from or modify a memory location"); 1201 1202 if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects()) 1203 return false; 1204 1205 // XXX - Can we relax this between address spaces? 1206 if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef()) 1207 return false; 1208 1209 // TODO: Should we check the address space from the MachineMemOperand? That 1210 // would allow us to distinguish objects we know don't alias based on the 1211 // underlying address space, even if it was lowered to a different one, 1212 // e.g. private accesses lowered to use MUBUF instructions on a scratch 1213 // buffer. 1214 if (isDS(*MIa)) { 1215 if (isDS(*MIb)) 1216 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1217 1218 return !isFLAT(*MIb); 1219 } 1220 1221 if (isMUBUF(*MIa) || isMTBUF(*MIa)) { 1222 if (isMUBUF(*MIb) || isMTBUF(*MIb)) 1223 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1224 1225 return !isFLAT(*MIb) && !isSMRD(*MIb); 1226 } 1227 1228 if (isSMRD(*MIa)) { 1229 if (isSMRD(*MIb)) 1230 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1231 1232 return !isFLAT(*MIb) && !isMUBUF(*MIa) && !isMTBUF(*MIa); 1233 } 1234 1235 if (isFLAT(*MIa)) { 1236 if (isFLAT(*MIb)) 1237 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1238 1239 return false; 1240 } 1241 1242 return false; 1243 } 1244 1245 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, 1246 MachineBasicBlock::iterator &MI, 1247 LiveVariables *LV) const { 1248 1249 switch (MI->getOpcode()) { 1250 default: return nullptr; 1251 case AMDGPU::V_MAC_F32_e64: break; 1252 case AMDGPU::V_MAC_F32_e32: { 1253 const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); 1254 if (Src0->isImm() && !isInlineConstant(*Src0, 4)) 1255 return nullptr; 1256 break; 1257 } 1258 } 1259 1260 const MachineOperand *Dst = getNamedOperand(*MI, AMDGPU::OpName::vdst); 1261 const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); 1262 const MachineOperand *Src1 = getNamedOperand(*MI, AMDGPU::OpName::src1); 1263 const MachineOperand *Src2 = getNamedOperand(*MI, AMDGPU::OpName::src2); 1264 1265 return BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_MAD_F32)) 1266 .addOperand(*Dst) 1267 .addImm(0) // Src0 mods 1268 .addOperand(*Src0) 1269 .addImm(0) // Src1 mods 1270 .addOperand(*Src1) 1271 .addImm(0) // Src mods 1272 .addOperand(*Src2) 1273 .addImm(0) // clamp 1274 .addImm(0); // omod 1275 } 1276 1277 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr *MI, 1278 const MachineBasicBlock *MBB, 1279 const MachineFunction &MF) const { 1280 // Target-independent instructions do not have an implicit-use of EXEC, even 1281 // when they operate on VGPRs. Treating EXEC modifications as scheduling 1282 // boundaries prevents incorrect movements of such instructions. 1283 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); 1284 if (MI->modifiesRegister(AMDGPU::EXEC, TRI)) 1285 return true; 1286 1287 return AMDGPUInstrInfo::isSchedulingBoundary(MI, MBB, MF); 1288 } 1289 1290 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 1291 int64_t SVal = Imm.getSExtValue(); 1292 if (SVal >= -16 && SVal <= 64) 1293 return true; 1294 1295 if (Imm.getBitWidth() == 64) { 1296 uint64_t Val = Imm.getZExtValue(); 1297 return (DoubleToBits(0.0) == Val) || 1298 (DoubleToBits(1.0) == Val) || 1299 (DoubleToBits(-1.0) == Val) || 1300 (DoubleToBits(0.5) == Val) || 1301 (DoubleToBits(-0.5) == Val) || 1302 (DoubleToBits(2.0) == Val) || 1303 (DoubleToBits(-2.0) == Val) || 1304 (DoubleToBits(4.0) == Val) || 1305 (DoubleToBits(-4.0) == Val); 1306 } 1307 1308 // The actual type of the operand does not seem to matter as long 1309 // as the bits match one of the inline immediate values. For example: 1310 // 1311 // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal, 1312 // so it is a legal inline immediate. 1313 // 1314 // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in 1315 // floating-point, so it is a legal inline immediate. 1316 uint32_t Val = Imm.getZExtValue(); 1317 1318 return (FloatToBits(0.0f) == Val) || 1319 (FloatToBits(1.0f) == Val) || 1320 (FloatToBits(-1.0f) == Val) || 1321 (FloatToBits(0.5f) == Val) || 1322 (FloatToBits(-0.5f) == Val) || 1323 (FloatToBits(2.0f) == Val) || 1324 (FloatToBits(-2.0f) == Val) || 1325 (FloatToBits(4.0f) == Val) || 1326 (FloatToBits(-4.0f) == Val); 1327 } 1328 1329 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 1330 unsigned OpSize) const { 1331 if (MO.isImm()) { 1332 // MachineOperand provides no way to tell the true operand size, since it 1333 // only records a 64-bit value. We need to know the size to determine if a 1334 // 32-bit floating point immediate bit pattern is legal for an integer 1335 // immediate. It would be for any 32-bit integer operand, but would not be 1336 // for a 64-bit one. 1337 1338 unsigned BitSize = 8 * OpSize; 1339 return isInlineConstant(APInt(BitSize, MO.getImm(), true)); 1340 } 1341 1342 return false; 1343 } 1344 1345 bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO, 1346 unsigned OpSize) const { 1347 return MO.isImm() && !isInlineConstant(MO, OpSize); 1348 } 1349 1350 static bool compareMachineOp(const MachineOperand &Op0, 1351 const MachineOperand &Op1) { 1352 if (Op0.getType() != Op1.getType()) 1353 return false; 1354 1355 switch (Op0.getType()) { 1356 case MachineOperand::MO_Register: 1357 return Op0.getReg() == Op1.getReg(); 1358 case MachineOperand::MO_Immediate: 1359 return Op0.getImm() == Op1.getImm(); 1360 default: 1361 llvm_unreachable("Didn't expect to be comparing these operand types"); 1362 } 1363 } 1364 1365 bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, 1366 const MachineOperand &MO) const { 1367 const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo]; 1368 1369 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 1370 1371 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 1372 return true; 1373 1374 if (OpInfo.RegClass < 0) 1375 return false; 1376 1377 unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize(); 1378 if (isLiteralConstant(MO, OpSize)) 1379 return RI.opCanUseLiteralConstant(OpInfo.OperandType); 1380 1381 return RI.opCanUseInlineConstant(OpInfo.OperandType); 1382 } 1383 1384 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 1385 int Op32 = AMDGPU::getVOPe32(Opcode); 1386 if (Op32 == -1) 1387 return false; 1388 1389 return pseudoToMCOpcode(Op32) != -1; 1390 } 1391 1392 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 1393 // The src0_modifier operand is present on all instructions 1394 // that have modifiers. 1395 1396 return AMDGPU::getNamedOperandIdx(Opcode, 1397 AMDGPU::OpName::src0_modifiers) != -1; 1398 } 1399 1400 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 1401 unsigned OpName) const { 1402 const MachineOperand *Mods = getNamedOperand(MI, OpName); 1403 return Mods && Mods->getImm(); 1404 } 1405 1406 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 1407 const MachineOperand &MO, 1408 unsigned OpSize) const { 1409 // Literal constants use the constant bus. 1410 if (isLiteralConstant(MO, OpSize)) 1411 return true; 1412 1413 if (!MO.isReg() || !MO.isUse()) 1414 return false; 1415 1416 if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) 1417 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 1418 1419 // FLAT_SCR is just an SGPR pair. 1420 if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) 1421 return true; 1422 1423 // EXEC register uses the constant bus. 1424 if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) 1425 return true; 1426 1427 // SGPRs use the constant bus 1428 return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 || 1429 (!MO.isImplicit() && 1430 (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || 1431 AMDGPU::SGPR_64RegClass.contains(MO.getReg())))); 1432 } 1433 1434 static unsigned findImplicitSGPRRead(const MachineInstr &MI) { 1435 for (const MachineOperand &MO : MI.implicit_operands()) { 1436 // We only care about reads. 1437 if (MO.isDef()) 1438 continue; 1439 1440 switch (MO.getReg()) { 1441 case AMDGPU::VCC: 1442 case AMDGPU::M0: 1443 case AMDGPU::FLAT_SCR: 1444 return MO.getReg(); 1445 1446 default: 1447 break; 1448 } 1449 } 1450 1451 return AMDGPU::NoRegister; 1452 } 1453 1454 bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, 1455 StringRef &ErrInfo) const { 1456 uint16_t Opcode = MI->getOpcode(); 1457 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1458 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 1459 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 1460 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 1461 1462 // Make sure we don't have SCC live-ins to basic blocks. moveToVALU assumes 1463 // all SCC users are in the same blocks as their defs. 1464 const MachineBasicBlock *MBB = MI->getParent(); 1465 if (MI == &MBB->front()) { 1466 if (MBB->isLiveIn(AMDGPU::SCC)) { 1467 ErrInfo = "scc register cannot be live across blocks."; 1468 return false; 1469 } 1470 } 1471 1472 // Make sure the number of operands is correct. 1473 const MCInstrDesc &Desc = get(Opcode); 1474 if (!Desc.isVariadic() && 1475 Desc.getNumOperands() != MI->getNumExplicitOperands()) { 1476 ErrInfo = "Instruction has wrong number of operands."; 1477 return false; 1478 } 1479 1480 // Make sure the register classes are correct. 1481 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 1482 if (MI->getOperand(i).isFPImm()) { 1483 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 1484 "all fp values to integers."; 1485 return false; 1486 } 1487 1488 int RegClass = Desc.OpInfo[i].RegClass; 1489 1490 switch (Desc.OpInfo[i].OperandType) { 1491 case MCOI::OPERAND_REGISTER: 1492 if (MI->getOperand(i).isImm()) { 1493 ErrInfo = "Illegal immediate value for operand."; 1494 return false; 1495 } 1496 break; 1497 case AMDGPU::OPERAND_REG_IMM32: 1498 break; 1499 case AMDGPU::OPERAND_REG_INLINE_C: 1500 if (isLiteralConstant(MI->getOperand(i), 1501 RI.getRegClass(RegClass)->getSize())) { 1502 ErrInfo = "Illegal immediate value for operand."; 1503 return false; 1504 } 1505 break; 1506 case MCOI::OPERAND_IMMEDIATE: 1507 // Check if this operand is an immediate. 1508 // FrameIndex operands will be replaced by immediates, so they are 1509 // allowed. 1510 if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) { 1511 ErrInfo = "Expected immediate, but got non-immediate"; 1512 return false; 1513 } 1514 // Fall-through 1515 default: 1516 continue; 1517 } 1518 1519 if (!MI->getOperand(i).isReg()) 1520 continue; 1521 1522 if (RegClass != -1) { 1523 unsigned Reg = MI->getOperand(i).getReg(); 1524 if (TargetRegisterInfo::isVirtualRegister(Reg)) 1525 continue; 1526 1527 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 1528 if (!RC->contains(Reg)) { 1529 ErrInfo = "Operand has incorrect register class."; 1530 return false; 1531 } 1532 } 1533 } 1534 1535 1536 // Verify VOP* 1537 if (isVOP1(*MI) || isVOP2(*MI) || isVOP3(*MI) || isVOPC(*MI)) { 1538 // Only look at the true operands. Only a real operand can use the constant 1539 // bus, and we don't want to check pseudo-operands like the source modifier 1540 // flags. 1541 const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; 1542 1543 unsigned ConstantBusCount = 0; 1544 unsigned SGPRUsed = findImplicitSGPRRead(*MI); 1545 if (SGPRUsed != AMDGPU::NoRegister) 1546 ++ConstantBusCount; 1547 1548 for (int OpIdx : OpIndices) { 1549 if (OpIdx == -1) 1550 break; 1551 const MachineOperand &MO = MI->getOperand(OpIdx); 1552 if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) { 1553 if (MO.isReg()) { 1554 if (MO.getReg() != SGPRUsed) 1555 ++ConstantBusCount; 1556 SGPRUsed = MO.getReg(); 1557 } else { 1558 ++ConstantBusCount; 1559 } 1560 } 1561 } 1562 if (ConstantBusCount > 1) { 1563 ErrInfo = "VOP* instruction uses the constant bus more than once"; 1564 return false; 1565 } 1566 } 1567 1568 // Verify misc. restrictions on specific instructions. 1569 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || 1570 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { 1571 const MachineOperand &Src0 = MI->getOperand(Src0Idx); 1572 const MachineOperand &Src1 = MI->getOperand(Src1Idx); 1573 const MachineOperand &Src2 = MI->getOperand(Src2Idx); 1574 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 1575 if (!compareMachineOp(Src0, Src1) && 1576 !compareMachineOp(Src0, Src2)) { 1577 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 1578 return false; 1579 } 1580 } 1581 } 1582 1583 // Make sure we aren't losing exec uses in the td files. This mostly requires 1584 // being careful when using let Uses to try to add other use registers. 1585 if (!isGenericOpcode(Opcode) && !isSALU(Opcode) && !isSMRD(Opcode)) { 1586 const MachineOperand *Exec = MI->findRegisterUseOperand(AMDGPU::EXEC); 1587 if (!Exec || !Exec->isImplicit()) { 1588 ErrInfo = "VALU instruction does not implicitly read exec mask"; 1589 return false; 1590 } 1591 } 1592 1593 return true; 1594 } 1595 1596 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { 1597 switch (MI.getOpcode()) { 1598 default: return AMDGPU::INSTRUCTION_LIST_END; 1599 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 1600 case AMDGPU::COPY: return AMDGPU::COPY; 1601 case AMDGPU::PHI: return AMDGPU::PHI; 1602 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 1603 case AMDGPU::S_MOV_B32: 1604 return MI.getOperand(1).isReg() ? 1605 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 1606 case AMDGPU::S_ADD_I32: 1607 case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; 1608 case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; 1609 case AMDGPU::S_SUB_I32: 1610 case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; 1611 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 1612 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; 1613 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32; 1614 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32; 1615 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32; 1616 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32; 1617 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32; 1618 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32; 1619 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32; 1620 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 1621 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; 1622 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 1623 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; 1624 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 1625 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; 1626 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; 1627 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; 1628 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; 1629 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; 1630 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 1631 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 1632 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 1633 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 1634 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; 1635 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; 1636 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; 1637 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; 1638 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; 1639 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; 1640 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; 1641 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; 1642 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; 1643 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; 1644 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; 1645 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; 1646 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 1647 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 1648 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 1649 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 1650 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; 1651 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; 1652 } 1653 } 1654 1655 bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const { 1656 return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END; 1657 } 1658 1659 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 1660 unsigned OpNo) const { 1661 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 1662 const MCInstrDesc &Desc = get(MI.getOpcode()); 1663 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 1664 Desc.OpInfo[OpNo].RegClass == -1) { 1665 unsigned Reg = MI.getOperand(OpNo).getReg(); 1666 1667 if (TargetRegisterInfo::isVirtualRegister(Reg)) 1668 return MRI.getRegClass(Reg); 1669 return RI.getPhysRegClass(Reg); 1670 } 1671 1672 unsigned RCID = Desc.OpInfo[OpNo].RegClass; 1673 return RI.getRegClass(RCID); 1674 } 1675 1676 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { 1677 switch (MI.getOpcode()) { 1678 case AMDGPU::COPY: 1679 case AMDGPU::REG_SEQUENCE: 1680 case AMDGPU::PHI: 1681 case AMDGPU::INSERT_SUBREG: 1682 return RI.hasVGPRs(getOpRegClass(MI, 0)); 1683 default: 1684 return RI.hasVGPRs(getOpRegClass(MI, OpNo)); 1685 } 1686 } 1687 1688 void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const { 1689 MachineBasicBlock::iterator I = MI; 1690 MachineBasicBlock *MBB = MI->getParent(); 1691 MachineOperand &MO = MI->getOperand(OpIdx); 1692 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1693 unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass; 1694 const TargetRegisterClass *RC = RI.getRegClass(RCID); 1695 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 1696 if (MO.isReg()) 1697 Opcode = AMDGPU::COPY; 1698 else if (RI.isSGPRClass(RC)) 1699 Opcode = AMDGPU::S_MOV_B32; 1700 1701 1702 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 1703 if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) 1704 VRC = &AMDGPU::VReg_64RegClass; 1705 else 1706 VRC = &AMDGPU::VGPR_32RegClass; 1707 1708 unsigned Reg = MRI.createVirtualRegister(VRC); 1709 DebugLoc DL = MBB->findDebugLoc(I); 1710 BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg) 1711 .addOperand(MO); 1712 MO.ChangeToRegister(Reg, false); 1713 } 1714 1715 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 1716 MachineRegisterInfo &MRI, 1717 MachineOperand &SuperReg, 1718 const TargetRegisterClass *SuperRC, 1719 unsigned SubIdx, 1720 const TargetRegisterClass *SubRC) 1721 const { 1722 MachineBasicBlock *MBB = MI->getParent(); 1723 DebugLoc DL = MI->getDebugLoc(); 1724 unsigned SubReg = MRI.createVirtualRegister(SubRC); 1725 1726 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { 1727 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 1728 .addReg(SuperReg.getReg(), 0, SubIdx); 1729 return SubReg; 1730 } 1731 1732 // Just in case the super register is itself a sub-register, copy it to a new 1733 // value so we don't need to worry about merging its subreg index with the 1734 // SubIdx passed to this function. The register coalescer should be able to 1735 // eliminate this extra copy. 1736 unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); 1737 1738 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 1739 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 1740 1741 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 1742 .addReg(NewSuperReg, 0, SubIdx); 1743 1744 return SubReg; 1745 } 1746 1747 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 1748 MachineBasicBlock::iterator MII, 1749 MachineRegisterInfo &MRI, 1750 MachineOperand &Op, 1751 const TargetRegisterClass *SuperRC, 1752 unsigned SubIdx, 1753 const TargetRegisterClass *SubRC) const { 1754 if (Op.isImm()) { 1755 // XXX - Is there a better way to do this? 1756 if (SubIdx == AMDGPU::sub0) 1757 return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF); 1758 if (SubIdx == AMDGPU::sub1) 1759 return MachineOperand::CreateImm(Op.getImm() >> 32); 1760 1761 llvm_unreachable("Unhandled register index for immediate"); 1762 } 1763 1764 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 1765 SubIdx, SubRC); 1766 return MachineOperand::CreateReg(SubReg, false); 1767 } 1768 1769 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 1770 void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const { 1771 assert(Inst->getNumExplicitOperands() == 3); 1772 MachineOperand Op1 = Inst->getOperand(1); 1773 Inst->RemoveOperand(1); 1774 Inst->addOperand(Op1); 1775 } 1776 1777 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, 1778 const MCOperandInfo &OpInfo, 1779 const MachineOperand &MO) const { 1780 if (!MO.isReg()) 1781 return false; 1782 1783 unsigned Reg = MO.getReg(); 1784 const TargetRegisterClass *RC = 1785 TargetRegisterInfo::isVirtualRegister(Reg) ? 1786 MRI.getRegClass(Reg) : 1787 RI.getPhysRegClass(Reg); 1788 1789 const SIRegisterInfo *TRI = 1790 static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 1791 RC = TRI->getSubRegClass(RC, MO.getSubReg()); 1792 1793 // In order to be legal, the common sub-class must be equal to the 1794 // class of the current operand. For example: 1795 // 1796 // v_mov_b32 s0 ; Operand defined as vsrc_32 1797 // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL 1798 // 1799 // s_sendmsg 0, s0 ; Operand defined as m0reg 1800 // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL 1801 1802 return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; 1803 } 1804 1805 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, 1806 const MCOperandInfo &OpInfo, 1807 const MachineOperand &MO) const { 1808 if (MO.isReg()) 1809 return isLegalRegOperand(MRI, OpInfo, MO); 1810 1811 // Handle non-register types that are treated like immediates. 1812 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 1813 return true; 1814 } 1815 1816 bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, 1817 const MachineOperand *MO) const { 1818 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1819 const MCInstrDesc &InstDesc = MI->getDesc(); 1820 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; 1821 const TargetRegisterClass *DefinedRC = 1822 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 1823 if (!MO) 1824 MO = &MI->getOperand(OpIdx); 1825 1826 if (isVALU(*MI) && 1827 usesConstantBus(MRI, *MO, DefinedRC->getSize())) { 1828 1829 RegSubRegPair SGPRUsed; 1830 if (MO->isReg()) 1831 SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg()); 1832 1833 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { 1834 if (i == OpIdx) 1835 continue; 1836 const MachineOperand &Op = MI->getOperand(i); 1837 if (Op.isReg() && 1838 (Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) && 1839 usesConstantBus(MRI, Op, getOpSize(*MI, i))) { 1840 return false; 1841 } 1842 } 1843 } 1844 1845 if (MO->isReg()) { 1846 assert(DefinedRC); 1847 return isLegalRegOperand(MRI, OpInfo, *MO); 1848 } 1849 1850 1851 // Handle non-register types that are treated like immediates. 1852 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); 1853 1854 if (!DefinedRC) { 1855 // This operand expects an immediate. 1856 return true; 1857 } 1858 1859 return isImmOperandLegal(MI, OpIdx, *MO); 1860 } 1861 1862 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, 1863 MachineInstr *MI) const { 1864 unsigned Opc = MI->getOpcode(); 1865 const MCInstrDesc &InstrDesc = get(Opc); 1866 1867 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1868 MachineOperand &Src1 = MI->getOperand(Src1Idx); 1869 1870 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 1871 // we need to only have one constant bus use. 1872 // 1873 // Note we do not need to worry about literal constants here. They are 1874 // disabled for the operand type for instructions because they will always 1875 // violate the one constant bus use rule. 1876 bool HasImplicitSGPR = findImplicitSGPRRead(*MI) != AMDGPU::NoRegister; 1877 if (HasImplicitSGPR) { 1878 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1879 MachineOperand &Src0 = MI->getOperand(Src0Idx); 1880 1881 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) 1882 legalizeOpWithMove(MI, Src0Idx); 1883 } 1884 1885 // VOP2 src0 instructions support all operand types, so we don't need to check 1886 // their legality. If src1 is already legal, we don't need to do anything. 1887 if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) 1888 return; 1889 1890 // We do not use commuteInstruction here because it is too aggressive and will 1891 // commute if it is possible. We only want to commute here if it improves 1892 // legality. This can be called a fairly large number of times so don't waste 1893 // compile time pointlessly swapping and checking legality again. 1894 if (HasImplicitSGPR || !MI->isCommutable()) { 1895 legalizeOpWithMove(MI, Src1Idx); 1896 return; 1897 } 1898 1899 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1900 MachineOperand &Src0 = MI->getOperand(Src0Idx); 1901 1902 // If src0 can be used as src1, commuting will make the operands legal. 1903 // Otherwise we have to give up and insert a move. 1904 // 1905 // TODO: Other immediate-like operand kinds could be commuted if there was a 1906 // MachineOperand::ChangeTo* for them. 1907 if ((!Src1.isImm() && !Src1.isReg()) || 1908 !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { 1909 legalizeOpWithMove(MI, Src1Idx); 1910 return; 1911 } 1912 1913 int CommutedOpc = commuteOpcode(*MI); 1914 if (CommutedOpc == -1) { 1915 legalizeOpWithMove(MI, Src1Idx); 1916 return; 1917 } 1918 1919 MI->setDesc(get(CommutedOpc)); 1920 1921 unsigned Src0Reg = Src0.getReg(); 1922 unsigned Src0SubReg = Src0.getSubReg(); 1923 bool Src0Kill = Src0.isKill(); 1924 1925 if (Src1.isImm()) 1926 Src0.ChangeToImmediate(Src1.getImm()); 1927 else if (Src1.isReg()) { 1928 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); 1929 Src0.setSubReg(Src1.getSubReg()); 1930 } else 1931 llvm_unreachable("Should only have register or immediate operands"); 1932 1933 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); 1934 Src1.setSubReg(Src0SubReg); 1935 } 1936 1937 // Legalize VOP3 operands. Because all operand types are supported for any 1938 // operand, and since literal constants are not allowed and should never be 1939 // seen, we only need to worry about inserting copies if we use multiple SGPR 1940 // operands. 1941 void SIInstrInfo::legalizeOperandsVOP3( 1942 MachineRegisterInfo &MRI, 1943 MachineInstr *MI) const { 1944 unsigned Opc = MI->getOpcode(); 1945 1946 int VOP3Idx[3] = { 1947 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 1948 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), 1949 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) 1950 }; 1951 1952 // Find the one SGPR operand we are allowed to use. 1953 unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); 1954 1955 for (unsigned i = 0; i < 3; ++i) { 1956 int Idx = VOP3Idx[i]; 1957 if (Idx == -1) 1958 break; 1959 MachineOperand &MO = MI->getOperand(Idx); 1960 1961 // We should never see a VOP3 instruction with an illegal immediate operand. 1962 if (!MO.isReg()) 1963 continue; 1964 1965 if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 1966 continue; // VGPRs are legal 1967 1968 if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { 1969 SGPRReg = MO.getReg(); 1970 // We can use one SGPR in each VOP3 instruction. 1971 continue; 1972 } 1973 1974 // If we make it this far, then the operand is not legal and we must 1975 // legalize it. 1976 legalizeOpWithMove(MI, Idx); 1977 } 1978 } 1979 1980 unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr *UseMI, 1981 MachineRegisterInfo &MRI) const { 1982 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); 1983 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); 1984 unsigned DstReg = MRI.createVirtualRegister(SRC); 1985 unsigned SubRegs = VRC->getSize() / 4; 1986 1987 SmallVector<unsigned, 8> SRegs; 1988 for (unsigned i = 0; i < SubRegs; ++i) { 1989 unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 1990 BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(), 1991 get(AMDGPU::V_READFIRSTLANE_B32), SGPR) 1992 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); 1993 SRegs.push_back(SGPR); 1994 } 1995 1996 MachineInstrBuilder MIB = BuildMI(*UseMI->getParent(), UseMI, 1997 UseMI->getDebugLoc(), 1998 get(AMDGPU::REG_SEQUENCE), DstReg); 1999 for (unsigned i = 0; i < SubRegs; ++i) { 2000 MIB.addReg(SRegs[i]); 2001 MIB.addImm(RI.getSubRegFromChannel(i)); 2002 } 2003 return DstReg; 2004 } 2005 2006 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, 2007 MachineInstr *MI) const { 2008 2009 // If the pointer is store in VGPRs, then we need to move them to 2010 // SGPRs using v_readfirstlane. This is safe because we only select 2011 // loads with uniform pointers to SMRD instruction so we know the 2012 // pointer value is uniform. 2013 MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase); 2014 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { 2015 unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); 2016 SBase->setReg(SGPR); 2017 } 2018 } 2019 2020 void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { 2021 MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 2022 2023 // Legalize VOP2 2024 if (isVOP2(*MI) || isVOPC(*MI)) { 2025 legalizeOperandsVOP2(MRI, MI); 2026 return; 2027 } 2028 2029 // Legalize VOP3 2030 if (isVOP3(*MI)) { 2031 legalizeOperandsVOP3(MRI, MI); 2032 return; 2033 } 2034 2035 // Legalize SMRD 2036 if (isSMRD(*MI)) { 2037 legalizeOperandsSMRD(MRI, MI); 2038 return; 2039 } 2040 2041 // Legalize REG_SEQUENCE and PHI 2042 // The register class of the operands much be the same type as the register 2043 // class of the output. 2044 if (MI->getOpcode() == AMDGPU::PHI) { 2045 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 2046 for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { 2047 if (!MI->getOperand(i).isReg() || 2048 !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) 2049 continue; 2050 const TargetRegisterClass *OpRC = 2051 MRI.getRegClass(MI->getOperand(i).getReg()); 2052 if (RI.hasVGPRs(OpRC)) { 2053 VRC = OpRC; 2054 } else { 2055 SRC = OpRC; 2056 } 2057 } 2058 2059 // If any of the operands are VGPR registers, then they all most be 2060 // otherwise we will create illegal VGPR->SGPR copies when legalizing 2061 // them. 2062 if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) { 2063 if (!VRC) { 2064 assert(SRC); 2065 VRC = RI.getEquivalentVGPRClass(SRC); 2066 } 2067 RC = VRC; 2068 } else { 2069 RC = SRC; 2070 } 2071 2072 // Update all the operands so they have the same type. 2073 for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { 2074 MachineOperand &Op = MI->getOperand(I); 2075 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 2076 continue; 2077 unsigned DstReg = MRI.createVirtualRegister(RC); 2078 2079 // MI is a PHI instruction. 2080 MachineBasicBlock *InsertBB = MI->getOperand(I + 1).getMBB(); 2081 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); 2082 2083 BuildMI(*InsertBB, Insert, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) 2084 .addOperand(Op); 2085 Op.setReg(DstReg); 2086 } 2087 } 2088 2089 // REG_SEQUENCE doesn't really require operand legalization, but if one has a 2090 // VGPR dest type and SGPR sources, insert copies so all operands are 2091 // VGPRs. This seems to help operand folding / the register coalescer. 2092 if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) { 2093 MachineBasicBlock *MBB = MI->getParent(); 2094 const TargetRegisterClass *DstRC = getOpRegClass(*MI, 0); 2095 if (RI.hasVGPRs(DstRC)) { 2096 // Update all the operands so they are VGPR register classes. These may 2097 // not be the same register class because REG_SEQUENCE supports mixing 2098 // subregister index types e.g. sub0_sub1 + sub2 + sub3 2099 for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { 2100 MachineOperand &Op = MI->getOperand(I); 2101 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 2102 continue; 2103 2104 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); 2105 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); 2106 if (VRC == OpRC) 2107 continue; 2108 2109 unsigned DstReg = MRI.createVirtualRegister(VRC); 2110 2111 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) 2112 .addOperand(Op); 2113 2114 Op.setReg(DstReg); 2115 Op.setIsKill(); 2116 } 2117 } 2118 2119 return; 2120 } 2121 2122 // Legalize INSERT_SUBREG 2123 // src0 must have the same register class as dst 2124 if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) { 2125 unsigned Dst = MI->getOperand(0).getReg(); 2126 unsigned Src0 = MI->getOperand(1).getReg(); 2127 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 2128 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 2129 if (DstRC != Src0RC) { 2130 MachineBasicBlock &MBB = *MI->getParent(); 2131 unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); 2132 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0) 2133 .addReg(Src0); 2134 MI->getOperand(1).setReg(NewSrc0); 2135 } 2136 return; 2137 } 2138 2139 // Legalize MIMG 2140 if (isMIMG(*MI)) { 2141 MachineOperand *SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); 2142 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { 2143 unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); 2144 SRsrc->setReg(SGPR); 2145 } 2146 2147 MachineOperand *SSamp = getNamedOperand(*MI, AMDGPU::OpName::ssamp); 2148 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { 2149 unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); 2150 SSamp->setReg(SGPR); 2151 } 2152 return; 2153 } 2154 2155 // Legalize MUBUF* instructions 2156 // FIXME: If we start using the non-addr64 instructions for compute, we 2157 // may need to legalize them here. 2158 int SRsrcIdx = 2159 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc); 2160 if (SRsrcIdx != -1) { 2161 // We have an MUBUF instruction 2162 MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx); 2163 unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass; 2164 if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), 2165 RI.getRegClass(SRsrcRC))) { 2166 // The operands are legal. 2167 // FIXME: We may need to legalize operands besided srsrc. 2168 return; 2169 } 2170 2171 MachineBasicBlock &MBB = *MI->getParent(); 2172 2173 // Extract the ptr from the resource descriptor. 2174 unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, 2175 &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); 2176 2177 // Create an empty resource descriptor 2178 unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2179 unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2180 unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2181 unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); 2182 uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); 2183 2184 // Zero64 = 0 2185 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64), 2186 Zero64) 2187 .addImm(0); 2188 2189 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 2190 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 2191 SRsrcFormatLo) 2192 .addImm(RsrcDataFormat & 0xFFFFFFFF); 2193 2194 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 2195 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 2196 SRsrcFormatHi) 2197 .addImm(RsrcDataFormat >> 32); 2198 2199 // NewSRsrc = {Zero64, SRsrcFormat} 2200 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) 2201 .addReg(Zero64) 2202 .addImm(AMDGPU::sub0_sub1) 2203 .addReg(SRsrcFormatLo) 2204 .addImm(AMDGPU::sub2) 2205 .addReg(SRsrcFormatHi) 2206 .addImm(AMDGPU::sub3); 2207 2208 MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); 2209 unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2210 if (VAddr) { 2211 // This is already an ADDR64 instruction so we need to add the pointer 2212 // extracted from the resource descriptor to the current value of VAddr. 2213 unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2214 unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2215 2216 // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 2217 DebugLoc DL = MI->getDebugLoc(); 2218 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) 2219 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 2220 .addReg(VAddr->getReg(), 0, AMDGPU::sub0); 2221 2222 // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1 2223 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) 2224 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 2225 .addReg(VAddr->getReg(), 0, AMDGPU::sub1); 2226 2227 // NewVaddr = {NewVaddrHi, NewVaddrLo} 2228 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 2229 .addReg(NewVAddrLo) 2230 .addImm(AMDGPU::sub0) 2231 .addReg(NewVAddrHi) 2232 .addImm(AMDGPU::sub1); 2233 } else { 2234 // This instructions is the _OFFSET variant, so we need to convert it to 2235 // ADDR64. 2236 assert(MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() 2237 < AMDGPUSubtarget::VOLCANIC_ISLANDS && 2238 "FIXME: Need to emit flat atomics here"); 2239 2240 MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata); 2241 MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset); 2242 MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset); 2243 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode()); 2244 2245 // Atomics rith return have have an additional tied operand and are 2246 // missing some of the special bits. 2247 MachineOperand *VDataIn = getNamedOperand(*MI, AMDGPU::OpName::vdata_in); 2248 MachineInstr *Addr64; 2249 2250 if (!VDataIn) { 2251 // Regular buffer load / store. 2252 MachineInstrBuilder MIB 2253 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) 2254 .addOperand(*VData) 2255 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 2256 // This will be replaced later 2257 // with the new value of vaddr. 2258 .addOperand(*SRsrc) 2259 .addOperand(*SOffset) 2260 .addOperand(*Offset); 2261 2262 // Atomics do not have this operand. 2263 if (const MachineOperand *GLC 2264 = getNamedOperand(*MI, AMDGPU::OpName::glc)) { 2265 MIB.addImm(GLC->getImm()); 2266 } 2267 2268 MIB.addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)); 2269 2270 if (const MachineOperand *TFE 2271 = getNamedOperand(*MI, AMDGPU::OpName::tfe)) { 2272 MIB.addImm(TFE->getImm()); 2273 } 2274 2275 MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 2276 Addr64 = MIB; 2277 } else { 2278 // Atomics with return. 2279 Addr64 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) 2280 .addOperand(*VData) 2281 .addOperand(*VDataIn) 2282 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 2283 // This will be replaced later 2284 // with the new value of vaddr. 2285 .addOperand(*SRsrc) 2286 .addOperand(*SOffset) 2287 .addOperand(*Offset) 2288 .addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)) 2289 .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 2290 } 2291 2292 MI->removeFromParent(); 2293 MI = Addr64; 2294 2295 // NewVaddr = {NewVaddrHi, NewVaddrLo} 2296 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 2297 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 2298 .addImm(AMDGPU::sub0) 2299 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 2300 .addImm(AMDGPU::sub1); 2301 2302 VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); 2303 SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); 2304 } 2305 2306 // Update the instruction to use NewVaddr 2307 VAddr->setReg(NewVAddr); 2308 // Update the instruction to use NewSRsrc 2309 SRsrc->setReg(NewSRsrc); 2310 } 2311 } 2312 2313 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { 2314 SmallVector<MachineInstr *, 128> Worklist; 2315 Worklist.push_back(&TopInst); 2316 2317 while (!Worklist.empty()) { 2318 MachineInstr *Inst = Worklist.pop_back_val(); 2319 MachineBasicBlock *MBB = Inst->getParent(); 2320 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 2321 2322 unsigned Opcode = Inst->getOpcode(); 2323 unsigned NewOpcode = getVALUOp(*Inst); 2324 2325 // Handle some special cases 2326 switch (Opcode) { 2327 default: 2328 break; 2329 case AMDGPU::S_AND_B64: 2330 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); 2331 Inst->eraseFromParent(); 2332 continue; 2333 2334 case AMDGPU::S_OR_B64: 2335 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64); 2336 Inst->eraseFromParent(); 2337 continue; 2338 2339 case AMDGPU::S_XOR_B64: 2340 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64); 2341 Inst->eraseFromParent(); 2342 continue; 2343 2344 case AMDGPU::S_NOT_B64: 2345 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); 2346 Inst->eraseFromParent(); 2347 continue; 2348 2349 case AMDGPU::S_BCNT1_I32_B64: 2350 splitScalar64BitBCNT(Worklist, Inst); 2351 Inst->eraseFromParent(); 2352 continue; 2353 2354 case AMDGPU::S_BFE_I64: { 2355 splitScalar64BitBFE(Worklist, Inst); 2356 Inst->eraseFromParent(); 2357 continue; 2358 } 2359 2360 case AMDGPU::S_LSHL_B32: 2361 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2362 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 2363 swapOperands(Inst); 2364 } 2365 break; 2366 case AMDGPU::S_ASHR_I32: 2367 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2368 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 2369 swapOperands(Inst); 2370 } 2371 break; 2372 case AMDGPU::S_LSHR_B32: 2373 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2374 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 2375 swapOperands(Inst); 2376 } 2377 break; 2378 case AMDGPU::S_LSHL_B64: 2379 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2380 NewOpcode = AMDGPU::V_LSHLREV_B64; 2381 swapOperands(Inst); 2382 } 2383 break; 2384 case AMDGPU::S_ASHR_I64: 2385 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2386 NewOpcode = AMDGPU::V_ASHRREV_I64; 2387 swapOperands(Inst); 2388 } 2389 break; 2390 case AMDGPU::S_LSHR_B64: 2391 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2392 NewOpcode = AMDGPU::V_LSHRREV_B64; 2393 swapOperands(Inst); 2394 } 2395 break; 2396 2397 case AMDGPU::S_ABS_I32: 2398 lowerScalarAbs(Worklist, Inst); 2399 Inst->eraseFromParent(); 2400 continue; 2401 2402 case AMDGPU::S_CBRANCH_SCC0: 2403 case AMDGPU::S_CBRANCH_SCC1: 2404 // Clear unused bits of vcc 2405 BuildMI(*MBB, Inst, Inst->getDebugLoc(), get(AMDGPU::S_AND_B64), AMDGPU::VCC) 2406 .addReg(AMDGPU::EXEC) 2407 .addReg(AMDGPU::VCC); 2408 break; 2409 2410 case AMDGPU::S_BFE_U64: 2411 case AMDGPU::S_BFM_B64: 2412 llvm_unreachable("Moving this op to VALU not implemented"); 2413 } 2414 2415 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 2416 // We cannot move this instruction to the VALU, so we should try to 2417 // legalize its operands instead. 2418 legalizeOperands(Inst); 2419 continue; 2420 } 2421 2422 // Use the new VALU Opcode. 2423 const MCInstrDesc &NewDesc = get(NewOpcode); 2424 Inst->setDesc(NewDesc); 2425 2426 // Remove any references to SCC. Vector instructions can't read from it, and 2427 // We're just about to add the implicit use / defs of VCC, and we don't want 2428 // both. 2429 for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) { 2430 MachineOperand &Op = Inst->getOperand(i); 2431 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { 2432 Inst->RemoveOperand(i); 2433 addSCCDefUsersToVALUWorklist(Inst, Worklist); 2434 } 2435 } 2436 2437 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 2438 // We are converting these to a BFE, so we need to add the missing 2439 // operands for the size and offset. 2440 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 2441 Inst->addOperand(MachineOperand::CreateImm(0)); 2442 Inst->addOperand(MachineOperand::CreateImm(Size)); 2443 2444 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 2445 // The VALU version adds the second operand to the result, so insert an 2446 // extra 0 operand. 2447 Inst->addOperand(MachineOperand::CreateImm(0)); 2448 } 2449 2450 Inst->addImplicitDefUseOperands(*Inst->getParent()->getParent()); 2451 2452 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 2453 const MachineOperand &OffsetWidthOp = Inst->getOperand(2); 2454 // If we need to move this to VGPRs, we need to unpack the second operand 2455 // back into the 2 separate ones for bit offset and width. 2456 assert(OffsetWidthOp.isImm() && 2457 "Scalar BFE is only implemented for constant width and offset"); 2458 uint32_t Imm = OffsetWidthOp.getImm(); 2459 2460 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 2461 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 2462 Inst->RemoveOperand(2); // Remove old immediate. 2463 Inst->addOperand(MachineOperand::CreateImm(Offset)); 2464 Inst->addOperand(MachineOperand::CreateImm(BitWidth)); 2465 } 2466 2467 bool HasDst = Inst->getOperand(0).isReg() && Inst->getOperand(0).isDef(); 2468 unsigned NewDstReg = AMDGPU::NoRegister; 2469 if (HasDst) { 2470 // Update the destination register class. 2471 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst); 2472 if (!NewDstRC) 2473 continue; 2474 2475 unsigned DstReg = Inst->getOperand(0).getReg(); 2476 NewDstReg = MRI.createVirtualRegister(NewDstRC); 2477 MRI.replaceRegWith(DstReg, NewDstReg); 2478 } 2479 2480 // Legalize the operands 2481 legalizeOperands(Inst); 2482 2483 if (HasDst) 2484 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 2485 } 2486 } 2487 2488 //===----------------------------------------------------------------------===// 2489 // Indirect addressing callbacks 2490 //===----------------------------------------------------------------------===// 2491 2492 const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const { 2493 return &AMDGPU::VGPR_32RegClass; 2494 } 2495 2496 void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, 2497 MachineInstr *Inst) const { 2498 MachineBasicBlock &MBB = *Inst->getParent(); 2499 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2500 MachineBasicBlock::iterator MII = Inst; 2501 DebugLoc DL = Inst->getDebugLoc(); 2502 2503 MachineOperand &Dest = Inst->getOperand(0); 2504 MachineOperand &Src = Inst->getOperand(1); 2505 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2506 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2507 2508 BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg) 2509 .addImm(0) 2510 .addReg(Src.getReg()); 2511 2512 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) 2513 .addReg(Src.getReg()) 2514 .addReg(TmpReg); 2515 2516 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2517 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2518 } 2519 2520 void SIInstrInfo::splitScalar64BitUnaryOp( 2521 SmallVectorImpl<MachineInstr *> &Worklist, 2522 MachineInstr *Inst, 2523 unsigned Opcode) const { 2524 MachineBasicBlock &MBB = *Inst->getParent(); 2525 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2526 2527 MachineOperand &Dest = Inst->getOperand(0); 2528 MachineOperand &Src0 = Inst->getOperand(1); 2529 DebugLoc DL = Inst->getDebugLoc(); 2530 2531 MachineBasicBlock::iterator MII = Inst; 2532 2533 const MCInstrDesc &InstDesc = get(Opcode); 2534 const TargetRegisterClass *Src0RC = Src0.isReg() ? 2535 MRI.getRegClass(Src0.getReg()) : 2536 &AMDGPU::SGPR_32RegClass; 2537 2538 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 2539 2540 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2541 AMDGPU::sub0, Src0SubRC); 2542 2543 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 2544 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 2545 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 2546 2547 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 2548 BuildMI(MBB, MII, DL, InstDesc, DestSub0) 2549 .addOperand(SrcReg0Sub0); 2550 2551 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2552 AMDGPU::sub1, Src0SubRC); 2553 2554 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 2555 BuildMI(MBB, MII, DL, InstDesc, DestSub1) 2556 .addOperand(SrcReg0Sub1); 2557 2558 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 2559 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2560 .addReg(DestSub0) 2561 .addImm(AMDGPU::sub0) 2562 .addReg(DestSub1) 2563 .addImm(AMDGPU::sub1); 2564 2565 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 2566 2567 // We don't need to legalizeOperands here because for a single operand, src0 2568 // will support any kind of input. 2569 2570 // Move all users of this moved value. 2571 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 2572 } 2573 2574 void SIInstrInfo::splitScalar64BitBinaryOp( 2575 SmallVectorImpl<MachineInstr *> &Worklist, 2576 MachineInstr *Inst, 2577 unsigned Opcode) const { 2578 MachineBasicBlock &MBB = *Inst->getParent(); 2579 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2580 2581 MachineOperand &Dest = Inst->getOperand(0); 2582 MachineOperand &Src0 = Inst->getOperand(1); 2583 MachineOperand &Src1 = Inst->getOperand(2); 2584 DebugLoc DL = Inst->getDebugLoc(); 2585 2586 MachineBasicBlock::iterator MII = Inst; 2587 2588 const MCInstrDesc &InstDesc = get(Opcode); 2589 const TargetRegisterClass *Src0RC = Src0.isReg() ? 2590 MRI.getRegClass(Src0.getReg()) : 2591 &AMDGPU::SGPR_32RegClass; 2592 2593 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 2594 const TargetRegisterClass *Src1RC = Src1.isReg() ? 2595 MRI.getRegClass(Src1.getReg()) : 2596 &AMDGPU::SGPR_32RegClass; 2597 2598 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 2599 2600 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2601 AMDGPU::sub0, Src0SubRC); 2602 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 2603 AMDGPU::sub0, Src1SubRC); 2604 2605 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 2606 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 2607 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 2608 2609 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 2610 MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) 2611 .addOperand(SrcReg0Sub0) 2612 .addOperand(SrcReg1Sub0); 2613 2614 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2615 AMDGPU::sub1, Src0SubRC); 2616 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 2617 AMDGPU::sub1, Src1SubRC); 2618 2619 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 2620 MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) 2621 .addOperand(SrcReg0Sub1) 2622 .addOperand(SrcReg1Sub1); 2623 2624 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 2625 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2626 .addReg(DestSub0) 2627 .addImm(AMDGPU::sub0) 2628 .addReg(DestSub1) 2629 .addImm(AMDGPU::sub1); 2630 2631 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 2632 2633 // Try to legalize the operands in case we need to swap the order to keep it 2634 // valid. 2635 legalizeOperands(LoHalf); 2636 legalizeOperands(HiHalf); 2637 2638 // Move all users of this moved vlaue. 2639 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 2640 } 2641 2642 void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, 2643 MachineInstr *Inst) const { 2644 MachineBasicBlock &MBB = *Inst->getParent(); 2645 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2646 2647 MachineBasicBlock::iterator MII = Inst; 2648 DebugLoc DL = Inst->getDebugLoc(); 2649 2650 MachineOperand &Dest = Inst->getOperand(0); 2651 MachineOperand &Src = Inst->getOperand(1); 2652 2653 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 2654 const TargetRegisterClass *SrcRC = Src.isReg() ? 2655 MRI.getRegClass(Src.getReg()) : 2656 &AMDGPU::SGPR_32RegClass; 2657 2658 unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2659 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2660 2661 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); 2662 2663 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 2664 AMDGPU::sub0, SrcSubRC); 2665 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 2666 AMDGPU::sub1, SrcSubRC); 2667 2668 BuildMI(MBB, MII, DL, InstDesc, MidReg) 2669 .addOperand(SrcRegSub0) 2670 .addImm(0); 2671 2672 BuildMI(MBB, MII, DL, InstDesc, ResultReg) 2673 .addOperand(SrcRegSub1) 2674 .addReg(MidReg); 2675 2676 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2677 2678 // We don't need to legalize operands here. src0 for etiher instruction can be 2679 // an SGPR, and the second input is unused or determined here. 2680 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2681 } 2682 2683 void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, 2684 MachineInstr *Inst) const { 2685 MachineBasicBlock &MBB = *Inst->getParent(); 2686 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2687 MachineBasicBlock::iterator MII = Inst; 2688 DebugLoc DL = Inst->getDebugLoc(); 2689 2690 MachineOperand &Dest = Inst->getOperand(0); 2691 uint32_t Imm = Inst->getOperand(2).getImm(); 2692 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 2693 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 2694 2695 (void) Offset; 2696 2697 // Only sext_inreg cases handled. 2698 assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 && 2699 BitWidth <= 32 && 2700 Offset == 0 && 2701 "Not implemented"); 2702 2703 if (BitWidth < 32) { 2704 unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2705 unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2706 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2707 2708 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) 2709 .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0) 2710 .addImm(0) 2711 .addImm(BitWidth); 2712 2713 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 2714 .addImm(31) 2715 .addReg(MidRegLo); 2716 2717 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 2718 .addReg(MidRegLo) 2719 .addImm(AMDGPU::sub0) 2720 .addReg(MidRegHi) 2721 .addImm(AMDGPU::sub1); 2722 2723 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2724 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2725 return; 2726 } 2727 2728 MachineOperand &Src = Inst->getOperand(1); 2729 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2730 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2731 2732 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 2733 .addImm(31) 2734 .addReg(Src.getReg(), 0, AMDGPU::sub0); 2735 2736 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 2737 .addReg(Src.getReg(), 0, AMDGPU::sub0) 2738 .addImm(AMDGPU::sub0) 2739 .addReg(TmpReg) 2740 .addImm(AMDGPU::sub1); 2741 2742 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2743 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2744 } 2745 2746 void SIInstrInfo::addUsersToMoveToVALUWorklist( 2747 unsigned DstReg, 2748 MachineRegisterInfo &MRI, 2749 SmallVectorImpl<MachineInstr *> &Worklist) const { 2750 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), 2751 E = MRI.use_end(); I != E; ++I) { 2752 MachineInstr &UseMI = *I->getParent(); 2753 if (!canReadVGPR(UseMI, I.getOperandNo())) { 2754 Worklist.push_back(&UseMI); 2755 } 2756 } 2757 } 2758 2759 void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineInstr *SCCDefInst, 2760 SmallVectorImpl<MachineInstr *> &Worklist) const { 2761 // This assumes that all the users of SCC are in the same block 2762 // as the SCC def. 2763 for (MachineBasicBlock::iterator I = SCCDefInst, 2764 E = SCCDefInst->getParent()->end(); I != E; ++I) { 2765 2766 // Exit if we find another SCC def. 2767 if (I->findRegisterDefOperandIdx(AMDGPU::SCC) != -1) 2768 return; 2769 2770 if (I->findRegisterUseOperandIdx(AMDGPU::SCC) != -1) 2771 Worklist.push_back(I); 2772 } 2773 } 2774 2775 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( 2776 const MachineInstr &Inst) const { 2777 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); 2778 2779 switch (Inst.getOpcode()) { 2780 // For target instructions, getOpRegClass just returns the virtual register 2781 // class associated with the operand, so we need to find an equivalent VGPR 2782 // register class in order to move the instruction to the VALU. 2783 case AMDGPU::COPY: 2784 case AMDGPU::PHI: 2785 case AMDGPU::REG_SEQUENCE: 2786 case AMDGPU::INSERT_SUBREG: 2787 if (RI.hasVGPRs(NewDstRC)) 2788 return nullptr; 2789 2790 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 2791 if (!NewDstRC) 2792 return nullptr; 2793 return NewDstRC; 2794 default: 2795 return NewDstRC; 2796 } 2797 } 2798 2799 // Find the one SGPR operand we are allowed to use. 2800 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, 2801 int OpIndices[3]) const { 2802 const MCInstrDesc &Desc = MI->getDesc(); 2803 2804 // Find the one SGPR operand we are allowed to use. 2805 // 2806 // First we need to consider the instruction's operand requirements before 2807 // legalizing. Some operands are required to be SGPRs, such as implicit uses 2808 // of VCC, but we are still bound by the constant bus requirement to only use 2809 // one. 2810 // 2811 // If the operand's class is an SGPR, we can never move it. 2812 2813 unsigned SGPRReg = findImplicitSGPRRead(*MI); 2814 if (SGPRReg != AMDGPU::NoRegister) 2815 return SGPRReg; 2816 2817 unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; 2818 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 2819 2820 for (unsigned i = 0; i < 3; ++i) { 2821 int Idx = OpIndices[i]; 2822 if (Idx == -1) 2823 break; 2824 2825 const MachineOperand &MO = MI->getOperand(Idx); 2826 if (!MO.isReg()) 2827 continue; 2828 2829 // Is this operand statically required to be an SGPR based on the operand 2830 // constraints? 2831 const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); 2832 bool IsRequiredSGPR = RI.isSGPRClass(OpRC); 2833 if (IsRequiredSGPR) 2834 return MO.getReg(); 2835 2836 // If this could be a VGPR or an SGPR, Check the dynamic register class. 2837 unsigned Reg = MO.getReg(); 2838 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); 2839 if (RI.isSGPRClass(RegRC)) 2840 UsedSGPRs[i] = Reg; 2841 } 2842 2843 // We don't have a required SGPR operand, so we have a bit more freedom in 2844 // selecting operands to move. 2845 2846 // Try to select the most used SGPR. If an SGPR is equal to one of the 2847 // others, we choose that. 2848 // 2849 // e.g. 2850 // V_FMA_F32 v0, s0, s0, s0 -> No moves 2851 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 2852 2853 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should 2854 // prefer those. 2855 2856 if (UsedSGPRs[0] != AMDGPU::NoRegister) { 2857 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 2858 SGPRReg = UsedSGPRs[0]; 2859 } 2860 2861 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { 2862 if (UsedSGPRs[1] == UsedSGPRs[2]) 2863 SGPRReg = UsedSGPRs[1]; 2864 } 2865 2866 return SGPRReg; 2867 } 2868 2869 void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved, 2870 const MachineFunction &MF) const { 2871 int End = getIndirectIndexEnd(MF); 2872 int Begin = getIndirectIndexBegin(MF); 2873 2874 if (End == -1) 2875 return; 2876 2877 2878 for (int Index = Begin; Index <= End; ++Index) 2879 Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index)); 2880 2881 for (int Index = std::max(0, Begin - 1); Index <= End; ++Index) 2882 Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index)); 2883 2884 for (int Index = std::max(0, Begin - 2); Index <= End; ++Index) 2885 Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index)); 2886 2887 for (int Index = std::max(0, Begin - 3); Index <= End; ++Index) 2888 Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index)); 2889 2890 for (int Index = std::max(0, Begin - 7); Index <= End; ++Index) 2891 Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index)); 2892 2893 for (int Index = std::max(0, Begin - 15); Index <= End; ++Index) 2894 Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index)); 2895 } 2896 2897 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 2898 unsigned OperandName) const { 2899 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 2900 if (Idx == -1) 2901 return nullptr; 2902 2903 return &MI.getOperand(Idx); 2904 } 2905 2906 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 2907 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 2908 if (ST.isAmdHsaOS()) { 2909 RsrcDataFormat |= (1ULL << 56); 2910 2911 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 2912 // Set MTYPE = 2 2913 RsrcDataFormat |= (2ULL << 59); 2914 } 2915 2916 return RsrcDataFormat; 2917 } 2918 2919 uint64_t SIInstrInfo::getScratchRsrcWords23() const { 2920 uint64_t Rsrc23 = getDefaultRsrcDataFormat() | 2921 AMDGPU::RSRC_TID_ENABLE | 2922 0xffffffff; // Size; 2923 2924 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; 2925 2926 Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT); 2927 2928 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. 2929 // Clear them unless we want a huge stride. 2930 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 2931 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; 2932 2933 return Rsrc23; 2934 } 2935 2936 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr *MI) const { 2937 unsigned Opc = MI->getOpcode(); 2938 2939 return isSMRD(Opc); 2940 } 2941 2942 bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr *MI) const { 2943 unsigned Opc = MI->getOpcode(); 2944 2945 return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); 2946 } 2947 2948 ArrayRef<std::pair<int, const char *>> 2949 SIInstrInfo::getSerializableTargetIndices() const { 2950 static const std::pair<int, const char *> TargetIndices[] = { 2951 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, 2952 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, 2953 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, 2954 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, 2955 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; 2956 return makeArrayRef(TargetIndices); 2957 } 2958