1 //===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief SI Implementation of TargetInstrInfo. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "SIInstrInfo.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "SIDefines.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/CodeGen/MachineFrameInfo.h" 21 #include "llvm/CodeGen/MachineInstrBuilder.h" 22 #include "llvm/CodeGen/MachineRegisterInfo.h" 23 #include "llvm/IR/Function.h" 24 #include "llvm/CodeGen/RegisterScavenging.h" 25 #include "llvm/MC/MCInstrDesc.h" 26 #include "llvm/Support/Debug.h" 27 28 using namespace llvm; 29 30 SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st) 31 : AMDGPUInstrInfo(st), RI() {} 32 33 //===----------------------------------------------------------------------===// 34 // TargetInstrInfo callbacks 35 //===----------------------------------------------------------------------===// 36 37 static unsigned getNumOperandsNoGlue(SDNode *Node) { 38 unsigned N = Node->getNumOperands(); 39 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 40 --N; 41 return N; 42 } 43 44 static SDValue findChainOperand(SDNode *Load) { 45 SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); 46 assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); 47 return LastOp; 48 } 49 50 /// \brief Returns true if both nodes have the same value for the given 51 /// operand \p Op, or if both nodes do not have this operand. 52 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 53 unsigned Opc0 = N0->getMachineOpcode(); 54 unsigned Opc1 = N1->getMachineOpcode(); 55 56 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 57 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 58 59 if (Op0Idx == -1 && Op1Idx == -1) 60 return true; 61 62 63 if ((Op0Idx == -1 && Op1Idx != -1) || 64 (Op1Idx == -1 && Op0Idx != -1)) 65 return false; 66 67 // getNamedOperandIdx returns the index for the MachineInstr's operands, 68 // which includes the result as the first operand. We are indexing into the 69 // MachineSDNode's operands, so we need to skip the result operand to get 70 // the real index. 71 --Op0Idx; 72 --Op1Idx; 73 74 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 75 } 76 77 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, 78 AliasAnalysis *AA) const { 79 // TODO: The generic check fails for VALU instructions that should be 80 // rematerializable due to implicit reads of exec. We really want all of the 81 // generic logic for this except for this. 82 switch (MI->getOpcode()) { 83 case AMDGPU::V_MOV_B32_e32: 84 case AMDGPU::V_MOV_B32_e64: 85 case AMDGPU::V_MOV_B64_PSEUDO: 86 return true; 87 default: 88 return false; 89 } 90 } 91 92 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 93 int64_t &Offset0, 94 int64_t &Offset1) const { 95 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 96 return false; 97 98 unsigned Opc0 = Load0->getMachineOpcode(); 99 unsigned Opc1 = Load1->getMachineOpcode(); 100 101 // Make sure both are actually loads. 102 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 103 return false; 104 105 if (isDS(Opc0) && isDS(Opc1)) { 106 107 // FIXME: Handle this case: 108 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 109 return false; 110 111 // Check base reg. 112 if (Load0->getOperand(1) != Load1->getOperand(1)) 113 return false; 114 115 // Check chain. 116 if (findChainOperand(Load0) != findChainOperand(Load1)) 117 return false; 118 119 // Skip read2 / write2 variants for simplicity. 120 // TODO: We should report true if the used offsets are adjacent (excluded 121 // st64 versions). 122 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || 123 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) 124 return false; 125 126 Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue(); 127 Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue(); 128 return true; 129 } 130 131 if (isSMRD(Opc0) && isSMRD(Opc1)) { 132 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); 133 134 // Check base reg. 135 if (Load0->getOperand(0) != Load1->getOperand(0)) 136 return false; 137 138 const ConstantSDNode *Load0Offset = 139 dyn_cast<ConstantSDNode>(Load0->getOperand(1)); 140 const ConstantSDNode *Load1Offset = 141 dyn_cast<ConstantSDNode>(Load1->getOperand(1)); 142 143 if (!Load0Offset || !Load1Offset) 144 return false; 145 146 // Check chain. 147 if (findChainOperand(Load0) != findChainOperand(Load1)) 148 return false; 149 150 Offset0 = Load0Offset->getZExtValue(); 151 Offset1 = Load1Offset->getZExtValue(); 152 return true; 153 } 154 155 // MUBUF and MTBUF can access the same addresses. 156 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 157 158 // MUBUF and MTBUF have vaddr at different indices. 159 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 160 findChainOperand(Load0) != findChainOperand(Load1) || 161 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 162 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 163 return false; 164 165 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 166 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 167 168 if (OffIdx0 == -1 || OffIdx1 == -1) 169 return false; 170 171 // getNamedOperandIdx returns the index for MachineInstrs. Since they 172 // inlcude the output in the operand list, but SDNodes don't, we need to 173 // subtract the index by one. 174 --OffIdx0; 175 --OffIdx1; 176 177 SDValue Off0 = Load0->getOperand(OffIdx0); 178 SDValue Off1 = Load1->getOperand(OffIdx1); 179 180 // The offset might be a FrameIndexSDNode. 181 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 182 return false; 183 184 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 185 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 186 return true; 187 } 188 189 return false; 190 } 191 192 static bool isStride64(unsigned Opc) { 193 switch (Opc) { 194 case AMDGPU::DS_READ2ST64_B32: 195 case AMDGPU::DS_READ2ST64_B64: 196 case AMDGPU::DS_WRITE2ST64_B32: 197 case AMDGPU::DS_WRITE2ST64_B64: 198 return true; 199 default: 200 return false; 201 } 202 } 203 204 bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, 205 unsigned &Offset, 206 const TargetRegisterInfo *TRI) const { 207 unsigned Opc = LdSt->getOpcode(); 208 209 if (isDS(*LdSt)) { 210 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 211 AMDGPU::OpName::offset); 212 if (OffsetImm) { 213 // Normal, single offset LDS instruction. 214 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 215 AMDGPU::OpName::addr); 216 217 BaseReg = AddrReg->getReg(); 218 Offset = OffsetImm->getImm(); 219 return true; 220 } 221 222 // The 2 offset instructions use offset0 and offset1 instead. We can treat 223 // these as a load with a single offset if the 2 offsets are consecutive. We 224 // will use this for some partially aligned loads. 225 const MachineOperand *Offset0Imm = getNamedOperand(*LdSt, 226 AMDGPU::OpName::offset0); 227 const MachineOperand *Offset1Imm = getNamedOperand(*LdSt, 228 AMDGPU::OpName::offset1); 229 230 uint8_t Offset0 = Offset0Imm->getImm(); 231 uint8_t Offset1 = Offset1Imm->getImm(); 232 233 if (Offset1 > Offset0 && Offset1 - Offset0 == 1) { 234 // Each of these offsets is in element sized units, so we need to convert 235 // to bytes of the individual reads. 236 237 unsigned EltSize; 238 if (LdSt->mayLoad()) 239 EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2; 240 else { 241 assert(LdSt->mayStore()); 242 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 243 EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize(); 244 } 245 246 if (isStride64(Opc)) 247 EltSize *= 64; 248 249 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 250 AMDGPU::OpName::addr); 251 BaseReg = AddrReg->getReg(); 252 Offset = EltSize * Offset0; 253 return true; 254 } 255 256 return false; 257 } 258 259 if (isMUBUF(*LdSt) || isMTBUF(*LdSt)) { 260 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) 261 return false; 262 263 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 264 AMDGPU::OpName::vaddr); 265 if (!AddrReg) 266 return false; 267 268 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 269 AMDGPU::OpName::offset); 270 BaseReg = AddrReg->getReg(); 271 Offset = OffsetImm->getImm(); 272 return true; 273 } 274 275 if (isSMRD(*LdSt)) { 276 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 277 AMDGPU::OpName::offset); 278 if (!OffsetImm) 279 return false; 280 281 const MachineOperand *SBaseReg = getNamedOperand(*LdSt, 282 AMDGPU::OpName::sbase); 283 BaseReg = SBaseReg->getReg(); 284 Offset = OffsetImm->getImm(); 285 return true; 286 } 287 288 return false; 289 } 290 291 bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, 292 MachineInstr *SecondLdSt, 293 unsigned NumLoads) const { 294 // TODO: This needs finer tuning 295 if (NumLoads > 4) 296 return false; 297 298 if (isDS(*FirstLdSt) && isDS(*SecondLdSt)) 299 return true; 300 301 if (isSMRD(*FirstLdSt) && isSMRD(*SecondLdSt)) 302 return true; 303 304 if ((isMUBUF(*FirstLdSt) || isMTBUF(*FirstLdSt)) && 305 (isMUBUF(*SecondLdSt) || isMTBUF(*SecondLdSt))) 306 return true; 307 308 return false; 309 } 310 311 void 312 SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 313 MachineBasicBlock::iterator MI, DebugLoc DL, 314 unsigned DestReg, unsigned SrcReg, 315 bool KillSrc) const { 316 317 // If we are trying to copy to or from SCC, there is a bug somewhere else in 318 // the backend. While it may be theoretically possible to do this, it should 319 // never be necessary. 320 assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC); 321 322 static const int16_t Sub0_15[] = { 323 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 324 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 325 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 326 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 0 327 }; 328 329 static const int16_t Sub0_7[] = { 330 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 331 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 0 332 }; 333 334 static const int16_t Sub0_3[] = { 335 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0 336 }; 337 338 static const int16_t Sub0_2[] = { 339 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 0 340 }; 341 342 static const int16_t Sub0_1[] = { 343 AMDGPU::sub0, AMDGPU::sub1, 0 344 }; 345 346 unsigned Opcode; 347 const int16_t *SubIndices; 348 349 if (AMDGPU::SReg_32RegClass.contains(DestReg)) { 350 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 351 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 352 .addReg(SrcReg, getKillRegState(KillSrc)); 353 return; 354 355 } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) { 356 if (DestReg == AMDGPU::VCC) { 357 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 358 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 359 .addReg(SrcReg, getKillRegState(KillSrc)); 360 } else { 361 // FIXME: Hack until VReg_1 removed. 362 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 363 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32)) 364 .addImm(0) 365 .addReg(SrcReg, getKillRegState(KillSrc)); 366 } 367 368 return; 369 } 370 371 assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); 372 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 373 .addReg(SrcReg, getKillRegState(KillSrc)); 374 return; 375 376 } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) { 377 assert(AMDGPU::SReg_128RegClass.contains(SrcReg)); 378 Opcode = AMDGPU::S_MOV_B32; 379 SubIndices = Sub0_3; 380 381 } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) { 382 assert(AMDGPU::SReg_256RegClass.contains(SrcReg)); 383 Opcode = AMDGPU::S_MOV_B32; 384 SubIndices = Sub0_7; 385 386 } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) { 387 assert(AMDGPU::SReg_512RegClass.contains(SrcReg)); 388 Opcode = AMDGPU::S_MOV_B32; 389 SubIndices = Sub0_15; 390 391 } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) { 392 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 393 AMDGPU::SReg_32RegClass.contains(SrcReg)); 394 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 395 .addReg(SrcReg, getKillRegState(KillSrc)); 396 return; 397 398 } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) { 399 assert(AMDGPU::VReg_64RegClass.contains(SrcReg) || 400 AMDGPU::SReg_64RegClass.contains(SrcReg)); 401 Opcode = AMDGPU::V_MOV_B32_e32; 402 SubIndices = Sub0_1; 403 404 } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) { 405 assert(AMDGPU::VReg_96RegClass.contains(SrcReg)); 406 Opcode = AMDGPU::V_MOV_B32_e32; 407 SubIndices = Sub0_2; 408 409 } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) { 410 assert(AMDGPU::VReg_128RegClass.contains(SrcReg) || 411 AMDGPU::SReg_128RegClass.contains(SrcReg)); 412 Opcode = AMDGPU::V_MOV_B32_e32; 413 SubIndices = Sub0_3; 414 415 } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) { 416 assert(AMDGPU::VReg_256RegClass.contains(SrcReg) || 417 AMDGPU::SReg_256RegClass.contains(SrcReg)); 418 Opcode = AMDGPU::V_MOV_B32_e32; 419 SubIndices = Sub0_7; 420 421 } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) { 422 assert(AMDGPU::VReg_512RegClass.contains(SrcReg) || 423 AMDGPU::SReg_512RegClass.contains(SrcReg)); 424 Opcode = AMDGPU::V_MOV_B32_e32; 425 SubIndices = Sub0_15; 426 427 } else { 428 llvm_unreachable("Can't copy register!"); 429 } 430 431 while (unsigned SubIdx = *SubIndices++) { 432 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 433 get(Opcode), RI.getSubReg(DestReg, SubIdx)); 434 435 Builder.addReg(RI.getSubReg(SrcReg, SubIdx), getKillRegState(KillSrc)); 436 437 if (*SubIndices) 438 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 439 } 440 } 441 442 int SIInstrInfo::commuteOpcode(const MachineInstr &MI) const { 443 const unsigned Opcode = MI.getOpcode(); 444 445 int NewOpc; 446 447 // Try to map original to commuted opcode 448 NewOpc = AMDGPU::getCommuteRev(Opcode); 449 if (NewOpc != -1) 450 // Check if the commuted (REV) opcode exists on the target. 451 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 452 453 // Try to map commuted to original opcode 454 NewOpc = AMDGPU::getCommuteOrig(Opcode); 455 if (NewOpc != -1) 456 // Check if the original (non-REV) opcode exists on the target. 457 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 458 459 return Opcode; 460 } 461 462 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 463 464 if (DstRC->getSize() == 4) { 465 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 466 } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) { 467 return AMDGPU::S_MOV_B64; 468 } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) { 469 return AMDGPU::V_MOV_B64_PSEUDO; 470 } 471 return AMDGPU::COPY; 472 } 473 474 static unsigned getSGPRSpillSaveOpcode(unsigned Size) { 475 switch (Size) { 476 case 4: 477 return AMDGPU::SI_SPILL_S32_SAVE; 478 case 8: 479 return AMDGPU::SI_SPILL_S64_SAVE; 480 case 16: 481 return AMDGPU::SI_SPILL_S128_SAVE; 482 case 32: 483 return AMDGPU::SI_SPILL_S256_SAVE; 484 case 64: 485 return AMDGPU::SI_SPILL_S512_SAVE; 486 default: 487 llvm_unreachable("unknown register size"); 488 } 489 } 490 491 static unsigned getVGPRSpillSaveOpcode(unsigned Size) { 492 switch (Size) { 493 case 4: 494 return AMDGPU::SI_SPILL_V32_SAVE; 495 case 8: 496 return AMDGPU::SI_SPILL_V64_SAVE; 497 case 16: 498 return AMDGPU::SI_SPILL_V128_SAVE; 499 case 32: 500 return AMDGPU::SI_SPILL_V256_SAVE; 501 case 64: 502 return AMDGPU::SI_SPILL_V512_SAVE; 503 default: 504 llvm_unreachable("unknown register size"); 505 } 506 } 507 508 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 509 MachineBasicBlock::iterator MI, 510 unsigned SrcReg, bool isKill, 511 int FrameIndex, 512 const TargetRegisterClass *RC, 513 const TargetRegisterInfo *TRI) const { 514 MachineFunction *MF = MBB.getParent(); 515 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 516 MachineFrameInfo *FrameInfo = MF->getFrameInfo(); 517 DebugLoc DL = MBB.findDebugLoc(MI); 518 519 unsigned Size = FrameInfo->getObjectSize(FrameIndex); 520 unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); 521 MachinePointerInfo PtrInfo 522 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 523 MachineMemOperand *MMO 524 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 525 Size, Align); 526 527 if (RI.isSGPRClass(RC)) { 528 MFI->setHasSpilledSGPRs(); 529 530 // We are only allowed to create one new instruction when spilling 531 // registers, so we need to use pseudo instruction for spilling 532 // SGPRs. 533 unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize()); 534 BuildMI(MBB, MI, DL, get(Opcode)) 535 .addReg(SrcReg) // src 536 .addFrameIndex(FrameIndex) // frame_idx 537 .addMemOperand(MMO); 538 539 return; 540 } 541 542 if (!ST.isVGPRSpillingEnabled(MFI)) { 543 LLVMContext &Ctx = MF->getFunction()->getContext(); 544 Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" 545 " spill register"); 546 BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) 547 .addReg(SrcReg); 548 549 return; 550 } 551 552 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 553 554 unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); 555 MFI->setHasSpilledVGPRs(); 556 BuildMI(MBB, MI, DL, get(Opcode)) 557 .addReg(SrcReg) // src 558 .addFrameIndex(FrameIndex) // frame_idx 559 // Place-holder registers, these will be filled in by 560 // SIPrepareScratchRegs. 561 .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) 562 .addReg(AMDGPU::SGPR0, RegState::Undef) 563 .addMemOperand(MMO); 564 } 565 566 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { 567 switch (Size) { 568 case 4: 569 return AMDGPU::SI_SPILL_S32_RESTORE; 570 case 8: 571 return AMDGPU::SI_SPILL_S64_RESTORE; 572 case 16: 573 return AMDGPU::SI_SPILL_S128_RESTORE; 574 case 32: 575 return AMDGPU::SI_SPILL_S256_RESTORE; 576 case 64: 577 return AMDGPU::SI_SPILL_S512_RESTORE; 578 default: 579 llvm_unreachable("unknown register size"); 580 } 581 } 582 583 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { 584 switch (Size) { 585 case 4: 586 return AMDGPU::SI_SPILL_V32_RESTORE; 587 case 8: 588 return AMDGPU::SI_SPILL_V64_RESTORE; 589 case 16: 590 return AMDGPU::SI_SPILL_V128_RESTORE; 591 case 32: 592 return AMDGPU::SI_SPILL_V256_RESTORE; 593 case 64: 594 return AMDGPU::SI_SPILL_V512_RESTORE; 595 default: 596 llvm_unreachable("unknown register size"); 597 } 598 } 599 600 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 601 MachineBasicBlock::iterator MI, 602 unsigned DestReg, int FrameIndex, 603 const TargetRegisterClass *RC, 604 const TargetRegisterInfo *TRI) const { 605 MachineFunction *MF = MBB.getParent(); 606 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 607 MachineFrameInfo *FrameInfo = MF->getFrameInfo(); 608 DebugLoc DL = MBB.findDebugLoc(MI); 609 unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); 610 unsigned Size = FrameInfo->getObjectSize(FrameIndex); 611 612 MachinePointerInfo PtrInfo 613 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 614 615 MachineMemOperand *MMO = MF->getMachineMemOperand( 616 PtrInfo, MachineMemOperand::MOLoad, Size, Align); 617 618 if (RI.isSGPRClass(RC)) { 619 // FIXME: Maybe this should not include a memoperand because it will be 620 // lowered to non-memory instructions. 621 unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize()); 622 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 623 .addFrameIndex(FrameIndex) // frame_idx 624 .addMemOperand(MMO); 625 626 return; 627 } 628 629 if (!ST.isVGPRSpillingEnabled(MFI)) { 630 LLVMContext &Ctx = MF->getFunction()->getContext(); 631 Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" 632 " restore register"); 633 BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); 634 635 return; 636 } 637 638 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 639 640 unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); 641 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 642 .addFrameIndex(FrameIndex) // frame_idx 643 // Place-holder registers, these will be filled in by 644 // SIPrepareScratchRegs. 645 .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) 646 .addReg(AMDGPU::SGPR0, RegState::Undef) 647 .addMemOperand(MMO); 648 } 649 650 /// \param @Offset Offset in bytes of the FrameIndex being spilled 651 unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, 652 MachineBasicBlock::iterator MI, 653 RegScavenger *RS, unsigned TmpReg, 654 unsigned FrameOffset, 655 unsigned Size) const { 656 MachineFunction *MF = MBB.getParent(); 657 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 658 const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>(); 659 const SIRegisterInfo *TRI = 660 static_cast<const SIRegisterInfo*>(ST.getRegisterInfo()); 661 DebugLoc DL = MBB.findDebugLoc(MI); 662 unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF); 663 unsigned WavefrontSize = ST.getWavefrontSize(); 664 665 unsigned TIDReg = MFI->getTIDReg(); 666 if (!MFI->hasCalculatedTID()) { 667 MachineBasicBlock &Entry = MBB.getParent()->front(); 668 MachineBasicBlock::iterator Insert = Entry.front(); 669 DebugLoc DL = Insert->getDebugLoc(); 670 671 TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass); 672 if (TIDReg == AMDGPU::NoRegister) 673 return TIDReg; 674 675 676 if (MFI->getShaderType() == ShaderType::COMPUTE && 677 WorkGroupSize > WavefrontSize) { 678 679 unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X); 680 unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y); 681 unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z); 682 unsigned InputPtrReg = 683 TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR); 684 for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { 685 if (!Entry.isLiveIn(Reg)) 686 Entry.addLiveIn(Reg); 687 } 688 689 RS->enterBasicBlock(&Entry); 690 // FIXME: Can we scavenge an SReg_64 and access the subregs? 691 unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 692 unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 693 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) 694 .addReg(InputPtrReg) 695 .addImm(SI::KernelInputOffsets::NGROUPS_Z); 696 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) 697 .addReg(InputPtrReg) 698 .addImm(SI::KernelInputOffsets::NGROUPS_Y); 699 700 // NGROUPS.X * NGROUPS.Y 701 BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) 702 .addReg(STmp1) 703 .addReg(STmp0); 704 // (NGROUPS.X * NGROUPS.Y) * TIDIG.X 705 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) 706 .addReg(STmp1) 707 .addReg(TIDIGXReg); 708 // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) 709 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) 710 .addReg(STmp0) 711 .addReg(TIDIGYReg) 712 .addReg(TIDReg); 713 // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z 714 BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) 715 .addReg(TIDReg) 716 .addReg(TIDIGZReg); 717 } else { 718 // Get the wave id 719 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), 720 TIDReg) 721 .addImm(-1) 722 .addImm(0); 723 724 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), 725 TIDReg) 726 .addImm(-1) 727 .addReg(TIDReg); 728 } 729 730 BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), 731 TIDReg) 732 .addImm(2) 733 .addReg(TIDReg); 734 MFI->setTIDReg(TIDReg); 735 } 736 737 // Add FrameIndex to LDS offset 738 unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize); 739 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) 740 .addImm(LDSOffset) 741 .addReg(TIDReg); 742 743 return TmpReg; 744 } 745 746 void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI, 747 int Count) const { 748 while (Count > 0) { 749 int Arg; 750 if (Count >= 8) 751 Arg = 7; 752 else 753 Arg = Count - 1; 754 Count -= 8; 755 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(AMDGPU::S_NOP)) 756 .addImm(Arg); 757 } 758 } 759 760 bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { 761 MachineBasicBlock &MBB = *MI->getParent(); 762 DebugLoc DL = MBB.findDebugLoc(MI); 763 switch (MI->getOpcode()) { 764 default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); 765 766 case AMDGPU::SI_CONSTDATA_PTR: { 767 unsigned Reg = MI->getOperand(0).getReg(); 768 unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); 769 unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); 770 771 BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg); 772 773 // Add 32-bit offset from this instruction to the start of the constant data. 774 BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_U32), RegLo) 775 .addReg(RegLo) 776 .addTargetIndex(AMDGPU::TI_CONSTDATA_START) 777 .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit); 778 BuildMI(MBB, MI, DL, get(AMDGPU::S_ADDC_U32), RegHi) 779 .addReg(RegHi) 780 .addImm(0) 781 .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit) 782 .addReg(AMDGPU::SCC, RegState::Implicit); 783 MI->eraseFromParent(); 784 break; 785 } 786 case AMDGPU::SGPR_USE: 787 // This is just a placeholder for register allocation. 788 MI->eraseFromParent(); 789 break; 790 791 case AMDGPU::V_MOV_B64_PSEUDO: { 792 unsigned Dst = MI->getOperand(0).getReg(); 793 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 794 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 795 796 const MachineOperand &SrcOp = MI->getOperand(1); 797 // FIXME: Will this work for 64-bit floating point immediates? 798 assert(!SrcOp.isFPImm()); 799 if (SrcOp.isImm()) { 800 APInt Imm(64, SrcOp.getImm()); 801 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 802 .addImm(Imm.getLoBits(32).getZExtValue()) 803 .addReg(Dst, RegState::Implicit); 804 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 805 .addImm(Imm.getHiBits(32).getZExtValue()) 806 .addReg(Dst, RegState::Implicit); 807 } else { 808 assert(SrcOp.isReg()); 809 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 810 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 811 .addReg(Dst, RegState::Implicit); 812 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 813 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 814 .addReg(Dst, RegState::Implicit); 815 } 816 MI->eraseFromParent(); 817 break; 818 } 819 820 case AMDGPU::V_CNDMASK_B64_PSEUDO: { 821 unsigned Dst = MI->getOperand(0).getReg(); 822 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 823 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 824 unsigned Src0 = MI->getOperand(1).getReg(); 825 unsigned Src1 = MI->getOperand(2).getReg(); 826 const MachineOperand &SrcCond = MI->getOperand(3); 827 828 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo) 829 .addReg(RI.getSubReg(Src0, AMDGPU::sub0)) 830 .addReg(RI.getSubReg(Src1, AMDGPU::sub0)) 831 .addOperand(SrcCond); 832 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi) 833 .addReg(RI.getSubReg(Src0, AMDGPU::sub1)) 834 .addReg(RI.getSubReg(Src1, AMDGPU::sub1)) 835 .addOperand(SrcCond); 836 MI->eraseFromParent(); 837 break; 838 } 839 } 840 return true; 841 } 842 843 /// Commutes the operands in the given instruction. 844 /// The commutable operands are specified by their indices OpIdx0 and OpIdx1. 845 /// 846 /// Do not call this method for a non-commutable instruction or for 847 /// non-commutable pair of operand indices OpIdx0 and OpIdx1. 848 /// Even though the instruction is commutable, the method may still 849 /// fail to commute the operands, null pointer is returned in such cases. 850 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI, 851 bool NewMI, 852 unsigned OpIdx0, 853 unsigned OpIdx1) const { 854 int CommutedOpcode = commuteOpcode(*MI); 855 if (CommutedOpcode == -1) 856 return nullptr; 857 858 int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 859 AMDGPU::OpName::src0); 860 MachineOperand &Src0 = MI->getOperand(Src0Idx); 861 if (!Src0.isReg()) 862 return nullptr; 863 864 int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 865 AMDGPU::OpName::src1); 866 867 if ((OpIdx0 != static_cast<unsigned>(Src0Idx) || 868 OpIdx1 != static_cast<unsigned>(Src1Idx)) && 869 (OpIdx0 != static_cast<unsigned>(Src1Idx) || 870 OpIdx1 != static_cast<unsigned>(Src0Idx))) 871 return nullptr; 872 873 MachineOperand &Src1 = MI->getOperand(Src1Idx); 874 875 // Make sure it's legal to commute operands for VOP2. 876 if (isVOP2(*MI) && 877 (!isOperandLegal(MI, Src0Idx, &Src1) || 878 !isOperandLegal(MI, Src1Idx, &Src0))) { 879 return nullptr; 880 } 881 882 if (!Src1.isReg()) { 883 // Allow commuting instructions with Imm operands. 884 if (NewMI || !Src1.isImm() || 885 (!isVOP2(*MI) && !isVOP3(*MI))) { 886 return nullptr; 887 } 888 889 // Be sure to copy the source modifiers to the right place. 890 if (MachineOperand *Src0Mods 891 = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { 892 MachineOperand *Src1Mods 893 = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers); 894 895 int Src0ModsVal = Src0Mods->getImm(); 896 if (!Src1Mods && Src0ModsVal != 0) 897 return nullptr; 898 899 // XXX - This assert might be a lie. It might be useful to have a neg 900 // modifier with 0.0. 901 int Src1ModsVal = Src1Mods->getImm(); 902 assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates"); 903 904 Src1Mods->setImm(Src0ModsVal); 905 Src0Mods->setImm(Src1ModsVal); 906 } 907 908 unsigned Reg = Src0.getReg(); 909 unsigned SubReg = Src0.getSubReg(); 910 if (Src1.isImm()) 911 Src0.ChangeToImmediate(Src1.getImm()); 912 else 913 llvm_unreachable("Should only have immediates"); 914 915 Src1.ChangeToRegister(Reg, false); 916 Src1.setSubReg(SubReg); 917 } else { 918 MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1); 919 } 920 921 if (MI) 922 MI->setDesc(get(CommutedOpcode)); 923 924 return MI; 925 } 926 927 // This needs to be implemented because the source modifiers may be inserted 928 // between the true commutable operands, and the base 929 // TargetInstrInfo::commuteInstruction uses it. 930 bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, 931 unsigned &SrcOpIdx0, 932 unsigned &SrcOpIdx1) const { 933 const MCInstrDesc &MCID = MI->getDesc(); 934 if (!MCID.isCommutable()) 935 return false; 936 937 unsigned Opc = MI->getOpcode(); 938 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 939 if (Src0Idx == -1) 940 return false; 941 942 // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on 943 // immediate. Also, immediate src0 operand is not handled in 944 // SIInstrInfo::commuteInstruction(); 945 if (!MI->getOperand(Src0Idx).isReg()) 946 return false; 947 948 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 949 if (Src1Idx == -1) 950 return false; 951 952 MachineOperand &Src1 = MI->getOperand(Src1Idx); 953 if (Src1.isImm()) { 954 // SIInstrInfo::commuteInstruction() does support commuting the immediate 955 // operand src1 in 2 and 3 operand instructions. 956 if (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode())) 957 return false; 958 } else if (Src1.isReg()) { 959 // If any source modifiers are set, the generic instruction commuting won't 960 // understand how to copy the source modifiers. 961 if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || 962 hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) 963 return false; 964 } else 965 return false; 966 967 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); 968 } 969 970 MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB, 971 MachineBasicBlock::iterator I, 972 unsigned DstReg, 973 unsigned SrcReg) const { 974 return BuildMI(*MBB, I, MBB->findDebugLoc(I), get(AMDGPU::V_MOV_B32_e32), 975 DstReg) .addReg(SrcReg); 976 } 977 978 bool SIInstrInfo::isMov(unsigned Opcode) const { 979 switch(Opcode) { 980 default: return false; 981 case AMDGPU::S_MOV_B32: 982 case AMDGPU::S_MOV_B64: 983 case AMDGPU::V_MOV_B32_e32: 984 case AMDGPU::V_MOV_B32_e64: 985 return true; 986 } 987 } 988 989 static void removeModOperands(MachineInstr &MI) { 990 unsigned Opc = MI.getOpcode(); 991 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, 992 AMDGPU::OpName::src0_modifiers); 993 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, 994 AMDGPU::OpName::src1_modifiers); 995 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, 996 AMDGPU::OpName::src2_modifiers); 997 998 MI.RemoveOperand(Src2ModIdx); 999 MI.RemoveOperand(Src1ModIdx); 1000 MI.RemoveOperand(Src0ModIdx); 1001 } 1002 1003 bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, 1004 unsigned Reg, MachineRegisterInfo *MRI) const { 1005 if (!MRI->hasOneNonDBGUse(Reg)) 1006 return false; 1007 1008 unsigned Opc = UseMI->getOpcode(); 1009 if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) { 1010 // Don't fold if we are using source modifiers. The new VOP2 instructions 1011 // don't have them. 1012 if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) || 1013 hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) || 1014 hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) { 1015 return false; 1016 } 1017 1018 MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0); 1019 MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1); 1020 MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2); 1021 1022 // Multiplied part is the constant: Use v_madmk_f32 1023 // We should only expect these to be on src0 due to canonicalizations. 1024 if (Src0->isReg() && Src0->getReg() == Reg) { 1025 if (!Src1->isReg() || 1026 (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) 1027 return false; 1028 1029 if (!Src2->isReg() || 1030 (Src2->isReg() && RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))) 1031 return false; 1032 1033 // We need to do some weird looking operand shuffling since the madmk 1034 // operands are out of the normal expected order with the multiplied 1035 // constant as the last operand. 1036 // 1037 // v_mad_f32 src0, src1, src2 -> v_madmk_f32 src0 * src2K + src1 1038 // src0 -> src2 K 1039 // src1 -> src0 1040 // src2 -> src1 1041 1042 const int64_t Imm = DefMI->getOperand(1).getImm(); 1043 1044 // FIXME: This would be a lot easier if we could return a new instruction 1045 // instead of having to modify in place. 1046 1047 // Remove these first since they are at the end. 1048 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1049 AMDGPU::OpName::omod)); 1050 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1051 AMDGPU::OpName::clamp)); 1052 1053 unsigned Src1Reg = Src1->getReg(); 1054 unsigned Src1SubReg = Src1->getSubReg(); 1055 unsigned Src2Reg = Src2->getReg(); 1056 unsigned Src2SubReg = Src2->getSubReg(); 1057 Src0->setReg(Src1Reg); 1058 Src0->setSubReg(Src1SubReg); 1059 Src0->setIsKill(Src1->isKill()); 1060 1061 Src1->setReg(Src2Reg); 1062 Src1->setSubReg(Src2SubReg); 1063 Src1->setIsKill(Src2->isKill()); 1064 1065 if (Opc == AMDGPU::V_MAC_F32_e64) { 1066 UseMI->untieRegOperand( 1067 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1068 } 1069 1070 Src2->ChangeToImmediate(Imm); 1071 1072 removeModOperands(*UseMI); 1073 UseMI->setDesc(get(AMDGPU::V_MADMK_F32)); 1074 1075 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1076 if (DeleteDef) 1077 DefMI->eraseFromParent(); 1078 1079 return true; 1080 } 1081 1082 // Added part is the constant: Use v_madak_f32 1083 if (Src2->isReg() && Src2->getReg() == Reg) { 1084 // Not allowed to use constant bus for another operand. 1085 // We can however allow an inline immediate as src0. 1086 if (!Src0->isImm() && 1087 (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) 1088 return false; 1089 1090 if (!Src1->isReg() || 1091 (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) 1092 return false; 1093 1094 const int64_t Imm = DefMI->getOperand(1).getImm(); 1095 1096 // FIXME: This would be a lot easier if we could return a new instruction 1097 // instead of having to modify in place. 1098 1099 // Remove these first since they are at the end. 1100 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1101 AMDGPU::OpName::omod)); 1102 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1103 AMDGPU::OpName::clamp)); 1104 1105 if (Opc == AMDGPU::V_MAC_F32_e64) { 1106 UseMI->untieRegOperand( 1107 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1108 } 1109 1110 // ChangingToImmediate adds Src2 back to the instruction. 1111 Src2->ChangeToImmediate(Imm); 1112 1113 // These come before src2. 1114 removeModOperands(*UseMI); 1115 UseMI->setDesc(get(AMDGPU::V_MADAK_F32)); 1116 1117 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1118 if (DeleteDef) 1119 DefMI->eraseFromParent(); 1120 1121 return true; 1122 } 1123 } 1124 1125 return false; 1126 } 1127 1128 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 1129 int WidthB, int OffsetB) { 1130 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1131 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1132 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1133 return LowOffset + LowWidth <= HighOffset; 1134 } 1135 1136 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa, 1137 MachineInstr *MIb) const { 1138 unsigned BaseReg0, Offset0; 1139 unsigned BaseReg1, Offset1; 1140 1141 if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && 1142 getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { 1143 assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() && 1144 "read2 / write2 not expected here yet"); 1145 unsigned Width0 = (*MIa->memoperands_begin())->getSize(); 1146 unsigned Width1 = (*MIb->memoperands_begin())->getSize(); 1147 if (BaseReg0 == BaseReg1 && 1148 offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { 1149 return true; 1150 } 1151 } 1152 1153 return false; 1154 } 1155 1156 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, 1157 MachineInstr *MIb, 1158 AliasAnalysis *AA) const { 1159 assert(MIa && (MIa->mayLoad() || MIa->mayStore()) && 1160 "MIa must load from or modify a memory location"); 1161 assert(MIb && (MIb->mayLoad() || MIb->mayStore()) && 1162 "MIb must load from or modify a memory location"); 1163 1164 if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects()) 1165 return false; 1166 1167 // XXX - Can we relax this between address spaces? 1168 if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef()) 1169 return false; 1170 1171 // TODO: Should we check the address space from the MachineMemOperand? That 1172 // would allow us to distinguish objects we know don't alias based on the 1173 // underlying address space, even if it was lowered to a different one, 1174 // e.g. private accesses lowered to use MUBUF instructions on a scratch 1175 // buffer. 1176 if (isDS(*MIa)) { 1177 if (isDS(*MIb)) 1178 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1179 1180 return !isFLAT(*MIb); 1181 } 1182 1183 if (isMUBUF(*MIa) || isMTBUF(*MIa)) { 1184 if (isMUBUF(*MIb) || isMTBUF(*MIb)) 1185 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1186 1187 return !isFLAT(*MIb) && !isSMRD(*MIb); 1188 } 1189 1190 if (isSMRD(*MIa)) { 1191 if (isSMRD(*MIb)) 1192 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1193 1194 return !isFLAT(*MIb) && !isMUBUF(*MIa) && !isMTBUF(*MIa); 1195 } 1196 1197 if (isFLAT(*MIa)) { 1198 if (isFLAT(*MIb)) 1199 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1200 1201 return false; 1202 } 1203 1204 return false; 1205 } 1206 1207 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, 1208 MachineBasicBlock::iterator &MI, 1209 LiveVariables *LV) const { 1210 1211 switch (MI->getOpcode()) { 1212 default: return nullptr; 1213 case AMDGPU::V_MAC_F32_e64: break; 1214 case AMDGPU::V_MAC_F32_e32: { 1215 const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); 1216 if (Src0->isImm() && !isInlineConstant(*Src0, 4)) 1217 return nullptr; 1218 break; 1219 } 1220 } 1221 1222 const MachineOperand *Dst = getNamedOperand(*MI, AMDGPU::OpName::dst); 1223 const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); 1224 const MachineOperand *Src1 = getNamedOperand(*MI, AMDGPU::OpName::src1); 1225 const MachineOperand *Src2 = getNamedOperand(*MI, AMDGPU::OpName::src2); 1226 1227 return BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_MAD_F32)) 1228 .addOperand(*Dst) 1229 .addImm(0) // Src0 mods 1230 .addOperand(*Src0) 1231 .addImm(0) // Src1 mods 1232 .addOperand(*Src1) 1233 .addImm(0) // Src mods 1234 .addOperand(*Src2) 1235 .addImm(0) // clamp 1236 .addImm(0); // omod 1237 } 1238 1239 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 1240 int64_t SVal = Imm.getSExtValue(); 1241 if (SVal >= -16 && SVal <= 64) 1242 return true; 1243 1244 if (Imm.getBitWidth() == 64) { 1245 uint64_t Val = Imm.getZExtValue(); 1246 return (DoubleToBits(0.0) == Val) || 1247 (DoubleToBits(1.0) == Val) || 1248 (DoubleToBits(-1.0) == Val) || 1249 (DoubleToBits(0.5) == Val) || 1250 (DoubleToBits(-0.5) == Val) || 1251 (DoubleToBits(2.0) == Val) || 1252 (DoubleToBits(-2.0) == Val) || 1253 (DoubleToBits(4.0) == Val) || 1254 (DoubleToBits(-4.0) == Val); 1255 } 1256 1257 // The actual type of the operand does not seem to matter as long 1258 // as the bits match one of the inline immediate values. For example: 1259 // 1260 // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal, 1261 // so it is a legal inline immediate. 1262 // 1263 // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in 1264 // floating-point, so it is a legal inline immediate. 1265 uint32_t Val = Imm.getZExtValue(); 1266 1267 return (FloatToBits(0.0f) == Val) || 1268 (FloatToBits(1.0f) == Val) || 1269 (FloatToBits(-1.0f) == Val) || 1270 (FloatToBits(0.5f) == Val) || 1271 (FloatToBits(-0.5f) == Val) || 1272 (FloatToBits(2.0f) == Val) || 1273 (FloatToBits(-2.0f) == Val) || 1274 (FloatToBits(4.0f) == Val) || 1275 (FloatToBits(-4.0f) == Val); 1276 } 1277 1278 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 1279 unsigned OpSize) const { 1280 if (MO.isImm()) { 1281 // MachineOperand provides no way to tell the true operand size, since it 1282 // only records a 64-bit value. We need to know the size to determine if a 1283 // 32-bit floating point immediate bit pattern is legal for an integer 1284 // immediate. It would be for any 32-bit integer operand, but would not be 1285 // for a 64-bit one. 1286 1287 unsigned BitSize = 8 * OpSize; 1288 return isInlineConstant(APInt(BitSize, MO.getImm(), true)); 1289 } 1290 1291 return false; 1292 } 1293 1294 bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO, 1295 unsigned OpSize) const { 1296 return MO.isImm() && !isInlineConstant(MO, OpSize); 1297 } 1298 1299 static bool compareMachineOp(const MachineOperand &Op0, 1300 const MachineOperand &Op1) { 1301 if (Op0.getType() != Op1.getType()) 1302 return false; 1303 1304 switch (Op0.getType()) { 1305 case MachineOperand::MO_Register: 1306 return Op0.getReg() == Op1.getReg(); 1307 case MachineOperand::MO_Immediate: 1308 return Op0.getImm() == Op1.getImm(); 1309 default: 1310 llvm_unreachable("Didn't expect to be comparing these operand types"); 1311 } 1312 } 1313 1314 bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, 1315 const MachineOperand &MO) const { 1316 const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo]; 1317 1318 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 1319 1320 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 1321 return true; 1322 1323 if (OpInfo.RegClass < 0) 1324 return false; 1325 1326 unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize(); 1327 if (isLiteralConstant(MO, OpSize)) 1328 return RI.opCanUseLiteralConstant(OpInfo.OperandType); 1329 1330 return RI.opCanUseInlineConstant(OpInfo.OperandType); 1331 } 1332 1333 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 1334 int Op32 = AMDGPU::getVOPe32(Opcode); 1335 if (Op32 == -1) 1336 return false; 1337 1338 return pseudoToMCOpcode(Op32) != -1; 1339 } 1340 1341 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 1342 // The src0_modifier operand is present on all instructions 1343 // that have modifiers. 1344 1345 return AMDGPU::getNamedOperandIdx(Opcode, 1346 AMDGPU::OpName::src0_modifiers) != -1; 1347 } 1348 1349 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 1350 unsigned OpName) const { 1351 const MachineOperand *Mods = getNamedOperand(MI, OpName); 1352 return Mods && Mods->getImm(); 1353 } 1354 1355 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 1356 const MachineOperand &MO, 1357 unsigned OpSize) const { 1358 // Literal constants use the constant bus. 1359 if (isLiteralConstant(MO, OpSize)) 1360 return true; 1361 1362 if (!MO.isReg() || !MO.isUse()) 1363 return false; 1364 1365 if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) 1366 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 1367 1368 // FLAT_SCR is just an SGPR pair. 1369 if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) 1370 return true; 1371 1372 // EXEC register uses the constant bus. 1373 if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) 1374 return true; 1375 1376 // SGPRs use the constant bus 1377 if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC || 1378 (!MO.isImplicit() && 1379 (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || 1380 AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) { 1381 return true; 1382 } 1383 1384 return false; 1385 } 1386 1387 static unsigned findImplicitSGPRRead(const MachineInstr &MI) { 1388 for (const MachineOperand &MO : MI.implicit_operands()) { 1389 // We only care about reads. 1390 if (MO.isDef()) 1391 continue; 1392 1393 switch (MO.getReg()) { 1394 case AMDGPU::VCC: 1395 case AMDGPU::M0: 1396 case AMDGPU::FLAT_SCR: 1397 return MO.getReg(); 1398 1399 default: 1400 break; 1401 } 1402 } 1403 1404 return AMDGPU::NoRegister; 1405 } 1406 1407 bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, 1408 StringRef &ErrInfo) const { 1409 uint16_t Opcode = MI->getOpcode(); 1410 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1411 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 1412 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 1413 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 1414 1415 // Make sure the number of operands is correct. 1416 const MCInstrDesc &Desc = get(Opcode); 1417 if (!Desc.isVariadic() && 1418 Desc.getNumOperands() != MI->getNumExplicitOperands()) { 1419 ErrInfo = "Instruction has wrong number of operands."; 1420 return false; 1421 } 1422 1423 // Make sure the register classes are correct 1424 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 1425 if (MI->getOperand(i).isFPImm()) { 1426 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 1427 "all fp values to integers."; 1428 return false; 1429 } 1430 1431 int RegClass = Desc.OpInfo[i].RegClass; 1432 1433 switch (Desc.OpInfo[i].OperandType) { 1434 case MCOI::OPERAND_REGISTER: 1435 if (MI->getOperand(i).isImm()) { 1436 ErrInfo = "Illegal immediate value for operand."; 1437 return false; 1438 } 1439 break; 1440 case AMDGPU::OPERAND_REG_IMM32: 1441 break; 1442 case AMDGPU::OPERAND_REG_INLINE_C: 1443 if (isLiteralConstant(MI->getOperand(i), 1444 RI.getRegClass(RegClass)->getSize())) { 1445 ErrInfo = "Illegal immediate value for operand."; 1446 return false; 1447 } 1448 break; 1449 case MCOI::OPERAND_IMMEDIATE: 1450 // Check if this operand is an immediate. 1451 // FrameIndex operands will be replaced by immediates, so they are 1452 // allowed. 1453 if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) { 1454 ErrInfo = "Expected immediate, but got non-immediate"; 1455 return false; 1456 } 1457 // Fall-through 1458 default: 1459 continue; 1460 } 1461 1462 if (!MI->getOperand(i).isReg()) 1463 continue; 1464 1465 if (RegClass != -1) { 1466 unsigned Reg = MI->getOperand(i).getReg(); 1467 if (TargetRegisterInfo::isVirtualRegister(Reg)) 1468 continue; 1469 1470 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 1471 if (!RC->contains(Reg)) { 1472 ErrInfo = "Operand has incorrect register class."; 1473 return false; 1474 } 1475 } 1476 } 1477 1478 1479 // Verify VOP* 1480 if (isVOP1(*MI) || isVOP2(*MI) || isVOP3(*MI) || isVOPC(*MI)) { 1481 // Only look at the true operands. Only a real operand can use the constant 1482 // bus, and we don't want to check pseudo-operands like the source modifier 1483 // flags. 1484 const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; 1485 1486 unsigned ConstantBusCount = 0; 1487 unsigned SGPRUsed = findImplicitSGPRRead(*MI); 1488 if (SGPRUsed != AMDGPU::NoRegister) 1489 ++ConstantBusCount; 1490 1491 for (int OpIdx : OpIndices) { 1492 if (OpIdx == -1) 1493 break; 1494 const MachineOperand &MO = MI->getOperand(OpIdx); 1495 if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) { 1496 if (MO.isReg()) { 1497 if (MO.getReg() != SGPRUsed) 1498 ++ConstantBusCount; 1499 SGPRUsed = MO.getReg(); 1500 } else { 1501 ++ConstantBusCount; 1502 } 1503 } 1504 } 1505 if (ConstantBusCount > 1) { 1506 ErrInfo = "VOP* instruction uses the constant bus more than once"; 1507 return false; 1508 } 1509 } 1510 1511 // Verify misc. restrictions on specific instructions. 1512 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || 1513 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { 1514 const MachineOperand &Src0 = MI->getOperand(Src0Idx); 1515 const MachineOperand &Src1 = MI->getOperand(Src1Idx); 1516 const MachineOperand &Src2 = MI->getOperand(Src2Idx); 1517 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 1518 if (!compareMachineOp(Src0, Src1) && 1519 !compareMachineOp(Src0, Src2)) { 1520 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 1521 return false; 1522 } 1523 } 1524 } 1525 1526 // Make sure we aren't losing exec uses in the td files. This mostly requires 1527 // being careful when using let Uses to try to add other use registers. 1528 if (!isGenericOpcode(Opcode) && !isSALU(Opcode) && !isSMRD(Opcode)) { 1529 const MachineOperand *Exec = MI->findRegisterUseOperand(AMDGPU::EXEC); 1530 if (!Exec || !Exec->isImplicit()) { 1531 ErrInfo = "VALU instruction does not implicitly read exec mask"; 1532 return false; 1533 } 1534 } 1535 1536 return true; 1537 } 1538 1539 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { 1540 switch (MI.getOpcode()) { 1541 default: return AMDGPU::INSTRUCTION_LIST_END; 1542 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 1543 case AMDGPU::COPY: return AMDGPU::COPY; 1544 case AMDGPU::PHI: return AMDGPU::PHI; 1545 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 1546 case AMDGPU::S_MOV_B32: 1547 return MI.getOperand(1).isReg() ? 1548 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 1549 case AMDGPU::S_ADD_I32: 1550 case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; 1551 case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; 1552 case AMDGPU::S_SUB_I32: 1553 case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; 1554 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 1555 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; 1556 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32; 1557 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32; 1558 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32; 1559 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32; 1560 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32; 1561 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32; 1562 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32; 1563 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 1564 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; 1565 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 1566 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; 1567 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 1568 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; 1569 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; 1570 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; 1571 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; 1572 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; 1573 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 1574 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 1575 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 1576 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 1577 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; 1578 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; 1579 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; 1580 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; 1581 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; 1582 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; 1583 case AMDGPU::S_LOAD_DWORD_IMM: 1584 case AMDGPU::S_LOAD_DWORD_SGPR: 1585 case AMDGPU::S_LOAD_DWORD_IMM_ci: 1586 return AMDGPU::BUFFER_LOAD_DWORD_ADDR64; 1587 case AMDGPU::S_LOAD_DWORDX2_IMM: 1588 case AMDGPU::S_LOAD_DWORDX2_SGPR: 1589 case AMDGPU::S_LOAD_DWORDX2_IMM_ci: 1590 return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; 1591 case AMDGPU::S_LOAD_DWORDX4_IMM: 1592 case AMDGPU::S_LOAD_DWORDX4_SGPR: 1593 case AMDGPU::S_LOAD_DWORDX4_IMM_ci: 1594 return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; 1595 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 1596 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 1597 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 1598 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 1599 } 1600 } 1601 1602 bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const { 1603 return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END; 1604 } 1605 1606 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 1607 unsigned OpNo) const { 1608 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 1609 const MCInstrDesc &Desc = get(MI.getOpcode()); 1610 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 1611 Desc.OpInfo[OpNo].RegClass == -1) { 1612 unsigned Reg = MI.getOperand(OpNo).getReg(); 1613 1614 if (TargetRegisterInfo::isVirtualRegister(Reg)) 1615 return MRI.getRegClass(Reg); 1616 return RI.getPhysRegClass(Reg); 1617 } 1618 1619 unsigned RCID = Desc.OpInfo[OpNo].RegClass; 1620 return RI.getRegClass(RCID); 1621 } 1622 1623 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { 1624 switch (MI.getOpcode()) { 1625 case AMDGPU::COPY: 1626 case AMDGPU::REG_SEQUENCE: 1627 case AMDGPU::PHI: 1628 case AMDGPU::INSERT_SUBREG: 1629 return RI.hasVGPRs(getOpRegClass(MI, 0)); 1630 default: 1631 return RI.hasVGPRs(getOpRegClass(MI, OpNo)); 1632 } 1633 } 1634 1635 void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const { 1636 MachineBasicBlock::iterator I = MI; 1637 MachineBasicBlock *MBB = MI->getParent(); 1638 MachineOperand &MO = MI->getOperand(OpIdx); 1639 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1640 unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass; 1641 const TargetRegisterClass *RC = RI.getRegClass(RCID); 1642 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 1643 if (MO.isReg()) 1644 Opcode = AMDGPU::COPY; 1645 else if (RI.isSGPRClass(RC)) 1646 Opcode = AMDGPU::S_MOV_B32; 1647 1648 1649 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 1650 if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) 1651 VRC = &AMDGPU::VReg_64RegClass; 1652 else 1653 VRC = &AMDGPU::VGPR_32RegClass; 1654 1655 unsigned Reg = MRI.createVirtualRegister(VRC); 1656 DebugLoc DL = MBB->findDebugLoc(I); 1657 BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg) 1658 .addOperand(MO); 1659 MO.ChangeToRegister(Reg, false); 1660 } 1661 1662 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 1663 MachineRegisterInfo &MRI, 1664 MachineOperand &SuperReg, 1665 const TargetRegisterClass *SuperRC, 1666 unsigned SubIdx, 1667 const TargetRegisterClass *SubRC) 1668 const { 1669 MachineBasicBlock *MBB = MI->getParent(); 1670 DebugLoc DL = MI->getDebugLoc(); 1671 unsigned SubReg = MRI.createVirtualRegister(SubRC); 1672 1673 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { 1674 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 1675 .addReg(SuperReg.getReg(), 0, SubIdx); 1676 return SubReg; 1677 } 1678 1679 // Just in case the super register is itself a sub-register, copy it to a new 1680 // value so we don't need to worry about merging its subreg index with the 1681 // SubIdx passed to this function. The register coalescer should be able to 1682 // eliminate this extra copy. 1683 unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); 1684 1685 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 1686 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 1687 1688 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 1689 .addReg(NewSuperReg, 0, SubIdx); 1690 1691 return SubReg; 1692 } 1693 1694 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 1695 MachineBasicBlock::iterator MII, 1696 MachineRegisterInfo &MRI, 1697 MachineOperand &Op, 1698 const TargetRegisterClass *SuperRC, 1699 unsigned SubIdx, 1700 const TargetRegisterClass *SubRC) const { 1701 if (Op.isImm()) { 1702 // XXX - Is there a better way to do this? 1703 if (SubIdx == AMDGPU::sub0) 1704 return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF); 1705 if (SubIdx == AMDGPU::sub1) 1706 return MachineOperand::CreateImm(Op.getImm() >> 32); 1707 1708 llvm_unreachable("Unhandled register index for immediate"); 1709 } 1710 1711 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 1712 SubIdx, SubRC); 1713 return MachineOperand::CreateReg(SubReg, false); 1714 } 1715 1716 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 1717 void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const { 1718 assert(Inst->getNumExplicitOperands() == 3); 1719 MachineOperand Op1 = Inst->getOperand(1); 1720 Inst->RemoveOperand(1); 1721 Inst->addOperand(Op1); 1722 } 1723 1724 bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, 1725 const MachineOperand *MO) const { 1726 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1727 const MCInstrDesc &InstDesc = get(MI->getOpcode()); 1728 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; 1729 const TargetRegisterClass *DefinedRC = 1730 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 1731 if (!MO) 1732 MO = &MI->getOperand(OpIdx); 1733 1734 if (isVALU(*MI) && 1735 usesConstantBus(MRI, *MO, DefinedRC->getSize())) { 1736 unsigned SGPRUsed = 1737 MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister; 1738 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { 1739 if (i == OpIdx) 1740 continue; 1741 const MachineOperand &Op = MI->getOperand(i); 1742 if (Op.isReg() && Op.getReg() != SGPRUsed && 1743 usesConstantBus(MRI, Op, getOpSize(*MI, i))) { 1744 return false; 1745 } 1746 } 1747 } 1748 1749 if (MO->isReg()) { 1750 assert(DefinedRC); 1751 const TargetRegisterClass *RC = 1752 TargetRegisterInfo::isVirtualRegister(MO->getReg()) ? 1753 MRI.getRegClass(MO->getReg()) : 1754 RI.getPhysRegClass(MO->getReg()); 1755 1756 // In order to be legal, the common sub-class must be equal to the 1757 // class of the current operand. For example: 1758 // 1759 // v_mov_b32 s0 ; Operand defined as vsrc_32 1760 // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL 1761 // 1762 // s_sendmsg 0, s0 ; Operand defined as m0reg 1763 // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL 1764 1765 return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; 1766 } 1767 1768 1769 // Handle non-register types that are treated like immediates. 1770 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); 1771 1772 if (!DefinedRC) { 1773 // This operand expects an immediate. 1774 return true; 1775 } 1776 1777 return isImmOperandLegal(MI, OpIdx, *MO); 1778 } 1779 1780 // Legalize VOP3 operands. Because all operand types are supported for any 1781 // operand, and since literal constants are not allowed and should never be 1782 // seen, we only need to worry about inserting copies if we use multiple SGPR 1783 // operands. 1784 void SIInstrInfo::legalizeOperandsVOP3( 1785 MachineRegisterInfo &MRI, 1786 MachineInstr *MI) const { 1787 unsigned Opc = MI->getOpcode(); 1788 1789 int VOP3Idx[3] = { 1790 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 1791 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), 1792 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) 1793 }; 1794 1795 // Find the one SGPR operand we are allowed to use. 1796 unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); 1797 1798 for (unsigned i = 0; i < 3; ++i) { 1799 int Idx = VOP3Idx[i]; 1800 if (Idx == -1) 1801 break; 1802 MachineOperand &MO = MI->getOperand(Idx); 1803 1804 // We should never see a VOP3 instruction with an illegal immediate operand. 1805 if (!MO.isReg()) 1806 continue; 1807 1808 if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 1809 continue; // VGPRs are legal 1810 1811 if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { 1812 SGPRReg = MO.getReg(); 1813 // We can use one SGPR in each VOP3 instruction. 1814 continue; 1815 } 1816 1817 // If we make it this far, then the operand is not legal and we must 1818 // legalize it. 1819 legalizeOpWithMove(MI, Idx); 1820 } 1821 } 1822 1823 void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { 1824 MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1825 unsigned Opc = MI->getOpcode(); 1826 1827 // Legalize VOP2 1828 if (isVOP2(*MI)) { 1829 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1830 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1831 1832 // Legalize src0 1833 if (!isOperandLegal(MI, Src0Idx)) 1834 legalizeOpWithMove(MI, Src0Idx); 1835 1836 // Legalize src1 1837 if (isOperandLegal(MI, Src1Idx)) 1838 return; 1839 1840 // Usually src0 of VOP2 instructions allow more types of inputs 1841 // than src1, so try to commute the instruction to decrease our 1842 // chances of having to insert a MOV instruction to legalize src1. 1843 if (MI->isCommutable()) { 1844 if (commuteInstruction(MI)) 1845 // If we are successful in commuting, then we know MI is legal, so 1846 // we are done. 1847 return; 1848 } 1849 1850 legalizeOpWithMove(MI, Src1Idx); 1851 return; 1852 } 1853 1854 // Legalize VOP3 1855 if (isVOP3(*MI)) { 1856 legalizeOperandsVOP3(MRI, MI); 1857 return; 1858 } 1859 1860 // Legalize REG_SEQUENCE and PHI 1861 // The register class of the operands much be the same type as the register 1862 // class of the output. 1863 if (MI->getOpcode() == AMDGPU::PHI) { 1864 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 1865 for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { 1866 if (!MI->getOperand(i).isReg() || 1867 !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) 1868 continue; 1869 const TargetRegisterClass *OpRC = 1870 MRI.getRegClass(MI->getOperand(i).getReg()); 1871 if (RI.hasVGPRs(OpRC)) { 1872 VRC = OpRC; 1873 } else { 1874 SRC = OpRC; 1875 } 1876 } 1877 1878 // If any of the operands are VGPR registers, then they all most be 1879 // otherwise we will create illegal VGPR->SGPR copies when legalizing 1880 // them. 1881 if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) { 1882 if (!VRC) { 1883 assert(SRC); 1884 VRC = RI.getEquivalentVGPRClass(SRC); 1885 } 1886 RC = VRC; 1887 } else { 1888 RC = SRC; 1889 } 1890 1891 // Update all the operands so they have the same type. 1892 for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { 1893 MachineOperand &Op = MI->getOperand(I); 1894 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 1895 continue; 1896 unsigned DstReg = MRI.createVirtualRegister(RC); 1897 1898 // MI is a PHI instruction. 1899 MachineBasicBlock *InsertBB = MI->getOperand(I + 1).getMBB(); 1900 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); 1901 1902 BuildMI(*InsertBB, Insert, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) 1903 .addOperand(Op); 1904 Op.setReg(DstReg); 1905 } 1906 } 1907 1908 // REG_SEQUENCE doesn't really require operand legalization, but if one has a 1909 // VGPR dest type and SGPR sources, insert copies so all operands are 1910 // VGPRs. This seems to help operand folding / the register coalescer. 1911 if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) { 1912 MachineBasicBlock *MBB = MI->getParent(); 1913 const TargetRegisterClass *DstRC = getOpRegClass(*MI, 0); 1914 if (RI.hasVGPRs(DstRC)) { 1915 // Update all the operands so they are VGPR register classes. These may 1916 // not be the same register class because REG_SEQUENCE supports mixing 1917 // subregister index types e.g. sub0_sub1 + sub2 + sub3 1918 for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { 1919 MachineOperand &Op = MI->getOperand(I); 1920 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 1921 continue; 1922 1923 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); 1924 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); 1925 if (VRC == OpRC) 1926 continue; 1927 1928 unsigned DstReg = MRI.createVirtualRegister(VRC); 1929 1930 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) 1931 .addOperand(Op); 1932 1933 Op.setReg(DstReg); 1934 Op.setIsKill(); 1935 } 1936 } 1937 1938 return; 1939 } 1940 1941 // Legalize INSERT_SUBREG 1942 // src0 must have the same register class as dst 1943 if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) { 1944 unsigned Dst = MI->getOperand(0).getReg(); 1945 unsigned Src0 = MI->getOperand(1).getReg(); 1946 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 1947 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 1948 if (DstRC != Src0RC) { 1949 MachineBasicBlock &MBB = *MI->getParent(); 1950 unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); 1951 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0) 1952 .addReg(Src0); 1953 MI->getOperand(1).setReg(NewSrc0); 1954 } 1955 return; 1956 } 1957 1958 // Legalize MUBUF* instructions 1959 // FIXME: If we start using the non-addr64 instructions for compute, we 1960 // may need to legalize them here. 1961 int SRsrcIdx = 1962 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc); 1963 if (SRsrcIdx != -1) { 1964 // We have an MUBUF instruction 1965 MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx); 1966 unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass; 1967 if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), 1968 RI.getRegClass(SRsrcRC))) { 1969 // The operands are legal. 1970 // FIXME: We may need to legalize operands besided srsrc. 1971 return; 1972 } 1973 1974 MachineBasicBlock &MBB = *MI->getParent(); 1975 1976 // Extract the ptr from the resource descriptor. 1977 unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, 1978 &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); 1979 1980 // Create an empty resource descriptor 1981 unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1982 unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 1983 unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 1984 unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); 1985 uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); 1986 1987 // Zero64 = 0 1988 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64), 1989 Zero64) 1990 .addImm(0); 1991 1992 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 1993 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 1994 SRsrcFormatLo) 1995 .addImm(RsrcDataFormat & 0xFFFFFFFF); 1996 1997 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 1998 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 1999 SRsrcFormatHi) 2000 .addImm(RsrcDataFormat >> 32); 2001 2002 // NewSRsrc = {Zero64, SRsrcFormat} 2003 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) 2004 .addReg(Zero64) 2005 .addImm(AMDGPU::sub0_sub1) 2006 .addReg(SRsrcFormatLo) 2007 .addImm(AMDGPU::sub2) 2008 .addReg(SRsrcFormatHi) 2009 .addImm(AMDGPU::sub3); 2010 2011 MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); 2012 unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2013 if (VAddr) { 2014 // This is already an ADDR64 instruction so we need to add the pointer 2015 // extracted from the resource descriptor to the current value of VAddr. 2016 unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2017 unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2018 2019 // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 2020 DebugLoc DL = MI->getDebugLoc(); 2021 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) 2022 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 2023 .addReg(VAddr->getReg(), 0, AMDGPU::sub0); 2024 2025 // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1 2026 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) 2027 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 2028 .addReg(VAddr->getReg(), 0, AMDGPU::sub1); 2029 2030 // NewVaddr = {NewVaddrHi, NewVaddrLo} 2031 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 2032 .addReg(NewVAddrLo) 2033 .addImm(AMDGPU::sub0) 2034 .addReg(NewVAddrHi) 2035 .addImm(AMDGPU::sub1); 2036 } else { 2037 // This instructions is the _OFFSET variant, so we need to convert it to 2038 // ADDR64. 2039 assert(MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() 2040 < AMDGPUSubtarget::VOLCANIC_ISLANDS && 2041 "FIXME: Need to emit flat atomics here"); 2042 2043 MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata); 2044 MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset); 2045 MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset); 2046 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode()); 2047 2048 // Atomics rith return have have an additional tied operand and are 2049 // missing some of the special bits. 2050 MachineOperand *VDataIn = getNamedOperand(*MI, AMDGPU::OpName::vdata_in); 2051 MachineInstr *Addr64; 2052 2053 if (!VDataIn) { 2054 // Regular buffer load / store. 2055 MachineInstrBuilder MIB 2056 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) 2057 .addOperand(*VData) 2058 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 2059 // This will be replaced later 2060 // with the new value of vaddr. 2061 .addOperand(*SRsrc) 2062 .addOperand(*SOffset) 2063 .addOperand(*Offset); 2064 2065 // Atomics do not have this operand. 2066 if (const MachineOperand *GLC 2067 = getNamedOperand(*MI, AMDGPU::OpName::glc)) { 2068 MIB.addImm(GLC->getImm()); 2069 } 2070 2071 MIB.addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)); 2072 2073 if (const MachineOperand *TFE 2074 = getNamedOperand(*MI, AMDGPU::OpName::tfe)) { 2075 MIB.addImm(TFE->getImm()); 2076 } 2077 2078 MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 2079 Addr64 = MIB; 2080 } else { 2081 // Atomics with return. 2082 Addr64 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) 2083 .addOperand(*VData) 2084 .addOperand(*VDataIn) 2085 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 2086 // This will be replaced later 2087 // with the new value of vaddr. 2088 .addOperand(*SRsrc) 2089 .addOperand(*SOffset) 2090 .addOperand(*Offset) 2091 .addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)) 2092 .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 2093 } 2094 2095 MI->removeFromParent(); 2096 MI = Addr64; 2097 2098 // NewVaddr = {NewVaddrHi, NewVaddrLo} 2099 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 2100 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 2101 .addImm(AMDGPU::sub0) 2102 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 2103 .addImm(AMDGPU::sub1); 2104 2105 VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); 2106 SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); 2107 } 2108 2109 // Update the instruction to use NewVaddr 2110 VAddr->setReg(NewVAddr); 2111 // Update the instruction to use NewSRsrc 2112 SRsrc->setReg(NewSRsrc); 2113 } 2114 } 2115 2116 void SIInstrInfo::splitSMRD(MachineInstr *MI, 2117 const TargetRegisterClass *HalfRC, 2118 unsigned HalfImmOp, unsigned HalfSGPROp, 2119 MachineInstr *&Lo, MachineInstr *&Hi) const { 2120 2121 DebugLoc DL = MI->getDebugLoc(); 2122 MachineBasicBlock *MBB = MI->getParent(); 2123 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 2124 unsigned RegLo = MRI.createVirtualRegister(HalfRC); 2125 unsigned RegHi = MRI.createVirtualRegister(HalfRC); 2126 unsigned HalfSize = HalfRC->getSize(); 2127 const MachineOperand *OffOp = 2128 getNamedOperand(*MI, AMDGPU::OpName::offset); 2129 const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase); 2130 2131 // The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes 2132 // on VI. 2133 2134 bool IsKill = SBase->isKill(); 2135 if (OffOp) { 2136 bool isVI = 2137 MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >= 2138 AMDGPUSubtarget::VOLCANIC_ISLANDS; 2139 unsigned OffScale = isVI ? 1 : 4; 2140 // Handle the _IMM variant 2141 unsigned LoOffset = OffOp->getImm() * OffScale; 2142 unsigned HiOffset = LoOffset + HalfSize; 2143 Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo) 2144 // Use addReg instead of addOperand 2145 // to make sure kill flag is cleared. 2146 .addReg(SBase->getReg(), 0, SBase->getSubReg()) 2147 .addImm(LoOffset / OffScale); 2148 2149 if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) { 2150 unsigned OffsetSGPR = 2151 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 2152 BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR) 2153 .addImm(HiOffset); // The offset in register is in bytes. 2154 Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) 2155 .addReg(SBase->getReg(), getKillRegState(IsKill), 2156 SBase->getSubReg()) 2157 .addReg(OffsetSGPR); 2158 } else { 2159 Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi) 2160 .addReg(SBase->getReg(), getKillRegState(IsKill), 2161 SBase->getSubReg()) 2162 .addImm(HiOffset / OffScale); 2163 } 2164 } else { 2165 // Handle the _SGPR variant 2166 MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff); 2167 Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo) 2168 .addReg(SBase->getReg(), 0, SBase->getSubReg()) 2169 .addOperand(*SOff); 2170 unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 2171 BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR) 2172 .addReg(SOff->getReg(), 0, SOff->getSubReg()) 2173 .addImm(HalfSize); 2174 Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) 2175 .addReg(SBase->getReg(), getKillRegState(IsKill), 2176 SBase->getSubReg()) 2177 .addReg(OffsetSGPR); 2178 } 2179 2180 unsigned SubLo, SubHi; 2181 const TargetRegisterClass *NewDstRC; 2182 switch (HalfSize) { 2183 case 4: 2184 SubLo = AMDGPU::sub0; 2185 SubHi = AMDGPU::sub1; 2186 NewDstRC = &AMDGPU::VReg_64RegClass; 2187 break; 2188 case 8: 2189 SubLo = AMDGPU::sub0_sub1; 2190 SubHi = AMDGPU::sub2_sub3; 2191 NewDstRC = &AMDGPU::VReg_128RegClass; 2192 break; 2193 case 16: 2194 SubLo = AMDGPU::sub0_sub1_sub2_sub3; 2195 SubHi = AMDGPU::sub4_sub5_sub6_sub7; 2196 NewDstRC = &AMDGPU::VReg_256RegClass; 2197 break; 2198 case 32: 2199 SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; 2200 SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15; 2201 NewDstRC = &AMDGPU::VReg_512RegClass; 2202 break; 2203 default: 2204 llvm_unreachable("Unhandled HalfSize"); 2205 } 2206 2207 unsigned OldDst = MI->getOperand(0).getReg(); 2208 unsigned NewDst = MRI.createVirtualRegister(NewDstRC); 2209 2210 MRI.replaceRegWith(OldDst, NewDst); 2211 2212 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDst) 2213 .addReg(RegLo) 2214 .addImm(SubLo) 2215 .addReg(RegHi) 2216 .addImm(SubHi); 2217 } 2218 2219 void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, 2220 MachineRegisterInfo &MRI, 2221 SmallVectorImpl<MachineInstr *> &Worklist) const { 2222 MachineBasicBlock *MBB = MI->getParent(); 2223 int DstIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); 2224 assert(DstIdx != -1); 2225 unsigned DstRCID = get(MI->getOpcode()).OpInfo[DstIdx].RegClass; 2226 switch(RI.getRegClass(DstRCID)->getSize()) { 2227 case 4: 2228 case 8: 2229 case 16: { 2230 unsigned NewOpcode = getVALUOp(*MI); 2231 unsigned RegOffset; 2232 unsigned ImmOffset; 2233 2234 if (MI->getOperand(2).isReg()) { 2235 RegOffset = MI->getOperand(2).getReg(); 2236 ImmOffset = 0; 2237 } else { 2238 assert(MI->getOperand(2).isImm()); 2239 // SMRD instructions take a dword offsets on SI and byte offset on VI 2240 // and MUBUF instructions always take a byte offset. 2241 ImmOffset = MI->getOperand(2).getImm(); 2242 if (MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() <= 2243 AMDGPUSubtarget::SEA_ISLANDS) 2244 ImmOffset <<= 2; 2245 RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2246 2247 if (isUInt<12>(ImmOffset)) { 2248 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 2249 RegOffset) 2250 .addImm(0); 2251 } else { 2252 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 2253 RegOffset) 2254 .addImm(ImmOffset); 2255 ImmOffset = 0; 2256 } 2257 } 2258 2259 unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); 2260 unsigned DWord0 = RegOffset; 2261 unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2262 unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2263 unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2264 uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); 2265 2266 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1) 2267 .addImm(0); 2268 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2) 2269 .addImm(RsrcDataFormat & 0xFFFFFFFF); 2270 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3) 2271 .addImm(RsrcDataFormat >> 32); 2272 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc) 2273 .addReg(DWord0) 2274 .addImm(AMDGPU::sub0) 2275 .addReg(DWord1) 2276 .addImm(AMDGPU::sub1) 2277 .addReg(DWord2) 2278 .addImm(AMDGPU::sub2) 2279 .addReg(DWord3) 2280 .addImm(AMDGPU::sub3); 2281 2282 const MCInstrDesc &NewInstDesc = get(NewOpcode); 2283 const TargetRegisterClass *NewDstRC 2284 = RI.getRegClass(NewInstDesc.OpInfo[0].RegClass); 2285 unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); 2286 unsigned DstReg = MI->getOperand(0).getReg(); 2287 MRI.replaceRegWith(DstReg, NewDstReg); 2288 2289 MachineInstr *NewInst = 2290 BuildMI(*MBB, MI, MI->getDebugLoc(), NewInstDesc, NewDstReg) 2291 .addOperand(MI->getOperand(1)) // sbase 2292 .addReg(SRsrc) 2293 .addImm(0) 2294 .addImm(ImmOffset) 2295 .addImm(0) // glc 2296 .addImm(0) // slc 2297 .addImm(0) // tfe 2298 .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 2299 MI->eraseFromParent(); 2300 2301 legalizeOperands(NewInst); 2302 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 2303 break; 2304 } 2305 case 32: { 2306 MachineInstr *Lo, *Hi; 2307 splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM, 2308 AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi); 2309 MI->eraseFromParent(); 2310 moveSMRDToVALU(Lo, MRI, Worklist); 2311 moveSMRDToVALU(Hi, MRI, Worklist); 2312 break; 2313 } 2314 2315 case 64: { 2316 MachineInstr *Lo, *Hi; 2317 splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM, 2318 AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi); 2319 MI->eraseFromParent(); 2320 moveSMRDToVALU(Lo, MRI, Worklist); 2321 moveSMRDToVALU(Hi, MRI, Worklist); 2322 break; 2323 } 2324 } 2325 } 2326 2327 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { 2328 SmallVector<MachineInstr *, 128> Worklist; 2329 Worklist.push_back(&TopInst); 2330 2331 while (!Worklist.empty()) { 2332 MachineInstr *Inst = Worklist.pop_back_val(); 2333 MachineBasicBlock *MBB = Inst->getParent(); 2334 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 2335 2336 unsigned Opcode = Inst->getOpcode(); 2337 unsigned NewOpcode = getVALUOp(*Inst); 2338 2339 // Handle some special cases 2340 switch (Opcode) { 2341 default: 2342 if (isSMRD(*Inst)) { 2343 moveSMRDToVALU(Inst, MRI, Worklist); 2344 continue; 2345 } 2346 break; 2347 case AMDGPU::S_AND_B64: 2348 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); 2349 Inst->eraseFromParent(); 2350 continue; 2351 2352 case AMDGPU::S_OR_B64: 2353 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64); 2354 Inst->eraseFromParent(); 2355 continue; 2356 2357 case AMDGPU::S_XOR_B64: 2358 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64); 2359 Inst->eraseFromParent(); 2360 continue; 2361 2362 case AMDGPU::S_NOT_B64: 2363 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); 2364 Inst->eraseFromParent(); 2365 continue; 2366 2367 case AMDGPU::S_BCNT1_I32_B64: 2368 splitScalar64BitBCNT(Worklist, Inst); 2369 Inst->eraseFromParent(); 2370 continue; 2371 2372 case AMDGPU::S_BFE_I64: { 2373 splitScalar64BitBFE(Worklist, Inst); 2374 Inst->eraseFromParent(); 2375 continue; 2376 } 2377 2378 case AMDGPU::S_LSHL_B32: 2379 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2380 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 2381 swapOperands(Inst); 2382 } 2383 break; 2384 case AMDGPU::S_ASHR_I32: 2385 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2386 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 2387 swapOperands(Inst); 2388 } 2389 break; 2390 case AMDGPU::S_LSHR_B32: 2391 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2392 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 2393 swapOperands(Inst); 2394 } 2395 break; 2396 case AMDGPU::S_LSHL_B64: 2397 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2398 NewOpcode = AMDGPU::V_LSHLREV_B64; 2399 swapOperands(Inst); 2400 } 2401 break; 2402 case AMDGPU::S_ASHR_I64: 2403 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2404 NewOpcode = AMDGPU::V_ASHRREV_I64; 2405 swapOperands(Inst); 2406 } 2407 break; 2408 case AMDGPU::S_LSHR_B64: 2409 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2410 NewOpcode = AMDGPU::V_LSHRREV_B64; 2411 swapOperands(Inst); 2412 } 2413 break; 2414 2415 case AMDGPU::S_BFE_U64: 2416 case AMDGPU::S_BFM_B64: 2417 llvm_unreachable("Moving this op to VALU not implemented"); 2418 } 2419 2420 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 2421 // We cannot move this instruction to the VALU, so we should try to 2422 // legalize its operands instead. 2423 legalizeOperands(Inst); 2424 continue; 2425 } 2426 2427 // Use the new VALU Opcode. 2428 const MCInstrDesc &NewDesc = get(NewOpcode); 2429 Inst->setDesc(NewDesc); 2430 2431 // Remove any references to SCC. Vector instructions can't read from it, and 2432 // We're just about to add the implicit use / defs of VCC, and we don't want 2433 // both. 2434 for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) { 2435 MachineOperand &Op = Inst->getOperand(i); 2436 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) 2437 Inst->RemoveOperand(i); 2438 } 2439 2440 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 2441 // We are converting these to a BFE, so we need to add the missing 2442 // operands for the size and offset. 2443 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 2444 Inst->addOperand(MachineOperand::CreateImm(0)); 2445 Inst->addOperand(MachineOperand::CreateImm(Size)); 2446 2447 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 2448 // The VALU version adds the second operand to the result, so insert an 2449 // extra 0 operand. 2450 Inst->addOperand(MachineOperand::CreateImm(0)); 2451 } 2452 2453 Inst->addImplicitDefUseOperands(*Inst->getParent()->getParent()); 2454 2455 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 2456 const MachineOperand &OffsetWidthOp = Inst->getOperand(2); 2457 // If we need to move this to VGPRs, we need to unpack the second operand 2458 // back into the 2 separate ones for bit offset and width. 2459 assert(OffsetWidthOp.isImm() && 2460 "Scalar BFE is only implemented for constant width and offset"); 2461 uint32_t Imm = OffsetWidthOp.getImm(); 2462 2463 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 2464 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 2465 Inst->RemoveOperand(2); // Remove old immediate. 2466 Inst->addOperand(MachineOperand::CreateImm(Offset)); 2467 Inst->addOperand(MachineOperand::CreateImm(BitWidth)); 2468 } 2469 2470 // Update the destination register class. 2471 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst); 2472 if (!NewDstRC) 2473 continue; 2474 2475 unsigned DstReg = Inst->getOperand(0).getReg(); 2476 unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); 2477 MRI.replaceRegWith(DstReg, NewDstReg); 2478 2479 // Legalize the operands 2480 legalizeOperands(Inst); 2481 2482 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 2483 } 2484 } 2485 2486 //===----------------------------------------------------------------------===// 2487 // Indirect addressing callbacks 2488 //===----------------------------------------------------------------------===// 2489 2490 unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex, 2491 unsigned Channel) const { 2492 assert(Channel == 0); 2493 return RegIndex; 2494 } 2495 2496 const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const { 2497 return &AMDGPU::VGPR_32RegClass; 2498 } 2499 2500 void SIInstrInfo::splitScalar64BitUnaryOp( 2501 SmallVectorImpl<MachineInstr *> &Worklist, 2502 MachineInstr *Inst, 2503 unsigned Opcode) const { 2504 MachineBasicBlock &MBB = *Inst->getParent(); 2505 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2506 2507 MachineOperand &Dest = Inst->getOperand(0); 2508 MachineOperand &Src0 = Inst->getOperand(1); 2509 DebugLoc DL = Inst->getDebugLoc(); 2510 2511 MachineBasicBlock::iterator MII = Inst; 2512 2513 const MCInstrDesc &InstDesc = get(Opcode); 2514 const TargetRegisterClass *Src0RC = Src0.isReg() ? 2515 MRI.getRegClass(Src0.getReg()) : 2516 &AMDGPU::SGPR_32RegClass; 2517 2518 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 2519 2520 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2521 AMDGPU::sub0, Src0SubRC); 2522 2523 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 2524 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 2525 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 2526 2527 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 2528 BuildMI(MBB, MII, DL, InstDesc, DestSub0) 2529 .addOperand(SrcReg0Sub0); 2530 2531 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2532 AMDGPU::sub1, Src0SubRC); 2533 2534 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 2535 BuildMI(MBB, MII, DL, InstDesc, DestSub1) 2536 .addOperand(SrcReg0Sub1); 2537 2538 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 2539 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2540 .addReg(DestSub0) 2541 .addImm(AMDGPU::sub0) 2542 .addReg(DestSub1) 2543 .addImm(AMDGPU::sub1); 2544 2545 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 2546 2547 // We don't need to legalizeOperands here because for a single operand, src0 2548 // will support any kind of input. 2549 2550 // Move all users of this moved value. 2551 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 2552 } 2553 2554 void SIInstrInfo::splitScalar64BitBinaryOp( 2555 SmallVectorImpl<MachineInstr *> &Worklist, 2556 MachineInstr *Inst, 2557 unsigned Opcode) const { 2558 MachineBasicBlock &MBB = *Inst->getParent(); 2559 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2560 2561 MachineOperand &Dest = Inst->getOperand(0); 2562 MachineOperand &Src0 = Inst->getOperand(1); 2563 MachineOperand &Src1 = Inst->getOperand(2); 2564 DebugLoc DL = Inst->getDebugLoc(); 2565 2566 MachineBasicBlock::iterator MII = Inst; 2567 2568 const MCInstrDesc &InstDesc = get(Opcode); 2569 const TargetRegisterClass *Src0RC = Src0.isReg() ? 2570 MRI.getRegClass(Src0.getReg()) : 2571 &AMDGPU::SGPR_32RegClass; 2572 2573 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 2574 const TargetRegisterClass *Src1RC = Src1.isReg() ? 2575 MRI.getRegClass(Src1.getReg()) : 2576 &AMDGPU::SGPR_32RegClass; 2577 2578 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 2579 2580 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2581 AMDGPU::sub0, Src0SubRC); 2582 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 2583 AMDGPU::sub0, Src1SubRC); 2584 2585 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 2586 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 2587 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 2588 2589 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 2590 MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) 2591 .addOperand(SrcReg0Sub0) 2592 .addOperand(SrcReg1Sub0); 2593 2594 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2595 AMDGPU::sub1, Src0SubRC); 2596 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 2597 AMDGPU::sub1, Src1SubRC); 2598 2599 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 2600 MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) 2601 .addOperand(SrcReg0Sub1) 2602 .addOperand(SrcReg1Sub1); 2603 2604 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 2605 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2606 .addReg(DestSub0) 2607 .addImm(AMDGPU::sub0) 2608 .addReg(DestSub1) 2609 .addImm(AMDGPU::sub1); 2610 2611 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 2612 2613 // Try to legalize the operands in case we need to swap the order to keep it 2614 // valid. 2615 legalizeOperands(LoHalf); 2616 legalizeOperands(HiHalf); 2617 2618 // Move all users of this moved vlaue. 2619 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 2620 } 2621 2622 void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, 2623 MachineInstr *Inst) const { 2624 MachineBasicBlock &MBB = *Inst->getParent(); 2625 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2626 2627 MachineBasicBlock::iterator MII = Inst; 2628 DebugLoc DL = Inst->getDebugLoc(); 2629 2630 MachineOperand &Dest = Inst->getOperand(0); 2631 MachineOperand &Src = Inst->getOperand(1); 2632 2633 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 2634 const TargetRegisterClass *SrcRC = Src.isReg() ? 2635 MRI.getRegClass(Src.getReg()) : 2636 &AMDGPU::SGPR_32RegClass; 2637 2638 unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2639 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2640 2641 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); 2642 2643 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 2644 AMDGPU::sub0, SrcSubRC); 2645 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 2646 AMDGPU::sub1, SrcSubRC); 2647 2648 BuildMI(MBB, MII, DL, InstDesc, MidReg) 2649 .addOperand(SrcRegSub0) 2650 .addImm(0); 2651 2652 BuildMI(MBB, MII, DL, InstDesc, ResultReg) 2653 .addOperand(SrcRegSub1) 2654 .addReg(MidReg); 2655 2656 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2657 2658 // We don't need to legalize operands here. src0 for etiher instruction can be 2659 // an SGPR, and the second input is unused or determined here. 2660 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2661 } 2662 2663 void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, 2664 MachineInstr *Inst) const { 2665 MachineBasicBlock &MBB = *Inst->getParent(); 2666 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2667 MachineBasicBlock::iterator MII = Inst; 2668 DebugLoc DL = Inst->getDebugLoc(); 2669 2670 MachineOperand &Dest = Inst->getOperand(0); 2671 uint32_t Imm = Inst->getOperand(2).getImm(); 2672 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 2673 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 2674 2675 (void) Offset; 2676 2677 // Only sext_inreg cases handled. 2678 assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 && 2679 BitWidth <= 32 && 2680 Offset == 0 && 2681 "Not implemented"); 2682 2683 if (BitWidth < 32) { 2684 unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2685 unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2686 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2687 2688 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) 2689 .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0) 2690 .addImm(0) 2691 .addImm(BitWidth); 2692 2693 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 2694 .addImm(31) 2695 .addReg(MidRegLo); 2696 2697 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 2698 .addReg(MidRegLo) 2699 .addImm(AMDGPU::sub0) 2700 .addReg(MidRegHi) 2701 .addImm(AMDGPU::sub1); 2702 2703 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2704 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2705 return; 2706 } 2707 2708 MachineOperand &Src = Inst->getOperand(1); 2709 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2710 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2711 2712 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 2713 .addImm(31) 2714 .addReg(Src.getReg(), 0, AMDGPU::sub0); 2715 2716 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 2717 .addReg(Src.getReg(), 0, AMDGPU::sub0) 2718 .addImm(AMDGPU::sub0) 2719 .addReg(TmpReg) 2720 .addImm(AMDGPU::sub1); 2721 2722 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2723 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2724 } 2725 2726 void SIInstrInfo::addUsersToMoveToVALUWorklist( 2727 unsigned DstReg, 2728 MachineRegisterInfo &MRI, 2729 SmallVectorImpl<MachineInstr *> &Worklist) const { 2730 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), 2731 E = MRI.use_end(); I != E; ++I) { 2732 MachineInstr &UseMI = *I->getParent(); 2733 if (!canReadVGPR(UseMI, I.getOperandNo())) { 2734 Worklist.push_back(&UseMI); 2735 } 2736 } 2737 } 2738 2739 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( 2740 const MachineInstr &Inst) const { 2741 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); 2742 2743 switch (Inst.getOpcode()) { 2744 // For target instructions, getOpRegClass just returns the virtual register 2745 // class associated with the operand, so we need to find an equivalent VGPR 2746 // register class in order to move the instruction to the VALU. 2747 case AMDGPU::COPY: 2748 case AMDGPU::PHI: 2749 case AMDGPU::REG_SEQUENCE: 2750 case AMDGPU::INSERT_SUBREG: 2751 if (RI.hasVGPRs(NewDstRC)) 2752 return nullptr; 2753 2754 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 2755 if (!NewDstRC) 2756 return nullptr; 2757 return NewDstRC; 2758 default: 2759 return NewDstRC; 2760 } 2761 } 2762 2763 // Find the one SGPR operand we are allowed to use. 2764 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, 2765 int OpIndices[3]) const { 2766 const MCInstrDesc &Desc = MI->getDesc(); 2767 2768 // Find the one SGPR operand we are allowed to use. 2769 // 2770 // First we need to consider the instruction's operand requirements before 2771 // legalizing. Some operands are required to be SGPRs, such as implicit uses 2772 // of VCC, but we are still bound by the constant bus requirement to only use 2773 // one. 2774 // 2775 // If the operand's class is an SGPR, we can never move it. 2776 2777 unsigned SGPRReg = findImplicitSGPRRead(*MI); 2778 if (SGPRReg != AMDGPU::NoRegister) 2779 return SGPRReg; 2780 2781 unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; 2782 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 2783 2784 for (unsigned i = 0; i < 3; ++i) { 2785 int Idx = OpIndices[i]; 2786 if (Idx == -1) 2787 break; 2788 2789 const MachineOperand &MO = MI->getOperand(Idx); 2790 if (!MO.isReg()) 2791 continue; 2792 2793 // Is this operand statically required to be an SGPR based on the operand 2794 // constraints? 2795 const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); 2796 bool IsRequiredSGPR = RI.isSGPRClass(OpRC); 2797 if (IsRequiredSGPR) 2798 return MO.getReg(); 2799 2800 // If this could be a VGPR or an SGPR, Check the dynamic register class. 2801 unsigned Reg = MO.getReg(); 2802 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); 2803 if (RI.isSGPRClass(RegRC)) 2804 UsedSGPRs[i] = Reg; 2805 } 2806 2807 // We don't have a required SGPR operand, so we have a bit more freedom in 2808 // selecting operands to move. 2809 2810 // Try to select the most used SGPR. If an SGPR is equal to one of the 2811 // others, we choose that. 2812 // 2813 // e.g. 2814 // V_FMA_F32 v0, s0, s0, s0 -> No moves 2815 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 2816 2817 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should 2818 // prefer those. 2819 2820 if (UsedSGPRs[0] != AMDGPU::NoRegister) { 2821 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 2822 SGPRReg = UsedSGPRs[0]; 2823 } 2824 2825 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { 2826 if (UsedSGPRs[1] == UsedSGPRs[2]) 2827 SGPRReg = UsedSGPRs[1]; 2828 } 2829 2830 return SGPRReg; 2831 } 2832 2833 MachineInstrBuilder SIInstrInfo::buildIndirectWrite( 2834 MachineBasicBlock *MBB, 2835 MachineBasicBlock::iterator I, 2836 unsigned ValueReg, 2837 unsigned Address, unsigned OffsetReg) const { 2838 const DebugLoc &DL = MBB->findDebugLoc(I); 2839 unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( 2840 getIndirectIndexBegin(*MBB->getParent())); 2841 2842 return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_DST_V1)) 2843 .addReg(IndirectBaseReg, RegState::Define) 2844 .addOperand(I->getOperand(0)) 2845 .addReg(IndirectBaseReg) 2846 .addReg(OffsetReg) 2847 .addImm(0) 2848 .addReg(ValueReg); 2849 } 2850 2851 MachineInstrBuilder SIInstrInfo::buildIndirectRead( 2852 MachineBasicBlock *MBB, 2853 MachineBasicBlock::iterator I, 2854 unsigned ValueReg, 2855 unsigned Address, unsigned OffsetReg) const { 2856 const DebugLoc &DL = MBB->findDebugLoc(I); 2857 unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( 2858 getIndirectIndexBegin(*MBB->getParent())); 2859 2860 return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC_V1)) 2861 .addOperand(I->getOperand(0)) 2862 .addOperand(I->getOperand(1)) 2863 .addReg(IndirectBaseReg) 2864 .addReg(OffsetReg) 2865 .addImm(0); 2866 2867 } 2868 2869 void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved, 2870 const MachineFunction &MF) const { 2871 int End = getIndirectIndexEnd(MF); 2872 int Begin = getIndirectIndexBegin(MF); 2873 2874 if (End == -1) 2875 return; 2876 2877 2878 for (int Index = Begin; Index <= End; ++Index) 2879 Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index)); 2880 2881 for (int Index = std::max(0, Begin - 1); Index <= End; ++Index) 2882 Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index)); 2883 2884 for (int Index = std::max(0, Begin - 2); Index <= End; ++Index) 2885 Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index)); 2886 2887 for (int Index = std::max(0, Begin - 3); Index <= End; ++Index) 2888 Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index)); 2889 2890 for (int Index = std::max(0, Begin - 7); Index <= End; ++Index) 2891 Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index)); 2892 2893 for (int Index = std::max(0, Begin - 15); Index <= End; ++Index) 2894 Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index)); 2895 } 2896 2897 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 2898 unsigned OperandName) const { 2899 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 2900 if (Idx == -1) 2901 return nullptr; 2902 2903 return &MI.getOperand(Idx); 2904 } 2905 2906 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 2907 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 2908 if (ST.isAmdHsaOS()) { 2909 RsrcDataFormat |= (1ULL << 56); 2910 2911 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 2912 // Set MTYPE = 2 2913 RsrcDataFormat |= (2ULL << 59); 2914 } 2915 2916 return RsrcDataFormat; 2917 } 2918 2919 uint64_t SIInstrInfo::getScratchRsrcWords23() const { 2920 uint64_t Rsrc23 = getDefaultRsrcDataFormat() | 2921 AMDGPU::RSRC_TID_ENABLE | 2922 0xffffffff; // Size; 2923 2924 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. 2925 // Clear them unless we want a huge stride. 2926 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 2927 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; 2928 2929 return Rsrc23; 2930 } 2931