1 //===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief SI Implementation of TargetInstrInfo. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "SIInstrInfo.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "SIDefines.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/CodeGen/MachineFrameInfo.h" 21 #include "llvm/CodeGen/MachineInstrBuilder.h" 22 #include "llvm/CodeGen/MachineRegisterInfo.h" 23 #include "llvm/IR/Function.h" 24 #include "llvm/CodeGen/RegisterScavenging.h" 25 #include "llvm/MC/MCInstrDesc.h" 26 #include "llvm/Support/Debug.h" 27 28 using namespace llvm; 29 30 SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st) 31 : AMDGPUInstrInfo(st), RI() {} 32 33 //===----------------------------------------------------------------------===// 34 // TargetInstrInfo callbacks 35 //===----------------------------------------------------------------------===// 36 37 static unsigned getNumOperandsNoGlue(SDNode *Node) { 38 unsigned N = Node->getNumOperands(); 39 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 40 --N; 41 return N; 42 } 43 44 static SDValue findChainOperand(SDNode *Load) { 45 SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); 46 assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); 47 return LastOp; 48 } 49 50 /// \brief Returns true if both nodes have the same value for the given 51 /// operand \p Op, or if both nodes do not have this operand. 52 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 53 unsigned Opc0 = N0->getMachineOpcode(); 54 unsigned Opc1 = N1->getMachineOpcode(); 55 56 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 57 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 58 59 if (Op0Idx == -1 && Op1Idx == -1) 60 return true; 61 62 63 if ((Op0Idx == -1 && Op1Idx != -1) || 64 (Op1Idx == -1 && Op0Idx != -1)) 65 return false; 66 67 // getNamedOperandIdx returns the index for the MachineInstr's operands, 68 // which includes the result as the first operand. We are indexing into the 69 // MachineSDNode's operands, so we need to skip the result operand to get 70 // the real index. 71 --Op0Idx; 72 --Op1Idx; 73 74 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 75 } 76 77 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, 78 AliasAnalysis *AA) const { 79 // TODO: The generic check fails for VALU instructions that should be 80 // rematerializable due to implicit reads of exec. We really want all of the 81 // generic logic for this except for this. 82 switch (MI->getOpcode()) { 83 case AMDGPU::V_MOV_B32_e32: 84 case AMDGPU::V_MOV_B32_e64: 85 case AMDGPU::V_MOV_B64_PSEUDO: 86 return true; 87 default: 88 return false; 89 } 90 } 91 92 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 93 int64_t &Offset0, 94 int64_t &Offset1) const { 95 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 96 return false; 97 98 unsigned Opc0 = Load0->getMachineOpcode(); 99 unsigned Opc1 = Load1->getMachineOpcode(); 100 101 // Make sure both are actually loads. 102 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 103 return false; 104 105 if (isDS(Opc0) && isDS(Opc1)) { 106 107 // FIXME: Handle this case: 108 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 109 return false; 110 111 // Check base reg. 112 if (Load0->getOperand(1) != Load1->getOperand(1)) 113 return false; 114 115 // Check chain. 116 if (findChainOperand(Load0) != findChainOperand(Load1)) 117 return false; 118 119 // Skip read2 / write2 variants for simplicity. 120 // TODO: We should report true if the used offsets are adjacent (excluded 121 // st64 versions). 122 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || 123 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) 124 return false; 125 126 Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue(); 127 Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue(); 128 return true; 129 } 130 131 if (isSMRD(Opc0) && isSMRD(Opc1)) { 132 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); 133 134 // Check base reg. 135 if (Load0->getOperand(0) != Load1->getOperand(0)) 136 return false; 137 138 const ConstantSDNode *Load0Offset = 139 dyn_cast<ConstantSDNode>(Load0->getOperand(1)); 140 const ConstantSDNode *Load1Offset = 141 dyn_cast<ConstantSDNode>(Load1->getOperand(1)); 142 143 if (!Load0Offset || !Load1Offset) 144 return false; 145 146 // Check chain. 147 if (findChainOperand(Load0) != findChainOperand(Load1)) 148 return false; 149 150 Offset0 = Load0Offset->getZExtValue(); 151 Offset1 = Load1Offset->getZExtValue(); 152 return true; 153 } 154 155 // MUBUF and MTBUF can access the same addresses. 156 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 157 158 // MUBUF and MTBUF have vaddr at different indices. 159 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 160 findChainOperand(Load0) != findChainOperand(Load1) || 161 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 162 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 163 return false; 164 165 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 166 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 167 168 if (OffIdx0 == -1 || OffIdx1 == -1) 169 return false; 170 171 // getNamedOperandIdx returns the index for MachineInstrs. Since they 172 // inlcude the output in the operand list, but SDNodes don't, we need to 173 // subtract the index by one. 174 --OffIdx0; 175 --OffIdx1; 176 177 SDValue Off0 = Load0->getOperand(OffIdx0); 178 SDValue Off1 = Load1->getOperand(OffIdx1); 179 180 // The offset might be a FrameIndexSDNode. 181 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 182 return false; 183 184 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 185 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 186 return true; 187 } 188 189 return false; 190 } 191 192 static bool isStride64(unsigned Opc) { 193 switch (Opc) { 194 case AMDGPU::DS_READ2ST64_B32: 195 case AMDGPU::DS_READ2ST64_B64: 196 case AMDGPU::DS_WRITE2ST64_B32: 197 case AMDGPU::DS_WRITE2ST64_B64: 198 return true; 199 default: 200 return false; 201 } 202 } 203 204 bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, 205 int64_t &Offset, 206 const TargetRegisterInfo *TRI) const { 207 unsigned Opc = LdSt->getOpcode(); 208 209 if (isDS(*LdSt)) { 210 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 211 AMDGPU::OpName::offset); 212 if (OffsetImm) { 213 // Normal, single offset LDS instruction. 214 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 215 AMDGPU::OpName::addr); 216 217 BaseReg = AddrReg->getReg(); 218 Offset = OffsetImm->getImm(); 219 return true; 220 } 221 222 // The 2 offset instructions use offset0 and offset1 instead. We can treat 223 // these as a load with a single offset if the 2 offsets are consecutive. We 224 // will use this for some partially aligned loads. 225 const MachineOperand *Offset0Imm = getNamedOperand(*LdSt, 226 AMDGPU::OpName::offset0); 227 // DS_PERMUTE does not have Offset0Imm (and Offset1Imm). 228 if (!Offset0Imm) 229 return false; 230 231 const MachineOperand *Offset1Imm = getNamedOperand(*LdSt, 232 AMDGPU::OpName::offset1); 233 234 uint8_t Offset0 = Offset0Imm->getImm(); 235 uint8_t Offset1 = Offset1Imm->getImm(); 236 237 if (Offset1 > Offset0 && Offset1 - Offset0 == 1) { 238 // Each of these offsets is in element sized units, so we need to convert 239 // to bytes of the individual reads. 240 241 unsigned EltSize; 242 if (LdSt->mayLoad()) 243 EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2; 244 else { 245 assert(LdSt->mayStore()); 246 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 247 EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize(); 248 } 249 250 if (isStride64(Opc)) 251 EltSize *= 64; 252 253 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 254 AMDGPU::OpName::addr); 255 BaseReg = AddrReg->getReg(); 256 Offset = EltSize * Offset0; 257 return true; 258 } 259 260 return false; 261 } 262 263 if (isMUBUF(*LdSt) || isMTBUF(*LdSt)) { 264 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) 265 return false; 266 267 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 268 AMDGPU::OpName::vaddr); 269 if (!AddrReg) 270 return false; 271 272 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 273 AMDGPU::OpName::offset); 274 BaseReg = AddrReg->getReg(); 275 Offset = OffsetImm->getImm(); 276 return true; 277 } 278 279 if (isSMRD(*LdSt)) { 280 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 281 AMDGPU::OpName::offset); 282 if (!OffsetImm) 283 return false; 284 285 const MachineOperand *SBaseReg = getNamedOperand(*LdSt, 286 AMDGPU::OpName::sbase); 287 BaseReg = SBaseReg->getReg(); 288 Offset = OffsetImm->getImm(); 289 return true; 290 } 291 292 return false; 293 } 294 295 bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, 296 MachineInstr *SecondLdSt, 297 unsigned NumLoads) const { 298 // TODO: This needs finer tuning 299 if (NumLoads > 4) 300 return false; 301 302 if (isDS(*FirstLdSt) && isDS(*SecondLdSt)) 303 return true; 304 305 if (isSMRD(*FirstLdSt) && isSMRD(*SecondLdSt)) 306 return true; 307 308 return (isMUBUF(*FirstLdSt) || isMTBUF(*FirstLdSt)) && 309 (isMUBUF(*SecondLdSt) || isMTBUF(*SecondLdSt)); 310 } 311 312 void 313 SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 314 MachineBasicBlock::iterator MI, DebugLoc DL, 315 unsigned DestReg, unsigned SrcReg, 316 bool KillSrc) const { 317 318 // If we are trying to copy to or from SCC, there is a bug somewhere else in 319 // the backend. While it may be theoretically possible to do this, it should 320 // never be necessary. 321 assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC); 322 323 static const int16_t Sub0_15[] = { 324 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 325 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 326 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 327 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 328 }; 329 330 static const int16_t Sub0_15_64[] = { 331 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 332 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 333 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 334 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 335 }; 336 337 static const int16_t Sub0_7[] = { 338 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 339 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 340 }; 341 342 static const int16_t Sub0_7_64[] = { 343 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 344 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 345 }; 346 347 static const int16_t Sub0_3[] = { 348 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 349 }; 350 351 static const int16_t Sub0_3_64[] = { 352 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 353 }; 354 355 static const int16_t Sub0_2[] = { 356 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 357 }; 358 359 static const int16_t Sub0_1[] = { 360 AMDGPU::sub0, AMDGPU::sub1, 361 }; 362 363 unsigned Opcode; 364 ArrayRef<int16_t> SubIndices; 365 bool Forward; 366 367 if (AMDGPU::SReg_32RegClass.contains(DestReg)) { 368 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 369 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 370 .addReg(SrcReg, getKillRegState(KillSrc)); 371 return; 372 373 } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) { 374 if (DestReg == AMDGPU::VCC) { 375 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 376 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 377 .addReg(SrcReg, getKillRegState(KillSrc)); 378 } else { 379 // FIXME: Hack until VReg_1 removed. 380 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 381 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32)) 382 .addImm(0) 383 .addReg(SrcReg, getKillRegState(KillSrc)); 384 } 385 386 return; 387 } 388 389 assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); 390 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 391 .addReg(SrcReg, getKillRegState(KillSrc)); 392 return; 393 394 } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) { 395 assert(AMDGPU::SReg_128RegClass.contains(SrcReg)); 396 Opcode = AMDGPU::S_MOV_B64; 397 SubIndices = Sub0_3_64; 398 399 } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) { 400 assert(AMDGPU::SReg_256RegClass.contains(SrcReg)); 401 Opcode = AMDGPU::S_MOV_B64; 402 SubIndices = Sub0_7_64; 403 404 } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) { 405 assert(AMDGPU::SReg_512RegClass.contains(SrcReg)); 406 Opcode = AMDGPU::S_MOV_B64; 407 SubIndices = Sub0_15_64; 408 409 } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) { 410 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 411 AMDGPU::SReg_32RegClass.contains(SrcReg)); 412 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 413 .addReg(SrcReg, getKillRegState(KillSrc)); 414 return; 415 416 } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) { 417 assert(AMDGPU::VReg_64RegClass.contains(SrcReg) || 418 AMDGPU::SReg_64RegClass.contains(SrcReg)); 419 Opcode = AMDGPU::V_MOV_B32_e32; 420 SubIndices = Sub0_1; 421 422 } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) { 423 assert(AMDGPU::VReg_96RegClass.contains(SrcReg)); 424 Opcode = AMDGPU::V_MOV_B32_e32; 425 SubIndices = Sub0_2; 426 427 } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) { 428 assert(AMDGPU::VReg_128RegClass.contains(SrcReg) || 429 AMDGPU::SReg_128RegClass.contains(SrcReg)); 430 Opcode = AMDGPU::V_MOV_B32_e32; 431 SubIndices = Sub0_3; 432 433 } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) { 434 assert(AMDGPU::VReg_256RegClass.contains(SrcReg) || 435 AMDGPU::SReg_256RegClass.contains(SrcReg)); 436 Opcode = AMDGPU::V_MOV_B32_e32; 437 SubIndices = Sub0_7; 438 439 } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) { 440 assert(AMDGPU::VReg_512RegClass.contains(SrcReg) || 441 AMDGPU::SReg_512RegClass.contains(SrcReg)); 442 Opcode = AMDGPU::V_MOV_B32_e32; 443 SubIndices = Sub0_15; 444 445 } else { 446 llvm_unreachable("Can't copy register!"); 447 } 448 449 if (RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg)) 450 Forward = true; 451 else 452 Forward = false; 453 454 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 455 unsigned SubIdx; 456 if (Forward) 457 SubIdx = SubIndices[Idx]; 458 else 459 SubIdx = SubIndices[SubIndices.size() - Idx - 1]; 460 461 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 462 get(Opcode), RI.getSubReg(DestReg, SubIdx)); 463 464 Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); 465 466 if (Idx == SubIndices.size() - 1) 467 Builder.addReg(SrcReg, RegState::Kill | RegState::Implicit); 468 469 if (Idx == 0) 470 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 471 } 472 } 473 474 int SIInstrInfo::commuteOpcode(const MachineInstr &MI) const { 475 const unsigned Opcode = MI.getOpcode(); 476 477 int NewOpc; 478 479 // Try to map original to commuted opcode 480 NewOpc = AMDGPU::getCommuteRev(Opcode); 481 if (NewOpc != -1) 482 // Check if the commuted (REV) opcode exists on the target. 483 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 484 485 // Try to map commuted to original opcode 486 NewOpc = AMDGPU::getCommuteOrig(Opcode); 487 if (NewOpc != -1) 488 // Check if the original (non-REV) opcode exists on the target. 489 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 490 491 return Opcode; 492 } 493 494 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 495 496 if (DstRC->getSize() == 4) { 497 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 498 } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) { 499 return AMDGPU::S_MOV_B64; 500 } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) { 501 return AMDGPU::V_MOV_B64_PSEUDO; 502 } 503 return AMDGPU::COPY; 504 } 505 506 static unsigned getSGPRSpillSaveOpcode(unsigned Size) { 507 switch (Size) { 508 case 4: 509 return AMDGPU::SI_SPILL_S32_SAVE; 510 case 8: 511 return AMDGPU::SI_SPILL_S64_SAVE; 512 case 16: 513 return AMDGPU::SI_SPILL_S128_SAVE; 514 case 32: 515 return AMDGPU::SI_SPILL_S256_SAVE; 516 case 64: 517 return AMDGPU::SI_SPILL_S512_SAVE; 518 default: 519 llvm_unreachable("unknown register size"); 520 } 521 } 522 523 static unsigned getVGPRSpillSaveOpcode(unsigned Size) { 524 switch (Size) { 525 case 4: 526 return AMDGPU::SI_SPILL_V32_SAVE; 527 case 8: 528 return AMDGPU::SI_SPILL_V64_SAVE; 529 case 16: 530 return AMDGPU::SI_SPILL_V128_SAVE; 531 case 32: 532 return AMDGPU::SI_SPILL_V256_SAVE; 533 case 64: 534 return AMDGPU::SI_SPILL_V512_SAVE; 535 default: 536 llvm_unreachable("unknown register size"); 537 } 538 } 539 540 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 541 MachineBasicBlock::iterator MI, 542 unsigned SrcReg, bool isKill, 543 int FrameIndex, 544 const TargetRegisterClass *RC, 545 const TargetRegisterInfo *TRI) const { 546 MachineFunction *MF = MBB.getParent(); 547 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 548 MachineFrameInfo *FrameInfo = MF->getFrameInfo(); 549 DebugLoc DL = MBB.findDebugLoc(MI); 550 551 unsigned Size = FrameInfo->getObjectSize(FrameIndex); 552 unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); 553 MachinePointerInfo PtrInfo 554 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 555 MachineMemOperand *MMO 556 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 557 Size, Align); 558 559 if (RI.isSGPRClass(RC)) { 560 MFI->setHasSpilledSGPRs(); 561 562 // We are only allowed to create one new instruction when spilling 563 // registers, so we need to use pseudo instruction for spilling 564 // SGPRs. 565 unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize()); 566 BuildMI(MBB, MI, DL, get(Opcode)) 567 .addReg(SrcReg) // src 568 .addFrameIndex(FrameIndex) // frame_idx 569 .addMemOperand(MMO); 570 571 return; 572 } 573 574 if (!ST.isVGPRSpillingEnabled(MFI)) { 575 LLVMContext &Ctx = MF->getFunction()->getContext(); 576 Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" 577 " spill register"); 578 BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) 579 .addReg(SrcReg); 580 581 return; 582 } 583 584 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 585 586 unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); 587 MFI->setHasSpilledVGPRs(); 588 BuildMI(MBB, MI, DL, get(Opcode)) 589 .addReg(SrcReg) // src 590 .addFrameIndex(FrameIndex) // frame_idx 591 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 592 .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset 593 .addImm(0) // offset 594 .addMemOperand(MMO); 595 } 596 597 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { 598 switch (Size) { 599 case 4: 600 return AMDGPU::SI_SPILL_S32_RESTORE; 601 case 8: 602 return AMDGPU::SI_SPILL_S64_RESTORE; 603 case 16: 604 return AMDGPU::SI_SPILL_S128_RESTORE; 605 case 32: 606 return AMDGPU::SI_SPILL_S256_RESTORE; 607 case 64: 608 return AMDGPU::SI_SPILL_S512_RESTORE; 609 default: 610 llvm_unreachable("unknown register size"); 611 } 612 } 613 614 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { 615 switch (Size) { 616 case 4: 617 return AMDGPU::SI_SPILL_V32_RESTORE; 618 case 8: 619 return AMDGPU::SI_SPILL_V64_RESTORE; 620 case 16: 621 return AMDGPU::SI_SPILL_V128_RESTORE; 622 case 32: 623 return AMDGPU::SI_SPILL_V256_RESTORE; 624 case 64: 625 return AMDGPU::SI_SPILL_V512_RESTORE; 626 default: 627 llvm_unreachable("unknown register size"); 628 } 629 } 630 631 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 632 MachineBasicBlock::iterator MI, 633 unsigned DestReg, int FrameIndex, 634 const TargetRegisterClass *RC, 635 const TargetRegisterInfo *TRI) const { 636 MachineFunction *MF = MBB.getParent(); 637 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 638 MachineFrameInfo *FrameInfo = MF->getFrameInfo(); 639 DebugLoc DL = MBB.findDebugLoc(MI); 640 unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); 641 unsigned Size = FrameInfo->getObjectSize(FrameIndex); 642 643 MachinePointerInfo PtrInfo 644 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 645 646 MachineMemOperand *MMO = MF->getMachineMemOperand( 647 PtrInfo, MachineMemOperand::MOLoad, Size, Align); 648 649 if (RI.isSGPRClass(RC)) { 650 // FIXME: Maybe this should not include a memoperand because it will be 651 // lowered to non-memory instructions. 652 unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize()); 653 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 654 .addFrameIndex(FrameIndex) // frame_idx 655 .addMemOperand(MMO); 656 657 return; 658 } 659 660 if (!ST.isVGPRSpillingEnabled(MFI)) { 661 LLVMContext &Ctx = MF->getFunction()->getContext(); 662 Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" 663 " restore register"); 664 BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); 665 666 return; 667 } 668 669 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 670 671 unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); 672 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 673 .addFrameIndex(FrameIndex) // frame_idx 674 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 675 .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset 676 .addImm(0) // offset 677 .addMemOperand(MMO); 678 } 679 680 /// \param @Offset Offset in bytes of the FrameIndex being spilled 681 unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, 682 MachineBasicBlock::iterator MI, 683 RegScavenger *RS, unsigned TmpReg, 684 unsigned FrameOffset, 685 unsigned Size) const { 686 MachineFunction *MF = MBB.getParent(); 687 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 688 const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>(); 689 const SIRegisterInfo *TRI = 690 static_cast<const SIRegisterInfo*>(ST.getRegisterInfo()); 691 DebugLoc DL = MBB.findDebugLoc(MI); 692 unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF); 693 unsigned WavefrontSize = ST.getWavefrontSize(); 694 695 unsigned TIDReg = MFI->getTIDReg(); 696 if (!MFI->hasCalculatedTID()) { 697 MachineBasicBlock &Entry = MBB.getParent()->front(); 698 MachineBasicBlock::iterator Insert = Entry.front(); 699 DebugLoc DL = Insert->getDebugLoc(); 700 701 TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass); 702 if (TIDReg == AMDGPU::NoRegister) 703 return TIDReg; 704 705 706 if (MFI->getShaderType() == ShaderType::COMPUTE && 707 WorkGroupSize > WavefrontSize) { 708 709 unsigned TIDIGXReg 710 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X); 711 unsigned TIDIGYReg 712 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y); 713 unsigned TIDIGZReg 714 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z); 715 unsigned InputPtrReg = 716 TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); 717 for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { 718 if (!Entry.isLiveIn(Reg)) 719 Entry.addLiveIn(Reg); 720 } 721 722 RS->enterBasicBlock(&Entry); 723 // FIXME: Can we scavenge an SReg_64 and access the subregs? 724 unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 725 unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 726 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) 727 .addReg(InputPtrReg) 728 .addImm(SI::KernelInputOffsets::NGROUPS_Z); 729 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) 730 .addReg(InputPtrReg) 731 .addImm(SI::KernelInputOffsets::NGROUPS_Y); 732 733 // NGROUPS.X * NGROUPS.Y 734 BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) 735 .addReg(STmp1) 736 .addReg(STmp0); 737 // (NGROUPS.X * NGROUPS.Y) * TIDIG.X 738 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) 739 .addReg(STmp1) 740 .addReg(TIDIGXReg); 741 // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) 742 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) 743 .addReg(STmp0) 744 .addReg(TIDIGYReg) 745 .addReg(TIDReg); 746 // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z 747 BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) 748 .addReg(TIDReg) 749 .addReg(TIDIGZReg); 750 } else { 751 // Get the wave id 752 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), 753 TIDReg) 754 .addImm(-1) 755 .addImm(0); 756 757 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), 758 TIDReg) 759 .addImm(-1) 760 .addReg(TIDReg); 761 } 762 763 BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), 764 TIDReg) 765 .addImm(2) 766 .addReg(TIDReg); 767 MFI->setTIDReg(TIDReg); 768 } 769 770 // Add FrameIndex to LDS offset 771 unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize); 772 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) 773 .addImm(LDSOffset) 774 .addReg(TIDReg); 775 776 return TmpReg; 777 } 778 779 void SIInstrInfo::insertWaitStates(MachineBasicBlock::iterator MI, 780 int Count) const { 781 while (Count > 0) { 782 int Arg; 783 if (Count >= 8) 784 Arg = 7; 785 else 786 Arg = Count - 1; 787 Count -= 8; 788 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(AMDGPU::S_NOP)) 789 .addImm(Arg); 790 } 791 } 792 793 bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { 794 MachineBasicBlock &MBB = *MI->getParent(); 795 DebugLoc DL = MBB.findDebugLoc(MI); 796 switch (MI->getOpcode()) { 797 default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); 798 799 case AMDGPU::SGPR_USE: 800 // This is just a placeholder for register allocation. 801 MI->eraseFromParent(); 802 break; 803 804 case AMDGPU::V_MOV_B64_PSEUDO: { 805 unsigned Dst = MI->getOperand(0).getReg(); 806 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 807 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 808 809 const MachineOperand &SrcOp = MI->getOperand(1); 810 // FIXME: Will this work for 64-bit floating point immediates? 811 assert(!SrcOp.isFPImm()); 812 if (SrcOp.isImm()) { 813 APInt Imm(64, SrcOp.getImm()); 814 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 815 .addImm(Imm.getLoBits(32).getZExtValue()) 816 .addReg(Dst, RegState::Implicit); 817 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 818 .addImm(Imm.getHiBits(32).getZExtValue()) 819 .addReg(Dst, RegState::Implicit); 820 } else { 821 assert(SrcOp.isReg()); 822 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 823 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 824 .addReg(Dst, RegState::Implicit); 825 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 826 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 827 .addReg(Dst, RegState::Implicit); 828 } 829 MI->eraseFromParent(); 830 break; 831 } 832 833 case AMDGPU::V_CNDMASK_B64_PSEUDO: { 834 unsigned Dst = MI->getOperand(0).getReg(); 835 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 836 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 837 unsigned Src0 = MI->getOperand(1).getReg(); 838 unsigned Src1 = MI->getOperand(2).getReg(); 839 const MachineOperand &SrcCond = MI->getOperand(3); 840 841 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo) 842 .addReg(RI.getSubReg(Src0, AMDGPU::sub0)) 843 .addReg(RI.getSubReg(Src1, AMDGPU::sub0)) 844 .addOperand(SrcCond); 845 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi) 846 .addReg(RI.getSubReg(Src0, AMDGPU::sub1)) 847 .addReg(RI.getSubReg(Src1, AMDGPU::sub1)) 848 .addOperand(SrcCond); 849 MI->eraseFromParent(); 850 break; 851 } 852 853 case AMDGPU::SI_CONSTDATA_PTR: { 854 const SIRegisterInfo *TRI = 855 static_cast<const SIRegisterInfo *>(ST.getRegisterInfo()); 856 MachineFunction &MF = *MBB.getParent(); 857 unsigned Reg = MI->getOperand(0).getReg(); 858 unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0); 859 unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1); 860 861 // Create a bundle so these instructions won't be re-ordered by the 862 // post-RA scheduler. 863 MIBundleBuilder Bundler(MBB, MI); 864 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); 865 866 // Add 32-bit offset from this instruction to the start of the 867 // constant data. 868 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) 869 .addReg(RegLo) 870 .addOperand(MI->getOperand(1))); 871 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) 872 .addReg(RegHi) 873 .addImm(0)); 874 875 llvm::finalizeBundle(MBB, Bundler.begin()); 876 877 MI->eraseFromParent(); 878 break; 879 } 880 } 881 return true; 882 } 883 884 /// Commutes the operands in the given instruction. 885 /// The commutable operands are specified by their indices OpIdx0 and OpIdx1. 886 /// 887 /// Do not call this method for a non-commutable instruction or for 888 /// non-commutable pair of operand indices OpIdx0 and OpIdx1. 889 /// Even though the instruction is commutable, the method may still 890 /// fail to commute the operands, null pointer is returned in such cases. 891 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI, 892 bool NewMI, 893 unsigned OpIdx0, 894 unsigned OpIdx1) const { 895 int CommutedOpcode = commuteOpcode(*MI); 896 if (CommutedOpcode == -1) 897 return nullptr; 898 899 int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 900 AMDGPU::OpName::src0); 901 MachineOperand &Src0 = MI->getOperand(Src0Idx); 902 if (!Src0.isReg()) 903 return nullptr; 904 905 int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 906 AMDGPU::OpName::src1); 907 908 if ((OpIdx0 != static_cast<unsigned>(Src0Idx) || 909 OpIdx1 != static_cast<unsigned>(Src1Idx)) && 910 (OpIdx0 != static_cast<unsigned>(Src1Idx) || 911 OpIdx1 != static_cast<unsigned>(Src0Idx))) 912 return nullptr; 913 914 MachineOperand &Src1 = MI->getOperand(Src1Idx); 915 916 917 if (isVOP2(*MI)) { 918 const MCInstrDesc &InstrDesc = MI->getDesc(); 919 // For VOP2 instructions, any operand type is valid to use for src0. Make 920 // sure we can use the src1 as src0. 921 // 922 // We could be stricter here and only allow commuting if there is a reason 923 // to do so. i.e. if both operands are VGPRs there is no real benefit, 924 // although MachineCSE attempts to find matches by commuting. 925 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 926 if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) 927 return nullptr; 928 } 929 930 if (!Src1.isReg()) { 931 // Allow commuting instructions with Imm operands. 932 if (NewMI || !Src1.isImm() || 933 (!isVOP2(*MI) && !isVOP3(*MI))) { 934 return nullptr; 935 } 936 // Be sure to copy the source modifiers to the right place. 937 if (MachineOperand *Src0Mods 938 = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { 939 MachineOperand *Src1Mods 940 = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers); 941 942 int Src0ModsVal = Src0Mods->getImm(); 943 if (!Src1Mods && Src0ModsVal != 0) 944 return nullptr; 945 946 // XXX - This assert might be a lie. It might be useful to have a neg 947 // modifier with 0.0. 948 int Src1ModsVal = Src1Mods->getImm(); 949 assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates"); 950 951 Src1Mods->setImm(Src0ModsVal); 952 Src0Mods->setImm(Src1ModsVal); 953 } 954 955 unsigned Reg = Src0.getReg(); 956 unsigned SubReg = Src0.getSubReg(); 957 if (Src1.isImm()) 958 Src0.ChangeToImmediate(Src1.getImm()); 959 else 960 llvm_unreachable("Should only have immediates"); 961 962 Src1.ChangeToRegister(Reg, false); 963 Src1.setSubReg(SubReg); 964 } else { 965 MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1); 966 } 967 968 if (MI) 969 MI->setDesc(get(CommutedOpcode)); 970 971 return MI; 972 } 973 974 // This needs to be implemented because the source modifiers may be inserted 975 // between the true commutable operands, and the base 976 // TargetInstrInfo::commuteInstruction uses it. 977 bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, 978 unsigned &SrcOpIdx0, 979 unsigned &SrcOpIdx1) const { 980 const MCInstrDesc &MCID = MI->getDesc(); 981 if (!MCID.isCommutable()) 982 return false; 983 984 unsigned Opc = MI->getOpcode(); 985 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 986 if (Src0Idx == -1) 987 return false; 988 989 // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on 990 // immediate. Also, immediate src0 operand is not handled in 991 // SIInstrInfo::commuteInstruction(); 992 if (!MI->getOperand(Src0Idx).isReg()) 993 return false; 994 995 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 996 if (Src1Idx == -1) 997 return false; 998 999 MachineOperand &Src1 = MI->getOperand(Src1Idx); 1000 if (Src1.isImm()) { 1001 // SIInstrInfo::commuteInstruction() does support commuting the immediate 1002 // operand src1 in 2 and 3 operand instructions. 1003 if (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode())) 1004 return false; 1005 } else if (Src1.isReg()) { 1006 // If any source modifiers are set, the generic instruction commuting won't 1007 // understand how to copy the source modifiers. 1008 if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || 1009 hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) 1010 return false; 1011 } else 1012 return false; 1013 1014 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); 1015 } 1016 1017 static void removeModOperands(MachineInstr &MI) { 1018 unsigned Opc = MI.getOpcode(); 1019 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1020 AMDGPU::OpName::src0_modifiers); 1021 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1022 AMDGPU::OpName::src1_modifiers); 1023 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1024 AMDGPU::OpName::src2_modifiers); 1025 1026 MI.RemoveOperand(Src2ModIdx); 1027 MI.RemoveOperand(Src1ModIdx); 1028 MI.RemoveOperand(Src0ModIdx); 1029 } 1030 1031 bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, 1032 unsigned Reg, MachineRegisterInfo *MRI) const { 1033 if (!MRI->hasOneNonDBGUse(Reg)) 1034 return false; 1035 1036 unsigned Opc = UseMI->getOpcode(); 1037 if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) { 1038 // Don't fold if we are using source modifiers. The new VOP2 instructions 1039 // don't have them. 1040 if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) || 1041 hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) || 1042 hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) { 1043 return false; 1044 } 1045 1046 MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0); 1047 MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1); 1048 MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2); 1049 1050 // Multiplied part is the constant: Use v_madmk_f32 1051 // We should only expect these to be on src0 due to canonicalizations. 1052 if (Src0->isReg() && Src0->getReg() == Reg) { 1053 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1054 return false; 1055 1056 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) 1057 return false; 1058 1059 // We need to swap operands 0 and 1 since madmk constant is at operand 1. 1060 1061 const int64_t Imm = DefMI->getOperand(1).getImm(); 1062 1063 // FIXME: This would be a lot easier if we could return a new instruction 1064 // instead of having to modify in place. 1065 1066 // Remove these first since they are at the end. 1067 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1068 AMDGPU::OpName::omod)); 1069 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1070 AMDGPU::OpName::clamp)); 1071 1072 unsigned Src1Reg = Src1->getReg(); 1073 unsigned Src1SubReg = Src1->getSubReg(); 1074 Src0->setReg(Src1Reg); 1075 Src0->setSubReg(Src1SubReg); 1076 Src0->setIsKill(Src1->isKill()); 1077 1078 if (Opc == AMDGPU::V_MAC_F32_e64) { 1079 UseMI->untieRegOperand( 1080 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1081 } 1082 1083 Src1->ChangeToImmediate(Imm); 1084 1085 removeModOperands(*UseMI); 1086 UseMI->setDesc(get(AMDGPU::V_MADMK_F32)); 1087 1088 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1089 if (DeleteDef) 1090 DefMI->eraseFromParent(); 1091 1092 return true; 1093 } 1094 1095 // Added part is the constant: Use v_madak_f32 1096 if (Src2->isReg() && Src2->getReg() == Reg) { 1097 // Not allowed to use constant bus for another operand. 1098 // We can however allow an inline immediate as src0. 1099 if (!Src0->isImm() && 1100 (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) 1101 return false; 1102 1103 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1104 return false; 1105 1106 const int64_t Imm = DefMI->getOperand(1).getImm(); 1107 1108 // FIXME: This would be a lot easier if we could return a new instruction 1109 // instead of having to modify in place. 1110 1111 // Remove these first since they are at the end. 1112 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1113 AMDGPU::OpName::omod)); 1114 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1115 AMDGPU::OpName::clamp)); 1116 1117 if (Opc == AMDGPU::V_MAC_F32_e64) { 1118 UseMI->untieRegOperand( 1119 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1120 } 1121 1122 // ChangingToImmediate adds Src2 back to the instruction. 1123 Src2->ChangeToImmediate(Imm); 1124 1125 // These come before src2. 1126 removeModOperands(*UseMI); 1127 UseMI->setDesc(get(AMDGPU::V_MADAK_F32)); 1128 1129 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1130 if (DeleteDef) 1131 DefMI->eraseFromParent(); 1132 1133 return true; 1134 } 1135 } 1136 1137 return false; 1138 } 1139 1140 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 1141 int WidthB, int OffsetB) { 1142 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1143 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1144 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1145 return LowOffset + LowWidth <= HighOffset; 1146 } 1147 1148 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa, 1149 MachineInstr *MIb) const { 1150 unsigned BaseReg0, BaseReg1; 1151 int64_t Offset0, Offset1; 1152 1153 if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && 1154 getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { 1155 assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() && 1156 "read2 / write2 not expected here yet"); 1157 unsigned Width0 = (*MIa->memoperands_begin())->getSize(); 1158 unsigned Width1 = (*MIb->memoperands_begin())->getSize(); 1159 if (BaseReg0 == BaseReg1 && 1160 offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { 1161 return true; 1162 } 1163 } 1164 1165 return false; 1166 } 1167 1168 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, 1169 MachineInstr *MIb, 1170 AliasAnalysis *AA) const { 1171 assert(MIa && (MIa->mayLoad() || MIa->mayStore()) && 1172 "MIa must load from or modify a memory location"); 1173 assert(MIb && (MIb->mayLoad() || MIb->mayStore()) && 1174 "MIb must load from or modify a memory location"); 1175 1176 if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects()) 1177 return false; 1178 1179 // XXX - Can we relax this between address spaces? 1180 if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef()) 1181 return false; 1182 1183 // TODO: Should we check the address space from the MachineMemOperand? That 1184 // would allow us to distinguish objects we know don't alias based on the 1185 // underlying address space, even if it was lowered to a different one, 1186 // e.g. private accesses lowered to use MUBUF instructions on a scratch 1187 // buffer. 1188 if (isDS(*MIa)) { 1189 if (isDS(*MIb)) 1190 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1191 1192 return !isFLAT(*MIb); 1193 } 1194 1195 if (isMUBUF(*MIa) || isMTBUF(*MIa)) { 1196 if (isMUBUF(*MIb) || isMTBUF(*MIb)) 1197 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1198 1199 return !isFLAT(*MIb) && !isSMRD(*MIb); 1200 } 1201 1202 if (isSMRD(*MIa)) { 1203 if (isSMRD(*MIb)) 1204 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1205 1206 return !isFLAT(*MIb) && !isMUBUF(*MIa) && !isMTBUF(*MIa); 1207 } 1208 1209 if (isFLAT(*MIa)) { 1210 if (isFLAT(*MIb)) 1211 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1212 1213 return false; 1214 } 1215 1216 return false; 1217 } 1218 1219 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, 1220 MachineBasicBlock::iterator &MI, 1221 LiveVariables *LV) const { 1222 1223 switch (MI->getOpcode()) { 1224 default: return nullptr; 1225 case AMDGPU::V_MAC_F32_e64: break; 1226 case AMDGPU::V_MAC_F32_e32: { 1227 const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); 1228 if (Src0->isImm() && !isInlineConstant(*Src0, 4)) 1229 return nullptr; 1230 break; 1231 } 1232 } 1233 1234 const MachineOperand *Dst = getNamedOperand(*MI, AMDGPU::OpName::vdst); 1235 const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); 1236 const MachineOperand *Src1 = getNamedOperand(*MI, AMDGPU::OpName::src1); 1237 const MachineOperand *Src2 = getNamedOperand(*MI, AMDGPU::OpName::src2); 1238 1239 return BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_MAD_F32)) 1240 .addOperand(*Dst) 1241 .addImm(0) // Src0 mods 1242 .addOperand(*Src0) 1243 .addImm(0) // Src1 mods 1244 .addOperand(*Src1) 1245 .addImm(0) // Src mods 1246 .addOperand(*Src2) 1247 .addImm(0) // clamp 1248 .addImm(0); // omod 1249 } 1250 1251 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 1252 int64_t SVal = Imm.getSExtValue(); 1253 if (SVal >= -16 && SVal <= 64) 1254 return true; 1255 1256 if (Imm.getBitWidth() == 64) { 1257 uint64_t Val = Imm.getZExtValue(); 1258 return (DoubleToBits(0.0) == Val) || 1259 (DoubleToBits(1.0) == Val) || 1260 (DoubleToBits(-1.0) == Val) || 1261 (DoubleToBits(0.5) == Val) || 1262 (DoubleToBits(-0.5) == Val) || 1263 (DoubleToBits(2.0) == Val) || 1264 (DoubleToBits(-2.0) == Val) || 1265 (DoubleToBits(4.0) == Val) || 1266 (DoubleToBits(-4.0) == Val); 1267 } 1268 1269 // The actual type of the operand does not seem to matter as long 1270 // as the bits match one of the inline immediate values. For example: 1271 // 1272 // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal, 1273 // so it is a legal inline immediate. 1274 // 1275 // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in 1276 // floating-point, so it is a legal inline immediate. 1277 uint32_t Val = Imm.getZExtValue(); 1278 1279 return (FloatToBits(0.0f) == Val) || 1280 (FloatToBits(1.0f) == Val) || 1281 (FloatToBits(-1.0f) == Val) || 1282 (FloatToBits(0.5f) == Val) || 1283 (FloatToBits(-0.5f) == Val) || 1284 (FloatToBits(2.0f) == Val) || 1285 (FloatToBits(-2.0f) == Val) || 1286 (FloatToBits(4.0f) == Val) || 1287 (FloatToBits(-4.0f) == Val); 1288 } 1289 1290 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 1291 unsigned OpSize) const { 1292 if (MO.isImm()) { 1293 // MachineOperand provides no way to tell the true operand size, since it 1294 // only records a 64-bit value. We need to know the size to determine if a 1295 // 32-bit floating point immediate bit pattern is legal for an integer 1296 // immediate. It would be for any 32-bit integer operand, but would not be 1297 // for a 64-bit one. 1298 1299 unsigned BitSize = 8 * OpSize; 1300 return isInlineConstant(APInt(BitSize, MO.getImm(), true)); 1301 } 1302 1303 return false; 1304 } 1305 1306 bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO, 1307 unsigned OpSize) const { 1308 return MO.isImm() && !isInlineConstant(MO, OpSize); 1309 } 1310 1311 static bool compareMachineOp(const MachineOperand &Op0, 1312 const MachineOperand &Op1) { 1313 if (Op0.getType() != Op1.getType()) 1314 return false; 1315 1316 switch (Op0.getType()) { 1317 case MachineOperand::MO_Register: 1318 return Op0.getReg() == Op1.getReg(); 1319 case MachineOperand::MO_Immediate: 1320 return Op0.getImm() == Op1.getImm(); 1321 default: 1322 llvm_unreachable("Didn't expect to be comparing these operand types"); 1323 } 1324 } 1325 1326 bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, 1327 const MachineOperand &MO) const { 1328 const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo]; 1329 1330 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 1331 1332 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 1333 return true; 1334 1335 if (OpInfo.RegClass < 0) 1336 return false; 1337 1338 unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize(); 1339 if (isLiteralConstant(MO, OpSize)) 1340 return RI.opCanUseLiteralConstant(OpInfo.OperandType); 1341 1342 return RI.opCanUseInlineConstant(OpInfo.OperandType); 1343 } 1344 1345 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 1346 int Op32 = AMDGPU::getVOPe32(Opcode); 1347 if (Op32 == -1) 1348 return false; 1349 1350 return pseudoToMCOpcode(Op32) != -1; 1351 } 1352 1353 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 1354 // The src0_modifier operand is present on all instructions 1355 // that have modifiers. 1356 1357 return AMDGPU::getNamedOperandIdx(Opcode, 1358 AMDGPU::OpName::src0_modifiers) != -1; 1359 } 1360 1361 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 1362 unsigned OpName) const { 1363 const MachineOperand *Mods = getNamedOperand(MI, OpName); 1364 return Mods && Mods->getImm(); 1365 } 1366 1367 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 1368 const MachineOperand &MO, 1369 unsigned OpSize) const { 1370 // Literal constants use the constant bus. 1371 if (isLiteralConstant(MO, OpSize)) 1372 return true; 1373 1374 if (!MO.isReg() || !MO.isUse()) 1375 return false; 1376 1377 if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) 1378 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 1379 1380 // FLAT_SCR is just an SGPR pair. 1381 if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) 1382 return true; 1383 1384 // EXEC register uses the constant bus. 1385 if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) 1386 return true; 1387 1388 // SGPRs use the constant bus 1389 return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 || 1390 (!MO.isImplicit() && 1391 (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || 1392 AMDGPU::SGPR_64RegClass.contains(MO.getReg())))); 1393 } 1394 1395 static unsigned findImplicitSGPRRead(const MachineInstr &MI) { 1396 for (const MachineOperand &MO : MI.implicit_operands()) { 1397 // We only care about reads. 1398 if (MO.isDef()) 1399 continue; 1400 1401 switch (MO.getReg()) { 1402 case AMDGPU::VCC: 1403 case AMDGPU::M0: 1404 case AMDGPU::FLAT_SCR: 1405 return MO.getReg(); 1406 1407 default: 1408 break; 1409 } 1410 } 1411 1412 return AMDGPU::NoRegister; 1413 } 1414 1415 bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, 1416 StringRef &ErrInfo) const { 1417 uint16_t Opcode = MI->getOpcode(); 1418 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1419 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 1420 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 1421 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 1422 1423 // Make sure we don't have SCC live-ins to basic blocks. moveToVALU assumes 1424 // all SCC users are in the same blocks as their defs. 1425 const MachineBasicBlock *MBB = MI->getParent(); 1426 if (MI == &MBB->front()) { 1427 if (MBB->isLiveIn(AMDGPU::SCC)) { 1428 ErrInfo = "scc register cannot be live across blocks."; 1429 return false; 1430 } 1431 } 1432 1433 // Make sure the number of operands is correct. 1434 const MCInstrDesc &Desc = get(Opcode); 1435 if (!Desc.isVariadic() && 1436 Desc.getNumOperands() != MI->getNumExplicitOperands()) { 1437 ErrInfo = "Instruction has wrong number of operands."; 1438 return false; 1439 } 1440 1441 // Make sure the register classes are correct. 1442 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 1443 if (MI->getOperand(i).isFPImm()) { 1444 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 1445 "all fp values to integers."; 1446 return false; 1447 } 1448 1449 int RegClass = Desc.OpInfo[i].RegClass; 1450 1451 switch (Desc.OpInfo[i].OperandType) { 1452 case MCOI::OPERAND_REGISTER: 1453 if (MI->getOperand(i).isImm()) { 1454 ErrInfo = "Illegal immediate value for operand."; 1455 return false; 1456 } 1457 break; 1458 case AMDGPU::OPERAND_REG_IMM32: 1459 break; 1460 case AMDGPU::OPERAND_REG_INLINE_C: 1461 if (isLiteralConstant(MI->getOperand(i), 1462 RI.getRegClass(RegClass)->getSize())) { 1463 ErrInfo = "Illegal immediate value for operand."; 1464 return false; 1465 } 1466 break; 1467 case MCOI::OPERAND_IMMEDIATE: 1468 // Check if this operand is an immediate. 1469 // FrameIndex operands will be replaced by immediates, so they are 1470 // allowed. 1471 if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) { 1472 ErrInfo = "Expected immediate, but got non-immediate"; 1473 return false; 1474 } 1475 // Fall-through 1476 default: 1477 continue; 1478 } 1479 1480 if (!MI->getOperand(i).isReg()) 1481 continue; 1482 1483 if (RegClass != -1) { 1484 unsigned Reg = MI->getOperand(i).getReg(); 1485 if (TargetRegisterInfo::isVirtualRegister(Reg)) 1486 continue; 1487 1488 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 1489 if (!RC->contains(Reg)) { 1490 ErrInfo = "Operand has incorrect register class."; 1491 return false; 1492 } 1493 } 1494 } 1495 1496 1497 // Verify VOP* 1498 if (isVOP1(*MI) || isVOP2(*MI) || isVOP3(*MI) || isVOPC(*MI)) { 1499 // Only look at the true operands. Only a real operand can use the constant 1500 // bus, and we don't want to check pseudo-operands like the source modifier 1501 // flags. 1502 const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; 1503 1504 unsigned ConstantBusCount = 0; 1505 unsigned SGPRUsed = findImplicitSGPRRead(*MI); 1506 if (SGPRUsed != AMDGPU::NoRegister) 1507 ++ConstantBusCount; 1508 1509 for (int OpIdx : OpIndices) { 1510 if (OpIdx == -1) 1511 break; 1512 const MachineOperand &MO = MI->getOperand(OpIdx); 1513 if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) { 1514 if (MO.isReg()) { 1515 if (MO.getReg() != SGPRUsed) 1516 ++ConstantBusCount; 1517 SGPRUsed = MO.getReg(); 1518 } else { 1519 ++ConstantBusCount; 1520 } 1521 } 1522 } 1523 if (ConstantBusCount > 1) { 1524 ErrInfo = "VOP* instruction uses the constant bus more than once"; 1525 return false; 1526 } 1527 } 1528 1529 // Verify misc. restrictions on specific instructions. 1530 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || 1531 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { 1532 const MachineOperand &Src0 = MI->getOperand(Src0Idx); 1533 const MachineOperand &Src1 = MI->getOperand(Src1Idx); 1534 const MachineOperand &Src2 = MI->getOperand(Src2Idx); 1535 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 1536 if (!compareMachineOp(Src0, Src1) && 1537 !compareMachineOp(Src0, Src2)) { 1538 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 1539 return false; 1540 } 1541 } 1542 } 1543 1544 // Make sure we aren't losing exec uses in the td files. This mostly requires 1545 // being careful when using let Uses to try to add other use registers. 1546 if (!isGenericOpcode(Opcode) && !isSALU(Opcode) && !isSMRD(Opcode)) { 1547 const MachineOperand *Exec = MI->findRegisterUseOperand(AMDGPU::EXEC); 1548 if (!Exec || !Exec->isImplicit()) { 1549 ErrInfo = "VALU instruction does not implicitly read exec mask"; 1550 return false; 1551 } 1552 } 1553 1554 return true; 1555 } 1556 1557 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { 1558 switch (MI.getOpcode()) { 1559 default: return AMDGPU::INSTRUCTION_LIST_END; 1560 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 1561 case AMDGPU::COPY: return AMDGPU::COPY; 1562 case AMDGPU::PHI: return AMDGPU::PHI; 1563 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 1564 case AMDGPU::S_MOV_B32: 1565 return MI.getOperand(1).isReg() ? 1566 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 1567 case AMDGPU::S_ADD_I32: 1568 case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; 1569 case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; 1570 case AMDGPU::S_SUB_I32: 1571 case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; 1572 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 1573 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; 1574 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32; 1575 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32; 1576 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32; 1577 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32; 1578 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32; 1579 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32; 1580 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32; 1581 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 1582 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; 1583 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 1584 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; 1585 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 1586 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; 1587 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; 1588 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; 1589 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; 1590 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; 1591 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 1592 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 1593 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 1594 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 1595 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; 1596 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; 1597 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; 1598 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; 1599 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; 1600 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; 1601 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; 1602 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; 1603 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; 1604 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; 1605 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; 1606 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; 1607 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 1608 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 1609 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 1610 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 1611 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; 1612 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; 1613 } 1614 } 1615 1616 bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const { 1617 return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END; 1618 } 1619 1620 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 1621 unsigned OpNo) const { 1622 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 1623 const MCInstrDesc &Desc = get(MI.getOpcode()); 1624 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 1625 Desc.OpInfo[OpNo].RegClass == -1) { 1626 unsigned Reg = MI.getOperand(OpNo).getReg(); 1627 1628 if (TargetRegisterInfo::isVirtualRegister(Reg)) 1629 return MRI.getRegClass(Reg); 1630 return RI.getPhysRegClass(Reg); 1631 } 1632 1633 unsigned RCID = Desc.OpInfo[OpNo].RegClass; 1634 return RI.getRegClass(RCID); 1635 } 1636 1637 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { 1638 switch (MI.getOpcode()) { 1639 case AMDGPU::COPY: 1640 case AMDGPU::REG_SEQUENCE: 1641 case AMDGPU::PHI: 1642 case AMDGPU::INSERT_SUBREG: 1643 return RI.hasVGPRs(getOpRegClass(MI, 0)); 1644 default: 1645 return RI.hasVGPRs(getOpRegClass(MI, OpNo)); 1646 } 1647 } 1648 1649 void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const { 1650 MachineBasicBlock::iterator I = MI; 1651 MachineBasicBlock *MBB = MI->getParent(); 1652 MachineOperand &MO = MI->getOperand(OpIdx); 1653 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1654 unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass; 1655 const TargetRegisterClass *RC = RI.getRegClass(RCID); 1656 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 1657 if (MO.isReg()) 1658 Opcode = AMDGPU::COPY; 1659 else if (RI.isSGPRClass(RC)) 1660 Opcode = AMDGPU::S_MOV_B32; 1661 1662 1663 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 1664 if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) 1665 VRC = &AMDGPU::VReg_64RegClass; 1666 else 1667 VRC = &AMDGPU::VGPR_32RegClass; 1668 1669 unsigned Reg = MRI.createVirtualRegister(VRC); 1670 DebugLoc DL = MBB->findDebugLoc(I); 1671 BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg) 1672 .addOperand(MO); 1673 MO.ChangeToRegister(Reg, false); 1674 } 1675 1676 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 1677 MachineRegisterInfo &MRI, 1678 MachineOperand &SuperReg, 1679 const TargetRegisterClass *SuperRC, 1680 unsigned SubIdx, 1681 const TargetRegisterClass *SubRC) 1682 const { 1683 MachineBasicBlock *MBB = MI->getParent(); 1684 DebugLoc DL = MI->getDebugLoc(); 1685 unsigned SubReg = MRI.createVirtualRegister(SubRC); 1686 1687 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { 1688 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 1689 .addReg(SuperReg.getReg(), 0, SubIdx); 1690 return SubReg; 1691 } 1692 1693 // Just in case the super register is itself a sub-register, copy it to a new 1694 // value so we don't need to worry about merging its subreg index with the 1695 // SubIdx passed to this function. The register coalescer should be able to 1696 // eliminate this extra copy. 1697 unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); 1698 1699 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 1700 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 1701 1702 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 1703 .addReg(NewSuperReg, 0, SubIdx); 1704 1705 return SubReg; 1706 } 1707 1708 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 1709 MachineBasicBlock::iterator MII, 1710 MachineRegisterInfo &MRI, 1711 MachineOperand &Op, 1712 const TargetRegisterClass *SuperRC, 1713 unsigned SubIdx, 1714 const TargetRegisterClass *SubRC) const { 1715 if (Op.isImm()) { 1716 // XXX - Is there a better way to do this? 1717 if (SubIdx == AMDGPU::sub0) 1718 return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF); 1719 if (SubIdx == AMDGPU::sub1) 1720 return MachineOperand::CreateImm(Op.getImm() >> 32); 1721 1722 llvm_unreachable("Unhandled register index for immediate"); 1723 } 1724 1725 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 1726 SubIdx, SubRC); 1727 return MachineOperand::CreateReg(SubReg, false); 1728 } 1729 1730 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 1731 void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const { 1732 assert(Inst->getNumExplicitOperands() == 3); 1733 MachineOperand Op1 = Inst->getOperand(1); 1734 Inst->RemoveOperand(1); 1735 Inst->addOperand(Op1); 1736 } 1737 1738 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, 1739 const MCOperandInfo &OpInfo, 1740 const MachineOperand &MO) const { 1741 if (!MO.isReg()) 1742 return false; 1743 1744 unsigned Reg = MO.getReg(); 1745 const TargetRegisterClass *RC = 1746 TargetRegisterInfo::isVirtualRegister(Reg) ? 1747 MRI.getRegClass(Reg) : 1748 RI.getPhysRegClass(Reg); 1749 1750 const SIRegisterInfo *TRI = 1751 static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 1752 RC = TRI->getSubRegClass(RC, MO.getSubReg()); 1753 1754 // In order to be legal, the common sub-class must be equal to the 1755 // class of the current operand. For example: 1756 // 1757 // v_mov_b32 s0 ; Operand defined as vsrc_32 1758 // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL 1759 // 1760 // s_sendmsg 0, s0 ; Operand defined as m0reg 1761 // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL 1762 1763 return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; 1764 } 1765 1766 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, 1767 const MCOperandInfo &OpInfo, 1768 const MachineOperand &MO) const { 1769 if (MO.isReg()) 1770 return isLegalRegOperand(MRI, OpInfo, MO); 1771 1772 // Handle non-register types that are treated like immediates. 1773 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 1774 return true; 1775 } 1776 1777 bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, 1778 const MachineOperand *MO) const { 1779 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1780 const MCInstrDesc &InstDesc = MI->getDesc(); 1781 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; 1782 const TargetRegisterClass *DefinedRC = 1783 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 1784 if (!MO) 1785 MO = &MI->getOperand(OpIdx); 1786 1787 if (isVALU(*MI) && 1788 usesConstantBus(MRI, *MO, DefinedRC->getSize())) { 1789 1790 RegSubRegPair SGPRUsed; 1791 if (MO->isReg()) 1792 SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg()); 1793 1794 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { 1795 if (i == OpIdx) 1796 continue; 1797 const MachineOperand &Op = MI->getOperand(i); 1798 if (Op.isReg() && 1799 (Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) && 1800 usesConstantBus(MRI, Op, getOpSize(*MI, i))) { 1801 return false; 1802 } 1803 } 1804 } 1805 1806 if (MO->isReg()) { 1807 assert(DefinedRC); 1808 return isLegalRegOperand(MRI, OpInfo, *MO); 1809 } 1810 1811 1812 // Handle non-register types that are treated like immediates. 1813 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); 1814 1815 if (!DefinedRC) { 1816 // This operand expects an immediate. 1817 return true; 1818 } 1819 1820 return isImmOperandLegal(MI, OpIdx, *MO); 1821 } 1822 1823 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, 1824 MachineInstr *MI) const { 1825 unsigned Opc = MI->getOpcode(); 1826 const MCInstrDesc &InstrDesc = get(Opc); 1827 1828 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1829 MachineOperand &Src1 = MI->getOperand(Src1Idx); 1830 1831 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 1832 // we need to only have one constant bus use. 1833 // 1834 // Note we do not need to worry about literal constants here. They are 1835 // disabled for the operand type for instructions because they will always 1836 // violate the one constant bus use rule. 1837 bool HasImplicitSGPR = findImplicitSGPRRead(*MI) != AMDGPU::NoRegister; 1838 if (HasImplicitSGPR) { 1839 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1840 MachineOperand &Src0 = MI->getOperand(Src0Idx); 1841 1842 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) 1843 legalizeOpWithMove(MI, Src0Idx); 1844 } 1845 1846 // VOP2 src0 instructions support all operand types, so we don't need to check 1847 // their legality. If src1 is already legal, we don't need to do anything. 1848 if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) 1849 return; 1850 1851 // We do not use commuteInstruction here because it is too aggressive and will 1852 // commute if it is possible. We only want to commute here if it improves 1853 // legality. This can be called a fairly large number of times so don't waste 1854 // compile time pointlessly swapping and checking legality again. 1855 if (HasImplicitSGPR || !MI->isCommutable()) { 1856 legalizeOpWithMove(MI, Src1Idx); 1857 return; 1858 } 1859 1860 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1861 MachineOperand &Src0 = MI->getOperand(Src0Idx); 1862 1863 // If src0 can be used as src1, commuting will make the operands legal. 1864 // Otherwise we have to give up and insert a move. 1865 // 1866 // TODO: Other immediate-like operand kinds could be commuted if there was a 1867 // MachineOperand::ChangeTo* for them. 1868 if ((!Src1.isImm() && !Src1.isReg()) || 1869 !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { 1870 legalizeOpWithMove(MI, Src1Idx); 1871 return; 1872 } 1873 1874 int CommutedOpc = commuteOpcode(*MI); 1875 if (CommutedOpc == -1) { 1876 legalizeOpWithMove(MI, Src1Idx); 1877 return; 1878 } 1879 1880 MI->setDesc(get(CommutedOpc)); 1881 1882 unsigned Src0Reg = Src0.getReg(); 1883 unsigned Src0SubReg = Src0.getSubReg(); 1884 bool Src0Kill = Src0.isKill(); 1885 1886 if (Src1.isImm()) 1887 Src0.ChangeToImmediate(Src1.getImm()); 1888 else if (Src1.isReg()) { 1889 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); 1890 Src0.setSubReg(Src1.getSubReg()); 1891 } else 1892 llvm_unreachable("Should only have register or immediate operands"); 1893 1894 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); 1895 Src1.setSubReg(Src0SubReg); 1896 } 1897 1898 // Legalize VOP3 operands. Because all operand types are supported for any 1899 // operand, and since literal constants are not allowed and should never be 1900 // seen, we only need to worry about inserting copies if we use multiple SGPR 1901 // operands. 1902 void SIInstrInfo::legalizeOperandsVOP3( 1903 MachineRegisterInfo &MRI, 1904 MachineInstr *MI) const { 1905 unsigned Opc = MI->getOpcode(); 1906 1907 int VOP3Idx[3] = { 1908 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 1909 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), 1910 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) 1911 }; 1912 1913 // Find the one SGPR operand we are allowed to use. 1914 unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); 1915 1916 for (unsigned i = 0; i < 3; ++i) { 1917 int Idx = VOP3Idx[i]; 1918 if (Idx == -1) 1919 break; 1920 MachineOperand &MO = MI->getOperand(Idx); 1921 1922 // We should never see a VOP3 instruction with an illegal immediate operand. 1923 if (!MO.isReg()) 1924 continue; 1925 1926 if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 1927 continue; // VGPRs are legal 1928 1929 if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { 1930 SGPRReg = MO.getReg(); 1931 // We can use one SGPR in each VOP3 instruction. 1932 continue; 1933 } 1934 1935 // If we make it this far, then the operand is not legal and we must 1936 // legalize it. 1937 legalizeOpWithMove(MI, Idx); 1938 } 1939 } 1940 1941 unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr *UseMI, 1942 MachineRegisterInfo &MRI) const { 1943 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); 1944 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); 1945 unsigned DstReg = MRI.createVirtualRegister(SRC); 1946 unsigned SubRegs = VRC->getSize() / 4; 1947 1948 SmallVector<unsigned, 8> SRegs; 1949 for (unsigned i = 0; i < SubRegs; ++i) { 1950 unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 1951 BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(), 1952 get(AMDGPU::V_READFIRSTLANE_B32), SGPR) 1953 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); 1954 SRegs.push_back(SGPR); 1955 } 1956 1957 MachineInstrBuilder MIB = BuildMI(*UseMI->getParent(), UseMI, 1958 UseMI->getDebugLoc(), 1959 get(AMDGPU::REG_SEQUENCE), DstReg); 1960 for (unsigned i = 0; i < SubRegs; ++i) { 1961 MIB.addReg(SRegs[i]); 1962 MIB.addImm(RI.getSubRegFromChannel(i)); 1963 } 1964 return DstReg; 1965 } 1966 1967 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, 1968 MachineInstr *MI) const { 1969 1970 // If the pointer is store in VGPRs, then we need to move them to 1971 // SGPRs using v_readfirstlane. This is safe because we only select 1972 // loads with uniform pointers to SMRD instruction so we know the 1973 // pointer value is uniform. 1974 MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase); 1975 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { 1976 unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); 1977 SBase->setReg(SGPR); 1978 } 1979 } 1980 1981 void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { 1982 MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1983 1984 // Legalize VOP2 1985 if (isVOP2(*MI) || isVOPC(*MI)) { 1986 legalizeOperandsVOP2(MRI, MI); 1987 return; 1988 } 1989 1990 // Legalize VOP3 1991 if (isVOP3(*MI)) { 1992 legalizeOperandsVOP3(MRI, MI); 1993 return; 1994 } 1995 1996 // Legalize SMRD 1997 if (isSMRD(*MI)) { 1998 legalizeOperandsSMRD(MRI, MI); 1999 return; 2000 } 2001 2002 // Legalize REG_SEQUENCE and PHI 2003 // The register class of the operands much be the same type as the register 2004 // class of the output. 2005 if (MI->getOpcode() == AMDGPU::PHI) { 2006 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 2007 for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { 2008 if (!MI->getOperand(i).isReg() || 2009 !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) 2010 continue; 2011 const TargetRegisterClass *OpRC = 2012 MRI.getRegClass(MI->getOperand(i).getReg()); 2013 if (RI.hasVGPRs(OpRC)) { 2014 VRC = OpRC; 2015 } else { 2016 SRC = OpRC; 2017 } 2018 } 2019 2020 // If any of the operands are VGPR registers, then they all most be 2021 // otherwise we will create illegal VGPR->SGPR copies when legalizing 2022 // them. 2023 if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) { 2024 if (!VRC) { 2025 assert(SRC); 2026 VRC = RI.getEquivalentVGPRClass(SRC); 2027 } 2028 RC = VRC; 2029 } else { 2030 RC = SRC; 2031 } 2032 2033 // Update all the operands so they have the same type. 2034 for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { 2035 MachineOperand &Op = MI->getOperand(I); 2036 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 2037 continue; 2038 unsigned DstReg = MRI.createVirtualRegister(RC); 2039 2040 // MI is a PHI instruction. 2041 MachineBasicBlock *InsertBB = MI->getOperand(I + 1).getMBB(); 2042 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); 2043 2044 BuildMI(*InsertBB, Insert, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) 2045 .addOperand(Op); 2046 Op.setReg(DstReg); 2047 } 2048 } 2049 2050 // REG_SEQUENCE doesn't really require operand legalization, but if one has a 2051 // VGPR dest type and SGPR sources, insert copies so all operands are 2052 // VGPRs. This seems to help operand folding / the register coalescer. 2053 if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) { 2054 MachineBasicBlock *MBB = MI->getParent(); 2055 const TargetRegisterClass *DstRC = getOpRegClass(*MI, 0); 2056 if (RI.hasVGPRs(DstRC)) { 2057 // Update all the operands so they are VGPR register classes. These may 2058 // not be the same register class because REG_SEQUENCE supports mixing 2059 // subregister index types e.g. sub0_sub1 + sub2 + sub3 2060 for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { 2061 MachineOperand &Op = MI->getOperand(I); 2062 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 2063 continue; 2064 2065 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); 2066 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); 2067 if (VRC == OpRC) 2068 continue; 2069 2070 unsigned DstReg = MRI.createVirtualRegister(VRC); 2071 2072 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) 2073 .addOperand(Op); 2074 2075 Op.setReg(DstReg); 2076 Op.setIsKill(); 2077 } 2078 } 2079 2080 return; 2081 } 2082 2083 // Legalize INSERT_SUBREG 2084 // src0 must have the same register class as dst 2085 if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) { 2086 unsigned Dst = MI->getOperand(0).getReg(); 2087 unsigned Src0 = MI->getOperand(1).getReg(); 2088 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 2089 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 2090 if (DstRC != Src0RC) { 2091 MachineBasicBlock &MBB = *MI->getParent(); 2092 unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); 2093 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0) 2094 .addReg(Src0); 2095 MI->getOperand(1).setReg(NewSrc0); 2096 } 2097 return; 2098 } 2099 2100 // Legalize MIMG 2101 if (isMIMG(*MI)) { 2102 MachineOperand *SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); 2103 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { 2104 unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); 2105 SRsrc->setReg(SGPR); 2106 } 2107 2108 MachineOperand *SSamp = getNamedOperand(*MI, AMDGPU::OpName::ssamp); 2109 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { 2110 unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); 2111 SSamp->setReg(SGPR); 2112 } 2113 return; 2114 } 2115 2116 // Legalize MUBUF* instructions 2117 // FIXME: If we start using the non-addr64 instructions for compute, we 2118 // may need to legalize them here. 2119 int SRsrcIdx = 2120 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc); 2121 if (SRsrcIdx != -1) { 2122 // We have an MUBUF instruction 2123 MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx); 2124 unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass; 2125 if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), 2126 RI.getRegClass(SRsrcRC))) { 2127 // The operands are legal. 2128 // FIXME: We may need to legalize operands besided srsrc. 2129 return; 2130 } 2131 2132 MachineBasicBlock &MBB = *MI->getParent(); 2133 2134 // Extract the ptr from the resource descriptor. 2135 unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, 2136 &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); 2137 2138 // Create an empty resource descriptor 2139 unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2140 unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2141 unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2142 unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); 2143 uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); 2144 2145 // Zero64 = 0 2146 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64), 2147 Zero64) 2148 .addImm(0); 2149 2150 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 2151 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 2152 SRsrcFormatLo) 2153 .addImm(RsrcDataFormat & 0xFFFFFFFF); 2154 2155 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 2156 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 2157 SRsrcFormatHi) 2158 .addImm(RsrcDataFormat >> 32); 2159 2160 // NewSRsrc = {Zero64, SRsrcFormat} 2161 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) 2162 .addReg(Zero64) 2163 .addImm(AMDGPU::sub0_sub1) 2164 .addReg(SRsrcFormatLo) 2165 .addImm(AMDGPU::sub2) 2166 .addReg(SRsrcFormatHi) 2167 .addImm(AMDGPU::sub3); 2168 2169 MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); 2170 unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2171 if (VAddr) { 2172 // This is already an ADDR64 instruction so we need to add the pointer 2173 // extracted from the resource descriptor to the current value of VAddr. 2174 unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2175 unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2176 2177 // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 2178 DebugLoc DL = MI->getDebugLoc(); 2179 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) 2180 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 2181 .addReg(VAddr->getReg(), 0, AMDGPU::sub0); 2182 2183 // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1 2184 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) 2185 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 2186 .addReg(VAddr->getReg(), 0, AMDGPU::sub1); 2187 2188 // NewVaddr = {NewVaddrHi, NewVaddrLo} 2189 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 2190 .addReg(NewVAddrLo) 2191 .addImm(AMDGPU::sub0) 2192 .addReg(NewVAddrHi) 2193 .addImm(AMDGPU::sub1); 2194 } else { 2195 // This instructions is the _OFFSET variant, so we need to convert it to 2196 // ADDR64. 2197 assert(MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() 2198 < AMDGPUSubtarget::VOLCANIC_ISLANDS && 2199 "FIXME: Need to emit flat atomics here"); 2200 2201 MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata); 2202 MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset); 2203 MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset); 2204 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode()); 2205 2206 // Atomics rith return have have an additional tied operand and are 2207 // missing some of the special bits. 2208 MachineOperand *VDataIn = getNamedOperand(*MI, AMDGPU::OpName::vdata_in); 2209 MachineInstr *Addr64; 2210 2211 if (!VDataIn) { 2212 // Regular buffer load / store. 2213 MachineInstrBuilder MIB 2214 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) 2215 .addOperand(*VData) 2216 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 2217 // This will be replaced later 2218 // with the new value of vaddr. 2219 .addOperand(*SRsrc) 2220 .addOperand(*SOffset) 2221 .addOperand(*Offset); 2222 2223 // Atomics do not have this operand. 2224 if (const MachineOperand *GLC 2225 = getNamedOperand(*MI, AMDGPU::OpName::glc)) { 2226 MIB.addImm(GLC->getImm()); 2227 } 2228 2229 MIB.addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)); 2230 2231 if (const MachineOperand *TFE 2232 = getNamedOperand(*MI, AMDGPU::OpName::tfe)) { 2233 MIB.addImm(TFE->getImm()); 2234 } 2235 2236 MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 2237 Addr64 = MIB; 2238 } else { 2239 // Atomics with return. 2240 Addr64 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) 2241 .addOperand(*VData) 2242 .addOperand(*VDataIn) 2243 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 2244 // This will be replaced later 2245 // with the new value of vaddr. 2246 .addOperand(*SRsrc) 2247 .addOperand(*SOffset) 2248 .addOperand(*Offset) 2249 .addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)) 2250 .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 2251 } 2252 2253 MI->removeFromParent(); 2254 MI = Addr64; 2255 2256 // NewVaddr = {NewVaddrHi, NewVaddrLo} 2257 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 2258 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 2259 .addImm(AMDGPU::sub0) 2260 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 2261 .addImm(AMDGPU::sub1); 2262 2263 VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); 2264 SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); 2265 } 2266 2267 // Update the instruction to use NewVaddr 2268 VAddr->setReg(NewVAddr); 2269 // Update the instruction to use NewSRsrc 2270 SRsrc->setReg(NewSRsrc); 2271 } 2272 } 2273 2274 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { 2275 SmallVector<MachineInstr *, 128> Worklist; 2276 Worklist.push_back(&TopInst); 2277 2278 while (!Worklist.empty()) { 2279 MachineInstr *Inst = Worklist.pop_back_val(); 2280 MachineBasicBlock *MBB = Inst->getParent(); 2281 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 2282 2283 unsigned Opcode = Inst->getOpcode(); 2284 unsigned NewOpcode = getVALUOp(*Inst); 2285 2286 // Handle some special cases 2287 switch (Opcode) { 2288 default: 2289 break; 2290 case AMDGPU::S_AND_B64: 2291 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); 2292 Inst->eraseFromParent(); 2293 continue; 2294 2295 case AMDGPU::S_OR_B64: 2296 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64); 2297 Inst->eraseFromParent(); 2298 continue; 2299 2300 case AMDGPU::S_XOR_B64: 2301 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64); 2302 Inst->eraseFromParent(); 2303 continue; 2304 2305 case AMDGPU::S_NOT_B64: 2306 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); 2307 Inst->eraseFromParent(); 2308 continue; 2309 2310 case AMDGPU::S_BCNT1_I32_B64: 2311 splitScalar64BitBCNT(Worklist, Inst); 2312 Inst->eraseFromParent(); 2313 continue; 2314 2315 case AMDGPU::S_BFE_I64: { 2316 splitScalar64BitBFE(Worklist, Inst); 2317 Inst->eraseFromParent(); 2318 continue; 2319 } 2320 2321 case AMDGPU::S_LSHL_B32: 2322 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2323 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 2324 swapOperands(Inst); 2325 } 2326 break; 2327 case AMDGPU::S_ASHR_I32: 2328 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2329 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 2330 swapOperands(Inst); 2331 } 2332 break; 2333 case AMDGPU::S_LSHR_B32: 2334 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2335 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 2336 swapOperands(Inst); 2337 } 2338 break; 2339 case AMDGPU::S_LSHL_B64: 2340 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2341 NewOpcode = AMDGPU::V_LSHLREV_B64; 2342 swapOperands(Inst); 2343 } 2344 break; 2345 case AMDGPU::S_ASHR_I64: 2346 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2347 NewOpcode = AMDGPU::V_ASHRREV_I64; 2348 swapOperands(Inst); 2349 } 2350 break; 2351 case AMDGPU::S_LSHR_B64: 2352 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2353 NewOpcode = AMDGPU::V_LSHRREV_B64; 2354 swapOperands(Inst); 2355 } 2356 break; 2357 2358 case AMDGPU::S_ABS_I32: 2359 lowerScalarAbs(Worklist, Inst); 2360 Inst->eraseFromParent(); 2361 continue; 2362 2363 case AMDGPU::S_CBRANCH_SCC0: 2364 case AMDGPU::S_CBRANCH_SCC1: 2365 // Clear unused bits of vcc 2366 BuildMI(*MBB, Inst, Inst->getDebugLoc(), get(AMDGPU::S_AND_B64), AMDGPU::VCC) 2367 .addReg(AMDGPU::EXEC) 2368 .addReg(AMDGPU::VCC); 2369 break; 2370 2371 case AMDGPU::S_BFE_U64: 2372 case AMDGPU::S_BFM_B64: 2373 llvm_unreachable("Moving this op to VALU not implemented"); 2374 } 2375 2376 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 2377 // We cannot move this instruction to the VALU, so we should try to 2378 // legalize its operands instead. 2379 legalizeOperands(Inst); 2380 continue; 2381 } 2382 2383 // Use the new VALU Opcode. 2384 const MCInstrDesc &NewDesc = get(NewOpcode); 2385 Inst->setDesc(NewDesc); 2386 2387 // Remove any references to SCC. Vector instructions can't read from it, and 2388 // We're just about to add the implicit use / defs of VCC, and we don't want 2389 // both. 2390 for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) { 2391 MachineOperand &Op = Inst->getOperand(i); 2392 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { 2393 Inst->RemoveOperand(i); 2394 addSCCDefUsersToVALUWorklist(Inst, Worklist); 2395 } 2396 } 2397 2398 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 2399 // We are converting these to a BFE, so we need to add the missing 2400 // operands for the size and offset. 2401 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 2402 Inst->addOperand(MachineOperand::CreateImm(0)); 2403 Inst->addOperand(MachineOperand::CreateImm(Size)); 2404 2405 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 2406 // The VALU version adds the second operand to the result, so insert an 2407 // extra 0 operand. 2408 Inst->addOperand(MachineOperand::CreateImm(0)); 2409 } 2410 2411 Inst->addImplicitDefUseOperands(*Inst->getParent()->getParent()); 2412 2413 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 2414 const MachineOperand &OffsetWidthOp = Inst->getOperand(2); 2415 // If we need to move this to VGPRs, we need to unpack the second operand 2416 // back into the 2 separate ones for bit offset and width. 2417 assert(OffsetWidthOp.isImm() && 2418 "Scalar BFE is only implemented for constant width and offset"); 2419 uint32_t Imm = OffsetWidthOp.getImm(); 2420 2421 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 2422 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 2423 Inst->RemoveOperand(2); // Remove old immediate. 2424 Inst->addOperand(MachineOperand::CreateImm(Offset)); 2425 Inst->addOperand(MachineOperand::CreateImm(BitWidth)); 2426 } 2427 2428 bool HasDst = Inst->getOperand(0).isReg() && Inst->getOperand(0).isDef(); 2429 unsigned NewDstReg = AMDGPU::NoRegister; 2430 if (HasDst) { 2431 // Update the destination register class. 2432 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst); 2433 if (!NewDstRC) 2434 continue; 2435 2436 unsigned DstReg = Inst->getOperand(0).getReg(); 2437 NewDstReg = MRI.createVirtualRegister(NewDstRC); 2438 MRI.replaceRegWith(DstReg, NewDstReg); 2439 } 2440 2441 // Legalize the operands 2442 legalizeOperands(Inst); 2443 2444 if (HasDst) 2445 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 2446 } 2447 } 2448 2449 //===----------------------------------------------------------------------===// 2450 // Indirect addressing callbacks 2451 //===----------------------------------------------------------------------===// 2452 2453 const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const { 2454 return &AMDGPU::VGPR_32RegClass; 2455 } 2456 2457 void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, 2458 MachineInstr *Inst) const { 2459 MachineBasicBlock &MBB = *Inst->getParent(); 2460 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2461 MachineBasicBlock::iterator MII = Inst; 2462 DebugLoc DL = Inst->getDebugLoc(); 2463 2464 MachineOperand &Dest = Inst->getOperand(0); 2465 MachineOperand &Src = Inst->getOperand(1); 2466 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2467 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2468 2469 BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg) 2470 .addImm(0) 2471 .addReg(Src.getReg()); 2472 2473 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) 2474 .addReg(Src.getReg()) 2475 .addReg(TmpReg); 2476 2477 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2478 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2479 } 2480 2481 void SIInstrInfo::splitScalar64BitUnaryOp( 2482 SmallVectorImpl<MachineInstr *> &Worklist, 2483 MachineInstr *Inst, 2484 unsigned Opcode) const { 2485 MachineBasicBlock &MBB = *Inst->getParent(); 2486 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2487 2488 MachineOperand &Dest = Inst->getOperand(0); 2489 MachineOperand &Src0 = Inst->getOperand(1); 2490 DebugLoc DL = Inst->getDebugLoc(); 2491 2492 MachineBasicBlock::iterator MII = Inst; 2493 2494 const MCInstrDesc &InstDesc = get(Opcode); 2495 const TargetRegisterClass *Src0RC = Src0.isReg() ? 2496 MRI.getRegClass(Src0.getReg()) : 2497 &AMDGPU::SGPR_32RegClass; 2498 2499 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 2500 2501 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2502 AMDGPU::sub0, Src0SubRC); 2503 2504 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 2505 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 2506 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 2507 2508 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 2509 BuildMI(MBB, MII, DL, InstDesc, DestSub0) 2510 .addOperand(SrcReg0Sub0); 2511 2512 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2513 AMDGPU::sub1, Src0SubRC); 2514 2515 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 2516 BuildMI(MBB, MII, DL, InstDesc, DestSub1) 2517 .addOperand(SrcReg0Sub1); 2518 2519 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 2520 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2521 .addReg(DestSub0) 2522 .addImm(AMDGPU::sub0) 2523 .addReg(DestSub1) 2524 .addImm(AMDGPU::sub1); 2525 2526 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 2527 2528 // We don't need to legalizeOperands here because for a single operand, src0 2529 // will support any kind of input. 2530 2531 // Move all users of this moved value. 2532 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 2533 } 2534 2535 void SIInstrInfo::splitScalar64BitBinaryOp( 2536 SmallVectorImpl<MachineInstr *> &Worklist, 2537 MachineInstr *Inst, 2538 unsigned Opcode) const { 2539 MachineBasicBlock &MBB = *Inst->getParent(); 2540 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2541 2542 MachineOperand &Dest = Inst->getOperand(0); 2543 MachineOperand &Src0 = Inst->getOperand(1); 2544 MachineOperand &Src1 = Inst->getOperand(2); 2545 DebugLoc DL = Inst->getDebugLoc(); 2546 2547 MachineBasicBlock::iterator MII = Inst; 2548 2549 const MCInstrDesc &InstDesc = get(Opcode); 2550 const TargetRegisterClass *Src0RC = Src0.isReg() ? 2551 MRI.getRegClass(Src0.getReg()) : 2552 &AMDGPU::SGPR_32RegClass; 2553 2554 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 2555 const TargetRegisterClass *Src1RC = Src1.isReg() ? 2556 MRI.getRegClass(Src1.getReg()) : 2557 &AMDGPU::SGPR_32RegClass; 2558 2559 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 2560 2561 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2562 AMDGPU::sub0, Src0SubRC); 2563 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 2564 AMDGPU::sub0, Src1SubRC); 2565 2566 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 2567 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 2568 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 2569 2570 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 2571 MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) 2572 .addOperand(SrcReg0Sub0) 2573 .addOperand(SrcReg1Sub0); 2574 2575 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2576 AMDGPU::sub1, Src0SubRC); 2577 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 2578 AMDGPU::sub1, Src1SubRC); 2579 2580 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 2581 MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) 2582 .addOperand(SrcReg0Sub1) 2583 .addOperand(SrcReg1Sub1); 2584 2585 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 2586 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2587 .addReg(DestSub0) 2588 .addImm(AMDGPU::sub0) 2589 .addReg(DestSub1) 2590 .addImm(AMDGPU::sub1); 2591 2592 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 2593 2594 // Try to legalize the operands in case we need to swap the order to keep it 2595 // valid. 2596 legalizeOperands(LoHalf); 2597 legalizeOperands(HiHalf); 2598 2599 // Move all users of this moved vlaue. 2600 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 2601 } 2602 2603 void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, 2604 MachineInstr *Inst) const { 2605 MachineBasicBlock &MBB = *Inst->getParent(); 2606 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2607 2608 MachineBasicBlock::iterator MII = Inst; 2609 DebugLoc DL = Inst->getDebugLoc(); 2610 2611 MachineOperand &Dest = Inst->getOperand(0); 2612 MachineOperand &Src = Inst->getOperand(1); 2613 2614 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 2615 const TargetRegisterClass *SrcRC = Src.isReg() ? 2616 MRI.getRegClass(Src.getReg()) : 2617 &AMDGPU::SGPR_32RegClass; 2618 2619 unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2620 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2621 2622 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); 2623 2624 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 2625 AMDGPU::sub0, SrcSubRC); 2626 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 2627 AMDGPU::sub1, SrcSubRC); 2628 2629 BuildMI(MBB, MII, DL, InstDesc, MidReg) 2630 .addOperand(SrcRegSub0) 2631 .addImm(0); 2632 2633 BuildMI(MBB, MII, DL, InstDesc, ResultReg) 2634 .addOperand(SrcRegSub1) 2635 .addReg(MidReg); 2636 2637 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2638 2639 // We don't need to legalize operands here. src0 for etiher instruction can be 2640 // an SGPR, and the second input is unused or determined here. 2641 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2642 } 2643 2644 void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, 2645 MachineInstr *Inst) const { 2646 MachineBasicBlock &MBB = *Inst->getParent(); 2647 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2648 MachineBasicBlock::iterator MII = Inst; 2649 DebugLoc DL = Inst->getDebugLoc(); 2650 2651 MachineOperand &Dest = Inst->getOperand(0); 2652 uint32_t Imm = Inst->getOperand(2).getImm(); 2653 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 2654 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 2655 2656 (void) Offset; 2657 2658 // Only sext_inreg cases handled. 2659 assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 && 2660 BitWidth <= 32 && 2661 Offset == 0 && 2662 "Not implemented"); 2663 2664 if (BitWidth < 32) { 2665 unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2666 unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2667 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2668 2669 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) 2670 .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0) 2671 .addImm(0) 2672 .addImm(BitWidth); 2673 2674 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 2675 .addImm(31) 2676 .addReg(MidRegLo); 2677 2678 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 2679 .addReg(MidRegLo) 2680 .addImm(AMDGPU::sub0) 2681 .addReg(MidRegHi) 2682 .addImm(AMDGPU::sub1); 2683 2684 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2685 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2686 return; 2687 } 2688 2689 MachineOperand &Src = Inst->getOperand(1); 2690 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2691 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2692 2693 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 2694 .addImm(31) 2695 .addReg(Src.getReg(), 0, AMDGPU::sub0); 2696 2697 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 2698 .addReg(Src.getReg(), 0, AMDGPU::sub0) 2699 .addImm(AMDGPU::sub0) 2700 .addReg(TmpReg) 2701 .addImm(AMDGPU::sub1); 2702 2703 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2704 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2705 } 2706 2707 void SIInstrInfo::addUsersToMoveToVALUWorklist( 2708 unsigned DstReg, 2709 MachineRegisterInfo &MRI, 2710 SmallVectorImpl<MachineInstr *> &Worklist) const { 2711 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), 2712 E = MRI.use_end(); I != E; ++I) { 2713 MachineInstr &UseMI = *I->getParent(); 2714 if (!canReadVGPR(UseMI, I.getOperandNo())) { 2715 Worklist.push_back(&UseMI); 2716 } 2717 } 2718 } 2719 2720 void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineInstr *SCCDefInst, 2721 SmallVectorImpl<MachineInstr *> &Worklist) const { 2722 // This assumes that all the users of SCC are in the same block 2723 // as the SCC def. 2724 for (MachineBasicBlock::iterator I = SCCDefInst, 2725 E = SCCDefInst->getParent()->end(); I != E; ++I) { 2726 2727 // Exit if we find another SCC def. 2728 if (I->findRegisterDefOperandIdx(AMDGPU::SCC) != -1) 2729 return; 2730 2731 if (I->findRegisterUseOperandIdx(AMDGPU::SCC) != -1) 2732 Worklist.push_back(I); 2733 } 2734 } 2735 2736 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( 2737 const MachineInstr &Inst) const { 2738 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); 2739 2740 switch (Inst.getOpcode()) { 2741 // For target instructions, getOpRegClass just returns the virtual register 2742 // class associated with the operand, so we need to find an equivalent VGPR 2743 // register class in order to move the instruction to the VALU. 2744 case AMDGPU::COPY: 2745 case AMDGPU::PHI: 2746 case AMDGPU::REG_SEQUENCE: 2747 case AMDGPU::INSERT_SUBREG: 2748 if (RI.hasVGPRs(NewDstRC)) 2749 return nullptr; 2750 2751 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 2752 if (!NewDstRC) 2753 return nullptr; 2754 return NewDstRC; 2755 default: 2756 return NewDstRC; 2757 } 2758 } 2759 2760 // Find the one SGPR operand we are allowed to use. 2761 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, 2762 int OpIndices[3]) const { 2763 const MCInstrDesc &Desc = MI->getDesc(); 2764 2765 // Find the one SGPR operand we are allowed to use. 2766 // 2767 // First we need to consider the instruction's operand requirements before 2768 // legalizing. Some operands are required to be SGPRs, such as implicit uses 2769 // of VCC, but we are still bound by the constant bus requirement to only use 2770 // one. 2771 // 2772 // If the operand's class is an SGPR, we can never move it. 2773 2774 unsigned SGPRReg = findImplicitSGPRRead(*MI); 2775 if (SGPRReg != AMDGPU::NoRegister) 2776 return SGPRReg; 2777 2778 unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; 2779 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 2780 2781 for (unsigned i = 0; i < 3; ++i) { 2782 int Idx = OpIndices[i]; 2783 if (Idx == -1) 2784 break; 2785 2786 const MachineOperand &MO = MI->getOperand(Idx); 2787 if (!MO.isReg()) 2788 continue; 2789 2790 // Is this operand statically required to be an SGPR based on the operand 2791 // constraints? 2792 const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); 2793 bool IsRequiredSGPR = RI.isSGPRClass(OpRC); 2794 if (IsRequiredSGPR) 2795 return MO.getReg(); 2796 2797 // If this could be a VGPR or an SGPR, Check the dynamic register class. 2798 unsigned Reg = MO.getReg(); 2799 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); 2800 if (RI.isSGPRClass(RegRC)) 2801 UsedSGPRs[i] = Reg; 2802 } 2803 2804 // We don't have a required SGPR operand, so we have a bit more freedom in 2805 // selecting operands to move. 2806 2807 // Try to select the most used SGPR. If an SGPR is equal to one of the 2808 // others, we choose that. 2809 // 2810 // e.g. 2811 // V_FMA_F32 v0, s0, s0, s0 -> No moves 2812 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 2813 2814 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should 2815 // prefer those. 2816 2817 if (UsedSGPRs[0] != AMDGPU::NoRegister) { 2818 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 2819 SGPRReg = UsedSGPRs[0]; 2820 } 2821 2822 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { 2823 if (UsedSGPRs[1] == UsedSGPRs[2]) 2824 SGPRReg = UsedSGPRs[1]; 2825 } 2826 2827 return SGPRReg; 2828 } 2829 2830 void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved, 2831 const MachineFunction &MF) const { 2832 int End = getIndirectIndexEnd(MF); 2833 int Begin = getIndirectIndexBegin(MF); 2834 2835 if (End == -1) 2836 return; 2837 2838 2839 for (int Index = Begin; Index <= End; ++Index) 2840 Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index)); 2841 2842 for (int Index = std::max(0, Begin - 1); Index <= End; ++Index) 2843 Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index)); 2844 2845 for (int Index = std::max(0, Begin - 2); Index <= End; ++Index) 2846 Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index)); 2847 2848 for (int Index = std::max(0, Begin - 3); Index <= End; ++Index) 2849 Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index)); 2850 2851 for (int Index = std::max(0, Begin - 7); Index <= End; ++Index) 2852 Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index)); 2853 2854 for (int Index = std::max(0, Begin - 15); Index <= End; ++Index) 2855 Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index)); 2856 } 2857 2858 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 2859 unsigned OperandName) const { 2860 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 2861 if (Idx == -1) 2862 return nullptr; 2863 2864 return &MI.getOperand(Idx); 2865 } 2866 2867 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 2868 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 2869 if (ST.isAmdHsaOS()) { 2870 RsrcDataFormat |= (1ULL << 56); 2871 2872 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 2873 // Set MTYPE = 2 2874 RsrcDataFormat |= (2ULL << 59); 2875 } 2876 2877 return RsrcDataFormat; 2878 } 2879 2880 uint64_t SIInstrInfo::getScratchRsrcWords23() const { 2881 uint64_t Rsrc23 = getDefaultRsrcDataFormat() | 2882 AMDGPU::RSRC_TID_ENABLE | 2883 0xffffffff; // Size; 2884 2885 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; 2886 2887 Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT); 2888 2889 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. 2890 // Clear them unless we want a huge stride. 2891 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 2892 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; 2893 2894 return Rsrc23; 2895 } 2896 2897 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr *MI) const { 2898 unsigned Opc = MI->getOpcode(); 2899 2900 return isSMRD(Opc); 2901 } 2902 2903 bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr *MI) const { 2904 unsigned Opc = MI->getOpcode(); 2905 2906 return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); 2907 } 2908 2909 ArrayRef<std::pair<int, const char *>> 2910 SIInstrInfo::getSerializableTargetIndices() const { 2911 static const std::pair<int, const char *> TargetIndices[] = { 2912 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, 2913 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, 2914 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, 2915 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, 2916 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; 2917 return makeArrayRef(TargetIndices); 2918 } 2919