1 //===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief SI Implementation of TargetInstrInfo. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "SIInstrInfo.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "SIDefines.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/CodeGen/MachineFrameInfo.h" 21 #include "llvm/CodeGen/MachineInstrBuilder.h" 22 #include "llvm/CodeGen/MachineRegisterInfo.h" 23 #include "llvm/IR/Function.h" 24 #include "llvm/CodeGen/RegisterScavenging.h" 25 #include "llvm/MC/MCInstrDesc.h" 26 #include "llvm/Support/Debug.h" 27 28 using namespace llvm; 29 30 SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st) 31 : AMDGPUInstrInfo(st), RI() {} 32 33 //===----------------------------------------------------------------------===// 34 // TargetInstrInfo callbacks 35 //===----------------------------------------------------------------------===// 36 37 static unsigned getNumOperandsNoGlue(SDNode *Node) { 38 unsigned N = Node->getNumOperands(); 39 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 40 --N; 41 return N; 42 } 43 44 static SDValue findChainOperand(SDNode *Load) { 45 SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); 46 assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); 47 return LastOp; 48 } 49 50 /// \brief Returns true if both nodes have the same value for the given 51 /// operand \p Op, or if both nodes do not have this operand. 52 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 53 unsigned Opc0 = N0->getMachineOpcode(); 54 unsigned Opc1 = N1->getMachineOpcode(); 55 56 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 57 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 58 59 if (Op0Idx == -1 && Op1Idx == -1) 60 return true; 61 62 63 if ((Op0Idx == -1 && Op1Idx != -1) || 64 (Op1Idx == -1 && Op0Idx != -1)) 65 return false; 66 67 // getNamedOperandIdx returns the index for the MachineInstr's operands, 68 // which includes the result as the first operand. We are indexing into the 69 // MachineSDNode's operands, so we need to skip the result operand to get 70 // the real index. 71 --Op0Idx; 72 --Op1Idx; 73 74 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 75 } 76 77 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, 78 AliasAnalysis *AA) const { 79 // TODO: The generic check fails for VALU instructions that should be 80 // rematerializable due to implicit reads of exec. We really want all of the 81 // generic logic for this except for this. 82 switch (MI->getOpcode()) { 83 case AMDGPU::V_MOV_B32_e32: 84 case AMDGPU::V_MOV_B32_e64: 85 case AMDGPU::V_MOV_B64_PSEUDO: 86 return true; 87 default: 88 return false; 89 } 90 } 91 92 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 93 int64_t &Offset0, 94 int64_t &Offset1) const { 95 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 96 return false; 97 98 unsigned Opc0 = Load0->getMachineOpcode(); 99 unsigned Opc1 = Load1->getMachineOpcode(); 100 101 // Make sure both are actually loads. 102 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 103 return false; 104 105 if (isDS(Opc0) && isDS(Opc1)) { 106 107 // FIXME: Handle this case: 108 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 109 return false; 110 111 // Check base reg. 112 if (Load0->getOperand(1) != Load1->getOperand(1)) 113 return false; 114 115 // Check chain. 116 if (findChainOperand(Load0) != findChainOperand(Load1)) 117 return false; 118 119 // Skip read2 / write2 variants for simplicity. 120 // TODO: We should report true if the used offsets are adjacent (excluded 121 // st64 versions). 122 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || 123 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) 124 return false; 125 126 Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue(); 127 Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue(); 128 return true; 129 } 130 131 if (isSMRD(Opc0) && isSMRD(Opc1)) { 132 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); 133 134 // Check base reg. 135 if (Load0->getOperand(0) != Load1->getOperand(0)) 136 return false; 137 138 const ConstantSDNode *Load0Offset = 139 dyn_cast<ConstantSDNode>(Load0->getOperand(1)); 140 const ConstantSDNode *Load1Offset = 141 dyn_cast<ConstantSDNode>(Load1->getOperand(1)); 142 143 if (!Load0Offset || !Load1Offset) 144 return false; 145 146 // Check chain. 147 if (findChainOperand(Load0) != findChainOperand(Load1)) 148 return false; 149 150 Offset0 = Load0Offset->getZExtValue(); 151 Offset1 = Load1Offset->getZExtValue(); 152 return true; 153 } 154 155 // MUBUF and MTBUF can access the same addresses. 156 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 157 158 // MUBUF and MTBUF have vaddr at different indices. 159 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 160 findChainOperand(Load0) != findChainOperand(Load1) || 161 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 162 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 163 return false; 164 165 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 166 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 167 168 if (OffIdx0 == -1 || OffIdx1 == -1) 169 return false; 170 171 // getNamedOperandIdx returns the index for MachineInstrs. Since they 172 // inlcude the output in the operand list, but SDNodes don't, we need to 173 // subtract the index by one. 174 --OffIdx0; 175 --OffIdx1; 176 177 SDValue Off0 = Load0->getOperand(OffIdx0); 178 SDValue Off1 = Load1->getOperand(OffIdx1); 179 180 // The offset might be a FrameIndexSDNode. 181 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 182 return false; 183 184 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 185 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 186 return true; 187 } 188 189 return false; 190 } 191 192 static bool isStride64(unsigned Opc) { 193 switch (Opc) { 194 case AMDGPU::DS_READ2ST64_B32: 195 case AMDGPU::DS_READ2ST64_B64: 196 case AMDGPU::DS_WRITE2ST64_B32: 197 case AMDGPU::DS_WRITE2ST64_B64: 198 return true; 199 default: 200 return false; 201 } 202 } 203 204 bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, 205 unsigned &Offset, 206 const TargetRegisterInfo *TRI) const { 207 unsigned Opc = LdSt->getOpcode(); 208 209 if (isDS(*LdSt)) { 210 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 211 AMDGPU::OpName::offset); 212 if (OffsetImm) { 213 // Normal, single offset LDS instruction. 214 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 215 AMDGPU::OpName::addr); 216 217 BaseReg = AddrReg->getReg(); 218 Offset = OffsetImm->getImm(); 219 return true; 220 } 221 222 // The 2 offset instructions use offset0 and offset1 instead. We can treat 223 // these as a load with a single offset if the 2 offsets are consecutive. We 224 // will use this for some partially aligned loads. 225 const MachineOperand *Offset0Imm = getNamedOperand(*LdSt, 226 AMDGPU::OpName::offset0); 227 // DS_PERMUTE does not have Offset0Imm (and Offset1Imm). 228 if (!Offset0Imm) 229 return false; 230 231 const MachineOperand *Offset1Imm = getNamedOperand(*LdSt, 232 AMDGPU::OpName::offset1); 233 234 uint8_t Offset0 = Offset0Imm->getImm(); 235 uint8_t Offset1 = Offset1Imm->getImm(); 236 237 if (Offset1 > Offset0 && Offset1 - Offset0 == 1) { 238 // Each of these offsets is in element sized units, so we need to convert 239 // to bytes of the individual reads. 240 241 unsigned EltSize; 242 if (LdSt->mayLoad()) 243 EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2; 244 else { 245 assert(LdSt->mayStore()); 246 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 247 EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize(); 248 } 249 250 if (isStride64(Opc)) 251 EltSize *= 64; 252 253 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 254 AMDGPU::OpName::addr); 255 BaseReg = AddrReg->getReg(); 256 Offset = EltSize * Offset0; 257 return true; 258 } 259 260 return false; 261 } 262 263 if (isMUBUF(*LdSt) || isMTBUF(*LdSt)) { 264 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) 265 return false; 266 267 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 268 AMDGPU::OpName::vaddr); 269 if (!AddrReg) 270 return false; 271 272 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 273 AMDGPU::OpName::offset); 274 BaseReg = AddrReg->getReg(); 275 Offset = OffsetImm->getImm(); 276 return true; 277 } 278 279 if (isSMRD(*LdSt)) { 280 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 281 AMDGPU::OpName::offset); 282 if (!OffsetImm) 283 return false; 284 285 const MachineOperand *SBaseReg = getNamedOperand(*LdSt, 286 AMDGPU::OpName::sbase); 287 BaseReg = SBaseReg->getReg(); 288 Offset = OffsetImm->getImm(); 289 return true; 290 } 291 292 return false; 293 } 294 295 bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, 296 MachineInstr *SecondLdSt, 297 unsigned NumLoads) const { 298 // TODO: This needs finer tuning 299 if (NumLoads > 4) 300 return false; 301 302 if (isDS(*FirstLdSt) && isDS(*SecondLdSt)) 303 return true; 304 305 if (isSMRD(*FirstLdSt) && isSMRD(*SecondLdSt)) 306 return true; 307 308 return (isMUBUF(*FirstLdSt) || isMTBUF(*FirstLdSt)) && 309 (isMUBUF(*SecondLdSt) || isMTBUF(*SecondLdSt)); 310 } 311 312 void 313 SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 314 MachineBasicBlock::iterator MI, DebugLoc DL, 315 unsigned DestReg, unsigned SrcReg, 316 bool KillSrc) const { 317 318 // If we are trying to copy to or from SCC, there is a bug somewhere else in 319 // the backend. While it may be theoretically possible to do this, it should 320 // never be necessary. 321 assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC); 322 323 static const int16_t Sub0_15[] = { 324 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 325 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 326 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 327 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 328 }; 329 330 static const int16_t Sub0_15_64[] = { 331 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 332 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 333 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 334 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 335 }; 336 337 static const int16_t Sub0_7[] = { 338 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 339 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 340 }; 341 342 static const int16_t Sub0_7_64[] = { 343 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 344 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 345 }; 346 347 static const int16_t Sub0_3[] = { 348 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 349 }; 350 351 static const int16_t Sub0_3_64[] = { 352 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 353 }; 354 355 static const int16_t Sub0_2[] = { 356 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 357 }; 358 359 static const int16_t Sub0_1[] = { 360 AMDGPU::sub0, AMDGPU::sub1, 361 }; 362 363 unsigned Opcode; 364 ArrayRef<int16_t> SubIndices; 365 bool Forward; 366 367 if (AMDGPU::SReg_32RegClass.contains(DestReg)) { 368 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 369 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 370 .addReg(SrcReg, getKillRegState(KillSrc)); 371 return; 372 373 } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) { 374 if (DestReg == AMDGPU::VCC) { 375 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 376 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 377 .addReg(SrcReg, getKillRegState(KillSrc)); 378 } else { 379 // FIXME: Hack until VReg_1 removed. 380 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 381 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32)) 382 .addImm(0) 383 .addReg(SrcReg, getKillRegState(KillSrc)); 384 } 385 386 return; 387 } 388 389 assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); 390 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 391 .addReg(SrcReg, getKillRegState(KillSrc)); 392 return; 393 394 } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) { 395 assert(AMDGPU::SReg_128RegClass.contains(SrcReg)); 396 Opcode = AMDGPU::S_MOV_B64; 397 SubIndices = Sub0_3_64; 398 399 } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) { 400 assert(AMDGPU::SReg_256RegClass.contains(SrcReg)); 401 Opcode = AMDGPU::S_MOV_B64; 402 SubIndices = Sub0_7_64; 403 404 } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) { 405 assert(AMDGPU::SReg_512RegClass.contains(SrcReg)); 406 Opcode = AMDGPU::S_MOV_B64; 407 SubIndices = Sub0_15_64; 408 409 } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) { 410 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 411 AMDGPU::SReg_32RegClass.contains(SrcReg)); 412 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 413 .addReg(SrcReg, getKillRegState(KillSrc)); 414 return; 415 416 } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) { 417 assert(AMDGPU::VReg_64RegClass.contains(SrcReg) || 418 AMDGPU::SReg_64RegClass.contains(SrcReg)); 419 Opcode = AMDGPU::V_MOV_B32_e32; 420 SubIndices = Sub0_1; 421 422 } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) { 423 assert(AMDGPU::VReg_96RegClass.contains(SrcReg)); 424 Opcode = AMDGPU::V_MOV_B32_e32; 425 SubIndices = Sub0_2; 426 427 } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) { 428 assert(AMDGPU::VReg_128RegClass.contains(SrcReg) || 429 AMDGPU::SReg_128RegClass.contains(SrcReg)); 430 Opcode = AMDGPU::V_MOV_B32_e32; 431 SubIndices = Sub0_3; 432 433 } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) { 434 assert(AMDGPU::VReg_256RegClass.contains(SrcReg) || 435 AMDGPU::SReg_256RegClass.contains(SrcReg)); 436 Opcode = AMDGPU::V_MOV_B32_e32; 437 SubIndices = Sub0_7; 438 439 } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) { 440 assert(AMDGPU::VReg_512RegClass.contains(SrcReg) || 441 AMDGPU::SReg_512RegClass.contains(SrcReg)); 442 Opcode = AMDGPU::V_MOV_B32_e32; 443 SubIndices = Sub0_15; 444 445 } else { 446 llvm_unreachable("Can't copy register!"); 447 } 448 449 if (RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg)) 450 Forward = true; 451 else 452 Forward = false; 453 454 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 455 unsigned SubIdx; 456 if (Forward) 457 SubIdx = SubIndices[Idx]; 458 else 459 SubIdx = SubIndices[SubIndices.size() - Idx - 1]; 460 461 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 462 get(Opcode), RI.getSubReg(DestReg, SubIdx)); 463 464 Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); 465 466 if (Idx == SubIndices.size() - 1) 467 Builder.addReg(SrcReg, RegState::Kill | RegState::Implicit); 468 469 if (Idx == 0) 470 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 471 } 472 } 473 474 int SIInstrInfo::commuteOpcode(const MachineInstr &MI) const { 475 const unsigned Opcode = MI.getOpcode(); 476 477 int NewOpc; 478 479 // Try to map original to commuted opcode 480 NewOpc = AMDGPU::getCommuteRev(Opcode); 481 if (NewOpc != -1) 482 // Check if the commuted (REV) opcode exists on the target. 483 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 484 485 // Try to map commuted to original opcode 486 NewOpc = AMDGPU::getCommuteOrig(Opcode); 487 if (NewOpc != -1) 488 // Check if the original (non-REV) opcode exists on the target. 489 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 490 491 return Opcode; 492 } 493 494 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 495 496 if (DstRC->getSize() == 4) { 497 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 498 } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) { 499 return AMDGPU::S_MOV_B64; 500 } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) { 501 return AMDGPU::V_MOV_B64_PSEUDO; 502 } 503 return AMDGPU::COPY; 504 } 505 506 static unsigned getSGPRSpillSaveOpcode(unsigned Size) { 507 switch (Size) { 508 case 4: 509 return AMDGPU::SI_SPILL_S32_SAVE; 510 case 8: 511 return AMDGPU::SI_SPILL_S64_SAVE; 512 case 16: 513 return AMDGPU::SI_SPILL_S128_SAVE; 514 case 32: 515 return AMDGPU::SI_SPILL_S256_SAVE; 516 case 64: 517 return AMDGPU::SI_SPILL_S512_SAVE; 518 default: 519 llvm_unreachable("unknown register size"); 520 } 521 } 522 523 static unsigned getVGPRSpillSaveOpcode(unsigned Size) { 524 switch (Size) { 525 case 4: 526 return AMDGPU::SI_SPILL_V32_SAVE; 527 case 8: 528 return AMDGPU::SI_SPILL_V64_SAVE; 529 case 16: 530 return AMDGPU::SI_SPILL_V128_SAVE; 531 case 32: 532 return AMDGPU::SI_SPILL_V256_SAVE; 533 case 64: 534 return AMDGPU::SI_SPILL_V512_SAVE; 535 default: 536 llvm_unreachable("unknown register size"); 537 } 538 } 539 540 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 541 MachineBasicBlock::iterator MI, 542 unsigned SrcReg, bool isKill, 543 int FrameIndex, 544 const TargetRegisterClass *RC, 545 const TargetRegisterInfo *TRI) const { 546 MachineFunction *MF = MBB.getParent(); 547 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 548 MachineFrameInfo *FrameInfo = MF->getFrameInfo(); 549 DebugLoc DL = MBB.findDebugLoc(MI); 550 551 unsigned Size = FrameInfo->getObjectSize(FrameIndex); 552 unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); 553 MachinePointerInfo PtrInfo 554 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 555 MachineMemOperand *MMO 556 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 557 Size, Align); 558 559 if (RI.isSGPRClass(RC)) { 560 MFI->setHasSpilledSGPRs(); 561 562 // We are only allowed to create one new instruction when spilling 563 // registers, so we need to use pseudo instruction for spilling 564 // SGPRs. 565 unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize()); 566 BuildMI(MBB, MI, DL, get(Opcode)) 567 .addReg(SrcReg) // src 568 .addFrameIndex(FrameIndex) // frame_idx 569 .addMemOperand(MMO); 570 571 return; 572 } 573 574 if (!ST.isVGPRSpillingEnabled(MFI)) { 575 LLVMContext &Ctx = MF->getFunction()->getContext(); 576 Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" 577 " spill register"); 578 BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) 579 .addReg(SrcReg); 580 581 return; 582 } 583 584 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 585 586 unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); 587 MFI->setHasSpilledVGPRs(); 588 BuildMI(MBB, MI, DL, get(Opcode)) 589 .addReg(SrcReg) // src 590 .addFrameIndex(FrameIndex) // frame_idx 591 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 592 .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset 593 .addImm(0) // offset 594 .addMemOperand(MMO); 595 } 596 597 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { 598 switch (Size) { 599 case 4: 600 return AMDGPU::SI_SPILL_S32_RESTORE; 601 case 8: 602 return AMDGPU::SI_SPILL_S64_RESTORE; 603 case 16: 604 return AMDGPU::SI_SPILL_S128_RESTORE; 605 case 32: 606 return AMDGPU::SI_SPILL_S256_RESTORE; 607 case 64: 608 return AMDGPU::SI_SPILL_S512_RESTORE; 609 default: 610 llvm_unreachable("unknown register size"); 611 } 612 } 613 614 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { 615 switch (Size) { 616 case 4: 617 return AMDGPU::SI_SPILL_V32_RESTORE; 618 case 8: 619 return AMDGPU::SI_SPILL_V64_RESTORE; 620 case 16: 621 return AMDGPU::SI_SPILL_V128_RESTORE; 622 case 32: 623 return AMDGPU::SI_SPILL_V256_RESTORE; 624 case 64: 625 return AMDGPU::SI_SPILL_V512_RESTORE; 626 default: 627 llvm_unreachable("unknown register size"); 628 } 629 } 630 631 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 632 MachineBasicBlock::iterator MI, 633 unsigned DestReg, int FrameIndex, 634 const TargetRegisterClass *RC, 635 const TargetRegisterInfo *TRI) const { 636 MachineFunction *MF = MBB.getParent(); 637 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 638 MachineFrameInfo *FrameInfo = MF->getFrameInfo(); 639 DebugLoc DL = MBB.findDebugLoc(MI); 640 unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); 641 unsigned Size = FrameInfo->getObjectSize(FrameIndex); 642 643 MachinePointerInfo PtrInfo 644 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 645 646 MachineMemOperand *MMO = MF->getMachineMemOperand( 647 PtrInfo, MachineMemOperand::MOLoad, Size, Align); 648 649 if (RI.isSGPRClass(RC)) { 650 // FIXME: Maybe this should not include a memoperand because it will be 651 // lowered to non-memory instructions. 652 unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize()); 653 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 654 .addFrameIndex(FrameIndex) // frame_idx 655 .addMemOperand(MMO); 656 657 return; 658 } 659 660 if (!ST.isVGPRSpillingEnabled(MFI)) { 661 LLVMContext &Ctx = MF->getFunction()->getContext(); 662 Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" 663 " restore register"); 664 BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); 665 666 return; 667 } 668 669 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 670 671 unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); 672 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 673 .addFrameIndex(FrameIndex) // frame_idx 674 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 675 .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset 676 .addImm(0) // offset 677 .addMemOperand(MMO); 678 } 679 680 /// \param @Offset Offset in bytes of the FrameIndex being spilled 681 unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, 682 MachineBasicBlock::iterator MI, 683 RegScavenger *RS, unsigned TmpReg, 684 unsigned FrameOffset, 685 unsigned Size) const { 686 MachineFunction *MF = MBB.getParent(); 687 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 688 const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>(); 689 const SIRegisterInfo *TRI = 690 static_cast<const SIRegisterInfo*>(ST.getRegisterInfo()); 691 DebugLoc DL = MBB.findDebugLoc(MI); 692 unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF); 693 unsigned WavefrontSize = ST.getWavefrontSize(); 694 695 unsigned TIDReg = MFI->getTIDReg(); 696 if (!MFI->hasCalculatedTID()) { 697 MachineBasicBlock &Entry = MBB.getParent()->front(); 698 MachineBasicBlock::iterator Insert = Entry.front(); 699 DebugLoc DL = Insert->getDebugLoc(); 700 701 TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass); 702 if (TIDReg == AMDGPU::NoRegister) 703 return TIDReg; 704 705 706 if (MFI->getShaderType() == ShaderType::COMPUTE && 707 WorkGroupSize > WavefrontSize) { 708 709 unsigned TIDIGXReg 710 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X); 711 unsigned TIDIGYReg 712 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y); 713 unsigned TIDIGZReg 714 = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z); 715 unsigned InputPtrReg = 716 TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); 717 for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { 718 if (!Entry.isLiveIn(Reg)) 719 Entry.addLiveIn(Reg); 720 } 721 722 RS->enterBasicBlock(&Entry); 723 // FIXME: Can we scavenge an SReg_64 and access the subregs? 724 unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 725 unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 726 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) 727 .addReg(InputPtrReg) 728 .addImm(SI::KernelInputOffsets::NGROUPS_Z); 729 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) 730 .addReg(InputPtrReg) 731 .addImm(SI::KernelInputOffsets::NGROUPS_Y); 732 733 // NGROUPS.X * NGROUPS.Y 734 BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) 735 .addReg(STmp1) 736 .addReg(STmp0); 737 // (NGROUPS.X * NGROUPS.Y) * TIDIG.X 738 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) 739 .addReg(STmp1) 740 .addReg(TIDIGXReg); 741 // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) 742 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) 743 .addReg(STmp0) 744 .addReg(TIDIGYReg) 745 .addReg(TIDReg); 746 // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z 747 BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) 748 .addReg(TIDReg) 749 .addReg(TIDIGZReg); 750 } else { 751 // Get the wave id 752 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), 753 TIDReg) 754 .addImm(-1) 755 .addImm(0); 756 757 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), 758 TIDReg) 759 .addImm(-1) 760 .addReg(TIDReg); 761 } 762 763 BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), 764 TIDReg) 765 .addImm(2) 766 .addReg(TIDReg); 767 MFI->setTIDReg(TIDReg); 768 } 769 770 // Add FrameIndex to LDS offset 771 unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize); 772 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) 773 .addImm(LDSOffset) 774 .addReg(TIDReg); 775 776 return TmpReg; 777 } 778 779 void SIInstrInfo::insertWaitStates(MachineBasicBlock::iterator MI, 780 int Count) const { 781 while (Count > 0) { 782 int Arg; 783 if (Count >= 8) 784 Arg = 7; 785 else 786 Arg = Count - 1; 787 Count -= 8; 788 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(AMDGPU::S_NOP)) 789 .addImm(Arg); 790 } 791 } 792 793 bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { 794 MachineBasicBlock &MBB = *MI->getParent(); 795 DebugLoc DL = MBB.findDebugLoc(MI); 796 switch (MI->getOpcode()) { 797 default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); 798 799 case AMDGPU::SGPR_USE: 800 // This is just a placeholder for register allocation. 801 MI->eraseFromParent(); 802 break; 803 804 case AMDGPU::V_MOV_B64_PSEUDO: { 805 unsigned Dst = MI->getOperand(0).getReg(); 806 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 807 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 808 809 const MachineOperand &SrcOp = MI->getOperand(1); 810 // FIXME: Will this work for 64-bit floating point immediates? 811 assert(!SrcOp.isFPImm()); 812 if (SrcOp.isImm()) { 813 APInt Imm(64, SrcOp.getImm()); 814 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 815 .addImm(Imm.getLoBits(32).getZExtValue()) 816 .addReg(Dst, RegState::Implicit); 817 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 818 .addImm(Imm.getHiBits(32).getZExtValue()) 819 .addReg(Dst, RegState::Implicit); 820 } else { 821 assert(SrcOp.isReg()); 822 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 823 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 824 .addReg(Dst, RegState::Implicit); 825 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 826 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 827 .addReg(Dst, RegState::Implicit); 828 } 829 MI->eraseFromParent(); 830 break; 831 } 832 833 case AMDGPU::V_CNDMASK_B64_PSEUDO: { 834 unsigned Dst = MI->getOperand(0).getReg(); 835 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 836 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 837 unsigned Src0 = MI->getOperand(1).getReg(); 838 unsigned Src1 = MI->getOperand(2).getReg(); 839 const MachineOperand &SrcCond = MI->getOperand(3); 840 841 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo) 842 .addReg(RI.getSubReg(Src0, AMDGPU::sub0)) 843 .addReg(RI.getSubReg(Src1, AMDGPU::sub0)) 844 .addOperand(SrcCond); 845 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi) 846 .addReg(RI.getSubReg(Src0, AMDGPU::sub1)) 847 .addReg(RI.getSubReg(Src1, AMDGPU::sub1)) 848 .addOperand(SrcCond); 849 MI->eraseFromParent(); 850 break; 851 } 852 853 case AMDGPU::SI_CONSTDATA_PTR: { 854 const SIRegisterInfo *TRI = 855 static_cast<const SIRegisterInfo *>(ST.getRegisterInfo()); 856 MachineFunction &MF = *MBB.getParent(); 857 unsigned Reg = MI->getOperand(0).getReg(); 858 unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0); 859 unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1); 860 861 // Create a bundle so these instructions won't be re-ordered by the 862 // post-RA scheduler. 863 MIBundleBuilder Bundler(MBB, MI); 864 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); 865 866 // Add 32-bit offset from this instruction to the start of the 867 // constant data. 868 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) 869 .addReg(RegLo) 870 .addOperand(MI->getOperand(1))); 871 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) 872 .addReg(RegHi) 873 .addImm(0)); 874 875 llvm::finalizeBundle(MBB, Bundler.begin()); 876 877 MI->eraseFromParent(); 878 break; 879 } 880 } 881 return true; 882 } 883 884 /// Commutes the operands in the given instruction. 885 /// The commutable operands are specified by their indices OpIdx0 and OpIdx1. 886 /// 887 /// Do not call this method for a non-commutable instruction or for 888 /// non-commutable pair of operand indices OpIdx0 and OpIdx1. 889 /// Even though the instruction is commutable, the method may still 890 /// fail to commute the operands, null pointer is returned in such cases. 891 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI, 892 bool NewMI, 893 unsigned OpIdx0, 894 unsigned OpIdx1) const { 895 int CommutedOpcode = commuteOpcode(*MI); 896 if (CommutedOpcode == -1) 897 return nullptr; 898 899 int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 900 AMDGPU::OpName::src0); 901 MachineOperand &Src0 = MI->getOperand(Src0Idx); 902 if (!Src0.isReg()) 903 return nullptr; 904 905 int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 906 AMDGPU::OpName::src1); 907 908 if ((OpIdx0 != static_cast<unsigned>(Src0Idx) || 909 OpIdx1 != static_cast<unsigned>(Src1Idx)) && 910 (OpIdx0 != static_cast<unsigned>(Src1Idx) || 911 OpIdx1 != static_cast<unsigned>(Src0Idx))) 912 return nullptr; 913 914 MachineOperand &Src1 = MI->getOperand(Src1Idx); 915 916 917 if (isVOP2(*MI)) { 918 const MCInstrDesc &InstrDesc = MI->getDesc(); 919 // For VOP2 instructions, any operand type is valid to use for src0. Make 920 // sure we can use the src1 as src0. 921 // 922 // We could be stricter here and only allow commuting if there is a reason 923 // to do so. i.e. if both operands are VGPRs there is no real benefit, 924 // although MachineCSE attempts to find matches by commuting. 925 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 926 if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) 927 return nullptr; 928 } 929 930 if (!Src1.isReg()) { 931 // Allow commuting instructions with Imm operands. 932 if (NewMI || !Src1.isImm() || 933 (!isVOP2(*MI) && !isVOP3(*MI))) { 934 return nullptr; 935 } 936 // Be sure to copy the source modifiers to the right place. 937 if (MachineOperand *Src0Mods 938 = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { 939 MachineOperand *Src1Mods 940 = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers); 941 942 int Src0ModsVal = Src0Mods->getImm(); 943 if (!Src1Mods && Src0ModsVal != 0) 944 return nullptr; 945 946 // XXX - This assert might be a lie. It might be useful to have a neg 947 // modifier with 0.0. 948 int Src1ModsVal = Src1Mods->getImm(); 949 assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates"); 950 951 Src1Mods->setImm(Src0ModsVal); 952 Src0Mods->setImm(Src1ModsVal); 953 } 954 955 unsigned Reg = Src0.getReg(); 956 unsigned SubReg = Src0.getSubReg(); 957 if (Src1.isImm()) 958 Src0.ChangeToImmediate(Src1.getImm()); 959 else 960 llvm_unreachable("Should only have immediates"); 961 962 Src1.ChangeToRegister(Reg, false); 963 Src1.setSubReg(SubReg); 964 } else { 965 MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1); 966 } 967 968 if (MI) 969 MI->setDesc(get(CommutedOpcode)); 970 971 return MI; 972 } 973 974 // This needs to be implemented because the source modifiers may be inserted 975 // between the true commutable operands, and the base 976 // TargetInstrInfo::commuteInstruction uses it. 977 bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, 978 unsigned &SrcOpIdx0, 979 unsigned &SrcOpIdx1) const { 980 const MCInstrDesc &MCID = MI->getDesc(); 981 if (!MCID.isCommutable()) 982 return false; 983 984 unsigned Opc = MI->getOpcode(); 985 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 986 if (Src0Idx == -1) 987 return false; 988 989 // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on 990 // immediate. Also, immediate src0 operand is not handled in 991 // SIInstrInfo::commuteInstruction(); 992 if (!MI->getOperand(Src0Idx).isReg()) 993 return false; 994 995 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 996 if (Src1Idx == -1) 997 return false; 998 999 MachineOperand &Src1 = MI->getOperand(Src1Idx); 1000 if (Src1.isImm()) { 1001 // SIInstrInfo::commuteInstruction() does support commuting the immediate 1002 // operand src1 in 2 and 3 operand instructions. 1003 if (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode())) 1004 return false; 1005 } else if (Src1.isReg()) { 1006 // If any source modifiers are set, the generic instruction commuting won't 1007 // understand how to copy the source modifiers. 1008 if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || 1009 hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) 1010 return false; 1011 } else 1012 return false; 1013 1014 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); 1015 } 1016 1017 static void removeModOperands(MachineInstr &MI) { 1018 unsigned Opc = MI.getOpcode(); 1019 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1020 AMDGPU::OpName::src0_modifiers); 1021 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1022 AMDGPU::OpName::src1_modifiers); 1023 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1024 AMDGPU::OpName::src2_modifiers); 1025 1026 MI.RemoveOperand(Src2ModIdx); 1027 MI.RemoveOperand(Src1ModIdx); 1028 MI.RemoveOperand(Src0ModIdx); 1029 } 1030 1031 bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, 1032 unsigned Reg, MachineRegisterInfo *MRI) const { 1033 if (!MRI->hasOneNonDBGUse(Reg)) 1034 return false; 1035 1036 unsigned Opc = UseMI->getOpcode(); 1037 if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) { 1038 // Don't fold if we are using source modifiers. The new VOP2 instructions 1039 // don't have them. 1040 if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) || 1041 hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) || 1042 hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) { 1043 return false; 1044 } 1045 1046 MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0); 1047 MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1); 1048 MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2); 1049 1050 // Multiplied part is the constant: Use v_madmk_f32 1051 // We should only expect these to be on src0 due to canonicalizations. 1052 if (Src0->isReg() && Src0->getReg() == Reg) { 1053 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1054 return false; 1055 1056 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) 1057 return false; 1058 1059 // We need to do some weird looking operand shuffling since the madmk 1060 // operands are out of the normal expected order with the multiplied 1061 // constant as the last operand. 1062 // 1063 // v_mad_f32 src0, src1, src2 -> v_madmk_f32 src0 * src2K + src1 1064 // src0 -> src2 K 1065 // src1 -> src0 1066 // src2 -> src1 1067 1068 const int64_t Imm = DefMI->getOperand(1).getImm(); 1069 1070 // FIXME: This would be a lot easier if we could return a new instruction 1071 // instead of having to modify in place. 1072 1073 // Remove these first since they are at the end. 1074 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1075 AMDGPU::OpName::omod)); 1076 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1077 AMDGPU::OpName::clamp)); 1078 1079 unsigned Src1Reg = Src1->getReg(); 1080 unsigned Src1SubReg = Src1->getSubReg(); 1081 unsigned Src2Reg = Src2->getReg(); 1082 unsigned Src2SubReg = Src2->getSubReg(); 1083 Src0->setReg(Src1Reg); 1084 Src0->setSubReg(Src1SubReg); 1085 Src0->setIsKill(Src1->isKill()); 1086 1087 Src1->setReg(Src2Reg); 1088 Src1->setSubReg(Src2SubReg); 1089 Src1->setIsKill(Src2->isKill()); 1090 1091 if (Opc == AMDGPU::V_MAC_F32_e64) { 1092 UseMI->untieRegOperand( 1093 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1094 } 1095 1096 Src2->ChangeToImmediate(Imm); 1097 1098 removeModOperands(*UseMI); 1099 UseMI->setDesc(get(AMDGPU::V_MADMK_F32)); 1100 1101 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1102 if (DeleteDef) 1103 DefMI->eraseFromParent(); 1104 1105 return true; 1106 } 1107 1108 // Added part is the constant: Use v_madak_f32 1109 if (Src2->isReg() && Src2->getReg() == Reg) { 1110 // Not allowed to use constant bus for another operand. 1111 // We can however allow an inline immediate as src0. 1112 if (!Src0->isImm() && 1113 (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) 1114 return false; 1115 1116 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1117 return false; 1118 1119 const int64_t Imm = DefMI->getOperand(1).getImm(); 1120 1121 // FIXME: This would be a lot easier if we could return a new instruction 1122 // instead of having to modify in place. 1123 1124 // Remove these first since they are at the end. 1125 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1126 AMDGPU::OpName::omod)); 1127 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1128 AMDGPU::OpName::clamp)); 1129 1130 if (Opc == AMDGPU::V_MAC_F32_e64) { 1131 UseMI->untieRegOperand( 1132 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1133 } 1134 1135 // ChangingToImmediate adds Src2 back to the instruction. 1136 Src2->ChangeToImmediate(Imm); 1137 1138 // These come before src2. 1139 removeModOperands(*UseMI); 1140 UseMI->setDesc(get(AMDGPU::V_MADAK_F32)); 1141 1142 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1143 if (DeleteDef) 1144 DefMI->eraseFromParent(); 1145 1146 return true; 1147 } 1148 } 1149 1150 return false; 1151 } 1152 1153 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 1154 int WidthB, int OffsetB) { 1155 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1156 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1157 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1158 return LowOffset + LowWidth <= HighOffset; 1159 } 1160 1161 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa, 1162 MachineInstr *MIb) const { 1163 unsigned BaseReg0, Offset0; 1164 unsigned BaseReg1, Offset1; 1165 1166 if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && 1167 getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { 1168 assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() && 1169 "read2 / write2 not expected here yet"); 1170 unsigned Width0 = (*MIa->memoperands_begin())->getSize(); 1171 unsigned Width1 = (*MIb->memoperands_begin())->getSize(); 1172 if (BaseReg0 == BaseReg1 && 1173 offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { 1174 return true; 1175 } 1176 } 1177 1178 return false; 1179 } 1180 1181 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, 1182 MachineInstr *MIb, 1183 AliasAnalysis *AA) const { 1184 assert(MIa && (MIa->mayLoad() || MIa->mayStore()) && 1185 "MIa must load from or modify a memory location"); 1186 assert(MIb && (MIb->mayLoad() || MIb->mayStore()) && 1187 "MIb must load from or modify a memory location"); 1188 1189 if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects()) 1190 return false; 1191 1192 // XXX - Can we relax this between address spaces? 1193 if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef()) 1194 return false; 1195 1196 // TODO: Should we check the address space from the MachineMemOperand? That 1197 // would allow us to distinguish objects we know don't alias based on the 1198 // underlying address space, even if it was lowered to a different one, 1199 // e.g. private accesses lowered to use MUBUF instructions on a scratch 1200 // buffer. 1201 if (isDS(*MIa)) { 1202 if (isDS(*MIb)) 1203 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1204 1205 return !isFLAT(*MIb); 1206 } 1207 1208 if (isMUBUF(*MIa) || isMTBUF(*MIa)) { 1209 if (isMUBUF(*MIb) || isMTBUF(*MIb)) 1210 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1211 1212 return !isFLAT(*MIb) && !isSMRD(*MIb); 1213 } 1214 1215 if (isSMRD(*MIa)) { 1216 if (isSMRD(*MIb)) 1217 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1218 1219 return !isFLAT(*MIb) && !isMUBUF(*MIa) && !isMTBUF(*MIa); 1220 } 1221 1222 if (isFLAT(*MIa)) { 1223 if (isFLAT(*MIb)) 1224 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1225 1226 return false; 1227 } 1228 1229 return false; 1230 } 1231 1232 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, 1233 MachineBasicBlock::iterator &MI, 1234 LiveVariables *LV) const { 1235 1236 switch (MI->getOpcode()) { 1237 default: return nullptr; 1238 case AMDGPU::V_MAC_F32_e64: break; 1239 case AMDGPU::V_MAC_F32_e32: { 1240 const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); 1241 if (Src0->isImm() && !isInlineConstant(*Src0, 4)) 1242 return nullptr; 1243 break; 1244 } 1245 } 1246 1247 const MachineOperand *Dst = getNamedOperand(*MI, AMDGPU::OpName::vdst); 1248 const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); 1249 const MachineOperand *Src1 = getNamedOperand(*MI, AMDGPU::OpName::src1); 1250 const MachineOperand *Src2 = getNamedOperand(*MI, AMDGPU::OpName::src2); 1251 1252 return BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_MAD_F32)) 1253 .addOperand(*Dst) 1254 .addImm(0) // Src0 mods 1255 .addOperand(*Src0) 1256 .addImm(0) // Src1 mods 1257 .addOperand(*Src1) 1258 .addImm(0) // Src mods 1259 .addOperand(*Src2) 1260 .addImm(0) // clamp 1261 .addImm(0); // omod 1262 } 1263 1264 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 1265 int64_t SVal = Imm.getSExtValue(); 1266 if (SVal >= -16 && SVal <= 64) 1267 return true; 1268 1269 if (Imm.getBitWidth() == 64) { 1270 uint64_t Val = Imm.getZExtValue(); 1271 return (DoubleToBits(0.0) == Val) || 1272 (DoubleToBits(1.0) == Val) || 1273 (DoubleToBits(-1.0) == Val) || 1274 (DoubleToBits(0.5) == Val) || 1275 (DoubleToBits(-0.5) == Val) || 1276 (DoubleToBits(2.0) == Val) || 1277 (DoubleToBits(-2.0) == Val) || 1278 (DoubleToBits(4.0) == Val) || 1279 (DoubleToBits(-4.0) == Val); 1280 } 1281 1282 // The actual type of the operand does not seem to matter as long 1283 // as the bits match one of the inline immediate values. For example: 1284 // 1285 // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal, 1286 // so it is a legal inline immediate. 1287 // 1288 // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in 1289 // floating-point, so it is a legal inline immediate. 1290 uint32_t Val = Imm.getZExtValue(); 1291 1292 return (FloatToBits(0.0f) == Val) || 1293 (FloatToBits(1.0f) == Val) || 1294 (FloatToBits(-1.0f) == Val) || 1295 (FloatToBits(0.5f) == Val) || 1296 (FloatToBits(-0.5f) == Val) || 1297 (FloatToBits(2.0f) == Val) || 1298 (FloatToBits(-2.0f) == Val) || 1299 (FloatToBits(4.0f) == Val) || 1300 (FloatToBits(-4.0f) == Val); 1301 } 1302 1303 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 1304 unsigned OpSize) const { 1305 if (MO.isImm()) { 1306 // MachineOperand provides no way to tell the true operand size, since it 1307 // only records a 64-bit value. We need to know the size to determine if a 1308 // 32-bit floating point immediate bit pattern is legal for an integer 1309 // immediate. It would be for any 32-bit integer operand, but would not be 1310 // for a 64-bit one. 1311 1312 unsigned BitSize = 8 * OpSize; 1313 return isInlineConstant(APInt(BitSize, MO.getImm(), true)); 1314 } 1315 1316 return false; 1317 } 1318 1319 bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO, 1320 unsigned OpSize) const { 1321 return MO.isImm() && !isInlineConstant(MO, OpSize); 1322 } 1323 1324 static bool compareMachineOp(const MachineOperand &Op0, 1325 const MachineOperand &Op1) { 1326 if (Op0.getType() != Op1.getType()) 1327 return false; 1328 1329 switch (Op0.getType()) { 1330 case MachineOperand::MO_Register: 1331 return Op0.getReg() == Op1.getReg(); 1332 case MachineOperand::MO_Immediate: 1333 return Op0.getImm() == Op1.getImm(); 1334 default: 1335 llvm_unreachable("Didn't expect to be comparing these operand types"); 1336 } 1337 } 1338 1339 bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, 1340 const MachineOperand &MO) const { 1341 const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo]; 1342 1343 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 1344 1345 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 1346 return true; 1347 1348 if (OpInfo.RegClass < 0) 1349 return false; 1350 1351 unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize(); 1352 if (isLiteralConstant(MO, OpSize)) 1353 return RI.opCanUseLiteralConstant(OpInfo.OperandType); 1354 1355 return RI.opCanUseInlineConstant(OpInfo.OperandType); 1356 } 1357 1358 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 1359 int Op32 = AMDGPU::getVOPe32(Opcode); 1360 if (Op32 == -1) 1361 return false; 1362 1363 return pseudoToMCOpcode(Op32) != -1; 1364 } 1365 1366 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 1367 // The src0_modifier operand is present on all instructions 1368 // that have modifiers. 1369 1370 return AMDGPU::getNamedOperandIdx(Opcode, 1371 AMDGPU::OpName::src0_modifiers) != -1; 1372 } 1373 1374 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 1375 unsigned OpName) const { 1376 const MachineOperand *Mods = getNamedOperand(MI, OpName); 1377 return Mods && Mods->getImm(); 1378 } 1379 1380 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 1381 const MachineOperand &MO, 1382 unsigned OpSize) const { 1383 // Literal constants use the constant bus. 1384 if (isLiteralConstant(MO, OpSize)) 1385 return true; 1386 1387 if (!MO.isReg() || !MO.isUse()) 1388 return false; 1389 1390 if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) 1391 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 1392 1393 // FLAT_SCR is just an SGPR pair. 1394 if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) 1395 return true; 1396 1397 // EXEC register uses the constant bus. 1398 if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) 1399 return true; 1400 1401 // SGPRs use the constant bus 1402 return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 || 1403 (!MO.isImplicit() && 1404 (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || 1405 AMDGPU::SGPR_64RegClass.contains(MO.getReg())))); 1406 } 1407 1408 static unsigned findImplicitSGPRRead(const MachineInstr &MI) { 1409 for (const MachineOperand &MO : MI.implicit_operands()) { 1410 // We only care about reads. 1411 if (MO.isDef()) 1412 continue; 1413 1414 switch (MO.getReg()) { 1415 case AMDGPU::VCC: 1416 case AMDGPU::M0: 1417 case AMDGPU::FLAT_SCR: 1418 return MO.getReg(); 1419 1420 default: 1421 break; 1422 } 1423 } 1424 1425 return AMDGPU::NoRegister; 1426 } 1427 1428 bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, 1429 StringRef &ErrInfo) const { 1430 uint16_t Opcode = MI->getOpcode(); 1431 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1432 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 1433 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 1434 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 1435 1436 // Make sure we don't have SCC live-ins to basic blocks. moveToVALU assumes 1437 // all SCC users are in the same blocks as their defs. 1438 const MachineBasicBlock *MBB = MI->getParent(); 1439 if (MI == &MBB->front()) { 1440 if (MBB->isLiveIn(AMDGPU::SCC)) { 1441 ErrInfo = "scc register cannot be live across blocks."; 1442 return false; 1443 } 1444 } 1445 1446 // Make sure the number of operands is correct. 1447 const MCInstrDesc &Desc = get(Opcode); 1448 if (!Desc.isVariadic() && 1449 Desc.getNumOperands() != MI->getNumExplicitOperands()) { 1450 ErrInfo = "Instruction has wrong number of operands."; 1451 return false; 1452 } 1453 1454 // Make sure the register classes are correct. 1455 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 1456 if (MI->getOperand(i).isFPImm()) { 1457 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 1458 "all fp values to integers."; 1459 return false; 1460 } 1461 1462 int RegClass = Desc.OpInfo[i].RegClass; 1463 1464 switch (Desc.OpInfo[i].OperandType) { 1465 case MCOI::OPERAND_REGISTER: 1466 if (MI->getOperand(i).isImm()) { 1467 ErrInfo = "Illegal immediate value for operand."; 1468 return false; 1469 } 1470 break; 1471 case AMDGPU::OPERAND_REG_IMM32: 1472 break; 1473 case AMDGPU::OPERAND_REG_INLINE_C: 1474 if (isLiteralConstant(MI->getOperand(i), 1475 RI.getRegClass(RegClass)->getSize())) { 1476 ErrInfo = "Illegal immediate value for operand."; 1477 return false; 1478 } 1479 break; 1480 case MCOI::OPERAND_IMMEDIATE: 1481 // Check if this operand is an immediate. 1482 // FrameIndex operands will be replaced by immediates, so they are 1483 // allowed. 1484 if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) { 1485 ErrInfo = "Expected immediate, but got non-immediate"; 1486 return false; 1487 } 1488 // Fall-through 1489 default: 1490 continue; 1491 } 1492 1493 if (!MI->getOperand(i).isReg()) 1494 continue; 1495 1496 if (RegClass != -1) { 1497 unsigned Reg = MI->getOperand(i).getReg(); 1498 if (TargetRegisterInfo::isVirtualRegister(Reg)) 1499 continue; 1500 1501 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 1502 if (!RC->contains(Reg)) { 1503 ErrInfo = "Operand has incorrect register class."; 1504 return false; 1505 } 1506 } 1507 } 1508 1509 1510 // Verify VOP* 1511 if (isVOP1(*MI) || isVOP2(*MI) || isVOP3(*MI) || isVOPC(*MI)) { 1512 // Only look at the true operands. Only a real operand can use the constant 1513 // bus, and we don't want to check pseudo-operands like the source modifier 1514 // flags. 1515 const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; 1516 1517 unsigned ConstantBusCount = 0; 1518 unsigned SGPRUsed = findImplicitSGPRRead(*MI); 1519 if (SGPRUsed != AMDGPU::NoRegister) 1520 ++ConstantBusCount; 1521 1522 for (int OpIdx : OpIndices) { 1523 if (OpIdx == -1) 1524 break; 1525 const MachineOperand &MO = MI->getOperand(OpIdx); 1526 if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) { 1527 if (MO.isReg()) { 1528 if (MO.getReg() != SGPRUsed) 1529 ++ConstantBusCount; 1530 SGPRUsed = MO.getReg(); 1531 } else { 1532 ++ConstantBusCount; 1533 } 1534 } 1535 } 1536 if (ConstantBusCount > 1) { 1537 ErrInfo = "VOP* instruction uses the constant bus more than once"; 1538 return false; 1539 } 1540 } 1541 1542 // Verify misc. restrictions on specific instructions. 1543 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || 1544 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { 1545 const MachineOperand &Src0 = MI->getOperand(Src0Idx); 1546 const MachineOperand &Src1 = MI->getOperand(Src1Idx); 1547 const MachineOperand &Src2 = MI->getOperand(Src2Idx); 1548 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 1549 if (!compareMachineOp(Src0, Src1) && 1550 !compareMachineOp(Src0, Src2)) { 1551 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 1552 return false; 1553 } 1554 } 1555 } 1556 1557 // Make sure we aren't losing exec uses in the td files. This mostly requires 1558 // being careful when using let Uses to try to add other use registers. 1559 if (!isGenericOpcode(Opcode) && !isSALU(Opcode) && !isSMRD(Opcode)) { 1560 const MachineOperand *Exec = MI->findRegisterUseOperand(AMDGPU::EXEC); 1561 if (!Exec || !Exec->isImplicit()) { 1562 ErrInfo = "VALU instruction does not implicitly read exec mask"; 1563 return false; 1564 } 1565 } 1566 1567 return true; 1568 } 1569 1570 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { 1571 switch (MI.getOpcode()) { 1572 default: return AMDGPU::INSTRUCTION_LIST_END; 1573 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 1574 case AMDGPU::COPY: return AMDGPU::COPY; 1575 case AMDGPU::PHI: return AMDGPU::PHI; 1576 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 1577 case AMDGPU::S_MOV_B32: 1578 return MI.getOperand(1).isReg() ? 1579 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 1580 case AMDGPU::S_ADD_I32: 1581 case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; 1582 case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; 1583 case AMDGPU::S_SUB_I32: 1584 case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; 1585 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 1586 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; 1587 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32; 1588 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32; 1589 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32; 1590 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32; 1591 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32; 1592 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32; 1593 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32; 1594 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 1595 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; 1596 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 1597 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; 1598 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 1599 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; 1600 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; 1601 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; 1602 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; 1603 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; 1604 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 1605 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 1606 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 1607 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 1608 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; 1609 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; 1610 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; 1611 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; 1612 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; 1613 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; 1614 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; 1615 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; 1616 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; 1617 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; 1618 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; 1619 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; 1620 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 1621 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 1622 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 1623 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 1624 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; 1625 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; 1626 } 1627 } 1628 1629 bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const { 1630 return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END; 1631 } 1632 1633 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 1634 unsigned OpNo) const { 1635 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 1636 const MCInstrDesc &Desc = get(MI.getOpcode()); 1637 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 1638 Desc.OpInfo[OpNo].RegClass == -1) { 1639 unsigned Reg = MI.getOperand(OpNo).getReg(); 1640 1641 if (TargetRegisterInfo::isVirtualRegister(Reg)) 1642 return MRI.getRegClass(Reg); 1643 return RI.getPhysRegClass(Reg); 1644 } 1645 1646 unsigned RCID = Desc.OpInfo[OpNo].RegClass; 1647 return RI.getRegClass(RCID); 1648 } 1649 1650 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { 1651 switch (MI.getOpcode()) { 1652 case AMDGPU::COPY: 1653 case AMDGPU::REG_SEQUENCE: 1654 case AMDGPU::PHI: 1655 case AMDGPU::INSERT_SUBREG: 1656 return RI.hasVGPRs(getOpRegClass(MI, 0)); 1657 default: 1658 return RI.hasVGPRs(getOpRegClass(MI, OpNo)); 1659 } 1660 } 1661 1662 void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const { 1663 MachineBasicBlock::iterator I = MI; 1664 MachineBasicBlock *MBB = MI->getParent(); 1665 MachineOperand &MO = MI->getOperand(OpIdx); 1666 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1667 unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass; 1668 const TargetRegisterClass *RC = RI.getRegClass(RCID); 1669 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 1670 if (MO.isReg()) 1671 Opcode = AMDGPU::COPY; 1672 else if (RI.isSGPRClass(RC)) 1673 Opcode = AMDGPU::S_MOV_B32; 1674 1675 1676 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 1677 if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) 1678 VRC = &AMDGPU::VReg_64RegClass; 1679 else 1680 VRC = &AMDGPU::VGPR_32RegClass; 1681 1682 unsigned Reg = MRI.createVirtualRegister(VRC); 1683 DebugLoc DL = MBB->findDebugLoc(I); 1684 BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg) 1685 .addOperand(MO); 1686 MO.ChangeToRegister(Reg, false); 1687 } 1688 1689 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 1690 MachineRegisterInfo &MRI, 1691 MachineOperand &SuperReg, 1692 const TargetRegisterClass *SuperRC, 1693 unsigned SubIdx, 1694 const TargetRegisterClass *SubRC) 1695 const { 1696 MachineBasicBlock *MBB = MI->getParent(); 1697 DebugLoc DL = MI->getDebugLoc(); 1698 unsigned SubReg = MRI.createVirtualRegister(SubRC); 1699 1700 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { 1701 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 1702 .addReg(SuperReg.getReg(), 0, SubIdx); 1703 return SubReg; 1704 } 1705 1706 // Just in case the super register is itself a sub-register, copy it to a new 1707 // value so we don't need to worry about merging its subreg index with the 1708 // SubIdx passed to this function. The register coalescer should be able to 1709 // eliminate this extra copy. 1710 unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); 1711 1712 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 1713 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 1714 1715 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 1716 .addReg(NewSuperReg, 0, SubIdx); 1717 1718 return SubReg; 1719 } 1720 1721 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 1722 MachineBasicBlock::iterator MII, 1723 MachineRegisterInfo &MRI, 1724 MachineOperand &Op, 1725 const TargetRegisterClass *SuperRC, 1726 unsigned SubIdx, 1727 const TargetRegisterClass *SubRC) const { 1728 if (Op.isImm()) { 1729 // XXX - Is there a better way to do this? 1730 if (SubIdx == AMDGPU::sub0) 1731 return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF); 1732 if (SubIdx == AMDGPU::sub1) 1733 return MachineOperand::CreateImm(Op.getImm() >> 32); 1734 1735 llvm_unreachable("Unhandled register index for immediate"); 1736 } 1737 1738 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 1739 SubIdx, SubRC); 1740 return MachineOperand::CreateReg(SubReg, false); 1741 } 1742 1743 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 1744 void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const { 1745 assert(Inst->getNumExplicitOperands() == 3); 1746 MachineOperand Op1 = Inst->getOperand(1); 1747 Inst->RemoveOperand(1); 1748 Inst->addOperand(Op1); 1749 } 1750 1751 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, 1752 const MCOperandInfo &OpInfo, 1753 const MachineOperand &MO) const { 1754 if (!MO.isReg()) 1755 return false; 1756 1757 unsigned Reg = MO.getReg(); 1758 const TargetRegisterClass *RC = 1759 TargetRegisterInfo::isVirtualRegister(Reg) ? 1760 MRI.getRegClass(Reg) : 1761 RI.getPhysRegClass(Reg); 1762 1763 const SIRegisterInfo *TRI = 1764 static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 1765 RC = TRI->getSubRegClass(RC, MO.getSubReg()); 1766 1767 // In order to be legal, the common sub-class must be equal to the 1768 // class of the current operand. For example: 1769 // 1770 // v_mov_b32 s0 ; Operand defined as vsrc_32 1771 // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL 1772 // 1773 // s_sendmsg 0, s0 ; Operand defined as m0reg 1774 // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL 1775 1776 return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; 1777 } 1778 1779 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, 1780 const MCOperandInfo &OpInfo, 1781 const MachineOperand &MO) const { 1782 if (MO.isReg()) 1783 return isLegalRegOperand(MRI, OpInfo, MO); 1784 1785 // Handle non-register types that are treated like immediates. 1786 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 1787 return true; 1788 } 1789 1790 bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, 1791 const MachineOperand *MO) const { 1792 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1793 const MCInstrDesc &InstDesc = MI->getDesc(); 1794 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; 1795 const TargetRegisterClass *DefinedRC = 1796 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 1797 if (!MO) 1798 MO = &MI->getOperand(OpIdx); 1799 1800 if (isVALU(*MI) && 1801 usesConstantBus(MRI, *MO, DefinedRC->getSize())) { 1802 1803 RegSubRegPair SGPRUsed; 1804 if (MO->isReg()) 1805 SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg()); 1806 1807 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { 1808 if (i == OpIdx) 1809 continue; 1810 const MachineOperand &Op = MI->getOperand(i); 1811 if (Op.isReg() && 1812 (Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) && 1813 usesConstantBus(MRI, Op, getOpSize(*MI, i))) { 1814 return false; 1815 } 1816 } 1817 } 1818 1819 if (MO->isReg()) { 1820 assert(DefinedRC); 1821 return isLegalRegOperand(MRI, OpInfo, *MO); 1822 } 1823 1824 1825 // Handle non-register types that are treated like immediates. 1826 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); 1827 1828 if (!DefinedRC) { 1829 // This operand expects an immediate. 1830 return true; 1831 } 1832 1833 return isImmOperandLegal(MI, OpIdx, *MO); 1834 } 1835 1836 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, 1837 MachineInstr *MI) const { 1838 unsigned Opc = MI->getOpcode(); 1839 const MCInstrDesc &InstrDesc = get(Opc); 1840 1841 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1842 MachineOperand &Src1 = MI->getOperand(Src1Idx); 1843 1844 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 1845 // we need to only have one constant bus use. 1846 // 1847 // Note we do not need to worry about literal constants here. They are 1848 // disabled for the operand type for instructions because they will always 1849 // violate the one constant bus use rule. 1850 bool HasImplicitSGPR = findImplicitSGPRRead(*MI) != AMDGPU::NoRegister; 1851 if (HasImplicitSGPR) { 1852 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1853 MachineOperand &Src0 = MI->getOperand(Src0Idx); 1854 1855 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) 1856 legalizeOpWithMove(MI, Src0Idx); 1857 } 1858 1859 // VOP2 src0 instructions support all operand types, so we don't need to check 1860 // their legality. If src1 is already legal, we don't need to do anything. 1861 if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) 1862 return; 1863 1864 // We do not use commuteInstruction here because it is too aggressive and will 1865 // commute if it is possible. We only want to commute here if it improves 1866 // legality. This can be called a fairly large number of times so don't waste 1867 // compile time pointlessly swapping and checking legality again. 1868 if (HasImplicitSGPR || !MI->isCommutable()) { 1869 legalizeOpWithMove(MI, Src1Idx); 1870 return; 1871 } 1872 1873 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1874 MachineOperand &Src0 = MI->getOperand(Src0Idx); 1875 1876 // If src0 can be used as src1, commuting will make the operands legal. 1877 // Otherwise we have to give up and insert a move. 1878 // 1879 // TODO: Other immediate-like operand kinds could be commuted if there was a 1880 // MachineOperand::ChangeTo* for them. 1881 if ((!Src1.isImm() && !Src1.isReg()) || 1882 !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { 1883 legalizeOpWithMove(MI, Src1Idx); 1884 return; 1885 } 1886 1887 int CommutedOpc = commuteOpcode(*MI); 1888 if (CommutedOpc == -1) { 1889 legalizeOpWithMove(MI, Src1Idx); 1890 return; 1891 } 1892 1893 MI->setDesc(get(CommutedOpc)); 1894 1895 unsigned Src0Reg = Src0.getReg(); 1896 unsigned Src0SubReg = Src0.getSubReg(); 1897 bool Src0Kill = Src0.isKill(); 1898 1899 if (Src1.isImm()) 1900 Src0.ChangeToImmediate(Src1.getImm()); 1901 else if (Src1.isReg()) { 1902 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); 1903 Src0.setSubReg(Src1.getSubReg()); 1904 } else 1905 llvm_unreachable("Should only have register or immediate operands"); 1906 1907 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); 1908 Src1.setSubReg(Src0SubReg); 1909 } 1910 1911 // Legalize VOP3 operands. Because all operand types are supported for any 1912 // operand, and since literal constants are not allowed and should never be 1913 // seen, we only need to worry about inserting copies if we use multiple SGPR 1914 // operands. 1915 void SIInstrInfo::legalizeOperandsVOP3( 1916 MachineRegisterInfo &MRI, 1917 MachineInstr *MI) const { 1918 unsigned Opc = MI->getOpcode(); 1919 1920 int VOP3Idx[3] = { 1921 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 1922 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), 1923 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) 1924 }; 1925 1926 // Find the one SGPR operand we are allowed to use. 1927 unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); 1928 1929 for (unsigned i = 0; i < 3; ++i) { 1930 int Idx = VOP3Idx[i]; 1931 if (Idx == -1) 1932 break; 1933 MachineOperand &MO = MI->getOperand(Idx); 1934 1935 // We should never see a VOP3 instruction with an illegal immediate operand. 1936 if (!MO.isReg()) 1937 continue; 1938 1939 if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 1940 continue; // VGPRs are legal 1941 1942 if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { 1943 SGPRReg = MO.getReg(); 1944 // We can use one SGPR in each VOP3 instruction. 1945 continue; 1946 } 1947 1948 // If we make it this far, then the operand is not legal and we must 1949 // legalize it. 1950 legalizeOpWithMove(MI, Idx); 1951 } 1952 } 1953 1954 unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr *UseMI, 1955 MachineRegisterInfo &MRI) const { 1956 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); 1957 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); 1958 unsigned DstReg = MRI.createVirtualRegister(SRC); 1959 unsigned SubRegs = VRC->getSize() / 4; 1960 1961 SmallVector<unsigned, 8> SRegs; 1962 for (unsigned i = 0; i < SubRegs; ++i) { 1963 unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 1964 BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(), 1965 get(AMDGPU::V_READFIRSTLANE_B32), SGPR) 1966 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); 1967 SRegs.push_back(SGPR); 1968 } 1969 1970 MachineInstrBuilder MIB = BuildMI(*UseMI->getParent(), UseMI, 1971 UseMI->getDebugLoc(), 1972 get(AMDGPU::REG_SEQUENCE), DstReg); 1973 for (unsigned i = 0; i < SubRegs; ++i) { 1974 MIB.addReg(SRegs[i]); 1975 MIB.addImm(RI.getSubRegFromChannel(i)); 1976 } 1977 return DstReg; 1978 } 1979 1980 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, 1981 MachineInstr *MI) const { 1982 1983 // If the pointer is store in VGPRs, then we need to move them to 1984 // SGPRs using v_readfirstlane. This is safe because we only select 1985 // loads with uniform pointers to SMRD instruction so we know the 1986 // pointer value is uniform. 1987 MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase); 1988 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { 1989 unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); 1990 SBase->setReg(SGPR); 1991 } 1992 } 1993 1994 void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { 1995 MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1996 1997 // Legalize VOP2 1998 if (isVOP2(*MI) || isVOPC(*MI)) { 1999 legalizeOperandsVOP2(MRI, MI); 2000 return; 2001 } 2002 2003 // Legalize VOP3 2004 if (isVOP3(*MI)) { 2005 legalizeOperandsVOP3(MRI, MI); 2006 return; 2007 } 2008 2009 // Legalize SMRD 2010 if (isSMRD(*MI)) { 2011 legalizeOperandsSMRD(MRI, MI); 2012 return; 2013 } 2014 2015 // Legalize REG_SEQUENCE and PHI 2016 // The register class of the operands much be the same type as the register 2017 // class of the output. 2018 if (MI->getOpcode() == AMDGPU::PHI) { 2019 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 2020 for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { 2021 if (!MI->getOperand(i).isReg() || 2022 !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) 2023 continue; 2024 const TargetRegisterClass *OpRC = 2025 MRI.getRegClass(MI->getOperand(i).getReg()); 2026 if (RI.hasVGPRs(OpRC)) { 2027 VRC = OpRC; 2028 } else { 2029 SRC = OpRC; 2030 } 2031 } 2032 2033 // If any of the operands are VGPR registers, then they all most be 2034 // otherwise we will create illegal VGPR->SGPR copies when legalizing 2035 // them. 2036 if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) { 2037 if (!VRC) { 2038 assert(SRC); 2039 VRC = RI.getEquivalentVGPRClass(SRC); 2040 } 2041 RC = VRC; 2042 } else { 2043 RC = SRC; 2044 } 2045 2046 // Update all the operands so they have the same type. 2047 for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { 2048 MachineOperand &Op = MI->getOperand(I); 2049 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 2050 continue; 2051 unsigned DstReg = MRI.createVirtualRegister(RC); 2052 2053 // MI is a PHI instruction. 2054 MachineBasicBlock *InsertBB = MI->getOperand(I + 1).getMBB(); 2055 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); 2056 2057 BuildMI(*InsertBB, Insert, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) 2058 .addOperand(Op); 2059 Op.setReg(DstReg); 2060 } 2061 } 2062 2063 // REG_SEQUENCE doesn't really require operand legalization, but if one has a 2064 // VGPR dest type and SGPR sources, insert copies so all operands are 2065 // VGPRs. This seems to help operand folding / the register coalescer. 2066 if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) { 2067 MachineBasicBlock *MBB = MI->getParent(); 2068 const TargetRegisterClass *DstRC = getOpRegClass(*MI, 0); 2069 if (RI.hasVGPRs(DstRC)) { 2070 // Update all the operands so they are VGPR register classes. These may 2071 // not be the same register class because REG_SEQUENCE supports mixing 2072 // subregister index types e.g. sub0_sub1 + sub2 + sub3 2073 for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { 2074 MachineOperand &Op = MI->getOperand(I); 2075 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 2076 continue; 2077 2078 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); 2079 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); 2080 if (VRC == OpRC) 2081 continue; 2082 2083 unsigned DstReg = MRI.createVirtualRegister(VRC); 2084 2085 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) 2086 .addOperand(Op); 2087 2088 Op.setReg(DstReg); 2089 Op.setIsKill(); 2090 } 2091 } 2092 2093 return; 2094 } 2095 2096 // Legalize INSERT_SUBREG 2097 // src0 must have the same register class as dst 2098 if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) { 2099 unsigned Dst = MI->getOperand(0).getReg(); 2100 unsigned Src0 = MI->getOperand(1).getReg(); 2101 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 2102 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 2103 if (DstRC != Src0RC) { 2104 MachineBasicBlock &MBB = *MI->getParent(); 2105 unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); 2106 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0) 2107 .addReg(Src0); 2108 MI->getOperand(1).setReg(NewSrc0); 2109 } 2110 return; 2111 } 2112 2113 // Legalize MIMG 2114 if (isMIMG(*MI)) { 2115 MachineOperand *SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); 2116 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { 2117 unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); 2118 SRsrc->setReg(SGPR); 2119 } 2120 2121 MachineOperand *SSamp = getNamedOperand(*MI, AMDGPU::OpName::ssamp); 2122 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { 2123 unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); 2124 SSamp->setReg(SGPR); 2125 } 2126 return; 2127 } 2128 2129 // Legalize MUBUF* instructions 2130 // FIXME: If we start using the non-addr64 instructions for compute, we 2131 // may need to legalize them here. 2132 int SRsrcIdx = 2133 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc); 2134 if (SRsrcIdx != -1) { 2135 // We have an MUBUF instruction 2136 MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx); 2137 unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass; 2138 if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), 2139 RI.getRegClass(SRsrcRC))) { 2140 // The operands are legal. 2141 // FIXME: We may need to legalize operands besided srsrc. 2142 return; 2143 } 2144 2145 MachineBasicBlock &MBB = *MI->getParent(); 2146 2147 // Extract the ptr from the resource descriptor. 2148 unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, 2149 &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); 2150 2151 // Create an empty resource descriptor 2152 unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2153 unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2154 unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2155 unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); 2156 uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); 2157 2158 // Zero64 = 0 2159 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64), 2160 Zero64) 2161 .addImm(0); 2162 2163 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 2164 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 2165 SRsrcFormatLo) 2166 .addImm(RsrcDataFormat & 0xFFFFFFFF); 2167 2168 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 2169 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 2170 SRsrcFormatHi) 2171 .addImm(RsrcDataFormat >> 32); 2172 2173 // NewSRsrc = {Zero64, SRsrcFormat} 2174 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) 2175 .addReg(Zero64) 2176 .addImm(AMDGPU::sub0_sub1) 2177 .addReg(SRsrcFormatLo) 2178 .addImm(AMDGPU::sub2) 2179 .addReg(SRsrcFormatHi) 2180 .addImm(AMDGPU::sub3); 2181 2182 MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); 2183 unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2184 if (VAddr) { 2185 // This is already an ADDR64 instruction so we need to add the pointer 2186 // extracted from the resource descriptor to the current value of VAddr. 2187 unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2188 unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2189 2190 // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 2191 DebugLoc DL = MI->getDebugLoc(); 2192 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) 2193 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 2194 .addReg(VAddr->getReg(), 0, AMDGPU::sub0); 2195 2196 // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1 2197 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) 2198 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 2199 .addReg(VAddr->getReg(), 0, AMDGPU::sub1); 2200 2201 // NewVaddr = {NewVaddrHi, NewVaddrLo} 2202 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 2203 .addReg(NewVAddrLo) 2204 .addImm(AMDGPU::sub0) 2205 .addReg(NewVAddrHi) 2206 .addImm(AMDGPU::sub1); 2207 } else { 2208 // This instructions is the _OFFSET variant, so we need to convert it to 2209 // ADDR64. 2210 assert(MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() 2211 < AMDGPUSubtarget::VOLCANIC_ISLANDS && 2212 "FIXME: Need to emit flat atomics here"); 2213 2214 MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata); 2215 MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset); 2216 MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset); 2217 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode()); 2218 2219 // Atomics rith return have have an additional tied operand and are 2220 // missing some of the special bits. 2221 MachineOperand *VDataIn = getNamedOperand(*MI, AMDGPU::OpName::vdata_in); 2222 MachineInstr *Addr64; 2223 2224 if (!VDataIn) { 2225 // Regular buffer load / store. 2226 MachineInstrBuilder MIB 2227 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) 2228 .addOperand(*VData) 2229 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 2230 // This will be replaced later 2231 // with the new value of vaddr. 2232 .addOperand(*SRsrc) 2233 .addOperand(*SOffset) 2234 .addOperand(*Offset); 2235 2236 // Atomics do not have this operand. 2237 if (const MachineOperand *GLC 2238 = getNamedOperand(*MI, AMDGPU::OpName::glc)) { 2239 MIB.addImm(GLC->getImm()); 2240 } 2241 2242 MIB.addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)); 2243 2244 if (const MachineOperand *TFE 2245 = getNamedOperand(*MI, AMDGPU::OpName::tfe)) { 2246 MIB.addImm(TFE->getImm()); 2247 } 2248 2249 MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 2250 Addr64 = MIB; 2251 } else { 2252 // Atomics with return. 2253 Addr64 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) 2254 .addOperand(*VData) 2255 .addOperand(*VDataIn) 2256 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 2257 // This will be replaced later 2258 // with the new value of vaddr. 2259 .addOperand(*SRsrc) 2260 .addOperand(*SOffset) 2261 .addOperand(*Offset) 2262 .addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)) 2263 .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 2264 } 2265 2266 MI->removeFromParent(); 2267 MI = Addr64; 2268 2269 // NewVaddr = {NewVaddrHi, NewVaddrLo} 2270 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 2271 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 2272 .addImm(AMDGPU::sub0) 2273 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 2274 .addImm(AMDGPU::sub1); 2275 2276 VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); 2277 SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); 2278 } 2279 2280 // Update the instruction to use NewVaddr 2281 VAddr->setReg(NewVAddr); 2282 // Update the instruction to use NewSRsrc 2283 SRsrc->setReg(NewSRsrc); 2284 } 2285 } 2286 2287 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { 2288 SmallVector<MachineInstr *, 128> Worklist; 2289 Worklist.push_back(&TopInst); 2290 2291 while (!Worklist.empty()) { 2292 MachineInstr *Inst = Worklist.pop_back_val(); 2293 MachineBasicBlock *MBB = Inst->getParent(); 2294 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 2295 2296 unsigned Opcode = Inst->getOpcode(); 2297 unsigned NewOpcode = getVALUOp(*Inst); 2298 2299 // Handle some special cases 2300 switch (Opcode) { 2301 default: 2302 break; 2303 case AMDGPU::S_AND_B64: 2304 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); 2305 Inst->eraseFromParent(); 2306 continue; 2307 2308 case AMDGPU::S_OR_B64: 2309 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64); 2310 Inst->eraseFromParent(); 2311 continue; 2312 2313 case AMDGPU::S_XOR_B64: 2314 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64); 2315 Inst->eraseFromParent(); 2316 continue; 2317 2318 case AMDGPU::S_NOT_B64: 2319 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); 2320 Inst->eraseFromParent(); 2321 continue; 2322 2323 case AMDGPU::S_BCNT1_I32_B64: 2324 splitScalar64BitBCNT(Worklist, Inst); 2325 Inst->eraseFromParent(); 2326 continue; 2327 2328 case AMDGPU::S_BFE_I64: { 2329 splitScalar64BitBFE(Worklist, Inst); 2330 Inst->eraseFromParent(); 2331 continue; 2332 } 2333 2334 case AMDGPU::S_LSHL_B32: 2335 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2336 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 2337 swapOperands(Inst); 2338 } 2339 break; 2340 case AMDGPU::S_ASHR_I32: 2341 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2342 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 2343 swapOperands(Inst); 2344 } 2345 break; 2346 case AMDGPU::S_LSHR_B32: 2347 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2348 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 2349 swapOperands(Inst); 2350 } 2351 break; 2352 case AMDGPU::S_LSHL_B64: 2353 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2354 NewOpcode = AMDGPU::V_LSHLREV_B64; 2355 swapOperands(Inst); 2356 } 2357 break; 2358 case AMDGPU::S_ASHR_I64: 2359 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2360 NewOpcode = AMDGPU::V_ASHRREV_I64; 2361 swapOperands(Inst); 2362 } 2363 break; 2364 case AMDGPU::S_LSHR_B64: 2365 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2366 NewOpcode = AMDGPU::V_LSHRREV_B64; 2367 swapOperands(Inst); 2368 } 2369 break; 2370 2371 case AMDGPU::S_ABS_I32: 2372 lowerScalarAbs(Worklist, Inst); 2373 Inst->eraseFromParent(); 2374 continue; 2375 2376 case AMDGPU::S_CBRANCH_SCC0: 2377 case AMDGPU::S_CBRANCH_SCC1: 2378 // Clear unused bits of vcc 2379 BuildMI(*MBB, Inst, Inst->getDebugLoc(), get(AMDGPU::S_AND_B64), AMDGPU::VCC) 2380 .addReg(AMDGPU::EXEC) 2381 .addReg(AMDGPU::VCC); 2382 break; 2383 2384 case AMDGPU::S_BFE_U64: 2385 case AMDGPU::S_BFM_B64: 2386 llvm_unreachable("Moving this op to VALU not implemented"); 2387 } 2388 2389 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 2390 // We cannot move this instruction to the VALU, so we should try to 2391 // legalize its operands instead. 2392 legalizeOperands(Inst); 2393 continue; 2394 } 2395 2396 // Use the new VALU Opcode. 2397 const MCInstrDesc &NewDesc = get(NewOpcode); 2398 Inst->setDesc(NewDesc); 2399 2400 // Remove any references to SCC. Vector instructions can't read from it, and 2401 // We're just about to add the implicit use / defs of VCC, and we don't want 2402 // both. 2403 for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) { 2404 MachineOperand &Op = Inst->getOperand(i); 2405 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { 2406 Inst->RemoveOperand(i); 2407 addSCCDefUsersToVALUWorklist(Inst, Worklist); 2408 } 2409 } 2410 2411 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 2412 // We are converting these to a BFE, so we need to add the missing 2413 // operands for the size and offset. 2414 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 2415 Inst->addOperand(MachineOperand::CreateImm(0)); 2416 Inst->addOperand(MachineOperand::CreateImm(Size)); 2417 2418 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 2419 // The VALU version adds the second operand to the result, so insert an 2420 // extra 0 operand. 2421 Inst->addOperand(MachineOperand::CreateImm(0)); 2422 } 2423 2424 Inst->addImplicitDefUseOperands(*Inst->getParent()->getParent()); 2425 2426 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 2427 const MachineOperand &OffsetWidthOp = Inst->getOperand(2); 2428 // If we need to move this to VGPRs, we need to unpack the second operand 2429 // back into the 2 separate ones for bit offset and width. 2430 assert(OffsetWidthOp.isImm() && 2431 "Scalar BFE is only implemented for constant width and offset"); 2432 uint32_t Imm = OffsetWidthOp.getImm(); 2433 2434 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 2435 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 2436 Inst->RemoveOperand(2); // Remove old immediate. 2437 Inst->addOperand(MachineOperand::CreateImm(Offset)); 2438 Inst->addOperand(MachineOperand::CreateImm(BitWidth)); 2439 } 2440 2441 bool HasDst = Inst->getOperand(0).isReg() && Inst->getOperand(0).isDef(); 2442 unsigned NewDstReg = AMDGPU::NoRegister; 2443 if (HasDst) { 2444 // Update the destination register class. 2445 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst); 2446 if (!NewDstRC) 2447 continue; 2448 2449 unsigned DstReg = Inst->getOperand(0).getReg(); 2450 NewDstReg = MRI.createVirtualRegister(NewDstRC); 2451 MRI.replaceRegWith(DstReg, NewDstReg); 2452 } 2453 2454 // Legalize the operands 2455 legalizeOperands(Inst); 2456 2457 if (HasDst) 2458 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 2459 } 2460 } 2461 2462 //===----------------------------------------------------------------------===// 2463 // Indirect addressing callbacks 2464 //===----------------------------------------------------------------------===// 2465 2466 const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const { 2467 return &AMDGPU::VGPR_32RegClass; 2468 } 2469 2470 void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, 2471 MachineInstr *Inst) const { 2472 MachineBasicBlock &MBB = *Inst->getParent(); 2473 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2474 MachineBasicBlock::iterator MII = Inst; 2475 DebugLoc DL = Inst->getDebugLoc(); 2476 2477 MachineOperand &Dest = Inst->getOperand(0); 2478 MachineOperand &Src = Inst->getOperand(1); 2479 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2480 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2481 2482 BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg) 2483 .addImm(0) 2484 .addReg(Src.getReg()); 2485 2486 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) 2487 .addReg(Src.getReg()) 2488 .addReg(TmpReg); 2489 2490 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2491 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2492 } 2493 2494 void SIInstrInfo::splitScalar64BitUnaryOp( 2495 SmallVectorImpl<MachineInstr *> &Worklist, 2496 MachineInstr *Inst, 2497 unsigned Opcode) const { 2498 MachineBasicBlock &MBB = *Inst->getParent(); 2499 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2500 2501 MachineOperand &Dest = Inst->getOperand(0); 2502 MachineOperand &Src0 = Inst->getOperand(1); 2503 DebugLoc DL = Inst->getDebugLoc(); 2504 2505 MachineBasicBlock::iterator MII = Inst; 2506 2507 const MCInstrDesc &InstDesc = get(Opcode); 2508 const TargetRegisterClass *Src0RC = Src0.isReg() ? 2509 MRI.getRegClass(Src0.getReg()) : 2510 &AMDGPU::SGPR_32RegClass; 2511 2512 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 2513 2514 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2515 AMDGPU::sub0, Src0SubRC); 2516 2517 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 2518 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 2519 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 2520 2521 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 2522 BuildMI(MBB, MII, DL, InstDesc, DestSub0) 2523 .addOperand(SrcReg0Sub0); 2524 2525 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2526 AMDGPU::sub1, Src0SubRC); 2527 2528 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 2529 BuildMI(MBB, MII, DL, InstDesc, DestSub1) 2530 .addOperand(SrcReg0Sub1); 2531 2532 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 2533 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2534 .addReg(DestSub0) 2535 .addImm(AMDGPU::sub0) 2536 .addReg(DestSub1) 2537 .addImm(AMDGPU::sub1); 2538 2539 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 2540 2541 // We don't need to legalizeOperands here because for a single operand, src0 2542 // will support any kind of input. 2543 2544 // Move all users of this moved value. 2545 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 2546 } 2547 2548 void SIInstrInfo::splitScalar64BitBinaryOp( 2549 SmallVectorImpl<MachineInstr *> &Worklist, 2550 MachineInstr *Inst, 2551 unsigned Opcode) const { 2552 MachineBasicBlock &MBB = *Inst->getParent(); 2553 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2554 2555 MachineOperand &Dest = Inst->getOperand(0); 2556 MachineOperand &Src0 = Inst->getOperand(1); 2557 MachineOperand &Src1 = Inst->getOperand(2); 2558 DebugLoc DL = Inst->getDebugLoc(); 2559 2560 MachineBasicBlock::iterator MII = Inst; 2561 2562 const MCInstrDesc &InstDesc = get(Opcode); 2563 const TargetRegisterClass *Src0RC = Src0.isReg() ? 2564 MRI.getRegClass(Src0.getReg()) : 2565 &AMDGPU::SGPR_32RegClass; 2566 2567 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 2568 const TargetRegisterClass *Src1RC = Src1.isReg() ? 2569 MRI.getRegClass(Src1.getReg()) : 2570 &AMDGPU::SGPR_32RegClass; 2571 2572 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 2573 2574 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2575 AMDGPU::sub0, Src0SubRC); 2576 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 2577 AMDGPU::sub0, Src1SubRC); 2578 2579 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 2580 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 2581 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 2582 2583 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 2584 MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) 2585 .addOperand(SrcReg0Sub0) 2586 .addOperand(SrcReg1Sub0); 2587 2588 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2589 AMDGPU::sub1, Src0SubRC); 2590 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 2591 AMDGPU::sub1, Src1SubRC); 2592 2593 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 2594 MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) 2595 .addOperand(SrcReg0Sub1) 2596 .addOperand(SrcReg1Sub1); 2597 2598 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 2599 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2600 .addReg(DestSub0) 2601 .addImm(AMDGPU::sub0) 2602 .addReg(DestSub1) 2603 .addImm(AMDGPU::sub1); 2604 2605 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 2606 2607 // Try to legalize the operands in case we need to swap the order to keep it 2608 // valid. 2609 legalizeOperands(LoHalf); 2610 legalizeOperands(HiHalf); 2611 2612 // Move all users of this moved vlaue. 2613 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 2614 } 2615 2616 void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, 2617 MachineInstr *Inst) const { 2618 MachineBasicBlock &MBB = *Inst->getParent(); 2619 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2620 2621 MachineBasicBlock::iterator MII = Inst; 2622 DebugLoc DL = Inst->getDebugLoc(); 2623 2624 MachineOperand &Dest = Inst->getOperand(0); 2625 MachineOperand &Src = Inst->getOperand(1); 2626 2627 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 2628 const TargetRegisterClass *SrcRC = Src.isReg() ? 2629 MRI.getRegClass(Src.getReg()) : 2630 &AMDGPU::SGPR_32RegClass; 2631 2632 unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2633 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2634 2635 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); 2636 2637 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 2638 AMDGPU::sub0, SrcSubRC); 2639 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 2640 AMDGPU::sub1, SrcSubRC); 2641 2642 BuildMI(MBB, MII, DL, InstDesc, MidReg) 2643 .addOperand(SrcRegSub0) 2644 .addImm(0); 2645 2646 BuildMI(MBB, MII, DL, InstDesc, ResultReg) 2647 .addOperand(SrcRegSub1) 2648 .addReg(MidReg); 2649 2650 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2651 2652 // We don't need to legalize operands here. src0 for etiher instruction can be 2653 // an SGPR, and the second input is unused or determined here. 2654 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2655 } 2656 2657 void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, 2658 MachineInstr *Inst) const { 2659 MachineBasicBlock &MBB = *Inst->getParent(); 2660 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2661 MachineBasicBlock::iterator MII = Inst; 2662 DebugLoc DL = Inst->getDebugLoc(); 2663 2664 MachineOperand &Dest = Inst->getOperand(0); 2665 uint32_t Imm = Inst->getOperand(2).getImm(); 2666 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 2667 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 2668 2669 (void) Offset; 2670 2671 // Only sext_inreg cases handled. 2672 assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 && 2673 BitWidth <= 32 && 2674 Offset == 0 && 2675 "Not implemented"); 2676 2677 if (BitWidth < 32) { 2678 unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2679 unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2680 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2681 2682 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) 2683 .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0) 2684 .addImm(0) 2685 .addImm(BitWidth); 2686 2687 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 2688 .addImm(31) 2689 .addReg(MidRegLo); 2690 2691 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 2692 .addReg(MidRegLo) 2693 .addImm(AMDGPU::sub0) 2694 .addReg(MidRegHi) 2695 .addImm(AMDGPU::sub1); 2696 2697 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2698 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2699 return; 2700 } 2701 2702 MachineOperand &Src = Inst->getOperand(1); 2703 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2704 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2705 2706 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 2707 .addImm(31) 2708 .addReg(Src.getReg(), 0, AMDGPU::sub0); 2709 2710 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 2711 .addReg(Src.getReg(), 0, AMDGPU::sub0) 2712 .addImm(AMDGPU::sub0) 2713 .addReg(TmpReg) 2714 .addImm(AMDGPU::sub1); 2715 2716 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2717 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 2718 } 2719 2720 void SIInstrInfo::addUsersToMoveToVALUWorklist( 2721 unsigned DstReg, 2722 MachineRegisterInfo &MRI, 2723 SmallVectorImpl<MachineInstr *> &Worklist) const { 2724 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), 2725 E = MRI.use_end(); I != E; ++I) { 2726 MachineInstr &UseMI = *I->getParent(); 2727 if (!canReadVGPR(UseMI, I.getOperandNo())) { 2728 Worklist.push_back(&UseMI); 2729 } 2730 } 2731 } 2732 2733 void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineInstr *SCCDefInst, 2734 SmallVectorImpl<MachineInstr *> &Worklist) const { 2735 // This assumes that all the users of SCC are in the same block 2736 // as the SCC def. 2737 for (MachineBasicBlock::iterator I = SCCDefInst, 2738 E = SCCDefInst->getParent()->end(); I != E; ++I) { 2739 2740 // Exit if we find another SCC def. 2741 if (I->findRegisterDefOperandIdx(AMDGPU::SCC) != -1) 2742 return; 2743 2744 if (I->findRegisterUseOperandIdx(AMDGPU::SCC) != -1) 2745 Worklist.push_back(I); 2746 } 2747 } 2748 2749 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( 2750 const MachineInstr &Inst) const { 2751 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); 2752 2753 switch (Inst.getOpcode()) { 2754 // For target instructions, getOpRegClass just returns the virtual register 2755 // class associated with the operand, so we need to find an equivalent VGPR 2756 // register class in order to move the instruction to the VALU. 2757 case AMDGPU::COPY: 2758 case AMDGPU::PHI: 2759 case AMDGPU::REG_SEQUENCE: 2760 case AMDGPU::INSERT_SUBREG: 2761 if (RI.hasVGPRs(NewDstRC)) 2762 return nullptr; 2763 2764 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 2765 if (!NewDstRC) 2766 return nullptr; 2767 return NewDstRC; 2768 default: 2769 return NewDstRC; 2770 } 2771 } 2772 2773 // Find the one SGPR operand we are allowed to use. 2774 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, 2775 int OpIndices[3]) const { 2776 const MCInstrDesc &Desc = MI->getDesc(); 2777 2778 // Find the one SGPR operand we are allowed to use. 2779 // 2780 // First we need to consider the instruction's operand requirements before 2781 // legalizing. Some operands are required to be SGPRs, such as implicit uses 2782 // of VCC, but we are still bound by the constant bus requirement to only use 2783 // one. 2784 // 2785 // If the operand's class is an SGPR, we can never move it. 2786 2787 unsigned SGPRReg = findImplicitSGPRRead(*MI); 2788 if (SGPRReg != AMDGPU::NoRegister) 2789 return SGPRReg; 2790 2791 unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; 2792 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 2793 2794 for (unsigned i = 0; i < 3; ++i) { 2795 int Idx = OpIndices[i]; 2796 if (Idx == -1) 2797 break; 2798 2799 const MachineOperand &MO = MI->getOperand(Idx); 2800 if (!MO.isReg()) 2801 continue; 2802 2803 // Is this operand statically required to be an SGPR based on the operand 2804 // constraints? 2805 const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); 2806 bool IsRequiredSGPR = RI.isSGPRClass(OpRC); 2807 if (IsRequiredSGPR) 2808 return MO.getReg(); 2809 2810 // If this could be a VGPR or an SGPR, Check the dynamic register class. 2811 unsigned Reg = MO.getReg(); 2812 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); 2813 if (RI.isSGPRClass(RegRC)) 2814 UsedSGPRs[i] = Reg; 2815 } 2816 2817 // We don't have a required SGPR operand, so we have a bit more freedom in 2818 // selecting operands to move. 2819 2820 // Try to select the most used SGPR. If an SGPR is equal to one of the 2821 // others, we choose that. 2822 // 2823 // e.g. 2824 // V_FMA_F32 v0, s0, s0, s0 -> No moves 2825 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 2826 2827 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should 2828 // prefer those. 2829 2830 if (UsedSGPRs[0] != AMDGPU::NoRegister) { 2831 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 2832 SGPRReg = UsedSGPRs[0]; 2833 } 2834 2835 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { 2836 if (UsedSGPRs[1] == UsedSGPRs[2]) 2837 SGPRReg = UsedSGPRs[1]; 2838 } 2839 2840 return SGPRReg; 2841 } 2842 2843 void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved, 2844 const MachineFunction &MF) const { 2845 int End = getIndirectIndexEnd(MF); 2846 int Begin = getIndirectIndexBegin(MF); 2847 2848 if (End == -1) 2849 return; 2850 2851 2852 for (int Index = Begin; Index <= End; ++Index) 2853 Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index)); 2854 2855 for (int Index = std::max(0, Begin - 1); Index <= End; ++Index) 2856 Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index)); 2857 2858 for (int Index = std::max(0, Begin - 2); Index <= End; ++Index) 2859 Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index)); 2860 2861 for (int Index = std::max(0, Begin - 3); Index <= End; ++Index) 2862 Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index)); 2863 2864 for (int Index = std::max(0, Begin - 7); Index <= End; ++Index) 2865 Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index)); 2866 2867 for (int Index = std::max(0, Begin - 15); Index <= End; ++Index) 2868 Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index)); 2869 } 2870 2871 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 2872 unsigned OperandName) const { 2873 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 2874 if (Idx == -1) 2875 return nullptr; 2876 2877 return &MI.getOperand(Idx); 2878 } 2879 2880 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 2881 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 2882 if (ST.isAmdHsaOS()) { 2883 RsrcDataFormat |= (1ULL << 56); 2884 2885 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 2886 // Set MTYPE = 2 2887 RsrcDataFormat |= (2ULL << 59); 2888 } 2889 2890 return RsrcDataFormat; 2891 } 2892 2893 uint64_t SIInstrInfo::getScratchRsrcWords23() const { 2894 uint64_t Rsrc23 = getDefaultRsrcDataFormat() | 2895 AMDGPU::RSRC_TID_ENABLE | 2896 0xffffffff; // Size; 2897 2898 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; 2899 2900 Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT); 2901 2902 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. 2903 // Clear them unless we want a huge stride. 2904 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 2905 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; 2906 2907 return Rsrc23; 2908 } 2909 2910 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr *MI) const { 2911 unsigned Opc = MI->getOpcode(); 2912 2913 return isSMRD(Opc); 2914 } 2915 2916 bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr *MI) const { 2917 unsigned Opc = MI->getOpcode(); 2918 2919 return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); 2920 } 2921 2922 ArrayRef<std::pair<int, const char *>> 2923 SIInstrInfo::getSerializableTargetIndices() const { 2924 static const std::pair<int, const char *> TargetIndices[] = { 2925 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, 2926 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, 2927 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, 2928 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, 2929 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; 2930 return makeArrayRef(TargetIndices); 2931 } 2932