1 //===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief SI Implementation of TargetInstrInfo. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "SIInstrInfo.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "SIDefines.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/CodeGen/MachineFrameInfo.h" 21 #include "llvm/CodeGen/MachineInstrBuilder.h" 22 #include "llvm/CodeGen/MachineRegisterInfo.h" 23 #include "llvm/IR/Function.h" 24 #include "llvm/CodeGen/RegisterScavenging.h" 25 #include "llvm/MC/MCInstrDesc.h" 26 #include "llvm/Support/Debug.h" 27 28 using namespace llvm; 29 30 SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st) 31 : AMDGPUInstrInfo(st), RI() {} 32 33 //===----------------------------------------------------------------------===// 34 // TargetInstrInfo callbacks 35 //===----------------------------------------------------------------------===// 36 37 static unsigned getNumOperandsNoGlue(SDNode *Node) { 38 unsigned N = Node->getNumOperands(); 39 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 40 --N; 41 return N; 42 } 43 44 static SDValue findChainOperand(SDNode *Load) { 45 SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); 46 assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); 47 return LastOp; 48 } 49 50 /// \brief Returns true if both nodes have the same value for the given 51 /// operand \p Op, or if both nodes do not have this operand. 52 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 53 unsigned Opc0 = N0->getMachineOpcode(); 54 unsigned Opc1 = N1->getMachineOpcode(); 55 56 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 57 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 58 59 if (Op0Idx == -1 && Op1Idx == -1) 60 return true; 61 62 63 if ((Op0Idx == -1 && Op1Idx != -1) || 64 (Op1Idx == -1 && Op0Idx != -1)) 65 return false; 66 67 // getNamedOperandIdx returns the index for the MachineInstr's operands, 68 // which includes the result as the first operand. We are indexing into the 69 // MachineSDNode's operands, so we need to skip the result operand to get 70 // the real index. 71 --Op0Idx; 72 --Op1Idx; 73 74 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 75 } 76 77 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, 78 AliasAnalysis *AA) const { 79 // TODO: The generic check fails for VALU instructions that should be 80 // rematerializable due to implicit reads of exec. We really want all of the 81 // generic logic for this except for this. 82 switch (MI->getOpcode()) { 83 case AMDGPU::V_MOV_B32_e32: 84 case AMDGPU::V_MOV_B32_e64: 85 return true; 86 default: 87 return false; 88 } 89 } 90 91 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 92 int64_t &Offset0, 93 int64_t &Offset1) const { 94 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 95 return false; 96 97 unsigned Opc0 = Load0->getMachineOpcode(); 98 unsigned Opc1 = Load1->getMachineOpcode(); 99 100 // Make sure both are actually loads. 101 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 102 return false; 103 104 if (isDS(Opc0) && isDS(Opc1)) { 105 106 // FIXME: Handle this case: 107 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 108 return false; 109 110 // Check base reg. 111 if (Load0->getOperand(1) != Load1->getOperand(1)) 112 return false; 113 114 // Check chain. 115 if (findChainOperand(Load0) != findChainOperand(Load1)) 116 return false; 117 118 // Skip read2 / write2 variants for simplicity. 119 // TODO: We should report true if the used offsets are adjacent (excluded 120 // st64 versions). 121 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || 122 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) 123 return false; 124 125 Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue(); 126 Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue(); 127 return true; 128 } 129 130 if (isSMRD(Opc0) && isSMRD(Opc1)) { 131 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); 132 133 // Check base reg. 134 if (Load0->getOperand(0) != Load1->getOperand(0)) 135 return false; 136 137 const ConstantSDNode *Load0Offset = 138 dyn_cast<ConstantSDNode>(Load0->getOperand(1)); 139 const ConstantSDNode *Load1Offset = 140 dyn_cast<ConstantSDNode>(Load1->getOperand(1)); 141 142 if (!Load0Offset || !Load1Offset) 143 return false; 144 145 // Check chain. 146 if (findChainOperand(Load0) != findChainOperand(Load1)) 147 return false; 148 149 Offset0 = Load0Offset->getZExtValue(); 150 Offset1 = Load1Offset->getZExtValue(); 151 return true; 152 } 153 154 // MUBUF and MTBUF can access the same addresses. 155 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 156 157 // MUBUF and MTBUF have vaddr at different indices. 158 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 159 findChainOperand(Load0) != findChainOperand(Load1) || 160 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 161 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 162 return false; 163 164 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 165 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 166 167 if (OffIdx0 == -1 || OffIdx1 == -1) 168 return false; 169 170 // getNamedOperandIdx returns the index for MachineInstrs. Since they 171 // inlcude the output in the operand list, but SDNodes don't, we need to 172 // subtract the index by one. 173 --OffIdx0; 174 --OffIdx1; 175 176 SDValue Off0 = Load0->getOperand(OffIdx0); 177 SDValue Off1 = Load1->getOperand(OffIdx1); 178 179 // The offset might be a FrameIndexSDNode. 180 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 181 return false; 182 183 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 184 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 185 return true; 186 } 187 188 return false; 189 } 190 191 static bool isStride64(unsigned Opc) { 192 switch (Opc) { 193 case AMDGPU::DS_READ2ST64_B32: 194 case AMDGPU::DS_READ2ST64_B64: 195 case AMDGPU::DS_WRITE2ST64_B32: 196 case AMDGPU::DS_WRITE2ST64_B64: 197 return true; 198 default: 199 return false; 200 } 201 } 202 203 bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, 204 unsigned &Offset, 205 const TargetRegisterInfo *TRI) const { 206 unsigned Opc = LdSt->getOpcode(); 207 if (isDS(Opc)) { 208 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 209 AMDGPU::OpName::offset); 210 if (OffsetImm) { 211 // Normal, single offset LDS instruction. 212 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 213 AMDGPU::OpName::addr); 214 215 BaseReg = AddrReg->getReg(); 216 Offset = OffsetImm->getImm(); 217 return true; 218 } 219 220 // The 2 offset instructions use offset0 and offset1 instead. We can treat 221 // these as a load with a single offset if the 2 offsets are consecutive. We 222 // will use this for some partially aligned loads. 223 const MachineOperand *Offset0Imm = getNamedOperand(*LdSt, 224 AMDGPU::OpName::offset0); 225 const MachineOperand *Offset1Imm = getNamedOperand(*LdSt, 226 AMDGPU::OpName::offset1); 227 228 uint8_t Offset0 = Offset0Imm->getImm(); 229 uint8_t Offset1 = Offset1Imm->getImm(); 230 assert(Offset1 > Offset0); 231 232 if (Offset1 - Offset0 == 1) { 233 // Each of these offsets is in element sized units, so we need to convert 234 // to bytes of the individual reads. 235 236 unsigned EltSize; 237 if (LdSt->mayLoad()) 238 EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2; 239 else { 240 assert(LdSt->mayStore()); 241 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 242 EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize(); 243 } 244 245 if (isStride64(Opc)) 246 EltSize *= 64; 247 248 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 249 AMDGPU::OpName::addr); 250 BaseReg = AddrReg->getReg(); 251 Offset = EltSize * Offset0; 252 return true; 253 } 254 255 return false; 256 } 257 258 if (isMUBUF(Opc) || isMTBUF(Opc)) { 259 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) 260 return false; 261 262 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 263 AMDGPU::OpName::vaddr); 264 if (!AddrReg) 265 return false; 266 267 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 268 AMDGPU::OpName::offset); 269 BaseReg = AddrReg->getReg(); 270 Offset = OffsetImm->getImm(); 271 return true; 272 } 273 274 if (isSMRD(Opc)) { 275 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 276 AMDGPU::OpName::offset); 277 if (!OffsetImm) 278 return false; 279 280 const MachineOperand *SBaseReg = getNamedOperand(*LdSt, 281 AMDGPU::OpName::sbase); 282 BaseReg = SBaseReg->getReg(); 283 Offset = OffsetImm->getImm(); 284 return true; 285 } 286 287 return false; 288 } 289 290 bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, 291 MachineInstr *SecondLdSt, 292 unsigned NumLoads) const { 293 unsigned Opc0 = FirstLdSt->getOpcode(); 294 unsigned Opc1 = SecondLdSt->getOpcode(); 295 296 // TODO: This needs finer tuning 297 if (NumLoads > 4) 298 return false; 299 300 if (isDS(Opc0) && isDS(Opc1)) 301 return true; 302 303 if (isSMRD(Opc0) && isSMRD(Opc1)) 304 return true; 305 306 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) 307 return true; 308 309 return false; 310 } 311 312 void 313 SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 314 MachineBasicBlock::iterator MI, DebugLoc DL, 315 unsigned DestReg, unsigned SrcReg, 316 bool KillSrc) const { 317 318 // If we are trying to copy to or from SCC, there is a bug somewhere else in 319 // the backend. While it may be theoretically possible to do this, it should 320 // never be necessary. 321 assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC); 322 323 static const int16_t Sub0_15[] = { 324 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 325 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 326 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 327 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 0 328 }; 329 330 static const int16_t Sub0_7[] = { 331 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 332 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 0 333 }; 334 335 static const int16_t Sub0_3[] = { 336 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0 337 }; 338 339 static const int16_t Sub0_2[] = { 340 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 0 341 }; 342 343 static const int16_t Sub0_1[] = { 344 AMDGPU::sub0, AMDGPU::sub1, 0 345 }; 346 347 unsigned Opcode; 348 const int16_t *SubIndices; 349 350 if (AMDGPU::SReg_32RegClass.contains(DestReg)) { 351 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 352 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 353 .addReg(SrcReg, getKillRegState(KillSrc)); 354 return; 355 356 } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) { 357 if (DestReg == AMDGPU::VCC) { 358 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 359 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 360 .addReg(SrcReg, getKillRegState(KillSrc)); 361 } else { 362 // FIXME: Hack until VReg_1 removed. 363 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 364 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32), AMDGPU::VCC) 365 .addImm(0) 366 .addReg(SrcReg, getKillRegState(KillSrc)); 367 } 368 369 return; 370 } 371 372 assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); 373 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 374 .addReg(SrcReg, getKillRegState(KillSrc)); 375 return; 376 377 } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) { 378 assert(AMDGPU::SReg_128RegClass.contains(SrcReg)); 379 Opcode = AMDGPU::S_MOV_B32; 380 SubIndices = Sub0_3; 381 382 } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) { 383 assert(AMDGPU::SReg_256RegClass.contains(SrcReg)); 384 Opcode = AMDGPU::S_MOV_B32; 385 SubIndices = Sub0_7; 386 387 } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) { 388 assert(AMDGPU::SReg_512RegClass.contains(SrcReg)); 389 Opcode = AMDGPU::S_MOV_B32; 390 SubIndices = Sub0_15; 391 392 } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) { 393 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 394 AMDGPU::SReg_32RegClass.contains(SrcReg)); 395 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 396 .addReg(SrcReg, getKillRegState(KillSrc)); 397 return; 398 399 } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) { 400 assert(AMDGPU::VReg_64RegClass.contains(SrcReg) || 401 AMDGPU::SReg_64RegClass.contains(SrcReg)); 402 Opcode = AMDGPU::V_MOV_B32_e32; 403 SubIndices = Sub0_1; 404 405 } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) { 406 assert(AMDGPU::VReg_96RegClass.contains(SrcReg)); 407 Opcode = AMDGPU::V_MOV_B32_e32; 408 SubIndices = Sub0_2; 409 410 } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) { 411 assert(AMDGPU::VReg_128RegClass.contains(SrcReg) || 412 AMDGPU::SReg_128RegClass.contains(SrcReg)); 413 Opcode = AMDGPU::V_MOV_B32_e32; 414 SubIndices = Sub0_3; 415 416 } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) { 417 assert(AMDGPU::VReg_256RegClass.contains(SrcReg) || 418 AMDGPU::SReg_256RegClass.contains(SrcReg)); 419 Opcode = AMDGPU::V_MOV_B32_e32; 420 SubIndices = Sub0_7; 421 422 } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) { 423 assert(AMDGPU::VReg_512RegClass.contains(SrcReg) || 424 AMDGPU::SReg_512RegClass.contains(SrcReg)); 425 Opcode = AMDGPU::V_MOV_B32_e32; 426 SubIndices = Sub0_15; 427 428 } else { 429 llvm_unreachable("Can't copy register!"); 430 } 431 432 while (unsigned SubIdx = *SubIndices++) { 433 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 434 get(Opcode), RI.getSubReg(DestReg, SubIdx)); 435 436 Builder.addReg(RI.getSubReg(SrcReg, SubIdx), getKillRegState(KillSrc)); 437 438 if (*SubIndices) 439 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 440 } 441 } 442 443 int SIInstrInfo::commuteOpcode(const MachineInstr &MI) const { 444 const unsigned Opcode = MI.getOpcode(); 445 446 int NewOpc; 447 448 // Try to map original to commuted opcode 449 NewOpc = AMDGPU::getCommuteRev(Opcode); 450 if (NewOpc != -1) 451 // Check if the commuted (REV) opcode exists on the target. 452 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 453 454 // Try to map commuted to original opcode 455 NewOpc = AMDGPU::getCommuteOrig(Opcode); 456 if (NewOpc != -1) 457 // Check if the original (non-REV) opcode exists on the target. 458 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 459 460 return Opcode; 461 } 462 463 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 464 465 if (DstRC->getSize() == 4) { 466 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 467 } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) { 468 return AMDGPU::S_MOV_B64; 469 } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) { 470 return AMDGPU::V_MOV_B64_PSEUDO; 471 } 472 return AMDGPU::COPY; 473 } 474 475 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 476 MachineBasicBlock::iterator MI, 477 unsigned SrcReg, bool isKill, 478 int FrameIndex, 479 const TargetRegisterClass *RC, 480 const TargetRegisterInfo *TRI) const { 481 MachineFunction *MF = MBB.getParent(); 482 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 483 MachineFrameInfo *FrameInfo = MF->getFrameInfo(); 484 DebugLoc DL = MBB.findDebugLoc(MI); 485 int Opcode = -1; 486 487 if (RI.isSGPRClass(RC)) { 488 // We are only allowed to create one new instruction when spilling 489 // registers, so we need to use pseudo instruction for spilling 490 // SGPRs. 491 switch (RC->getSize() * 8) { 492 case 32: Opcode = AMDGPU::SI_SPILL_S32_SAVE; break; 493 case 64: Opcode = AMDGPU::SI_SPILL_S64_SAVE; break; 494 case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break; 495 case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break; 496 case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break; 497 } 498 } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) { 499 MFI->setHasSpilledVGPRs(); 500 501 switch(RC->getSize() * 8) { 502 case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break; 503 case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break; 504 case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break; 505 case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break; 506 case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break; 507 case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break; 508 } 509 } 510 511 if (Opcode != -1) { 512 FrameInfo->setObjectAlignment(FrameIndex, 4); 513 BuildMI(MBB, MI, DL, get(Opcode)) 514 .addReg(SrcReg) 515 .addFrameIndex(FrameIndex) 516 // Place-holder registers, these will be filled in by 517 // SIPrepareScratchRegs. 518 .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) 519 .addReg(AMDGPU::SGPR0, RegState::Undef); 520 } else { 521 LLVMContext &Ctx = MF->getFunction()->getContext(); 522 Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" 523 " spill register"); 524 BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) 525 .addReg(SrcReg); 526 } 527 } 528 529 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 530 MachineBasicBlock::iterator MI, 531 unsigned DestReg, int FrameIndex, 532 const TargetRegisterClass *RC, 533 const TargetRegisterInfo *TRI) const { 534 MachineFunction *MF = MBB.getParent(); 535 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 536 MachineFrameInfo *FrameInfo = MF->getFrameInfo(); 537 DebugLoc DL = MBB.findDebugLoc(MI); 538 int Opcode = -1; 539 540 if (RI.isSGPRClass(RC)){ 541 switch(RC->getSize() * 8) { 542 case 32: Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break; 543 case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break; 544 case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break; 545 case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break; 546 case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break; 547 } 548 } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) { 549 switch(RC->getSize() * 8) { 550 case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break; 551 case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break; 552 case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break; 553 case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break; 554 case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break; 555 case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break; 556 } 557 } 558 559 if (Opcode != -1) { 560 FrameInfo->setObjectAlignment(FrameIndex, 4); 561 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 562 .addFrameIndex(FrameIndex) 563 // Place-holder registers, these will be filled in by 564 // SIPrepareScratchRegs. 565 .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) 566 .addReg(AMDGPU::SGPR0, RegState::Undef); 567 568 } else { 569 LLVMContext &Ctx = MF->getFunction()->getContext(); 570 Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" 571 " restore register"); 572 BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); 573 } 574 } 575 576 /// \param @Offset Offset in bytes of the FrameIndex being spilled 577 unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, 578 MachineBasicBlock::iterator MI, 579 RegScavenger *RS, unsigned TmpReg, 580 unsigned FrameOffset, 581 unsigned Size) const { 582 MachineFunction *MF = MBB.getParent(); 583 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 584 const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>(); 585 const SIRegisterInfo *TRI = 586 static_cast<const SIRegisterInfo*>(ST.getRegisterInfo()); 587 DebugLoc DL = MBB.findDebugLoc(MI); 588 unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF); 589 unsigned WavefrontSize = ST.getWavefrontSize(); 590 591 unsigned TIDReg = MFI->getTIDReg(); 592 if (!MFI->hasCalculatedTID()) { 593 MachineBasicBlock &Entry = MBB.getParent()->front(); 594 MachineBasicBlock::iterator Insert = Entry.front(); 595 DebugLoc DL = Insert->getDebugLoc(); 596 597 TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass); 598 if (TIDReg == AMDGPU::NoRegister) 599 return TIDReg; 600 601 602 if (MFI->getShaderType() == ShaderType::COMPUTE && 603 WorkGroupSize > WavefrontSize) { 604 605 unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X); 606 unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y); 607 unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z); 608 unsigned InputPtrReg = 609 TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR); 610 for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { 611 if (!Entry.isLiveIn(Reg)) 612 Entry.addLiveIn(Reg); 613 } 614 615 RS->enterBasicBlock(&Entry); 616 unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 617 unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 618 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) 619 .addReg(InputPtrReg) 620 .addImm(SI::KernelInputOffsets::NGROUPS_Z); 621 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) 622 .addReg(InputPtrReg) 623 .addImm(SI::KernelInputOffsets::NGROUPS_Y); 624 625 // NGROUPS.X * NGROUPS.Y 626 BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) 627 .addReg(STmp1) 628 .addReg(STmp0); 629 // (NGROUPS.X * NGROUPS.Y) * TIDIG.X 630 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) 631 .addReg(STmp1) 632 .addReg(TIDIGXReg); 633 // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) 634 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) 635 .addReg(STmp0) 636 .addReg(TIDIGYReg) 637 .addReg(TIDReg); 638 // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z 639 BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) 640 .addReg(TIDReg) 641 .addReg(TIDIGZReg); 642 } else { 643 // Get the wave id 644 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), 645 TIDReg) 646 .addImm(-1) 647 .addImm(0); 648 649 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), 650 TIDReg) 651 .addImm(-1) 652 .addReg(TIDReg); 653 } 654 655 BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), 656 TIDReg) 657 .addImm(2) 658 .addReg(TIDReg); 659 MFI->setTIDReg(TIDReg); 660 } 661 662 // Add FrameIndex to LDS offset 663 unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize); 664 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) 665 .addImm(LDSOffset) 666 .addReg(TIDReg); 667 668 return TmpReg; 669 } 670 671 void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI, 672 int Count) const { 673 while (Count > 0) { 674 int Arg; 675 if (Count >= 8) 676 Arg = 7; 677 else 678 Arg = Count - 1; 679 Count -= 8; 680 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(AMDGPU::S_NOP)) 681 .addImm(Arg); 682 } 683 } 684 685 bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { 686 MachineBasicBlock &MBB = *MI->getParent(); 687 DebugLoc DL = MBB.findDebugLoc(MI); 688 switch (MI->getOpcode()) { 689 default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); 690 691 case AMDGPU::SI_CONSTDATA_PTR: { 692 unsigned Reg = MI->getOperand(0).getReg(); 693 unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); 694 unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); 695 696 BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg); 697 698 // Add 32-bit offset from this instruction to the start of the constant data. 699 BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_U32), RegLo) 700 .addReg(RegLo) 701 .addTargetIndex(AMDGPU::TI_CONSTDATA_START) 702 .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit); 703 BuildMI(MBB, MI, DL, get(AMDGPU::S_ADDC_U32), RegHi) 704 .addReg(RegHi) 705 .addImm(0) 706 .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit) 707 .addReg(AMDGPU::SCC, RegState::Implicit); 708 MI->eraseFromParent(); 709 break; 710 } 711 case AMDGPU::SGPR_USE: 712 // This is just a placeholder for register allocation. 713 MI->eraseFromParent(); 714 break; 715 716 case AMDGPU::V_MOV_B64_PSEUDO: { 717 unsigned Dst = MI->getOperand(0).getReg(); 718 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 719 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 720 721 const MachineOperand &SrcOp = MI->getOperand(1); 722 // FIXME: Will this work for 64-bit floating point immediates? 723 assert(!SrcOp.isFPImm()); 724 if (SrcOp.isImm()) { 725 APInt Imm(64, SrcOp.getImm()); 726 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 727 .addImm(Imm.getLoBits(32).getZExtValue()) 728 .addReg(Dst, RegState::Implicit); 729 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 730 .addImm(Imm.getHiBits(32).getZExtValue()) 731 .addReg(Dst, RegState::Implicit); 732 } else { 733 assert(SrcOp.isReg()); 734 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 735 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 736 .addReg(Dst, RegState::Implicit); 737 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 738 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 739 .addReg(Dst, RegState::Implicit); 740 } 741 MI->eraseFromParent(); 742 break; 743 } 744 745 case AMDGPU::V_CNDMASK_B64_PSEUDO: { 746 unsigned Dst = MI->getOperand(0).getReg(); 747 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 748 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 749 unsigned Src0 = MI->getOperand(1).getReg(); 750 unsigned Src1 = MI->getOperand(2).getReg(); 751 const MachineOperand &SrcCond = MI->getOperand(3); 752 753 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo) 754 .addReg(RI.getSubReg(Src0, AMDGPU::sub0)) 755 .addReg(RI.getSubReg(Src1, AMDGPU::sub0)) 756 .addOperand(SrcCond); 757 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi) 758 .addReg(RI.getSubReg(Src0, AMDGPU::sub1)) 759 .addReg(RI.getSubReg(Src1, AMDGPU::sub1)) 760 .addOperand(SrcCond); 761 MI->eraseFromParent(); 762 break; 763 } 764 } 765 return true; 766 } 767 768 MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, 769 bool NewMI) const { 770 771 if (MI->getNumOperands() < 3) 772 return nullptr; 773 774 int CommutedOpcode = commuteOpcode(*MI); 775 if (CommutedOpcode == -1) 776 return nullptr; 777 778 int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 779 AMDGPU::OpName::src0); 780 assert(Src0Idx != -1 && "Should always have src0 operand"); 781 782 MachineOperand &Src0 = MI->getOperand(Src0Idx); 783 if (!Src0.isReg()) 784 return nullptr; 785 786 int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 787 AMDGPU::OpName::src1); 788 if (Src1Idx == -1) 789 return nullptr; 790 791 MachineOperand &Src1 = MI->getOperand(Src1Idx); 792 793 // Make sure it's legal to commute operands for VOP2. 794 if (isVOP2(MI->getOpcode()) && 795 (!isOperandLegal(MI, Src0Idx, &Src1) || 796 !isOperandLegal(MI, Src1Idx, &Src0))) { 797 return nullptr; 798 } 799 800 if (!Src1.isReg()) { 801 // Allow commuting instructions with Imm operands. 802 if (NewMI || !Src1.isImm() || 803 (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) { 804 return nullptr; 805 } 806 807 // Be sure to copy the source modifiers to the right place. 808 if (MachineOperand *Src0Mods 809 = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { 810 MachineOperand *Src1Mods 811 = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers); 812 813 int Src0ModsVal = Src0Mods->getImm(); 814 if (!Src1Mods && Src0ModsVal != 0) 815 return nullptr; 816 817 // XXX - This assert might be a lie. It might be useful to have a neg 818 // modifier with 0.0. 819 int Src1ModsVal = Src1Mods->getImm(); 820 assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates"); 821 822 Src1Mods->setImm(Src0ModsVal); 823 Src0Mods->setImm(Src1ModsVal); 824 } 825 826 unsigned Reg = Src0.getReg(); 827 unsigned SubReg = Src0.getSubReg(); 828 if (Src1.isImm()) 829 Src0.ChangeToImmediate(Src1.getImm()); 830 else 831 llvm_unreachable("Should only have immediates"); 832 833 Src1.ChangeToRegister(Reg, false); 834 Src1.setSubReg(SubReg); 835 } else { 836 MI = TargetInstrInfo::commuteInstruction(MI, NewMI); 837 } 838 839 if (MI) 840 MI->setDesc(get(CommutedOpcode)); 841 842 return MI; 843 } 844 845 // This needs to be implemented because the source modifiers may be inserted 846 // between the true commutable operands, and the base 847 // TargetInstrInfo::commuteInstruction uses it. 848 bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, 849 unsigned &SrcOpIdx1, 850 unsigned &SrcOpIdx2) const { 851 const MCInstrDesc &MCID = MI->getDesc(); 852 if (!MCID.isCommutable()) 853 return false; 854 855 unsigned Opc = MI->getOpcode(); 856 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 857 if (Src0Idx == -1) 858 return false; 859 860 // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on 861 // immediate. 862 if (!MI->getOperand(Src0Idx).isReg()) 863 return false; 864 865 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 866 if (Src1Idx == -1) 867 return false; 868 869 if (!MI->getOperand(Src1Idx).isReg()) 870 return false; 871 872 // If any source modifiers are set, the generic instruction commuting won't 873 // understand how to copy the source modifiers. 874 if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || 875 hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) 876 return false; 877 878 SrcOpIdx1 = Src0Idx; 879 SrcOpIdx2 = Src1Idx; 880 return true; 881 } 882 883 MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB, 884 MachineBasicBlock::iterator I, 885 unsigned DstReg, 886 unsigned SrcReg) const { 887 return BuildMI(*MBB, I, MBB->findDebugLoc(I), get(AMDGPU::V_MOV_B32_e32), 888 DstReg) .addReg(SrcReg); 889 } 890 891 bool SIInstrInfo::isMov(unsigned Opcode) const { 892 switch(Opcode) { 893 default: return false; 894 case AMDGPU::S_MOV_B32: 895 case AMDGPU::S_MOV_B64: 896 case AMDGPU::V_MOV_B32_e32: 897 case AMDGPU::V_MOV_B32_e64: 898 return true; 899 } 900 } 901 902 bool 903 SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { 904 return RC != &AMDGPU::EXECRegRegClass; 905 } 906 907 static void removeModOperands(MachineInstr &MI) { 908 unsigned Opc = MI.getOpcode(); 909 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, 910 AMDGPU::OpName::src0_modifiers); 911 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, 912 AMDGPU::OpName::src1_modifiers); 913 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, 914 AMDGPU::OpName::src2_modifiers); 915 916 MI.RemoveOperand(Src2ModIdx); 917 MI.RemoveOperand(Src1ModIdx); 918 MI.RemoveOperand(Src0ModIdx); 919 } 920 921 bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, 922 unsigned Reg, MachineRegisterInfo *MRI) const { 923 if (!MRI->hasOneNonDBGUse(Reg)) 924 return false; 925 926 unsigned Opc = UseMI->getOpcode(); 927 if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) { 928 // Don't fold if we are using source modifiers. The new VOP2 instructions 929 // don't have them. 930 if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) || 931 hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) || 932 hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) { 933 return false; 934 } 935 936 MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0); 937 MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1); 938 MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2); 939 940 // Multiplied part is the constant: Use v_madmk_f32 941 // We should only expect these to be on src0 due to canonicalizations. 942 if (Src0->isReg() && Src0->getReg() == Reg) { 943 if (!Src1->isReg() || 944 (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) 945 return false; 946 947 if (!Src2->isReg() || 948 (Src2->isReg() && RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))) 949 return false; 950 951 // We need to do some weird looking operand shuffling since the madmk 952 // operands are out of the normal expected order with the multiplied 953 // constant as the last operand. 954 // 955 // v_mad_f32 src0, src1, src2 -> v_madmk_f32 src0 * src2K + src1 956 // src0 -> src2 K 957 // src1 -> src0 958 // src2 -> src1 959 960 const int64_t Imm = DefMI->getOperand(1).getImm(); 961 962 // FIXME: This would be a lot easier if we could return a new instruction 963 // instead of having to modify in place. 964 965 // Remove these first since they are at the end. 966 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 967 AMDGPU::OpName::omod)); 968 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 969 AMDGPU::OpName::clamp)); 970 971 unsigned Src1Reg = Src1->getReg(); 972 unsigned Src1SubReg = Src1->getSubReg(); 973 unsigned Src2Reg = Src2->getReg(); 974 unsigned Src2SubReg = Src2->getSubReg(); 975 Src0->setReg(Src1Reg); 976 Src0->setSubReg(Src1SubReg); 977 Src0->setIsKill(Src1->isKill()); 978 979 Src1->setReg(Src2Reg); 980 Src1->setSubReg(Src2SubReg); 981 Src1->setIsKill(Src2->isKill()); 982 983 if (Opc == AMDGPU::V_MAC_F32_e64) { 984 UseMI->untieRegOperand( 985 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 986 } 987 988 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 989 AMDGPU::OpName::src2)); 990 // ChangingToImmediate adds Src2 back to the instruction. 991 Src2->ChangeToImmediate(Imm); 992 993 removeModOperands(*UseMI); 994 UseMI->setDesc(get(AMDGPU::V_MADMK_F32)); 995 996 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 997 if (DeleteDef) 998 DefMI->eraseFromParent(); 999 1000 return true; 1001 } 1002 1003 // Added part is the constant: Use v_madak_f32 1004 if (Src2->isReg() && Src2->getReg() == Reg) { 1005 // Not allowed to use constant bus for another operand. 1006 // We can however allow an inline immediate as src0. 1007 if (!Src0->isImm() && 1008 (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) 1009 return false; 1010 1011 if (!Src1->isReg() || 1012 (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) 1013 return false; 1014 1015 const int64_t Imm = DefMI->getOperand(1).getImm(); 1016 1017 // FIXME: This would be a lot easier if we could return a new instruction 1018 // instead of having to modify in place. 1019 1020 // Remove these first since they are at the end. 1021 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1022 AMDGPU::OpName::omod)); 1023 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, 1024 AMDGPU::OpName::clamp)); 1025 1026 if (Opc == AMDGPU::V_MAC_F32_e64) { 1027 UseMI->untieRegOperand( 1028 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1029 } 1030 1031 // ChangingToImmediate adds Src2 back to the instruction. 1032 Src2->ChangeToImmediate(Imm); 1033 1034 // These come before src2. 1035 removeModOperands(*UseMI); 1036 UseMI->setDesc(get(AMDGPU::V_MADAK_F32)); 1037 1038 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1039 if (DeleteDef) 1040 DefMI->eraseFromParent(); 1041 1042 return true; 1043 } 1044 } 1045 1046 return false; 1047 } 1048 1049 bool 1050 SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI, 1051 AliasAnalysis *AA) const { 1052 switch(MI->getOpcode()) { 1053 default: return AMDGPUInstrInfo::isTriviallyReMaterializable(MI, AA); 1054 case AMDGPU::S_MOV_B32: 1055 case AMDGPU::S_MOV_B64: 1056 case AMDGPU::V_MOV_B32_e32: 1057 return MI->getOperand(1).isImm(); 1058 } 1059 } 1060 1061 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 1062 int WidthB, int OffsetB) { 1063 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1064 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1065 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1066 return LowOffset + LowWidth <= HighOffset; 1067 } 1068 1069 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa, 1070 MachineInstr *MIb) const { 1071 unsigned BaseReg0, Offset0; 1072 unsigned BaseReg1, Offset1; 1073 1074 if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && 1075 getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { 1076 assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() && 1077 "read2 / write2 not expected here yet"); 1078 unsigned Width0 = (*MIa->memoperands_begin())->getSize(); 1079 unsigned Width1 = (*MIb->memoperands_begin())->getSize(); 1080 if (BaseReg0 == BaseReg1 && 1081 offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { 1082 return true; 1083 } 1084 } 1085 1086 return false; 1087 } 1088 1089 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, 1090 MachineInstr *MIb, 1091 AliasAnalysis *AA) const { 1092 unsigned Opc0 = MIa->getOpcode(); 1093 unsigned Opc1 = MIb->getOpcode(); 1094 1095 assert(MIa && (MIa->mayLoad() || MIa->mayStore()) && 1096 "MIa must load from or modify a memory location"); 1097 assert(MIb && (MIb->mayLoad() || MIb->mayStore()) && 1098 "MIb must load from or modify a memory location"); 1099 1100 if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects()) 1101 return false; 1102 1103 // XXX - Can we relax this between address spaces? 1104 if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef()) 1105 return false; 1106 1107 // TODO: Should we check the address space from the MachineMemOperand? That 1108 // would allow us to distinguish objects we know don't alias based on the 1109 // underlying addres space, even if it was lowered to a different one, 1110 // e.g. private accesses lowered to use MUBUF instructions on a scratch 1111 // buffer. 1112 if (isDS(Opc0)) { 1113 if (isDS(Opc1)) 1114 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1115 1116 return !isFLAT(Opc1); 1117 } 1118 1119 if (isMUBUF(Opc0) || isMTBUF(Opc0)) { 1120 if (isMUBUF(Opc1) || isMTBUF(Opc1)) 1121 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1122 1123 return !isFLAT(Opc1) && !isSMRD(Opc1); 1124 } 1125 1126 if (isSMRD(Opc0)) { 1127 if (isSMRD(Opc1)) 1128 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1129 1130 return !isFLAT(Opc1) && !isMUBUF(Opc0) && !isMTBUF(Opc0); 1131 } 1132 1133 if (isFLAT(Opc0)) { 1134 if (isFLAT(Opc1)) 1135 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1136 1137 return false; 1138 } 1139 1140 return false; 1141 } 1142 1143 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, 1144 MachineBasicBlock::iterator &MI, 1145 LiveVariables *LV) const { 1146 1147 switch (MI->getOpcode()) { 1148 default: return nullptr; 1149 case AMDGPU::V_MAC_F32_e64: break; 1150 case AMDGPU::V_MAC_F32_e32: { 1151 const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); 1152 if (Src0->isImm() && !isInlineConstant(*Src0, 4)) 1153 return nullptr; 1154 break; 1155 } 1156 } 1157 1158 const MachineOperand *Dst = getNamedOperand(*MI, AMDGPU::OpName::dst); 1159 const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); 1160 const MachineOperand *Src1 = getNamedOperand(*MI, AMDGPU::OpName::src1); 1161 const MachineOperand *Src2 = getNamedOperand(*MI, AMDGPU::OpName::src2); 1162 1163 return BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_MAD_F32)) 1164 .addOperand(*Dst) 1165 .addImm(0) // Src0 mods 1166 .addOperand(*Src0) 1167 .addImm(0) // Src1 mods 1168 .addOperand(*Src1) 1169 .addImm(0) // Src mods 1170 .addOperand(*Src2) 1171 .addImm(0) // clamp 1172 .addImm(0); // omod 1173 } 1174 1175 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 1176 int64_t SVal = Imm.getSExtValue(); 1177 if (SVal >= -16 && SVal <= 64) 1178 return true; 1179 1180 if (Imm.getBitWidth() == 64) { 1181 uint64_t Val = Imm.getZExtValue(); 1182 return (DoubleToBits(0.0) == Val) || 1183 (DoubleToBits(1.0) == Val) || 1184 (DoubleToBits(-1.0) == Val) || 1185 (DoubleToBits(0.5) == Val) || 1186 (DoubleToBits(-0.5) == Val) || 1187 (DoubleToBits(2.0) == Val) || 1188 (DoubleToBits(-2.0) == Val) || 1189 (DoubleToBits(4.0) == Val) || 1190 (DoubleToBits(-4.0) == Val); 1191 } 1192 1193 // The actual type of the operand does not seem to matter as long 1194 // as the bits match one of the inline immediate values. For example: 1195 // 1196 // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal, 1197 // so it is a legal inline immediate. 1198 // 1199 // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in 1200 // floating-point, so it is a legal inline immediate. 1201 uint32_t Val = Imm.getZExtValue(); 1202 1203 return (FloatToBits(0.0f) == Val) || 1204 (FloatToBits(1.0f) == Val) || 1205 (FloatToBits(-1.0f) == Val) || 1206 (FloatToBits(0.5f) == Val) || 1207 (FloatToBits(-0.5f) == Val) || 1208 (FloatToBits(2.0f) == Val) || 1209 (FloatToBits(-2.0f) == Val) || 1210 (FloatToBits(4.0f) == Val) || 1211 (FloatToBits(-4.0f) == Val); 1212 } 1213 1214 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 1215 unsigned OpSize) const { 1216 if (MO.isImm()) { 1217 // MachineOperand provides no way to tell the true operand size, since it 1218 // only records a 64-bit value. We need to know the size to determine if a 1219 // 32-bit floating point immediate bit pattern is legal for an integer 1220 // immediate. It would be for any 32-bit integer operand, but would not be 1221 // for a 64-bit one. 1222 1223 unsigned BitSize = 8 * OpSize; 1224 return isInlineConstant(APInt(BitSize, MO.getImm(), true)); 1225 } 1226 1227 return false; 1228 } 1229 1230 bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO, 1231 unsigned OpSize) const { 1232 return MO.isImm() && !isInlineConstant(MO, OpSize); 1233 } 1234 1235 static bool compareMachineOp(const MachineOperand &Op0, 1236 const MachineOperand &Op1) { 1237 if (Op0.getType() != Op1.getType()) 1238 return false; 1239 1240 switch (Op0.getType()) { 1241 case MachineOperand::MO_Register: 1242 return Op0.getReg() == Op1.getReg(); 1243 case MachineOperand::MO_Immediate: 1244 return Op0.getImm() == Op1.getImm(); 1245 default: 1246 llvm_unreachable("Didn't expect to be comparing these operand types"); 1247 } 1248 } 1249 1250 bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, 1251 const MachineOperand &MO) const { 1252 const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo]; 1253 1254 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 1255 1256 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 1257 return true; 1258 1259 if (OpInfo.RegClass < 0) 1260 return false; 1261 1262 unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize(); 1263 if (isLiteralConstant(MO, OpSize)) 1264 return RI.opCanUseLiteralConstant(OpInfo.OperandType); 1265 1266 return RI.opCanUseInlineConstant(OpInfo.OperandType); 1267 } 1268 1269 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 1270 int Op32 = AMDGPU::getVOPe32(Opcode); 1271 if (Op32 == -1) 1272 return false; 1273 1274 return pseudoToMCOpcode(Op32) != -1; 1275 } 1276 1277 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 1278 // The src0_modifier operand is present on all instructions 1279 // that have modifiers. 1280 1281 return AMDGPU::getNamedOperandIdx(Opcode, 1282 AMDGPU::OpName::src0_modifiers) != -1; 1283 } 1284 1285 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 1286 unsigned OpName) const { 1287 const MachineOperand *Mods = getNamedOperand(MI, OpName); 1288 return Mods && Mods->getImm(); 1289 } 1290 1291 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 1292 const MachineOperand &MO, 1293 unsigned OpSize) const { 1294 // Literal constants use the constant bus. 1295 if (isLiteralConstant(MO, OpSize)) 1296 return true; 1297 1298 if (!MO.isReg() || !MO.isUse()) 1299 return false; 1300 1301 if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) 1302 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 1303 1304 // FLAT_SCR is just an SGPR pair. 1305 if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) 1306 return true; 1307 1308 // EXEC register uses the constant bus. 1309 if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) 1310 return true; 1311 1312 // SGPRs use the constant bus 1313 if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC || 1314 (!MO.isImplicit() && 1315 (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || 1316 AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) { 1317 return true; 1318 } 1319 1320 return false; 1321 } 1322 1323 bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, 1324 StringRef &ErrInfo) const { 1325 uint16_t Opcode = MI->getOpcode(); 1326 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1327 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 1328 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 1329 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 1330 1331 // Make sure the number of operands is correct. 1332 const MCInstrDesc &Desc = get(Opcode); 1333 if (!Desc.isVariadic() && 1334 Desc.getNumOperands() != MI->getNumExplicitOperands()) { 1335 ErrInfo = "Instruction has wrong number of operands."; 1336 return false; 1337 } 1338 1339 // Make sure the register classes are correct 1340 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 1341 if (MI->getOperand(i).isFPImm()) { 1342 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 1343 "all fp values to integers."; 1344 return false; 1345 } 1346 1347 int RegClass = Desc.OpInfo[i].RegClass; 1348 1349 switch (Desc.OpInfo[i].OperandType) { 1350 case MCOI::OPERAND_REGISTER: 1351 if (MI->getOperand(i).isImm()) { 1352 ErrInfo = "Illegal immediate value for operand."; 1353 return false; 1354 } 1355 break; 1356 case AMDGPU::OPERAND_REG_IMM32: 1357 break; 1358 case AMDGPU::OPERAND_REG_INLINE_C: 1359 if (isLiteralConstant(MI->getOperand(i), 1360 RI.getRegClass(RegClass)->getSize())) { 1361 ErrInfo = "Illegal immediate value for operand."; 1362 return false; 1363 } 1364 break; 1365 case MCOI::OPERAND_IMMEDIATE: 1366 // Check if this operand is an immediate. 1367 // FrameIndex operands will be replaced by immediates, so they are 1368 // allowed. 1369 if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) { 1370 ErrInfo = "Expected immediate, but got non-immediate"; 1371 return false; 1372 } 1373 // Fall-through 1374 default: 1375 continue; 1376 } 1377 1378 if (!MI->getOperand(i).isReg()) 1379 continue; 1380 1381 if (RegClass != -1) { 1382 unsigned Reg = MI->getOperand(i).getReg(); 1383 if (TargetRegisterInfo::isVirtualRegister(Reg)) 1384 continue; 1385 1386 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 1387 if (!RC->contains(Reg)) { 1388 ErrInfo = "Operand has incorrect register class."; 1389 return false; 1390 } 1391 } 1392 } 1393 1394 1395 // Verify VOP* 1396 if (isVOP1(Opcode) || isVOP2(Opcode) || isVOP3(Opcode) || isVOPC(Opcode)) { 1397 // Only look at the true operands. Only a real operand can use the constant 1398 // bus, and we don't want to check pseudo-operands like the source modifier 1399 // flags. 1400 const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; 1401 1402 unsigned ConstantBusCount = 0; 1403 unsigned SGPRUsed = AMDGPU::NoRegister; 1404 for (int OpIdx : OpIndices) { 1405 if (OpIdx == -1) 1406 break; 1407 const MachineOperand &MO = MI->getOperand(OpIdx); 1408 if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) { 1409 if (MO.isReg()) { 1410 if (MO.getReg() != SGPRUsed) 1411 ++ConstantBusCount; 1412 SGPRUsed = MO.getReg(); 1413 } else { 1414 ++ConstantBusCount; 1415 } 1416 } 1417 } 1418 if (ConstantBusCount > 1) { 1419 ErrInfo = "VOP* instruction uses the constant bus more than once"; 1420 return false; 1421 } 1422 } 1423 1424 // Verify misc. restrictions on specific instructions. 1425 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || 1426 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { 1427 const MachineOperand &Src0 = MI->getOperand(Src0Idx); 1428 const MachineOperand &Src1 = MI->getOperand(Src1Idx); 1429 const MachineOperand &Src2 = MI->getOperand(Src2Idx); 1430 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 1431 if (!compareMachineOp(Src0, Src1) && 1432 !compareMachineOp(Src0, Src2)) { 1433 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 1434 return false; 1435 } 1436 } 1437 } 1438 1439 return true; 1440 } 1441 1442 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { 1443 switch (MI.getOpcode()) { 1444 default: return AMDGPU::INSTRUCTION_LIST_END; 1445 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 1446 case AMDGPU::COPY: return AMDGPU::COPY; 1447 case AMDGPU::PHI: return AMDGPU::PHI; 1448 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 1449 case AMDGPU::S_MOV_B32: 1450 return MI.getOperand(1).isReg() ? 1451 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 1452 case AMDGPU::S_ADD_I32: 1453 case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; 1454 case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; 1455 case AMDGPU::S_SUB_I32: 1456 case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; 1457 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 1458 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; 1459 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32; 1460 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32; 1461 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32; 1462 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32; 1463 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32; 1464 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32; 1465 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32; 1466 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 1467 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; 1468 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 1469 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; 1470 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 1471 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; 1472 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; 1473 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; 1474 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; 1475 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; 1476 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 1477 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 1478 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 1479 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 1480 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; 1481 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; 1482 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; 1483 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; 1484 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; 1485 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; 1486 case AMDGPU::S_LOAD_DWORD_IMM: 1487 case AMDGPU::S_LOAD_DWORD_SGPR: return AMDGPU::BUFFER_LOAD_DWORD_ADDR64; 1488 case AMDGPU::S_LOAD_DWORDX2_IMM: 1489 case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; 1490 case AMDGPU::S_LOAD_DWORDX4_IMM: 1491 case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; 1492 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 1493 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 1494 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 1495 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 1496 } 1497 } 1498 1499 bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const { 1500 return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END; 1501 } 1502 1503 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 1504 unsigned OpNo) const { 1505 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 1506 const MCInstrDesc &Desc = get(MI.getOpcode()); 1507 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 1508 Desc.OpInfo[OpNo].RegClass == -1) { 1509 unsigned Reg = MI.getOperand(OpNo).getReg(); 1510 1511 if (TargetRegisterInfo::isVirtualRegister(Reg)) 1512 return MRI.getRegClass(Reg); 1513 return RI.getPhysRegClass(Reg); 1514 } 1515 1516 unsigned RCID = Desc.OpInfo[OpNo].RegClass; 1517 return RI.getRegClass(RCID); 1518 } 1519 1520 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { 1521 switch (MI.getOpcode()) { 1522 case AMDGPU::COPY: 1523 case AMDGPU::REG_SEQUENCE: 1524 case AMDGPU::PHI: 1525 case AMDGPU::INSERT_SUBREG: 1526 return RI.hasVGPRs(getOpRegClass(MI, 0)); 1527 default: 1528 return RI.hasVGPRs(getOpRegClass(MI, OpNo)); 1529 } 1530 } 1531 1532 void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const { 1533 MachineBasicBlock::iterator I = MI; 1534 MachineBasicBlock *MBB = MI->getParent(); 1535 MachineOperand &MO = MI->getOperand(OpIdx); 1536 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1537 unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass; 1538 const TargetRegisterClass *RC = RI.getRegClass(RCID); 1539 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 1540 if (MO.isReg()) 1541 Opcode = AMDGPU::COPY; 1542 else if (RI.isSGPRClass(RC)) 1543 Opcode = AMDGPU::S_MOV_B32; 1544 1545 1546 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 1547 if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) 1548 VRC = &AMDGPU::VReg_64RegClass; 1549 else 1550 VRC = &AMDGPU::VGPR_32RegClass; 1551 1552 unsigned Reg = MRI.createVirtualRegister(VRC); 1553 DebugLoc DL = MBB->findDebugLoc(I); 1554 BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg) 1555 .addOperand(MO); 1556 MO.ChangeToRegister(Reg, false); 1557 } 1558 1559 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 1560 MachineRegisterInfo &MRI, 1561 MachineOperand &SuperReg, 1562 const TargetRegisterClass *SuperRC, 1563 unsigned SubIdx, 1564 const TargetRegisterClass *SubRC) 1565 const { 1566 assert(SuperReg.isReg()); 1567 1568 unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); 1569 unsigned SubReg = MRI.createVirtualRegister(SubRC); 1570 1571 // Just in case the super register is itself a sub-register, copy it to a new 1572 // value so we don't need to worry about merging its subreg index with the 1573 // SubIdx passed to this function. The register coalescer should be able to 1574 // eliminate this extra copy. 1575 MachineBasicBlock *MBB = MI->getParent(); 1576 DebugLoc DL = MI->getDebugLoc(); 1577 1578 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 1579 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 1580 1581 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 1582 .addReg(NewSuperReg, 0, SubIdx); 1583 1584 return SubReg; 1585 } 1586 1587 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 1588 MachineBasicBlock::iterator MII, 1589 MachineRegisterInfo &MRI, 1590 MachineOperand &Op, 1591 const TargetRegisterClass *SuperRC, 1592 unsigned SubIdx, 1593 const TargetRegisterClass *SubRC) const { 1594 if (Op.isImm()) { 1595 // XXX - Is there a better way to do this? 1596 if (SubIdx == AMDGPU::sub0) 1597 return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF); 1598 if (SubIdx == AMDGPU::sub1) 1599 return MachineOperand::CreateImm(Op.getImm() >> 32); 1600 1601 llvm_unreachable("Unhandled register index for immediate"); 1602 } 1603 1604 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 1605 SubIdx, SubRC); 1606 return MachineOperand::CreateReg(SubReg, false); 1607 } 1608 1609 unsigned SIInstrInfo::split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist, 1610 MachineBasicBlock::iterator MI, 1611 MachineRegisterInfo &MRI, 1612 const TargetRegisterClass *RC, 1613 const MachineOperand &Op) const { 1614 MachineBasicBlock *MBB = MI->getParent(); 1615 DebugLoc DL = MI->getDebugLoc(); 1616 unsigned LoDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 1617 unsigned HiDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 1618 unsigned Dst = MRI.createVirtualRegister(RC); 1619 1620 MachineInstr *Lo = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), 1621 LoDst) 1622 .addImm(Op.getImm() & 0xFFFFFFFF); 1623 MachineInstr *Hi = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), 1624 HiDst) 1625 .addImm(Op.getImm() >> 32); 1626 1627 BuildMI(*MBB, MI, DL, get(TargetOpcode::REG_SEQUENCE), Dst) 1628 .addReg(LoDst) 1629 .addImm(AMDGPU::sub0) 1630 .addReg(HiDst) 1631 .addImm(AMDGPU::sub1); 1632 1633 Worklist.push_back(Lo); 1634 Worklist.push_back(Hi); 1635 1636 return Dst; 1637 } 1638 1639 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 1640 void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const { 1641 assert(Inst->getNumExplicitOperands() == 3); 1642 MachineOperand Op1 = Inst->getOperand(1); 1643 Inst->RemoveOperand(1); 1644 Inst->addOperand(Op1); 1645 } 1646 1647 bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, 1648 const MachineOperand *MO) const { 1649 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1650 const MCInstrDesc &InstDesc = get(MI->getOpcode()); 1651 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; 1652 const TargetRegisterClass *DefinedRC = 1653 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 1654 if (!MO) 1655 MO = &MI->getOperand(OpIdx); 1656 1657 if (isVALU(InstDesc.Opcode) && 1658 usesConstantBus(MRI, *MO, DefinedRC->getSize())) { 1659 unsigned SGPRUsed = 1660 MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister; 1661 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { 1662 if (i == OpIdx) 1663 continue; 1664 const MachineOperand &Op = MI->getOperand(i); 1665 if (Op.isReg() && Op.getReg() != SGPRUsed && 1666 usesConstantBus(MRI, Op, getOpSize(*MI, i))) { 1667 return false; 1668 } 1669 } 1670 } 1671 1672 if (MO->isReg()) { 1673 assert(DefinedRC); 1674 const TargetRegisterClass *RC = 1675 TargetRegisterInfo::isVirtualRegister(MO->getReg()) ? 1676 MRI.getRegClass(MO->getReg()) : 1677 RI.getPhysRegClass(MO->getReg()); 1678 1679 // In order to be legal, the common sub-class must be equal to the 1680 // class of the current operand. For example: 1681 // 1682 // v_mov_b32 s0 ; Operand defined as vsrc_32 1683 // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL 1684 // 1685 // s_sendmsg 0, s0 ; Operand defined as m0reg 1686 // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL 1687 1688 return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; 1689 } 1690 1691 1692 // Handle non-register types that are treated like immediates. 1693 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); 1694 1695 if (!DefinedRC) { 1696 // This operand expects an immediate. 1697 return true; 1698 } 1699 1700 return isImmOperandLegal(MI, OpIdx, *MO); 1701 } 1702 1703 void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { 1704 MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1705 1706 int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 1707 AMDGPU::OpName::src0); 1708 int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 1709 AMDGPU::OpName::src1); 1710 int Src2Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 1711 AMDGPU::OpName::src2); 1712 1713 // Legalize VOP2 1714 if (isVOP2(MI->getOpcode()) && Src1Idx != -1) { 1715 // Legalize src0 1716 if (!isOperandLegal(MI, Src0Idx)) 1717 legalizeOpWithMove(MI, Src0Idx); 1718 1719 // Legalize src1 1720 if (isOperandLegal(MI, Src1Idx)) 1721 return; 1722 1723 // Usually src0 of VOP2 instructions allow more types of inputs 1724 // than src1, so try to commute the instruction to decrease our 1725 // chances of having to insert a MOV instruction to legalize src1. 1726 if (MI->isCommutable()) { 1727 if (commuteInstruction(MI)) 1728 // If we are successful in commuting, then we know MI is legal, so 1729 // we are done. 1730 return; 1731 } 1732 1733 legalizeOpWithMove(MI, Src1Idx); 1734 return; 1735 } 1736 1737 // XXX - Do any VOP3 instructions read VCC? 1738 // Legalize VOP3 1739 if (isVOP3(MI->getOpcode())) { 1740 int VOP3Idx[3] = { Src0Idx, Src1Idx, Src2Idx }; 1741 1742 // Find the one SGPR operand we are allowed to use. 1743 unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); 1744 1745 for (unsigned i = 0; i < 3; ++i) { 1746 int Idx = VOP3Idx[i]; 1747 if (Idx == -1) 1748 break; 1749 MachineOperand &MO = MI->getOperand(Idx); 1750 1751 if (MO.isReg()) { 1752 if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 1753 continue; // VGPRs are legal 1754 1755 assert(MO.getReg() != AMDGPU::SCC && "SCC operand to VOP3 instruction"); 1756 1757 if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { 1758 SGPRReg = MO.getReg(); 1759 // We can use one SGPR in each VOP3 instruction. 1760 continue; 1761 } 1762 } else if (!isLiteralConstant(MO, getOpSize(MI->getOpcode(), Idx))) { 1763 // If it is not a register and not a literal constant, then it must be 1764 // an inline constant which is always legal. 1765 continue; 1766 } 1767 // If we make it this far, then the operand is not legal and we must 1768 // legalize it. 1769 legalizeOpWithMove(MI, Idx); 1770 } 1771 } 1772 1773 // Legalize REG_SEQUENCE and PHI 1774 // The register class of the operands much be the same type as the register 1775 // class of the output. 1776 if (MI->getOpcode() == AMDGPU::REG_SEQUENCE || 1777 MI->getOpcode() == AMDGPU::PHI) { 1778 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 1779 for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { 1780 if (!MI->getOperand(i).isReg() || 1781 !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) 1782 continue; 1783 const TargetRegisterClass *OpRC = 1784 MRI.getRegClass(MI->getOperand(i).getReg()); 1785 if (RI.hasVGPRs(OpRC)) { 1786 VRC = OpRC; 1787 } else { 1788 SRC = OpRC; 1789 } 1790 } 1791 1792 // If any of the operands are VGPR registers, then they all most be 1793 // otherwise we will create illegal VGPR->SGPR copies when legalizing 1794 // them. 1795 if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) { 1796 if (!VRC) { 1797 assert(SRC); 1798 VRC = RI.getEquivalentVGPRClass(SRC); 1799 } 1800 RC = VRC; 1801 } else { 1802 RC = SRC; 1803 } 1804 1805 // Update all the operands so they have the same type. 1806 for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { 1807 if (!MI->getOperand(i).isReg() || 1808 !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) 1809 continue; 1810 unsigned DstReg = MRI.createVirtualRegister(RC); 1811 MachineBasicBlock *InsertBB; 1812 MachineBasicBlock::iterator Insert; 1813 if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) { 1814 InsertBB = MI->getParent(); 1815 Insert = MI; 1816 } else { 1817 // MI is a PHI instruction. 1818 InsertBB = MI->getOperand(i + 1).getMBB(); 1819 Insert = InsertBB->getFirstTerminator(); 1820 } 1821 BuildMI(*InsertBB, Insert, MI->getDebugLoc(), 1822 get(AMDGPU::COPY), DstReg) 1823 .addOperand(MI->getOperand(i)); 1824 MI->getOperand(i).setReg(DstReg); 1825 } 1826 } 1827 1828 // Legalize INSERT_SUBREG 1829 // src0 must have the same register class as dst 1830 if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) { 1831 unsigned Dst = MI->getOperand(0).getReg(); 1832 unsigned Src0 = MI->getOperand(1).getReg(); 1833 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 1834 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 1835 if (DstRC != Src0RC) { 1836 MachineBasicBlock &MBB = *MI->getParent(); 1837 unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); 1838 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0) 1839 .addReg(Src0); 1840 MI->getOperand(1).setReg(NewSrc0); 1841 } 1842 return; 1843 } 1844 1845 // Legalize MUBUF* instructions 1846 // FIXME: If we start using the non-addr64 instructions for compute, we 1847 // may need to legalize them here. 1848 int SRsrcIdx = 1849 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc); 1850 if (SRsrcIdx != -1) { 1851 // We have an MUBUF instruction 1852 MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx); 1853 unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass; 1854 if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), 1855 RI.getRegClass(SRsrcRC))) { 1856 // The operands are legal. 1857 // FIXME: We may need to legalize operands besided srsrc. 1858 return; 1859 } 1860 1861 MachineBasicBlock &MBB = *MI->getParent(); 1862 // Extract the ptr from the resource descriptor. 1863 1864 // SRsrcPtrLo = srsrc:sub0 1865 unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc, 1866 &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VGPR_32RegClass); 1867 1868 // SRsrcPtrHi = srsrc:sub1 1869 unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc, 1870 &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VGPR_32RegClass); 1871 1872 // Create an empty resource descriptor 1873 unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1874 unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 1875 unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 1876 unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); 1877 uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); 1878 1879 // Zero64 = 0 1880 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64), 1881 Zero64) 1882 .addImm(0); 1883 1884 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 1885 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 1886 SRsrcFormatLo) 1887 .addImm(RsrcDataFormat & 0xFFFFFFFF); 1888 1889 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 1890 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 1891 SRsrcFormatHi) 1892 .addImm(RsrcDataFormat >> 32); 1893 1894 // NewSRsrc = {Zero64, SRsrcFormat} 1895 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), 1896 NewSRsrc) 1897 .addReg(Zero64) 1898 .addImm(AMDGPU::sub0_sub1) 1899 .addReg(SRsrcFormatLo) 1900 .addImm(AMDGPU::sub2) 1901 .addReg(SRsrcFormatHi) 1902 .addImm(AMDGPU::sub3); 1903 1904 MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); 1905 unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 1906 unsigned NewVAddrLo; 1907 unsigned NewVAddrHi; 1908 if (VAddr) { 1909 // This is already an ADDR64 instruction so we need to add the pointer 1910 // extracted from the resource descriptor to the current value of VAddr. 1911 NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1912 NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1913 1914 // NewVaddrLo = SRsrcPtrLo + VAddr:sub0 1915 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32), 1916 NewVAddrLo) 1917 .addReg(SRsrcPtrLo) 1918 .addReg(VAddr->getReg(), 0, AMDGPU::sub0) 1919 .addReg(AMDGPU::VCC, RegState::ImplicitDefine); 1920 1921 // NewVaddrHi = SRsrcPtrHi + VAddr:sub1 1922 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32), 1923 NewVAddrHi) 1924 .addReg(SRsrcPtrHi) 1925 .addReg(VAddr->getReg(), 0, AMDGPU::sub1) 1926 .addReg(AMDGPU::VCC, RegState::ImplicitDefine) 1927 .addReg(AMDGPU::VCC, RegState::Implicit); 1928 1929 } else { 1930 // This instructions is the _OFFSET variant, so we need to convert it to 1931 // ADDR64. 1932 MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata); 1933 MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset); 1934 MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset); 1935 1936 // Create the new instruction. 1937 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode()); 1938 MachineInstr *Addr64 = 1939 BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) 1940 .addOperand(*VData) 1941 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 1942 // This will be replaced later 1943 // with the new value of vaddr. 1944 .addOperand(*SRsrc) 1945 .addOperand(*SOffset) 1946 .addOperand(*Offset) 1947 .addImm(0) // glc 1948 .addImm(0) // slc 1949 .addImm(0); // tfe 1950 1951 MI->removeFromParent(); 1952 MI = Addr64; 1953 1954 NewVAddrLo = SRsrcPtrLo; 1955 NewVAddrHi = SRsrcPtrHi; 1956 VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); 1957 SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); 1958 } 1959 1960 // NewVaddr = {NewVaddrHi, NewVaddrLo} 1961 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), 1962 NewVAddr) 1963 .addReg(NewVAddrLo) 1964 .addImm(AMDGPU::sub0) 1965 .addReg(NewVAddrHi) 1966 .addImm(AMDGPU::sub1); 1967 1968 1969 // Update the instruction to use NewVaddr 1970 VAddr->setReg(NewVAddr); 1971 // Update the instruction to use NewSRsrc 1972 SRsrc->setReg(NewSRsrc); 1973 } 1974 } 1975 1976 void SIInstrInfo::splitSMRD(MachineInstr *MI, 1977 const TargetRegisterClass *HalfRC, 1978 unsigned HalfImmOp, unsigned HalfSGPROp, 1979 MachineInstr *&Lo, MachineInstr *&Hi) const { 1980 1981 DebugLoc DL = MI->getDebugLoc(); 1982 MachineBasicBlock *MBB = MI->getParent(); 1983 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1984 unsigned RegLo = MRI.createVirtualRegister(HalfRC); 1985 unsigned RegHi = MRI.createVirtualRegister(HalfRC); 1986 unsigned HalfSize = HalfRC->getSize(); 1987 const MachineOperand *OffOp = 1988 getNamedOperand(*MI, AMDGPU::OpName::offset); 1989 const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase); 1990 1991 // The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes 1992 // on VI. 1993 1994 bool IsKill = SBase->isKill(); 1995 if (OffOp) { 1996 bool isVI = 1997 MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >= 1998 AMDGPUSubtarget::VOLCANIC_ISLANDS; 1999 unsigned OffScale = isVI ? 1 : 4; 2000 // Handle the _IMM variant 2001 unsigned LoOffset = OffOp->getImm() * OffScale; 2002 unsigned HiOffset = LoOffset + HalfSize; 2003 Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo) 2004 // Use addReg instead of addOperand 2005 // to make sure kill flag is cleared. 2006 .addReg(SBase->getReg(), 0, SBase->getSubReg()) 2007 .addImm(LoOffset / OffScale); 2008 2009 if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) { 2010 unsigned OffsetSGPR = 2011 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 2012 BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR) 2013 .addImm(HiOffset); // The offset in register is in bytes. 2014 Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) 2015 .addReg(SBase->getReg(), getKillRegState(IsKill), 2016 SBase->getSubReg()) 2017 .addReg(OffsetSGPR); 2018 } else { 2019 Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi) 2020 .addReg(SBase->getReg(), getKillRegState(IsKill), 2021 SBase->getSubReg()) 2022 .addImm(HiOffset / OffScale); 2023 } 2024 } else { 2025 // Handle the _SGPR variant 2026 MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff); 2027 Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo) 2028 .addReg(SBase->getReg(), 0, SBase->getSubReg()) 2029 .addOperand(*SOff); 2030 unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 2031 BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR) 2032 .addOperand(*SOff) 2033 .addImm(HalfSize); 2034 Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp)) 2035 .addReg(SBase->getReg(), getKillRegState(IsKill), 2036 SBase->getSubReg()) 2037 .addReg(OffsetSGPR); 2038 } 2039 2040 unsigned SubLo, SubHi; 2041 switch (HalfSize) { 2042 case 4: 2043 SubLo = AMDGPU::sub0; 2044 SubHi = AMDGPU::sub1; 2045 break; 2046 case 8: 2047 SubLo = AMDGPU::sub0_sub1; 2048 SubHi = AMDGPU::sub2_sub3; 2049 break; 2050 case 16: 2051 SubLo = AMDGPU::sub0_sub1_sub2_sub3; 2052 SubHi = AMDGPU::sub4_sub5_sub6_sub7; 2053 break; 2054 case 32: 2055 SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; 2056 SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15; 2057 break; 2058 default: 2059 llvm_unreachable("Unhandled HalfSize"); 2060 } 2061 2062 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE)) 2063 .addOperand(MI->getOperand(0)) 2064 .addReg(RegLo) 2065 .addImm(SubLo) 2066 .addReg(RegHi) 2067 .addImm(SubHi); 2068 } 2069 2070 void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const { 2071 MachineBasicBlock *MBB = MI->getParent(); 2072 switch (MI->getOpcode()) { 2073 case AMDGPU::S_LOAD_DWORD_IMM: 2074 case AMDGPU::S_LOAD_DWORD_SGPR: 2075 case AMDGPU::S_LOAD_DWORDX2_IMM: 2076 case AMDGPU::S_LOAD_DWORDX2_SGPR: 2077 case AMDGPU::S_LOAD_DWORDX4_IMM: 2078 case AMDGPU::S_LOAD_DWORDX4_SGPR: { 2079 unsigned NewOpcode = getVALUOp(*MI); 2080 unsigned RegOffset; 2081 unsigned ImmOffset; 2082 2083 if (MI->getOperand(2).isReg()) { 2084 RegOffset = MI->getOperand(2).getReg(); 2085 ImmOffset = 0; 2086 } else { 2087 assert(MI->getOperand(2).isImm()); 2088 // SMRD instructions take a dword offsets on SI and byte offset on VI 2089 // and MUBUF instructions always take a byte offset. 2090 ImmOffset = MI->getOperand(2).getImm(); 2091 if (MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() <= 2092 AMDGPUSubtarget::SEA_ISLANDS) 2093 ImmOffset <<= 2; 2094 RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2095 2096 if (isUInt<12>(ImmOffset)) { 2097 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 2098 RegOffset) 2099 .addImm(0); 2100 } else { 2101 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 2102 RegOffset) 2103 .addImm(ImmOffset); 2104 ImmOffset = 0; 2105 } 2106 } 2107 2108 unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); 2109 unsigned DWord0 = RegOffset; 2110 unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2111 unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2112 unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2113 uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); 2114 2115 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1) 2116 .addImm(0); 2117 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2) 2118 .addImm(RsrcDataFormat & 0xFFFFFFFF); 2119 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3) 2120 .addImm(RsrcDataFormat >> 32); 2121 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc) 2122 .addReg(DWord0) 2123 .addImm(AMDGPU::sub0) 2124 .addReg(DWord1) 2125 .addImm(AMDGPU::sub1) 2126 .addReg(DWord2) 2127 .addImm(AMDGPU::sub2) 2128 .addReg(DWord3) 2129 .addImm(AMDGPU::sub3); 2130 MI->setDesc(get(NewOpcode)); 2131 if (MI->getOperand(2).isReg()) { 2132 MI->getOperand(2).setReg(SRsrc); 2133 } else { 2134 MI->getOperand(2).ChangeToRegister(SRsrc, false); 2135 } 2136 MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); 2137 MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset)); 2138 MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // glc 2139 MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // slc 2140 MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // tfe 2141 2142 const TargetRegisterClass *NewDstRC = 2143 RI.getRegClass(get(NewOpcode).OpInfo[0].RegClass); 2144 2145 unsigned DstReg = MI->getOperand(0).getReg(); 2146 unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); 2147 MRI.replaceRegWith(DstReg, NewDstReg); 2148 break; 2149 } 2150 case AMDGPU::S_LOAD_DWORDX8_IMM: 2151 case AMDGPU::S_LOAD_DWORDX8_SGPR: { 2152 MachineInstr *Lo, *Hi; 2153 splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM, 2154 AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi); 2155 MI->eraseFromParent(); 2156 moveSMRDToVALU(Lo, MRI); 2157 moveSMRDToVALU(Hi, MRI); 2158 break; 2159 } 2160 2161 case AMDGPU::S_LOAD_DWORDX16_IMM: 2162 case AMDGPU::S_LOAD_DWORDX16_SGPR: { 2163 MachineInstr *Lo, *Hi; 2164 splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM, 2165 AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi); 2166 MI->eraseFromParent(); 2167 moveSMRDToVALU(Lo, MRI); 2168 moveSMRDToVALU(Hi, MRI); 2169 break; 2170 } 2171 } 2172 } 2173 2174 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { 2175 SmallVector<MachineInstr *, 128> Worklist; 2176 Worklist.push_back(&TopInst); 2177 2178 while (!Worklist.empty()) { 2179 MachineInstr *Inst = Worklist.pop_back_val(); 2180 MachineBasicBlock *MBB = Inst->getParent(); 2181 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 2182 2183 unsigned Opcode = Inst->getOpcode(); 2184 unsigned NewOpcode = getVALUOp(*Inst); 2185 2186 // Handle some special cases 2187 switch (Opcode) { 2188 default: 2189 if (isSMRD(Inst->getOpcode())) { 2190 moveSMRDToVALU(Inst, MRI); 2191 } 2192 break; 2193 case AMDGPU::S_MOV_B64: { 2194 DebugLoc DL = Inst->getDebugLoc(); 2195 2196 // If the source operand is a register we can replace this with a 2197 // copy. 2198 if (Inst->getOperand(1).isReg()) { 2199 MachineInstr *Copy = BuildMI(*MBB, Inst, DL, get(TargetOpcode::COPY)) 2200 .addOperand(Inst->getOperand(0)) 2201 .addOperand(Inst->getOperand(1)); 2202 Worklist.push_back(Copy); 2203 } else { 2204 // Otherwise, we need to split this into two movs, because there is 2205 // no 64-bit VALU move instruction. 2206 unsigned Reg = Inst->getOperand(0).getReg(); 2207 unsigned Dst = split64BitImm(Worklist, 2208 Inst, 2209 MRI, 2210 MRI.getRegClass(Reg), 2211 Inst->getOperand(1)); 2212 MRI.replaceRegWith(Reg, Dst); 2213 } 2214 Inst->eraseFromParent(); 2215 continue; 2216 } 2217 case AMDGPU::S_AND_B64: 2218 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32); 2219 Inst->eraseFromParent(); 2220 continue; 2221 2222 case AMDGPU::S_OR_B64: 2223 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32); 2224 Inst->eraseFromParent(); 2225 continue; 2226 2227 case AMDGPU::S_XOR_B64: 2228 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32); 2229 Inst->eraseFromParent(); 2230 continue; 2231 2232 case AMDGPU::S_NOT_B64: 2233 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); 2234 Inst->eraseFromParent(); 2235 continue; 2236 2237 case AMDGPU::S_BCNT1_I32_B64: 2238 splitScalar64BitBCNT(Worklist, Inst); 2239 Inst->eraseFromParent(); 2240 continue; 2241 2242 case AMDGPU::S_BFE_I64: { 2243 splitScalar64BitBFE(Worklist, Inst); 2244 Inst->eraseFromParent(); 2245 continue; 2246 } 2247 2248 case AMDGPU::S_LSHL_B32: 2249 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2250 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 2251 swapOperands(Inst); 2252 } 2253 break; 2254 case AMDGPU::S_ASHR_I32: 2255 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2256 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 2257 swapOperands(Inst); 2258 } 2259 break; 2260 case AMDGPU::S_LSHR_B32: 2261 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2262 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 2263 swapOperands(Inst); 2264 } 2265 break; 2266 case AMDGPU::S_LSHL_B64: 2267 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2268 NewOpcode = AMDGPU::V_LSHLREV_B64; 2269 swapOperands(Inst); 2270 } 2271 break; 2272 case AMDGPU::S_ASHR_I64: 2273 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2274 NewOpcode = AMDGPU::V_ASHRREV_I64; 2275 swapOperands(Inst); 2276 } 2277 break; 2278 case AMDGPU::S_LSHR_B64: 2279 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2280 NewOpcode = AMDGPU::V_LSHRREV_B64; 2281 swapOperands(Inst); 2282 } 2283 break; 2284 2285 case AMDGPU::S_BFE_U64: 2286 case AMDGPU::S_BFM_B64: 2287 llvm_unreachable("Moving this op to VALU not implemented"); 2288 } 2289 2290 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 2291 // We cannot move this instruction to the VALU, so we should try to 2292 // legalize its operands instead. 2293 legalizeOperands(Inst); 2294 continue; 2295 } 2296 2297 // Use the new VALU Opcode. 2298 const MCInstrDesc &NewDesc = get(NewOpcode); 2299 Inst->setDesc(NewDesc); 2300 2301 // Remove any references to SCC. Vector instructions can't read from it, and 2302 // We're just about to add the implicit use / defs of VCC, and we don't want 2303 // both. 2304 for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) { 2305 MachineOperand &Op = Inst->getOperand(i); 2306 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) 2307 Inst->RemoveOperand(i); 2308 } 2309 2310 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 2311 // We are converting these to a BFE, so we need to add the missing 2312 // operands for the size and offset. 2313 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 2314 Inst->addOperand(MachineOperand::CreateImm(0)); 2315 Inst->addOperand(MachineOperand::CreateImm(Size)); 2316 2317 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 2318 // The VALU version adds the second operand to the result, so insert an 2319 // extra 0 operand. 2320 Inst->addOperand(MachineOperand::CreateImm(0)); 2321 } 2322 2323 addDescImplicitUseDef(NewDesc, Inst); 2324 2325 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 2326 const MachineOperand &OffsetWidthOp = Inst->getOperand(2); 2327 // If we need to move this to VGPRs, we need to unpack the second operand 2328 // back into the 2 separate ones for bit offset and width. 2329 assert(OffsetWidthOp.isImm() && 2330 "Scalar BFE is only implemented for constant width and offset"); 2331 uint32_t Imm = OffsetWidthOp.getImm(); 2332 2333 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 2334 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 2335 Inst->RemoveOperand(2); // Remove old immediate. 2336 Inst->addOperand(MachineOperand::CreateImm(Offset)); 2337 Inst->addOperand(MachineOperand::CreateImm(BitWidth)); 2338 } 2339 2340 // Update the destination register class. 2341 2342 const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0); 2343 2344 switch (Opcode) { 2345 // For target instructions, getOpRegClass just returns the virtual 2346 // register class associated with the operand, so we need to find an 2347 // equivalent VGPR register class in order to move the instruction to the 2348 // VALU. 2349 case AMDGPU::COPY: 2350 case AMDGPU::PHI: 2351 case AMDGPU::REG_SEQUENCE: 2352 case AMDGPU::INSERT_SUBREG: 2353 if (RI.hasVGPRs(NewDstRC)) 2354 continue; 2355 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 2356 if (!NewDstRC) 2357 continue; 2358 break; 2359 default: 2360 break; 2361 } 2362 2363 unsigned DstReg = Inst->getOperand(0).getReg(); 2364 unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); 2365 MRI.replaceRegWith(DstReg, NewDstReg); 2366 2367 // Legalize the operands 2368 legalizeOperands(Inst); 2369 2370 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg), 2371 E = MRI.use_end(); I != E; ++I) { 2372 MachineInstr &UseMI = *I->getParent(); 2373 if (!canReadVGPR(UseMI, I.getOperandNo())) { 2374 Worklist.push_back(&UseMI); 2375 } 2376 } 2377 } 2378 } 2379 2380 //===----------------------------------------------------------------------===// 2381 // Indirect addressing callbacks 2382 //===----------------------------------------------------------------------===// 2383 2384 unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex, 2385 unsigned Channel) const { 2386 assert(Channel == 0); 2387 return RegIndex; 2388 } 2389 2390 const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const { 2391 return &AMDGPU::VGPR_32RegClass; 2392 } 2393 2394 void SIInstrInfo::splitScalar64BitUnaryOp( 2395 SmallVectorImpl<MachineInstr *> &Worklist, 2396 MachineInstr *Inst, 2397 unsigned Opcode) const { 2398 MachineBasicBlock &MBB = *Inst->getParent(); 2399 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2400 2401 MachineOperand &Dest = Inst->getOperand(0); 2402 MachineOperand &Src0 = Inst->getOperand(1); 2403 DebugLoc DL = Inst->getDebugLoc(); 2404 2405 MachineBasicBlock::iterator MII = Inst; 2406 2407 const MCInstrDesc &InstDesc = get(Opcode); 2408 const TargetRegisterClass *Src0RC = Src0.isReg() ? 2409 MRI.getRegClass(Src0.getReg()) : 2410 &AMDGPU::SGPR_32RegClass; 2411 2412 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 2413 2414 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2415 AMDGPU::sub0, Src0SubRC); 2416 2417 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 2418 const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0); 2419 2420 unsigned DestSub0 = MRI.createVirtualRegister(DestRC); 2421 MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) 2422 .addOperand(SrcReg0Sub0); 2423 2424 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2425 AMDGPU::sub1, Src0SubRC); 2426 2427 unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC); 2428 MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) 2429 .addOperand(SrcReg0Sub1); 2430 2431 unsigned FullDestReg = MRI.createVirtualRegister(DestRC); 2432 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2433 .addReg(DestSub0) 2434 .addImm(AMDGPU::sub0) 2435 .addReg(DestSub1) 2436 .addImm(AMDGPU::sub1); 2437 2438 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 2439 2440 // Try to legalize the operands in case we need to swap the order to keep it 2441 // valid. 2442 Worklist.push_back(LoHalf); 2443 Worklist.push_back(HiHalf); 2444 } 2445 2446 void SIInstrInfo::splitScalar64BitBinaryOp( 2447 SmallVectorImpl<MachineInstr *> &Worklist, 2448 MachineInstr *Inst, 2449 unsigned Opcode) const { 2450 MachineBasicBlock &MBB = *Inst->getParent(); 2451 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2452 2453 MachineOperand &Dest = Inst->getOperand(0); 2454 MachineOperand &Src0 = Inst->getOperand(1); 2455 MachineOperand &Src1 = Inst->getOperand(2); 2456 DebugLoc DL = Inst->getDebugLoc(); 2457 2458 MachineBasicBlock::iterator MII = Inst; 2459 2460 const MCInstrDesc &InstDesc = get(Opcode); 2461 const TargetRegisterClass *Src0RC = Src0.isReg() ? 2462 MRI.getRegClass(Src0.getReg()) : 2463 &AMDGPU::SGPR_32RegClass; 2464 2465 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 2466 const TargetRegisterClass *Src1RC = Src1.isReg() ? 2467 MRI.getRegClass(Src1.getReg()) : 2468 &AMDGPU::SGPR_32RegClass; 2469 2470 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 2471 2472 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2473 AMDGPU::sub0, Src0SubRC); 2474 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 2475 AMDGPU::sub0, Src1SubRC); 2476 2477 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 2478 const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0); 2479 2480 unsigned DestSub0 = MRI.createVirtualRegister(DestRC); 2481 MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) 2482 .addOperand(SrcReg0Sub0) 2483 .addOperand(SrcReg1Sub0); 2484 2485 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2486 AMDGPU::sub1, Src0SubRC); 2487 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 2488 AMDGPU::sub1, Src1SubRC); 2489 2490 unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC); 2491 MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) 2492 .addOperand(SrcReg0Sub1) 2493 .addOperand(SrcReg1Sub1); 2494 2495 unsigned FullDestReg = MRI.createVirtualRegister(DestRC); 2496 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2497 .addReg(DestSub0) 2498 .addImm(AMDGPU::sub0) 2499 .addReg(DestSub1) 2500 .addImm(AMDGPU::sub1); 2501 2502 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 2503 2504 // Try to legalize the operands in case we need to swap the order to keep it 2505 // valid. 2506 Worklist.push_back(LoHalf); 2507 Worklist.push_back(HiHalf); 2508 } 2509 2510 void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, 2511 MachineInstr *Inst) const { 2512 MachineBasicBlock &MBB = *Inst->getParent(); 2513 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2514 2515 MachineBasicBlock::iterator MII = Inst; 2516 DebugLoc DL = Inst->getDebugLoc(); 2517 2518 MachineOperand &Dest = Inst->getOperand(0); 2519 MachineOperand &Src = Inst->getOperand(1); 2520 2521 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 2522 const TargetRegisterClass *SrcRC = Src.isReg() ? 2523 MRI.getRegClass(Src.getReg()) : 2524 &AMDGPU::SGPR_32RegClass; 2525 2526 unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2527 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2528 2529 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); 2530 2531 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 2532 AMDGPU::sub0, SrcSubRC); 2533 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 2534 AMDGPU::sub1, SrcSubRC); 2535 2536 MachineInstr *First = BuildMI(MBB, MII, DL, InstDesc, MidReg) 2537 .addOperand(SrcRegSub0) 2538 .addImm(0); 2539 2540 MachineInstr *Second = BuildMI(MBB, MII, DL, InstDesc, ResultReg) 2541 .addOperand(SrcRegSub1) 2542 .addReg(MidReg); 2543 2544 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2545 2546 Worklist.push_back(First); 2547 Worklist.push_back(Second); 2548 } 2549 2550 void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, 2551 MachineInstr *Inst) const { 2552 MachineBasicBlock &MBB = *Inst->getParent(); 2553 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2554 MachineBasicBlock::iterator MII = Inst; 2555 DebugLoc DL = Inst->getDebugLoc(); 2556 2557 MachineOperand &Dest = Inst->getOperand(0); 2558 uint32_t Imm = Inst->getOperand(2).getImm(); 2559 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 2560 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 2561 2562 (void) Offset; 2563 2564 // Only sext_inreg cases handled. 2565 assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 && 2566 BitWidth <= 32 && 2567 Offset == 0 && 2568 "Not implemented"); 2569 2570 if (BitWidth < 32) { 2571 unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2572 unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2573 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2574 2575 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) 2576 .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0) 2577 .addImm(0) 2578 .addImm(BitWidth); 2579 2580 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 2581 .addImm(31) 2582 .addReg(MidRegLo); 2583 2584 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 2585 .addReg(MidRegLo) 2586 .addImm(AMDGPU::sub0) 2587 .addReg(MidRegHi) 2588 .addImm(AMDGPU::sub1); 2589 2590 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2591 return; 2592 } 2593 2594 MachineOperand &Src = Inst->getOperand(1); 2595 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2596 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2597 2598 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 2599 .addImm(31) 2600 .addReg(Src.getReg(), 0, AMDGPU::sub0); 2601 2602 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 2603 .addReg(Src.getReg(), 0, AMDGPU::sub0) 2604 .addImm(AMDGPU::sub0) 2605 .addReg(TmpReg) 2606 .addImm(AMDGPU::sub1); 2607 2608 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2609 } 2610 2611 void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc, 2612 MachineInstr *Inst) const { 2613 // Add the implict and explicit register definitions. 2614 if (NewDesc.ImplicitUses) { 2615 for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) { 2616 unsigned Reg = NewDesc.ImplicitUses[i]; 2617 Inst->addOperand(MachineOperand::CreateReg(Reg, false, true)); 2618 } 2619 } 2620 2621 if (NewDesc.ImplicitDefs) { 2622 for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) { 2623 unsigned Reg = NewDesc.ImplicitDefs[i]; 2624 Inst->addOperand(MachineOperand::CreateReg(Reg, true, true)); 2625 } 2626 } 2627 } 2628 2629 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, 2630 int OpIndices[3]) const { 2631 const MCInstrDesc &Desc = get(MI->getOpcode()); 2632 2633 // Find the one SGPR operand we are allowed to use. 2634 unsigned SGPRReg = AMDGPU::NoRegister; 2635 2636 // First we need to consider the instruction's operand requirements before 2637 // legalizing. Some operands are required to be SGPRs, such as implicit uses 2638 // of VCC, but we are still bound by the constant bus requirement to only use 2639 // one. 2640 // 2641 // If the operand's class is an SGPR, we can never move it. 2642 2643 for (const MachineOperand &MO : MI->implicit_operands()) { 2644 // We only care about reads. 2645 if (MO.isDef()) 2646 continue; 2647 2648 if (MO.getReg() == AMDGPU::VCC) 2649 return AMDGPU::VCC; 2650 2651 if (MO.getReg() == AMDGPU::FLAT_SCR) 2652 return AMDGPU::FLAT_SCR; 2653 } 2654 2655 unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; 2656 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 2657 2658 for (unsigned i = 0; i < 3; ++i) { 2659 int Idx = OpIndices[i]; 2660 if (Idx == -1) 2661 break; 2662 2663 const MachineOperand &MO = MI->getOperand(Idx); 2664 if (RI.isSGPRClassID(Desc.OpInfo[Idx].RegClass)) 2665 SGPRReg = MO.getReg(); 2666 2667 if (MO.isReg() && RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 2668 UsedSGPRs[i] = MO.getReg(); 2669 } 2670 2671 if (SGPRReg != AMDGPU::NoRegister) 2672 return SGPRReg; 2673 2674 // We don't have a required SGPR operand, so we have a bit more freedom in 2675 // selecting operands to move. 2676 2677 // Try to select the most used SGPR. If an SGPR is equal to one of the 2678 // others, we choose that. 2679 // 2680 // e.g. 2681 // V_FMA_F32 v0, s0, s0, s0 -> No moves 2682 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 2683 2684 if (UsedSGPRs[0] != AMDGPU::NoRegister) { 2685 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 2686 SGPRReg = UsedSGPRs[0]; 2687 } 2688 2689 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { 2690 if (UsedSGPRs[1] == UsedSGPRs[2]) 2691 SGPRReg = UsedSGPRs[1]; 2692 } 2693 2694 return SGPRReg; 2695 } 2696 2697 MachineInstrBuilder SIInstrInfo::buildIndirectWrite( 2698 MachineBasicBlock *MBB, 2699 MachineBasicBlock::iterator I, 2700 unsigned ValueReg, 2701 unsigned Address, unsigned OffsetReg) const { 2702 const DebugLoc &DL = MBB->findDebugLoc(I); 2703 unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( 2704 getIndirectIndexBegin(*MBB->getParent())); 2705 2706 return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_DST_V1)) 2707 .addReg(IndirectBaseReg, RegState::Define) 2708 .addOperand(I->getOperand(0)) 2709 .addReg(IndirectBaseReg) 2710 .addReg(OffsetReg) 2711 .addImm(0) 2712 .addReg(ValueReg); 2713 } 2714 2715 MachineInstrBuilder SIInstrInfo::buildIndirectRead( 2716 MachineBasicBlock *MBB, 2717 MachineBasicBlock::iterator I, 2718 unsigned ValueReg, 2719 unsigned Address, unsigned OffsetReg) const { 2720 const DebugLoc &DL = MBB->findDebugLoc(I); 2721 unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( 2722 getIndirectIndexBegin(*MBB->getParent())); 2723 2724 return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC)) 2725 .addOperand(I->getOperand(0)) 2726 .addOperand(I->getOperand(1)) 2727 .addReg(IndirectBaseReg) 2728 .addReg(OffsetReg) 2729 .addImm(0); 2730 2731 } 2732 2733 void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved, 2734 const MachineFunction &MF) const { 2735 int End = getIndirectIndexEnd(MF); 2736 int Begin = getIndirectIndexBegin(MF); 2737 2738 if (End == -1) 2739 return; 2740 2741 2742 for (int Index = Begin; Index <= End; ++Index) 2743 Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index)); 2744 2745 for (int Index = std::max(0, Begin - 1); Index <= End; ++Index) 2746 Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index)); 2747 2748 for (int Index = std::max(0, Begin - 2); Index <= End; ++Index) 2749 Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index)); 2750 2751 for (int Index = std::max(0, Begin - 3); Index <= End; ++Index) 2752 Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index)); 2753 2754 for (int Index = std::max(0, Begin - 7); Index <= End; ++Index) 2755 Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index)); 2756 2757 for (int Index = std::max(0, Begin - 15); Index <= End; ++Index) 2758 Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index)); 2759 } 2760 2761 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 2762 unsigned OperandName) const { 2763 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 2764 if (Idx == -1) 2765 return nullptr; 2766 2767 return &MI.getOperand(Idx); 2768 } 2769 2770 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 2771 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 2772 if (ST.isAmdHsaOS()) { 2773 RsrcDataFormat |= (1ULL << 56); 2774 2775 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 2776 // Set MTYPE = 2 2777 RsrcDataFormat |= (2ULL << 59); 2778 } 2779 2780 return RsrcDataFormat; 2781 } 2782