1 //===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief SI Implementation of TargetInstrInfo. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "SIInstrInfo.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "SIDefines.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/CodeGen/MachineFrameInfo.h" 21 #include "llvm/CodeGen/MachineInstrBuilder.h" 22 #include "llvm/CodeGen/MachineRegisterInfo.h" 23 #include "llvm/IR/Function.h" 24 #include "llvm/CodeGen/RegisterScavenging.h" 25 #include "llvm/MC/MCInstrDesc.h" 26 #include "llvm/Support/Debug.h" 27 28 using namespace llvm; 29 30 SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st) 31 : AMDGPUInstrInfo(st), RI() {} 32 33 //===----------------------------------------------------------------------===// 34 // TargetInstrInfo callbacks 35 //===----------------------------------------------------------------------===// 36 37 static unsigned getNumOperandsNoGlue(SDNode *Node) { 38 unsigned N = Node->getNumOperands(); 39 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 40 --N; 41 return N; 42 } 43 44 static SDValue findChainOperand(SDNode *Load) { 45 SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); 46 assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); 47 return LastOp; 48 } 49 50 /// \brief Returns true if both nodes have the same value for the given 51 /// operand \p Op, or if both nodes do not have this operand. 52 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 53 unsigned Opc0 = N0->getMachineOpcode(); 54 unsigned Opc1 = N1->getMachineOpcode(); 55 56 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 57 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 58 59 if (Op0Idx == -1 && Op1Idx == -1) 60 return true; 61 62 63 if ((Op0Idx == -1 && Op1Idx != -1) || 64 (Op1Idx == -1 && Op0Idx != -1)) 65 return false; 66 67 // getNamedOperandIdx returns the index for the MachineInstr's operands, 68 // which includes the result as the first operand. We are indexing into the 69 // MachineSDNode's operands, so we need to skip the result operand to get 70 // the real index. 71 --Op0Idx; 72 --Op1Idx; 73 74 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 75 } 76 77 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, 78 AliasAnalysis *AA) const { 79 // TODO: The generic check fails for VALU instructions that should be 80 // rematerializable due to implicit reads of exec. We really want all of the 81 // generic logic for this except for this. 82 switch (MI->getOpcode()) { 83 case AMDGPU::V_MOV_B32_e32: 84 case AMDGPU::V_MOV_B32_e64: 85 return true; 86 default: 87 return false; 88 } 89 } 90 91 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 92 int64_t &Offset0, 93 int64_t &Offset1) const { 94 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 95 return false; 96 97 unsigned Opc0 = Load0->getMachineOpcode(); 98 unsigned Opc1 = Load1->getMachineOpcode(); 99 100 // Make sure both are actually loads. 101 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 102 return false; 103 104 if (isDS(Opc0) && isDS(Opc1)) { 105 106 // FIXME: Handle this case: 107 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 108 return false; 109 110 // Check base reg. 111 if (Load0->getOperand(1) != Load1->getOperand(1)) 112 return false; 113 114 // Check chain. 115 if (findChainOperand(Load0) != findChainOperand(Load1)) 116 return false; 117 118 // Skip read2 / write2 variants for simplicity. 119 // TODO: We should report true if the used offsets are adjacent (excluded 120 // st64 versions). 121 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || 122 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) 123 return false; 124 125 Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue(); 126 Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue(); 127 return true; 128 } 129 130 if (isSMRD(Opc0) && isSMRD(Opc1)) { 131 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); 132 133 // Check base reg. 134 if (Load0->getOperand(0) != Load1->getOperand(0)) 135 return false; 136 137 const ConstantSDNode *Load0Offset = 138 dyn_cast<ConstantSDNode>(Load0->getOperand(1)); 139 const ConstantSDNode *Load1Offset = 140 dyn_cast<ConstantSDNode>(Load1->getOperand(1)); 141 142 if (!Load0Offset || !Load1Offset) 143 return false; 144 145 // Check chain. 146 if (findChainOperand(Load0) != findChainOperand(Load1)) 147 return false; 148 149 Offset0 = Load0Offset->getZExtValue(); 150 Offset1 = Load1Offset->getZExtValue(); 151 return true; 152 } 153 154 // MUBUF and MTBUF can access the same addresses. 155 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 156 157 // MUBUF and MTBUF have vaddr at different indices. 158 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 159 findChainOperand(Load0) != findChainOperand(Load1) || 160 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 161 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 162 return false; 163 164 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 165 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 166 167 if (OffIdx0 == -1 || OffIdx1 == -1) 168 return false; 169 170 // getNamedOperandIdx returns the index for MachineInstrs. Since they 171 // inlcude the output in the operand list, but SDNodes don't, we need to 172 // subtract the index by one. 173 --OffIdx0; 174 --OffIdx1; 175 176 SDValue Off0 = Load0->getOperand(OffIdx0); 177 SDValue Off1 = Load1->getOperand(OffIdx1); 178 179 // The offset might be a FrameIndexSDNode. 180 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 181 return false; 182 183 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 184 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 185 return true; 186 } 187 188 return false; 189 } 190 191 static bool isStride64(unsigned Opc) { 192 switch (Opc) { 193 case AMDGPU::DS_READ2ST64_B32: 194 case AMDGPU::DS_READ2ST64_B64: 195 case AMDGPU::DS_WRITE2ST64_B32: 196 case AMDGPU::DS_WRITE2ST64_B64: 197 return true; 198 default: 199 return false; 200 } 201 } 202 203 bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, 204 unsigned &Offset, 205 const TargetRegisterInfo *TRI) const { 206 unsigned Opc = LdSt->getOpcode(); 207 if (isDS(Opc)) { 208 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 209 AMDGPU::OpName::offset); 210 if (OffsetImm) { 211 // Normal, single offset LDS instruction. 212 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 213 AMDGPU::OpName::addr); 214 215 BaseReg = AddrReg->getReg(); 216 Offset = OffsetImm->getImm(); 217 return true; 218 } 219 220 // The 2 offset instructions use offset0 and offset1 instead. We can treat 221 // these as a load with a single offset if the 2 offsets are consecutive. We 222 // will use this for some partially aligned loads. 223 const MachineOperand *Offset0Imm = getNamedOperand(*LdSt, 224 AMDGPU::OpName::offset0); 225 const MachineOperand *Offset1Imm = getNamedOperand(*LdSt, 226 AMDGPU::OpName::offset1); 227 228 uint8_t Offset0 = Offset0Imm->getImm(); 229 uint8_t Offset1 = Offset1Imm->getImm(); 230 assert(Offset1 > Offset0); 231 232 if (Offset1 - Offset0 == 1) { 233 // Each of these offsets is in element sized units, so we need to convert 234 // to bytes of the individual reads. 235 236 unsigned EltSize; 237 if (LdSt->mayLoad()) 238 EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2; 239 else { 240 assert(LdSt->mayStore()); 241 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 242 EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize(); 243 } 244 245 if (isStride64(Opc)) 246 EltSize *= 64; 247 248 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 249 AMDGPU::OpName::addr); 250 BaseReg = AddrReg->getReg(); 251 Offset = EltSize * Offset0; 252 return true; 253 } 254 255 return false; 256 } 257 258 if (isMUBUF(Opc) || isMTBUF(Opc)) { 259 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) 260 return false; 261 262 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 263 AMDGPU::OpName::vaddr); 264 if (!AddrReg) 265 return false; 266 267 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 268 AMDGPU::OpName::offset); 269 BaseReg = AddrReg->getReg(); 270 Offset = OffsetImm->getImm(); 271 return true; 272 } 273 274 if (isSMRD(Opc)) { 275 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 276 AMDGPU::OpName::offset); 277 if (!OffsetImm) 278 return false; 279 280 const MachineOperand *SBaseReg = getNamedOperand(*LdSt, 281 AMDGPU::OpName::sbase); 282 BaseReg = SBaseReg->getReg(); 283 Offset = OffsetImm->getImm(); 284 return true; 285 } 286 287 return false; 288 } 289 290 bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, 291 MachineInstr *SecondLdSt, 292 unsigned NumLoads) const { 293 unsigned Opc0 = FirstLdSt->getOpcode(); 294 unsigned Opc1 = SecondLdSt->getOpcode(); 295 296 // TODO: This needs finer tuning 297 if (NumLoads > 4) 298 return false; 299 300 if (isDS(Opc0) && isDS(Opc1)) 301 return true; 302 303 if (isSMRD(Opc0) && isSMRD(Opc1)) 304 return true; 305 306 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) 307 return true; 308 309 return false; 310 } 311 312 void 313 SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 314 MachineBasicBlock::iterator MI, DebugLoc DL, 315 unsigned DestReg, unsigned SrcReg, 316 bool KillSrc) const { 317 318 // If we are trying to copy to or from SCC, there is a bug somewhere else in 319 // the backend. While it may be theoretically possible to do this, it should 320 // never be necessary. 321 assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC); 322 323 static const int16_t Sub0_15[] = { 324 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 325 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 326 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 327 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 0 328 }; 329 330 static const int16_t Sub0_7[] = { 331 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 332 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 0 333 }; 334 335 static const int16_t Sub0_3[] = { 336 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0 337 }; 338 339 static const int16_t Sub0_2[] = { 340 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 0 341 }; 342 343 static const int16_t Sub0_1[] = { 344 AMDGPU::sub0, AMDGPU::sub1, 0 345 }; 346 347 unsigned Opcode; 348 const int16_t *SubIndices; 349 350 if (AMDGPU::SReg_32RegClass.contains(DestReg)) { 351 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 352 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 353 .addReg(SrcReg, getKillRegState(KillSrc)); 354 return; 355 356 } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) { 357 if (DestReg == AMDGPU::VCC) { 358 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 359 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 360 .addReg(SrcReg, getKillRegState(KillSrc)); 361 } else { 362 // FIXME: Hack until VReg_1 removed. 363 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 364 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32), AMDGPU::VCC) 365 .addImm(0) 366 .addReg(SrcReg, getKillRegState(KillSrc)); 367 } 368 369 return; 370 } 371 372 assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); 373 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 374 .addReg(SrcReg, getKillRegState(KillSrc)); 375 return; 376 377 } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) { 378 assert(AMDGPU::SReg_128RegClass.contains(SrcReg)); 379 Opcode = AMDGPU::S_MOV_B32; 380 SubIndices = Sub0_3; 381 382 } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) { 383 assert(AMDGPU::SReg_256RegClass.contains(SrcReg)); 384 Opcode = AMDGPU::S_MOV_B32; 385 SubIndices = Sub0_7; 386 387 } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) { 388 assert(AMDGPU::SReg_512RegClass.contains(SrcReg)); 389 Opcode = AMDGPU::S_MOV_B32; 390 SubIndices = Sub0_15; 391 392 } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) { 393 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 394 AMDGPU::SReg_32RegClass.contains(SrcReg)); 395 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 396 .addReg(SrcReg, getKillRegState(KillSrc)); 397 return; 398 399 } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) { 400 assert(AMDGPU::VReg_64RegClass.contains(SrcReg) || 401 AMDGPU::SReg_64RegClass.contains(SrcReg)); 402 Opcode = AMDGPU::V_MOV_B32_e32; 403 SubIndices = Sub0_1; 404 405 } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) { 406 assert(AMDGPU::VReg_96RegClass.contains(SrcReg)); 407 Opcode = AMDGPU::V_MOV_B32_e32; 408 SubIndices = Sub0_2; 409 410 } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) { 411 assert(AMDGPU::VReg_128RegClass.contains(SrcReg) || 412 AMDGPU::SReg_128RegClass.contains(SrcReg)); 413 Opcode = AMDGPU::V_MOV_B32_e32; 414 SubIndices = Sub0_3; 415 416 } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) { 417 assert(AMDGPU::VReg_256RegClass.contains(SrcReg) || 418 AMDGPU::SReg_256RegClass.contains(SrcReg)); 419 Opcode = AMDGPU::V_MOV_B32_e32; 420 SubIndices = Sub0_7; 421 422 } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) { 423 assert(AMDGPU::VReg_512RegClass.contains(SrcReg) || 424 AMDGPU::SReg_512RegClass.contains(SrcReg)); 425 Opcode = AMDGPU::V_MOV_B32_e32; 426 SubIndices = Sub0_15; 427 428 } else { 429 llvm_unreachable("Can't copy register!"); 430 } 431 432 while (unsigned SubIdx = *SubIndices++) { 433 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 434 get(Opcode), RI.getSubReg(DestReg, SubIdx)); 435 436 Builder.addReg(RI.getSubReg(SrcReg, SubIdx), getKillRegState(KillSrc)); 437 438 if (*SubIndices) 439 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 440 } 441 } 442 443 int SIInstrInfo::commuteOpcode(const MachineInstr &MI) const { 444 const unsigned Opcode = MI.getOpcode(); 445 446 int NewOpc; 447 448 // Try to map original to commuted opcode 449 NewOpc = AMDGPU::getCommuteRev(Opcode); 450 if (NewOpc != -1) 451 // Check if the commuted (REV) opcode exists on the target. 452 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 453 454 // Try to map commuted to original opcode 455 NewOpc = AMDGPU::getCommuteOrig(Opcode); 456 if (NewOpc != -1) 457 // Check if the original (non-REV) opcode exists on the target. 458 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 459 460 return Opcode; 461 } 462 463 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 464 465 if (DstRC->getSize() == 4) { 466 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 467 } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) { 468 return AMDGPU::S_MOV_B64; 469 } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) { 470 return AMDGPU::V_MOV_B64_PSEUDO; 471 } 472 return AMDGPU::COPY; 473 } 474 475 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 476 MachineBasicBlock::iterator MI, 477 unsigned SrcReg, bool isKill, 478 int FrameIndex, 479 const TargetRegisterClass *RC, 480 const TargetRegisterInfo *TRI) const { 481 MachineFunction *MF = MBB.getParent(); 482 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 483 MachineFrameInfo *FrameInfo = MF->getFrameInfo(); 484 DebugLoc DL = MBB.findDebugLoc(MI); 485 int Opcode = -1; 486 487 if (RI.isSGPRClass(RC)) { 488 // We are only allowed to create one new instruction when spilling 489 // registers, so we need to use pseudo instruction for spilling 490 // SGPRs. 491 switch (RC->getSize() * 8) { 492 case 32: Opcode = AMDGPU::SI_SPILL_S32_SAVE; break; 493 case 64: Opcode = AMDGPU::SI_SPILL_S64_SAVE; break; 494 case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break; 495 case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break; 496 case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break; 497 } 498 } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) { 499 MFI->setHasSpilledVGPRs(); 500 501 switch(RC->getSize() * 8) { 502 case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break; 503 case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break; 504 case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break; 505 case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break; 506 case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break; 507 case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break; 508 } 509 } 510 511 if (Opcode != -1) { 512 FrameInfo->setObjectAlignment(FrameIndex, 4); 513 BuildMI(MBB, MI, DL, get(Opcode)) 514 .addReg(SrcReg) 515 .addFrameIndex(FrameIndex) 516 // Place-holder registers, these will be filled in by 517 // SIPrepareScratchRegs. 518 .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) 519 .addReg(AMDGPU::SGPR0, RegState::Undef); 520 } else { 521 LLVMContext &Ctx = MF->getFunction()->getContext(); 522 Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" 523 " spill register"); 524 BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) 525 .addReg(SrcReg); 526 } 527 } 528 529 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 530 MachineBasicBlock::iterator MI, 531 unsigned DestReg, int FrameIndex, 532 const TargetRegisterClass *RC, 533 const TargetRegisterInfo *TRI) const { 534 MachineFunction *MF = MBB.getParent(); 535 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 536 MachineFrameInfo *FrameInfo = MF->getFrameInfo(); 537 DebugLoc DL = MBB.findDebugLoc(MI); 538 int Opcode = -1; 539 540 if (RI.isSGPRClass(RC)){ 541 switch(RC->getSize() * 8) { 542 case 32: Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break; 543 case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break; 544 case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break; 545 case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break; 546 case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break; 547 } 548 } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) { 549 switch(RC->getSize() * 8) { 550 case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break; 551 case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break; 552 case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break; 553 case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break; 554 case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break; 555 case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break; 556 } 557 } 558 559 if (Opcode != -1) { 560 FrameInfo->setObjectAlignment(FrameIndex, 4); 561 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 562 .addFrameIndex(FrameIndex) 563 // Place-holder registers, these will be filled in by 564 // SIPrepareScratchRegs. 565 .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) 566 .addReg(AMDGPU::SGPR0, RegState::Undef); 567 568 } else { 569 LLVMContext &Ctx = MF->getFunction()->getContext(); 570 Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" 571 " restore register"); 572 BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); 573 } 574 } 575 576 /// \param @Offset Offset in bytes of the FrameIndex being spilled 577 unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, 578 MachineBasicBlock::iterator MI, 579 RegScavenger *RS, unsigned TmpReg, 580 unsigned FrameOffset, 581 unsigned Size) const { 582 MachineFunction *MF = MBB.getParent(); 583 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 584 const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>(); 585 const SIRegisterInfo *TRI = 586 static_cast<const SIRegisterInfo*>(ST.getRegisterInfo()); 587 DebugLoc DL = MBB.findDebugLoc(MI); 588 unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF); 589 unsigned WavefrontSize = ST.getWavefrontSize(); 590 591 unsigned TIDReg = MFI->getTIDReg(); 592 if (!MFI->hasCalculatedTID()) { 593 MachineBasicBlock &Entry = MBB.getParent()->front(); 594 MachineBasicBlock::iterator Insert = Entry.front(); 595 DebugLoc DL = Insert->getDebugLoc(); 596 597 TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass); 598 if (TIDReg == AMDGPU::NoRegister) 599 return TIDReg; 600 601 602 if (MFI->getShaderType() == ShaderType::COMPUTE && 603 WorkGroupSize > WavefrontSize) { 604 605 unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X); 606 unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y); 607 unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z); 608 unsigned InputPtrReg = 609 TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR); 610 for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { 611 if (!Entry.isLiveIn(Reg)) 612 Entry.addLiveIn(Reg); 613 } 614 615 RS->enterBasicBlock(&Entry); 616 unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 617 unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 618 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) 619 .addReg(InputPtrReg) 620 .addImm(SI::KernelInputOffsets::NGROUPS_Z); 621 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) 622 .addReg(InputPtrReg) 623 .addImm(SI::KernelInputOffsets::NGROUPS_Y); 624 625 // NGROUPS.X * NGROUPS.Y 626 BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) 627 .addReg(STmp1) 628 .addReg(STmp0); 629 // (NGROUPS.X * NGROUPS.Y) * TIDIG.X 630 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) 631 .addReg(STmp1) 632 .addReg(TIDIGXReg); 633 // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) 634 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) 635 .addReg(STmp0) 636 .addReg(TIDIGYReg) 637 .addReg(TIDReg); 638 // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z 639 BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) 640 .addReg(TIDReg) 641 .addReg(TIDIGZReg); 642 } else { 643 // Get the wave id 644 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), 645 TIDReg) 646 .addImm(-1) 647 .addImm(0); 648 649 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), 650 TIDReg) 651 .addImm(-1) 652 .addReg(TIDReg); 653 } 654 655 BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), 656 TIDReg) 657 .addImm(2) 658 .addReg(TIDReg); 659 MFI->setTIDReg(TIDReg); 660 } 661 662 // Add FrameIndex to LDS offset 663 unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize); 664 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) 665 .addImm(LDSOffset) 666 .addReg(TIDReg); 667 668 return TmpReg; 669 } 670 671 void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI, 672 int Count) const { 673 while (Count > 0) { 674 int Arg; 675 if (Count >= 8) 676 Arg = 7; 677 else 678 Arg = Count - 1; 679 Count -= 8; 680 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(AMDGPU::S_NOP)) 681 .addImm(Arg); 682 } 683 } 684 685 bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { 686 MachineBasicBlock &MBB = *MI->getParent(); 687 DebugLoc DL = MBB.findDebugLoc(MI); 688 switch (MI->getOpcode()) { 689 default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); 690 691 case AMDGPU::SI_CONSTDATA_PTR: { 692 unsigned Reg = MI->getOperand(0).getReg(); 693 unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); 694 unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); 695 696 BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg); 697 698 // Add 32-bit offset from this instruction to the start of the constant data. 699 BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_U32), RegLo) 700 .addReg(RegLo) 701 .addTargetIndex(AMDGPU::TI_CONSTDATA_START) 702 .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit); 703 BuildMI(MBB, MI, DL, get(AMDGPU::S_ADDC_U32), RegHi) 704 .addReg(RegHi) 705 .addImm(0) 706 .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit) 707 .addReg(AMDGPU::SCC, RegState::Implicit); 708 MI->eraseFromParent(); 709 break; 710 } 711 case AMDGPU::SGPR_USE: 712 // This is just a placeholder for register allocation. 713 MI->eraseFromParent(); 714 break; 715 716 case AMDGPU::V_MOV_B64_PSEUDO: { 717 unsigned Dst = MI->getOperand(0).getReg(); 718 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 719 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 720 721 const MachineOperand &SrcOp = MI->getOperand(1); 722 // FIXME: Will this work for 64-bit floating point immediates? 723 assert(!SrcOp.isFPImm()); 724 if (SrcOp.isImm()) { 725 APInt Imm(64, SrcOp.getImm()); 726 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 727 .addImm(Imm.getLoBits(32).getZExtValue()) 728 .addReg(Dst, RegState::Implicit); 729 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 730 .addImm(Imm.getHiBits(32).getZExtValue()) 731 .addReg(Dst, RegState::Implicit); 732 } else { 733 assert(SrcOp.isReg()); 734 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 735 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 736 .addReg(Dst, RegState::Implicit); 737 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 738 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 739 .addReg(Dst, RegState::Implicit); 740 } 741 MI->eraseFromParent(); 742 break; 743 } 744 745 case AMDGPU::V_CNDMASK_B64_PSEUDO: { 746 unsigned Dst = MI->getOperand(0).getReg(); 747 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 748 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 749 unsigned Src0 = MI->getOperand(1).getReg(); 750 unsigned Src1 = MI->getOperand(2).getReg(); 751 const MachineOperand &SrcCond = MI->getOperand(3); 752 753 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo) 754 .addReg(RI.getSubReg(Src0, AMDGPU::sub0)) 755 .addReg(RI.getSubReg(Src1, AMDGPU::sub0)) 756 .addOperand(SrcCond); 757 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi) 758 .addReg(RI.getSubReg(Src0, AMDGPU::sub1)) 759 .addReg(RI.getSubReg(Src1, AMDGPU::sub1)) 760 .addOperand(SrcCond); 761 MI->eraseFromParent(); 762 break; 763 } 764 } 765 return true; 766 } 767 768 MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, 769 bool NewMI) const { 770 771 if (MI->getNumOperands() < 3) 772 return nullptr; 773 774 int CommutedOpcode = commuteOpcode(*MI); 775 if (CommutedOpcode == -1) 776 return nullptr; 777 778 int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 779 AMDGPU::OpName::src0); 780 assert(Src0Idx != -1 && "Should always have src0 operand"); 781 782 MachineOperand &Src0 = MI->getOperand(Src0Idx); 783 if (!Src0.isReg()) 784 return nullptr; 785 786 int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 787 AMDGPU::OpName::src1); 788 if (Src1Idx == -1) 789 return nullptr; 790 791 MachineOperand &Src1 = MI->getOperand(Src1Idx); 792 793 // Make sure it's legal to commute operands for VOP2. 794 if (isVOP2(MI->getOpcode()) && 795 (!isOperandLegal(MI, Src0Idx, &Src1) || 796 !isOperandLegal(MI, Src1Idx, &Src0))) { 797 return nullptr; 798 } 799 800 if (!Src1.isReg()) { 801 // Allow commuting instructions with Imm operands. 802 if (NewMI || !Src1.isImm() || 803 (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) { 804 return nullptr; 805 } 806 807 // Be sure to copy the source modifiers to the right place. 808 if (MachineOperand *Src0Mods 809 = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { 810 MachineOperand *Src1Mods 811 = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers); 812 813 int Src0ModsVal = Src0Mods->getImm(); 814 if (!Src1Mods && Src0ModsVal != 0) 815 return nullptr; 816 817 // XXX - This assert might be a lie. It might be useful to have a neg 818 // modifier with 0.0. 819 int Src1ModsVal = Src1Mods->getImm(); 820 assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates"); 821 822 Src1Mods->setImm(Src0ModsVal); 823 Src0Mods->setImm(Src1ModsVal); 824 } 825 826 unsigned Reg = Src0.getReg(); 827 unsigned SubReg = Src0.getSubReg(); 828 if (Src1.isImm()) 829 Src0.ChangeToImmediate(Src1.getImm()); 830 else 831 llvm_unreachable("Should only have immediates"); 832 833 Src1.ChangeToRegister(Reg, false); 834 Src1.setSubReg(SubReg); 835 } else { 836 MI = TargetInstrInfo::commuteInstruction(MI, NewMI); 837 } 838 839 if (MI) 840 MI->setDesc(get(CommutedOpcode)); 841 842 return MI; 843 } 844 845 // This needs to be implemented because the source modifiers may be inserted 846 // between the true commutable operands, and the base 847 // TargetInstrInfo::commuteInstruction uses it. 848 bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, 849 unsigned &SrcOpIdx1, 850 unsigned &SrcOpIdx2) const { 851 const MCInstrDesc &MCID = MI->getDesc(); 852 if (!MCID.isCommutable()) 853 return false; 854 855 unsigned Opc = MI->getOpcode(); 856 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 857 if (Src0Idx == -1) 858 return false; 859 860 // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on 861 // immediate. 862 if (!MI->getOperand(Src0Idx).isReg()) 863 return false; 864 865 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 866 if (Src1Idx == -1) 867 return false; 868 869 if (!MI->getOperand(Src1Idx).isReg()) 870 return false; 871 872 // If any source modifiers are set, the generic instruction commuting won't 873 // understand how to copy the source modifiers. 874 if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || 875 hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) 876 return false; 877 878 SrcOpIdx1 = Src0Idx; 879 SrcOpIdx2 = Src1Idx; 880 return true; 881 } 882 883 MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB, 884 MachineBasicBlock::iterator I, 885 unsigned DstReg, 886 unsigned SrcReg) const { 887 return BuildMI(*MBB, I, MBB->findDebugLoc(I), get(AMDGPU::V_MOV_B32_e32), 888 DstReg) .addReg(SrcReg); 889 } 890 891 bool SIInstrInfo::isMov(unsigned Opcode) const { 892 switch(Opcode) { 893 default: return false; 894 case AMDGPU::S_MOV_B32: 895 case AMDGPU::S_MOV_B64: 896 case AMDGPU::V_MOV_B32_e32: 897 case AMDGPU::V_MOV_B32_e64: 898 return true; 899 } 900 } 901 902 bool 903 SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { 904 return RC != &AMDGPU::EXECRegRegClass; 905 } 906 907 static void removeModOperands(MachineInstr &MI) { 908 unsigned Opc = MI.getOpcode(); 909 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, 910 AMDGPU::OpName::src0_modifiers); 911 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, 912 AMDGPU::OpName::src1_modifiers); 913 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, 914 AMDGPU::OpName::src2_modifiers); 915 916 MI.RemoveOperand(Src2ModIdx); 917 MI.RemoveOperand(Src1ModIdx); 918 MI.RemoveOperand(Src0ModIdx); 919 } 920 921 bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, 922 unsigned Reg, MachineRegisterInfo *MRI) const { 923 if (!MRI->hasOneNonDBGUse(Reg)) 924 return false; 925 926 unsigned Opc = UseMI->getOpcode(); 927 if (Opc == AMDGPU::V_MAD_F32) { 928 // Don't fold if we are using source modifiers. The new VOP2 instructions 929 // don't have them. 930 if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) || 931 hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) || 932 hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) { 933 return false; 934 } 935 936 MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0); 937 MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1); 938 MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2); 939 940 // Multiplied part is the constant: Use v_madmk_f32 941 // We should only expect these to be on src0 due to canonicalizations. 942 if (Src0->isReg() && Src0->getReg() == Reg) { 943 if (!Src1->isReg() || 944 (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) 945 return false; 946 947 if (!Src2->isReg() || 948 (Src2->isReg() && RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))) 949 return false; 950 951 // We need to do some weird looking operand shuffling since the madmk 952 // operands are out of the normal expected order with the multiplied 953 // constant as the last operand. 954 // 955 // v_mad_f32 src0, src1, src2 -> v_madmk_f32 src0 * src2K + src1 956 // src0 -> src2 K 957 // src1 -> src0 958 // src2 -> src1 959 960 const int64_t Imm = DefMI->getOperand(1).getImm(); 961 962 // FIXME: This would be a lot easier if we could return a new instruction 963 // instead of having to modify in place. 964 965 // Remove these first since they are at the end. 966 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, 967 AMDGPU::OpName::omod)); 968 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, 969 AMDGPU::OpName::clamp)); 970 971 unsigned Src1Reg = Src1->getReg(); 972 unsigned Src1SubReg = Src1->getSubReg(); 973 unsigned Src2Reg = Src2->getReg(); 974 unsigned Src2SubReg = Src2->getSubReg(); 975 Src0->setReg(Src1Reg); 976 Src0->setSubReg(Src1SubReg); 977 Src0->setIsKill(Src1->isKill()); 978 979 Src1->setReg(Src2Reg); 980 Src1->setSubReg(Src2SubReg); 981 Src1->setIsKill(Src2->isKill()); 982 983 Src2->ChangeToImmediate(Imm); 984 985 removeModOperands(*UseMI); 986 UseMI->setDesc(get(AMDGPU::V_MADMK_F32)); 987 988 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 989 if (DeleteDef) 990 DefMI->eraseFromParent(); 991 992 return true; 993 } 994 995 // Added part is the constant: Use v_madak_f32 996 if (Src2->isReg() && Src2->getReg() == Reg) { 997 // Not allowed to use constant bus for another operand. 998 // We can however allow an inline immediate as src0. 999 if (!Src0->isImm() && 1000 (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) 1001 return false; 1002 1003 if (!Src1->isReg() || 1004 (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) 1005 return false; 1006 1007 const int64_t Imm = DefMI->getOperand(1).getImm(); 1008 1009 // FIXME: This would be a lot easier if we could return a new instruction 1010 // instead of having to modify in place. 1011 1012 // Remove these first since they are at the end. 1013 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, 1014 AMDGPU::OpName::omod)); 1015 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, 1016 AMDGPU::OpName::clamp)); 1017 1018 Src2->ChangeToImmediate(Imm); 1019 1020 // These come before src2. 1021 removeModOperands(*UseMI); 1022 UseMI->setDesc(get(AMDGPU::V_MADAK_F32)); 1023 1024 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1025 if (DeleteDef) 1026 DefMI->eraseFromParent(); 1027 1028 return true; 1029 } 1030 } 1031 1032 return false; 1033 } 1034 1035 bool 1036 SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI, 1037 AliasAnalysis *AA) const { 1038 switch(MI->getOpcode()) { 1039 default: return AMDGPUInstrInfo::isTriviallyReMaterializable(MI, AA); 1040 case AMDGPU::S_MOV_B32: 1041 case AMDGPU::S_MOV_B64: 1042 case AMDGPU::V_MOV_B32_e32: 1043 return MI->getOperand(1).isImm(); 1044 } 1045 } 1046 1047 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 1048 int WidthB, int OffsetB) { 1049 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1050 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1051 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1052 return LowOffset + LowWidth <= HighOffset; 1053 } 1054 1055 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa, 1056 MachineInstr *MIb) const { 1057 unsigned BaseReg0, Offset0; 1058 unsigned BaseReg1, Offset1; 1059 1060 if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && 1061 getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { 1062 assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() && 1063 "read2 / write2 not expected here yet"); 1064 unsigned Width0 = (*MIa->memoperands_begin())->getSize(); 1065 unsigned Width1 = (*MIb->memoperands_begin())->getSize(); 1066 if (BaseReg0 == BaseReg1 && 1067 offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { 1068 return true; 1069 } 1070 } 1071 1072 return false; 1073 } 1074 1075 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, 1076 MachineInstr *MIb, 1077 AliasAnalysis *AA) const { 1078 unsigned Opc0 = MIa->getOpcode(); 1079 unsigned Opc1 = MIb->getOpcode(); 1080 1081 assert(MIa && (MIa->mayLoad() || MIa->mayStore()) && 1082 "MIa must load from or modify a memory location"); 1083 assert(MIb && (MIb->mayLoad() || MIb->mayStore()) && 1084 "MIb must load from or modify a memory location"); 1085 1086 if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects()) 1087 return false; 1088 1089 // XXX - Can we relax this between address spaces? 1090 if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef()) 1091 return false; 1092 1093 // TODO: Should we check the address space from the MachineMemOperand? That 1094 // would allow us to distinguish objects we know don't alias based on the 1095 // underlying addres space, even if it was lowered to a different one, 1096 // e.g. private accesses lowered to use MUBUF instructions on a scratch 1097 // buffer. 1098 if (isDS(Opc0)) { 1099 if (isDS(Opc1)) 1100 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1101 1102 return !isFLAT(Opc1); 1103 } 1104 1105 if (isMUBUF(Opc0) || isMTBUF(Opc0)) { 1106 if (isMUBUF(Opc1) || isMTBUF(Opc1)) 1107 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1108 1109 return !isFLAT(Opc1) && !isSMRD(Opc1); 1110 } 1111 1112 if (isSMRD(Opc0)) { 1113 if (isSMRD(Opc1)) 1114 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1115 1116 return !isFLAT(Opc1) && !isMUBUF(Opc0) && !isMTBUF(Opc0); 1117 } 1118 1119 if (isFLAT(Opc0)) { 1120 if (isFLAT(Opc1)) 1121 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1122 1123 return false; 1124 } 1125 1126 return false; 1127 } 1128 1129 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 1130 int64_t SVal = Imm.getSExtValue(); 1131 if (SVal >= -16 && SVal <= 64) 1132 return true; 1133 1134 if (Imm.getBitWidth() == 64) { 1135 uint64_t Val = Imm.getZExtValue(); 1136 return (DoubleToBits(0.0) == Val) || 1137 (DoubleToBits(1.0) == Val) || 1138 (DoubleToBits(-1.0) == Val) || 1139 (DoubleToBits(0.5) == Val) || 1140 (DoubleToBits(-0.5) == Val) || 1141 (DoubleToBits(2.0) == Val) || 1142 (DoubleToBits(-2.0) == Val) || 1143 (DoubleToBits(4.0) == Val) || 1144 (DoubleToBits(-4.0) == Val); 1145 } 1146 1147 // The actual type of the operand does not seem to matter as long 1148 // as the bits match one of the inline immediate values. For example: 1149 // 1150 // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal, 1151 // so it is a legal inline immediate. 1152 // 1153 // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in 1154 // floating-point, so it is a legal inline immediate. 1155 uint32_t Val = Imm.getZExtValue(); 1156 1157 return (FloatToBits(0.0f) == Val) || 1158 (FloatToBits(1.0f) == Val) || 1159 (FloatToBits(-1.0f) == Val) || 1160 (FloatToBits(0.5f) == Val) || 1161 (FloatToBits(-0.5f) == Val) || 1162 (FloatToBits(2.0f) == Val) || 1163 (FloatToBits(-2.0f) == Val) || 1164 (FloatToBits(4.0f) == Val) || 1165 (FloatToBits(-4.0f) == Val); 1166 } 1167 1168 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 1169 unsigned OpSize) const { 1170 if (MO.isImm()) { 1171 // MachineOperand provides no way to tell the true operand size, since it 1172 // only records a 64-bit value. We need to know the size to determine if a 1173 // 32-bit floating point immediate bit pattern is legal for an integer 1174 // immediate. It would be for any 32-bit integer operand, but would not be 1175 // for a 64-bit one. 1176 1177 unsigned BitSize = 8 * OpSize; 1178 return isInlineConstant(APInt(BitSize, MO.getImm(), true)); 1179 } 1180 1181 return false; 1182 } 1183 1184 bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO, 1185 unsigned OpSize) const { 1186 return MO.isImm() && !isInlineConstant(MO, OpSize); 1187 } 1188 1189 static bool compareMachineOp(const MachineOperand &Op0, 1190 const MachineOperand &Op1) { 1191 if (Op0.getType() != Op1.getType()) 1192 return false; 1193 1194 switch (Op0.getType()) { 1195 case MachineOperand::MO_Register: 1196 return Op0.getReg() == Op1.getReg(); 1197 case MachineOperand::MO_Immediate: 1198 return Op0.getImm() == Op1.getImm(); 1199 default: 1200 llvm_unreachable("Didn't expect to be comparing these operand types"); 1201 } 1202 } 1203 1204 bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, 1205 const MachineOperand &MO) const { 1206 const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo]; 1207 1208 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 1209 1210 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 1211 return true; 1212 1213 if (OpInfo.RegClass < 0) 1214 return false; 1215 1216 unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize(); 1217 if (isLiteralConstant(MO, OpSize)) 1218 return RI.opCanUseLiteralConstant(OpInfo.OperandType); 1219 1220 return RI.opCanUseInlineConstant(OpInfo.OperandType); 1221 } 1222 1223 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 1224 int Op32 = AMDGPU::getVOPe32(Opcode); 1225 if (Op32 == -1) 1226 return false; 1227 1228 return pseudoToMCOpcode(Op32) != -1; 1229 } 1230 1231 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 1232 // The src0_modifier operand is present on all instructions 1233 // that have modifiers. 1234 1235 return AMDGPU::getNamedOperandIdx(Opcode, 1236 AMDGPU::OpName::src0_modifiers) != -1; 1237 } 1238 1239 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 1240 unsigned OpName) const { 1241 const MachineOperand *Mods = getNamedOperand(MI, OpName); 1242 return Mods && Mods->getImm(); 1243 } 1244 1245 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 1246 const MachineOperand &MO, 1247 unsigned OpSize) const { 1248 // Literal constants use the constant bus. 1249 if (isLiteralConstant(MO, OpSize)) 1250 return true; 1251 1252 if (!MO.isReg() || !MO.isUse()) 1253 return false; 1254 1255 if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) 1256 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 1257 1258 // FLAT_SCR is just an SGPR pair. 1259 if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) 1260 return true; 1261 1262 // EXEC register uses the constant bus. 1263 if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) 1264 return true; 1265 1266 // SGPRs use the constant bus 1267 if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC || 1268 (!MO.isImplicit() && 1269 (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || 1270 AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) { 1271 return true; 1272 } 1273 1274 return false; 1275 } 1276 1277 bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, 1278 StringRef &ErrInfo) const { 1279 uint16_t Opcode = MI->getOpcode(); 1280 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1281 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 1282 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 1283 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 1284 1285 // Make sure the number of operands is correct. 1286 const MCInstrDesc &Desc = get(Opcode); 1287 if (!Desc.isVariadic() && 1288 Desc.getNumOperands() != MI->getNumExplicitOperands()) { 1289 ErrInfo = "Instruction has wrong number of operands."; 1290 return false; 1291 } 1292 1293 // Make sure the register classes are correct 1294 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 1295 if (MI->getOperand(i).isFPImm()) { 1296 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 1297 "all fp values to integers."; 1298 return false; 1299 } 1300 1301 int RegClass = Desc.OpInfo[i].RegClass; 1302 1303 switch (Desc.OpInfo[i].OperandType) { 1304 case MCOI::OPERAND_REGISTER: 1305 if (MI->getOperand(i).isImm()) { 1306 ErrInfo = "Illegal immediate value for operand."; 1307 return false; 1308 } 1309 break; 1310 case AMDGPU::OPERAND_REG_IMM32: 1311 break; 1312 case AMDGPU::OPERAND_REG_INLINE_C: 1313 if (isLiteralConstant(MI->getOperand(i), 1314 RI.getRegClass(RegClass)->getSize())) { 1315 ErrInfo = "Illegal immediate value for operand."; 1316 return false; 1317 } 1318 break; 1319 case MCOI::OPERAND_IMMEDIATE: 1320 // Check if this operand is an immediate. 1321 // FrameIndex operands will be replaced by immediates, so they are 1322 // allowed. 1323 if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) { 1324 ErrInfo = "Expected immediate, but got non-immediate"; 1325 return false; 1326 } 1327 // Fall-through 1328 default: 1329 continue; 1330 } 1331 1332 if (!MI->getOperand(i).isReg()) 1333 continue; 1334 1335 if (RegClass != -1) { 1336 unsigned Reg = MI->getOperand(i).getReg(); 1337 if (TargetRegisterInfo::isVirtualRegister(Reg)) 1338 continue; 1339 1340 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 1341 if (!RC->contains(Reg)) { 1342 ErrInfo = "Operand has incorrect register class."; 1343 return false; 1344 } 1345 } 1346 } 1347 1348 1349 // Verify VOP* 1350 if (isVOP1(Opcode) || isVOP2(Opcode) || isVOP3(Opcode) || isVOPC(Opcode)) { 1351 // Only look at the true operands. Only a real operand can use the constant 1352 // bus, and we don't want to check pseudo-operands like the source modifier 1353 // flags. 1354 const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; 1355 1356 unsigned ConstantBusCount = 0; 1357 unsigned SGPRUsed = AMDGPU::NoRegister; 1358 for (int OpIdx : OpIndices) { 1359 if (OpIdx == -1) 1360 break; 1361 const MachineOperand &MO = MI->getOperand(OpIdx); 1362 if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) { 1363 if (MO.isReg()) { 1364 if (MO.getReg() != SGPRUsed) 1365 ++ConstantBusCount; 1366 SGPRUsed = MO.getReg(); 1367 } else { 1368 ++ConstantBusCount; 1369 } 1370 } 1371 } 1372 if (ConstantBusCount > 1) { 1373 ErrInfo = "VOP* instruction uses the constant bus more than once"; 1374 return false; 1375 } 1376 } 1377 1378 // Verify misc. restrictions on specific instructions. 1379 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || 1380 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { 1381 const MachineOperand &Src0 = MI->getOperand(Src0Idx); 1382 const MachineOperand &Src1 = MI->getOperand(Src1Idx); 1383 const MachineOperand &Src2 = MI->getOperand(Src2Idx); 1384 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 1385 if (!compareMachineOp(Src0, Src1) && 1386 !compareMachineOp(Src0, Src2)) { 1387 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 1388 return false; 1389 } 1390 } 1391 } 1392 1393 return true; 1394 } 1395 1396 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { 1397 switch (MI.getOpcode()) { 1398 default: return AMDGPU::INSTRUCTION_LIST_END; 1399 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 1400 case AMDGPU::COPY: return AMDGPU::COPY; 1401 case AMDGPU::PHI: return AMDGPU::PHI; 1402 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 1403 case AMDGPU::S_MOV_B32: 1404 return MI.getOperand(1).isReg() ? 1405 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 1406 case AMDGPU::S_ADD_I32: 1407 case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; 1408 case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; 1409 case AMDGPU::S_SUB_I32: 1410 case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; 1411 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 1412 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; 1413 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32; 1414 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32; 1415 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32; 1416 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32; 1417 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32; 1418 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32; 1419 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32; 1420 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 1421 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; 1422 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 1423 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; 1424 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 1425 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; 1426 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; 1427 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; 1428 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; 1429 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; 1430 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 1431 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 1432 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 1433 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 1434 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; 1435 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; 1436 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; 1437 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; 1438 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; 1439 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; 1440 case AMDGPU::S_LOAD_DWORD_IMM: 1441 case AMDGPU::S_LOAD_DWORD_SGPR: return AMDGPU::BUFFER_LOAD_DWORD_ADDR64; 1442 case AMDGPU::S_LOAD_DWORDX2_IMM: 1443 case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; 1444 case AMDGPU::S_LOAD_DWORDX4_IMM: 1445 case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; 1446 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 1447 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 1448 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 1449 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 1450 } 1451 } 1452 1453 bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const { 1454 return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END; 1455 } 1456 1457 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 1458 unsigned OpNo) const { 1459 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 1460 const MCInstrDesc &Desc = get(MI.getOpcode()); 1461 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 1462 Desc.OpInfo[OpNo].RegClass == -1) { 1463 unsigned Reg = MI.getOperand(OpNo).getReg(); 1464 1465 if (TargetRegisterInfo::isVirtualRegister(Reg)) 1466 return MRI.getRegClass(Reg); 1467 return RI.getPhysRegClass(Reg); 1468 } 1469 1470 unsigned RCID = Desc.OpInfo[OpNo].RegClass; 1471 return RI.getRegClass(RCID); 1472 } 1473 1474 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { 1475 switch (MI.getOpcode()) { 1476 case AMDGPU::COPY: 1477 case AMDGPU::REG_SEQUENCE: 1478 case AMDGPU::PHI: 1479 case AMDGPU::INSERT_SUBREG: 1480 return RI.hasVGPRs(getOpRegClass(MI, 0)); 1481 default: 1482 return RI.hasVGPRs(getOpRegClass(MI, OpNo)); 1483 } 1484 } 1485 1486 void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const { 1487 MachineBasicBlock::iterator I = MI; 1488 MachineBasicBlock *MBB = MI->getParent(); 1489 MachineOperand &MO = MI->getOperand(OpIdx); 1490 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1491 unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass; 1492 const TargetRegisterClass *RC = RI.getRegClass(RCID); 1493 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 1494 if (MO.isReg()) 1495 Opcode = AMDGPU::COPY; 1496 else if (RI.isSGPRClass(RC)) 1497 Opcode = AMDGPU::S_MOV_B32; 1498 1499 1500 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 1501 if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) 1502 VRC = &AMDGPU::VReg_64RegClass; 1503 else 1504 VRC = &AMDGPU::VGPR_32RegClass; 1505 1506 unsigned Reg = MRI.createVirtualRegister(VRC); 1507 DebugLoc DL = MBB->findDebugLoc(I); 1508 BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg) 1509 .addOperand(MO); 1510 MO.ChangeToRegister(Reg, false); 1511 } 1512 1513 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 1514 MachineRegisterInfo &MRI, 1515 MachineOperand &SuperReg, 1516 const TargetRegisterClass *SuperRC, 1517 unsigned SubIdx, 1518 const TargetRegisterClass *SubRC) 1519 const { 1520 assert(SuperReg.isReg()); 1521 1522 unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); 1523 unsigned SubReg = MRI.createVirtualRegister(SubRC); 1524 1525 // Just in case the super register is itself a sub-register, copy it to a new 1526 // value so we don't need to worry about merging its subreg index with the 1527 // SubIdx passed to this function. The register coalescer should be able to 1528 // eliminate this extra copy. 1529 MachineBasicBlock *MBB = MI->getParent(); 1530 DebugLoc DL = MI->getDebugLoc(); 1531 1532 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 1533 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 1534 1535 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 1536 .addReg(NewSuperReg, 0, SubIdx); 1537 1538 return SubReg; 1539 } 1540 1541 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 1542 MachineBasicBlock::iterator MII, 1543 MachineRegisterInfo &MRI, 1544 MachineOperand &Op, 1545 const TargetRegisterClass *SuperRC, 1546 unsigned SubIdx, 1547 const TargetRegisterClass *SubRC) const { 1548 if (Op.isImm()) { 1549 // XXX - Is there a better way to do this? 1550 if (SubIdx == AMDGPU::sub0) 1551 return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF); 1552 if (SubIdx == AMDGPU::sub1) 1553 return MachineOperand::CreateImm(Op.getImm() >> 32); 1554 1555 llvm_unreachable("Unhandled register index for immediate"); 1556 } 1557 1558 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 1559 SubIdx, SubRC); 1560 return MachineOperand::CreateReg(SubReg, false); 1561 } 1562 1563 unsigned SIInstrInfo::split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist, 1564 MachineBasicBlock::iterator MI, 1565 MachineRegisterInfo &MRI, 1566 const TargetRegisterClass *RC, 1567 const MachineOperand &Op) const { 1568 MachineBasicBlock *MBB = MI->getParent(); 1569 DebugLoc DL = MI->getDebugLoc(); 1570 unsigned LoDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 1571 unsigned HiDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 1572 unsigned Dst = MRI.createVirtualRegister(RC); 1573 1574 MachineInstr *Lo = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), 1575 LoDst) 1576 .addImm(Op.getImm() & 0xFFFFFFFF); 1577 MachineInstr *Hi = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), 1578 HiDst) 1579 .addImm(Op.getImm() >> 32); 1580 1581 BuildMI(*MBB, MI, DL, get(TargetOpcode::REG_SEQUENCE), Dst) 1582 .addReg(LoDst) 1583 .addImm(AMDGPU::sub0) 1584 .addReg(HiDst) 1585 .addImm(AMDGPU::sub1); 1586 1587 Worklist.push_back(Lo); 1588 Worklist.push_back(Hi); 1589 1590 return Dst; 1591 } 1592 1593 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 1594 void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const { 1595 assert(Inst->getNumExplicitOperands() == 3); 1596 MachineOperand Op1 = Inst->getOperand(1); 1597 Inst->RemoveOperand(1); 1598 Inst->addOperand(Op1); 1599 } 1600 1601 bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, 1602 const MachineOperand *MO) const { 1603 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1604 const MCInstrDesc &InstDesc = get(MI->getOpcode()); 1605 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; 1606 const TargetRegisterClass *DefinedRC = 1607 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 1608 if (!MO) 1609 MO = &MI->getOperand(OpIdx); 1610 1611 if (isVALU(InstDesc.Opcode) && 1612 usesConstantBus(MRI, *MO, DefinedRC->getSize())) { 1613 unsigned SGPRUsed = 1614 MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister; 1615 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { 1616 if (i == OpIdx) 1617 continue; 1618 const MachineOperand &Op = MI->getOperand(i); 1619 if (Op.isReg() && Op.getReg() != SGPRUsed && 1620 usesConstantBus(MRI, Op, getOpSize(*MI, i))) { 1621 return false; 1622 } 1623 } 1624 } 1625 1626 if (MO->isReg()) { 1627 assert(DefinedRC); 1628 const TargetRegisterClass *RC = MRI.getRegClass(MO->getReg()); 1629 1630 // In order to be legal, the common sub-class must be equal to the 1631 // class of the current operand. For example: 1632 // 1633 // v_mov_b32 s0 ; Operand defined as vsrc_32 1634 // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL 1635 // 1636 // s_sendmsg 0, s0 ; Operand defined as m0reg 1637 // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL 1638 1639 return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; 1640 } 1641 1642 1643 // Handle non-register types that are treated like immediates. 1644 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); 1645 1646 if (!DefinedRC) { 1647 // This operand expects an immediate. 1648 return true; 1649 } 1650 1651 return isImmOperandLegal(MI, OpIdx, *MO); 1652 } 1653 1654 void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { 1655 MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1656 1657 int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 1658 AMDGPU::OpName::src0); 1659 int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 1660 AMDGPU::OpName::src1); 1661 int Src2Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 1662 AMDGPU::OpName::src2); 1663 1664 // Legalize VOP2 1665 if (isVOP2(MI->getOpcode()) && Src1Idx != -1) { 1666 // Legalize src0 1667 if (!isOperandLegal(MI, Src0Idx)) 1668 legalizeOpWithMove(MI, Src0Idx); 1669 1670 // Legalize src1 1671 if (isOperandLegal(MI, Src1Idx)) 1672 return; 1673 1674 // Usually src0 of VOP2 instructions allow more types of inputs 1675 // than src1, so try to commute the instruction to decrease our 1676 // chances of having to insert a MOV instruction to legalize src1. 1677 if (MI->isCommutable()) { 1678 if (commuteInstruction(MI)) 1679 // If we are successful in commuting, then we know MI is legal, so 1680 // we are done. 1681 return; 1682 } 1683 1684 legalizeOpWithMove(MI, Src1Idx); 1685 return; 1686 } 1687 1688 // XXX - Do any VOP3 instructions read VCC? 1689 // Legalize VOP3 1690 if (isVOP3(MI->getOpcode())) { 1691 int VOP3Idx[3] = { Src0Idx, Src1Idx, Src2Idx }; 1692 1693 // Find the one SGPR operand we are allowed to use. 1694 unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); 1695 1696 for (unsigned i = 0; i < 3; ++i) { 1697 int Idx = VOP3Idx[i]; 1698 if (Idx == -1) 1699 break; 1700 MachineOperand &MO = MI->getOperand(Idx); 1701 1702 if (MO.isReg()) { 1703 if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 1704 continue; // VGPRs are legal 1705 1706 assert(MO.getReg() != AMDGPU::SCC && "SCC operand to VOP3 instruction"); 1707 1708 if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { 1709 SGPRReg = MO.getReg(); 1710 // We can use one SGPR in each VOP3 instruction. 1711 continue; 1712 } 1713 } else if (!isLiteralConstant(MO, getOpSize(MI->getOpcode(), Idx))) { 1714 // If it is not a register and not a literal constant, then it must be 1715 // an inline constant which is always legal. 1716 continue; 1717 } 1718 // If we make it this far, then the operand is not legal and we must 1719 // legalize it. 1720 legalizeOpWithMove(MI, Idx); 1721 } 1722 } 1723 1724 // Legalize REG_SEQUENCE and PHI 1725 // The register class of the operands much be the same type as the register 1726 // class of the output. 1727 if (MI->getOpcode() == AMDGPU::REG_SEQUENCE || 1728 MI->getOpcode() == AMDGPU::PHI) { 1729 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 1730 for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { 1731 if (!MI->getOperand(i).isReg() || 1732 !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) 1733 continue; 1734 const TargetRegisterClass *OpRC = 1735 MRI.getRegClass(MI->getOperand(i).getReg()); 1736 if (RI.hasVGPRs(OpRC)) { 1737 VRC = OpRC; 1738 } else { 1739 SRC = OpRC; 1740 } 1741 } 1742 1743 // If any of the operands are VGPR registers, then they all most be 1744 // otherwise we will create illegal VGPR->SGPR copies when legalizing 1745 // them. 1746 if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) { 1747 if (!VRC) { 1748 assert(SRC); 1749 VRC = RI.getEquivalentVGPRClass(SRC); 1750 } 1751 RC = VRC; 1752 } else { 1753 RC = SRC; 1754 } 1755 1756 // Update all the operands so they have the same type. 1757 for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { 1758 if (!MI->getOperand(i).isReg() || 1759 !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) 1760 continue; 1761 unsigned DstReg = MRI.createVirtualRegister(RC); 1762 MachineBasicBlock *InsertBB; 1763 MachineBasicBlock::iterator Insert; 1764 if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) { 1765 InsertBB = MI->getParent(); 1766 Insert = MI; 1767 } else { 1768 // MI is a PHI instruction. 1769 InsertBB = MI->getOperand(i + 1).getMBB(); 1770 Insert = InsertBB->getFirstTerminator(); 1771 } 1772 BuildMI(*InsertBB, Insert, MI->getDebugLoc(), 1773 get(AMDGPU::COPY), DstReg) 1774 .addOperand(MI->getOperand(i)); 1775 MI->getOperand(i).setReg(DstReg); 1776 } 1777 } 1778 1779 // Legalize INSERT_SUBREG 1780 // src0 must have the same register class as dst 1781 if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) { 1782 unsigned Dst = MI->getOperand(0).getReg(); 1783 unsigned Src0 = MI->getOperand(1).getReg(); 1784 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 1785 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 1786 if (DstRC != Src0RC) { 1787 MachineBasicBlock &MBB = *MI->getParent(); 1788 unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); 1789 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0) 1790 .addReg(Src0); 1791 MI->getOperand(1).setReg(NewSrc0); 1792 } 1793 return; 1794 } 1795 1796 // Legalize MUBUF* instructions 1797 // FIXME: If we start using the non-addr64 instructions for compute, we 1798 // may need to legalize them here. 1799 int SRsrcIdx = 1800 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc); 1801 if (SRsrcIdx != -1) { 1802 // We have an MUBUF instruction 1803 MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx); 1804 unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass; 1805 if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), 1806 RI.getRegClass(SRsrcRC))) { 1807 // The operands are legal. 1808 // FIXME: We may need to legalize operands besided srsrc. 1809 return; 1810 } 1811 1812 MachineBasicBlock &MBB = *MI->getParent(); 1813 // Extract the ptr from the resource descriptor. 1814 1815 // SRsrcPtrLo = srsrc:sub0 1816 unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc, 1817 &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VGPR_32RegClass); 1818 1819 // SRsrcPtrHi = srsrc:sub1 1820 unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc, 1821 &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VGPR_32RegClass); 1822 1823 // Create an empty resource descriptor 1824 unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1825 unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 1826 unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 1827 unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); 1828 uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); 1829 1830 // Zero64 = 0 1831 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64), 1832 Zero64) 1833 .addImm(0); 1834 1835 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 1836 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 1837 SRsrcFormatLo) 1838 .addImm(RsrcDataFormat & 0xFFFFFFFF); 1839 1840 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 1841 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 1842 SRsrcFormatHi) 1843 .addImm(RsrcDataFormat >> 32); 1844 1845 // NewSRsrc = {Zero64, SRsrcFormat} 1846 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), 1847 NewSRsrc) 1848 .addReg(Zero64) 1849 .addImm(AMDGPU::sub0_sub1) 1850 .addReg(SRsrcFormatLo) 1851 .addImm(AMDGPU::sub2) 1852 .addReg(SRsrcFormatHi) 1853 .addImm(AMDGPU::sub3); 1854 1855 MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); 1856 unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 1857 unsigned NewVAddrLo; 1858 unsigned NewVAddrHi; 1859 if (VAddr) { 1860 // This is already an ADDR64 instruction so we need to add the pointer 1861 // extracted from the resource descriptor to the current value of VAddr. 1862 NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1863 NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1864 1865 // NewVaddrLo = SRsrcPtrLo + VAddr:sub0 1866 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32), 1867 NewVAddrLo) 1868 .addReg(SRsrcPtrLo) 1869 .addReg(VAddr->getReg(), 0, AMDGPU::sub0) 1870 .addReg(AMDGPU::VCC, RegState::ImplicitDefine); 1871 1872 // NewVaddrHi = SRsrcPtrHi + VAddr:sub1 1873 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32), 1874 NewVAddrHi) 1875 .addReg(SRsrcPtrHi) 1876 .addReg(VAddr->getReg(), 0, AMDGPU::sub1) 1877 .addReg(AMDGPU::VCC, RegState::ImplicitDefine) 1878 .addReg(AMDGPU::VCC, RegState::Implicit); 1879 1880 } else { 1881 // This instructions is the _OFFSET variant, so we need to convert it to 1882 // ADDR64. 1883 MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata); 1884 MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset); 1885 MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset); 1886 1887 // Create the new instruction. 1888 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode()); 1889 MachineInstr *Addr64 = 1890 BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) 1891 .addOperand(*VData) 1892 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 1893 // This will be replaced later 1894 // with the new value of vaddr. 1895 .addOperand(*SRsrc) 1896 .addOperand(*SOffset) 1897 .addOperand(*Offset) 1898 .addImm(0) // glc 1899 .addImm(0) // slc 1900 .addImm(0); // tfe 1901 1902 MI->removeFromParent(); 1903 MI = Addr64; 1904 1905 NewVAddrLo = SRsrcPtrLo; 1906 NewVAddrHi = SRsrcPtrHi; 1907 VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); 1908 SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); 1909 } 1910 1911 // NewVaddr = {NewVaddrHi, NewVaddrLo} 1912 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), 1913 NewVAddr) 1914 .addReg(NewVAddrLo) 1915 .addImm(AMDGPU::sub0) 1916 .addReg(NewVAddrHi) 1917 .addImm(AMDGPU::sub1); 1918 1919 1920 // Update the instruction to use NewVaddr 1921 VAddr->setReg(NewVAddr); 1922 // Update the instruction to use NewSRsrc 1923 SRsrc->setReg(NewSRsrc); 1924 } 1925 } 1926 1927 void SIInstrInfo::splitSMRD(MachineInstr *MI, 1928 const TargetRegisterClass *HalfRC, 1929 unsigned HalfImmOp, unsigned HalfSGPROp, 1930 MachineInstr *&Lo, MachineInstr *&Hi) const { 1931 1932 DebugLoc DL = MI->getDebugLoc(); 1933 MachineBasicBlock *MBB = MI->getParent(); 1934 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1935 unsigned RegLo = MRI.createVirtualRegister(HalfRC); 1936 unsigned RegHi = MRI.createVirtualRegister(HalfRC); 1937 unsigned HalfSize = HalfRC->getSize(); 1938 const MachineOperand *OffOp = 1939 getNamedOperand(*MI, AMDGPU::OpName::offset); 1940 const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase); 1941 1942 // The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes 1943 // on VI. 1944 1945 bool IsKill = SBase->isKill(); 1946 if (OffOp) { 1947 bool isVI = 1948 MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >= 1949 AMDGPUSubtarget::VOLCANIC_ISLANDS; 1950 unsigned OffScale = isVI ? 1 : 4; 1951 // Handle the _IMM variant 1952 unsigned LoOffset = OffOp->getImm() * OffScale; 1953 unsigned HiOffset = LoOffset + HalfSize; 1954 Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo) 1955 // Use addReg instead of addOperand 1956 // to make sure kill flag is cleared. 1957 .addReg(SBase->getReg(), 0, SBase->getSubReg()) 1958 .addImm(LoOffset / OffScale); 1959 1960 if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) { 1961 unsigned OffsetSGPR = 1962 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1963 BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR) 1964 .addImm(HiOffset); // The offset in register is in bytes. 1965 Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) 1966 .addReg(SBase->getReg(), getKillRegState(IsKill), 1967 SBase->getSubReg()) 1968 .addReg(OffsetSGPR); 1969 } else { 1970 Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi) 1971 .addReg(SBase->getReg(), getKillRegState(IsKill), 1972 SBase->getSubReg()) 1973 .addImm(HiOffset / OffScale); 1974 } 1975 } else { 1976 // Handle the _SGPR variant 1977 MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff); 1978 Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo) 1979 .addReg(SBase->getReg(), 0, SBase->getSubReg()) 1980 .addOperand(*SOff); 1981 unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1982 BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR) 1983 .addOperand(*SOff) 1984 .addImm(HalfSize); 1985 Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp)) 1986 .addReg(SBase->getReg(), getKillRegState(IsKill), 1987 SBase->getSubReg()) 1988 .addReg(OffsetSGPR); 1989 } 1990 1991 unsigned SubLo, SubHi; 1992 switch (HalfSize) { 1993 case 4: 1994 SubLo = AMDGPU::sub0; 1995 SubHi = AMDGPU::sub1; 1996 break; 1997 case 8: 1998 SubLo = AMDGPU::sub0_sub1; 1999 SubHi = AMDGPU::sub2_sub3; 2000 break; 2001 case 16: 2002 SubLo = AMDGPU::sub0_sub1_sub2_sub3; 2003 SubHi = AMDGPU::sub4_sub5_sub6_sub7; 2004 break; 2005 case 32: 2006 SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; 2007 SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15; 2008 break; 2009 default: 2010 llvm_unreachable("Unhandled HalfSize"); 2011 } 2012 2013 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE)) 2014 .addOperand(MI->getOperand(0)) 2015 .addReg(RegLo) 2016 .addImm(SubLo) 2017 .addReg(RegHi) 2018 .addImm(SubHi); 2019 } 2020 2021 void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const { 2022 MachineBasicBlock *MBB = MI->getParent(); 2023 switch (MI->getOpcode()) { 2024 case AMDGPU::S_LOAD_DWORD_IMM: 2025 case AMDGPU::S_LOAD_DWORD_SGPR: 2026 case AMDGPU::S_LOAD_DWORDX2_IMM: 2027 case AMDGPU::S_LOAD_DWORDX2_SGPR: 2028 case AMDGPU::S_LOAD_DWORDX4_IMM: 2029 case AMDGPU::S_LOAD_DWORDX4_SGPR: { 2030 unsigned NewOpcode = getVALUOp(*MI); 2031 unsigned RegOffset; 2032 unsigned ImmOffset; 2033 2034 if (MI->getOperand(2).isReg()) { 2035 RegOffset = MI->getOperand(2).getReg(); 2036 ImmOffset = 0; 2037 } else { 2038 assert(MI->getOperand(2).isImm()); 2039 // SMRD instructions take a dword offsets on SI and byte offset on VI 2040 // and MUBUF instructions always take a byte offset. 2041 ImmOffset = MI->getOperand(2).getImm(); 2042 if (MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() <= 2043 AMDGPUSubtarget::SEA_ISLANDS) 2044 ImmOffset <<= 2; 2045 RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2046 2047 if (isUInt<12>(ImmOffset)) { 2048 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 2049 RegOffset) 2050 .addImm(0); 2051 } else { 2052 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 2053 RegOffset) 2054 .addImm(ImmOffset); 2055 ImmOffset = 0; 2056 } 2057 } 2058 2059 unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); 2060 unsigned DWord0 = RegOffset; 2061 unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2062 unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2063 unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2064 uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); 2065 2066 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1) 2067 .addImm(0); 2068 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2) 2069 .addImm(RsrcDataFormat & 0xFFFFFFFF); 2070 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3) 2071 .addImm(RsrcDataFormat >> 32); 2072 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc) 2073 .addReg(DWord0) 2074 .addImm(AMDGPU::sub0) 2075 .addReg(DWord1) 2076 .addImm(AMDGPU::sub1) 2077 .addReg(DWord2) 2078 .addImm(AMDGPU::sub2) 2079 .addReg(DWord3) 2080 .addImm(AMDGPU::sub3); 2081 MI->setDesc(get(NewOpcode)); 2082 if (MI->getOperand(2).isReg()) { 2083 MI->getOperand(2).setReg(SRsrc); 2084 } else { 2085 MI->getOperand(2).ChangeToRegister(SRsrc, false); 2086 } 2087 MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); 2088 MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset)); 2089 MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // glc 2090 MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // slc 2091 MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // tfe 2092 2093 const TargetRegisterClass *NewDstRC = 2094 RI.getRegClass(get(NewOpcode).OpInfo[0].RegClass); 2095 2096 unsigned DstReg = MI->getOperand(0).getReg(); 2097 unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); 2098 MRI.replaceRegWith(DstReg, NewDstReg); 2099 break; 2100 } 2101 case AMDGPU::S_LOAD_DWORDX8_IMM: 2102 case AMDGPU::S_LOAD_DWORDX8_SGPR: { 2103 MachineInstr *Lo, *Hi; 2104 splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM, 2105 AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi); 2106 MI->eraseFromParent(); 2107 moveSMRDToVALU(Lo, MRI); 2108 moveSMRDToVALU(Hi, MRI); 2109 break; 2110 } 2111 2112 case AMDGPU::S_LOAD_DWORDX16_IMM: 2113 case AMDGPU::S_LOAD_DWORDX16_SGPR: { 2114 MachineInstr *Lo, *Hi; 2115 splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM, 2116 AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi); 2117 MI->eraseFromParent(); 2118 moveSMRDToVALU(Lo, MRI); 2119 moveSMRDToVALU(Hi, MRI); 2120 break; 2121 } 2122 } 2123 } 2124 2125 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { 2126 SmallVector<MachineInstr *, 128> Worklist; 2127 Worklist.push_back(&TopInst); 2128 2129 while (!Worklist.empty()) { 2130 MachineInstr *Inst = Worklist.pop_back_val(); 2131 MachineBasicBlock *MBB = Inst->getParent(); 2132 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 2133 2134 unsigned Opcode = Inst->getOpcode(); 2135 unsigned NewOpcode = getVALUOp(*Inst); 2136 2137 // Handle some special cases 2138 switch (Opcode) { 2139 default: 2140 if (isSMRD(Inst->getOpcode())) { 2141 moveSMRDToVALU(Inst, MRI); 2142 } 2143 break; 2144 case AMDGPU::S_MOV_B64: { 2145 DebugLoc DL = Inst->getDebugLoc(); 2146 2147 // If the source operand is a register we can replace this with a 2148 // copy. 2149 if (Inst->getOperand(1).isReg()) { 2150 MachineInstr *Copy = BuildMI(*MBB, Inst, DL, get(TargetOpcode::COPY)) 2151 .addOperand(Inst->getOperand(0)) 2152 .addOperand(Inst->getOperand(1)); 2153 Worklist.push_back(Copy); 2154 } else { 2155 // Otherwise, we need to split this into two movs, because there is 2156 // no 64-bit VALU move instruction. 2157 unsigned Reg = Inst->getOperand(0).getReg(); 2158 unsigned Dst = split64BitImm(Worklist, 2159 Inst, 2160 MRI, 2161 MRI.getRegClass(Reg), 2162 Inst->getOperand(1)); 2163 MRI.replaceRegWith(Reg, Dst); 2164 } 2165 Inst->eraseFromParent(); 2166 continue; 2167 } 2168 case AMDGPU::S_AND_B64: 2169 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32); 2170 Inst->eraseFromParent(); 2171 continue; 2172 2173 case AMDGPU::S_OR_B64: 2174 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32); 2175 Inst->eraseFromParent(); 2176 continue; 2177 2178 case AMDGPU::S_XOR_B64: 2179 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32); 2180 Inst->eraseFromParent(); 2181 continue; 2182 2183 case AMDGPU::S_NOT_B64: 2184 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); 2185 Inst->eraseFromParent(); 2186 continue; 2187 2188 case AMDGPU::S_BCNT1_I32_B64: 2189 splitScalar64BitBCNT(Worklist, Inst); 2190 Inst->eraseFromParent(); 2191 continue; 2192 2193 case AMDGPU::S_BFE_I64: { 2194 splitScalar64BitBFE(Worklist, Inst); 2195 Inst->eraseFromParent(); 2196 continue; 2197 } 2198 2199 case AMDGPU::S_LSHL_B32: 2200 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2201 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 2202 swapOperands(Inst); 2203 } 2204 break; 2205 case AMDGPU::S_ASHR_I32: 2206 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2207 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 2208 swapOperands(Inst); 2209 } 2210 break; 2211 case AMDGPU::S_LSHR_B32: 2212 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2213 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 2214 swapOperands(Inst); 2215 } 2216 break; 2217 case AMDGPU::S_LSHL_B64: 2218 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2219 NewOpcode = AMDGPU::V_LSHLREV_B64; 2220 swapOperands(Inst); 2221 } 2222 break; 2223 case AMDGPU::S_ASHR_I64: 2224 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2225 NewOpcode = AMDGPU::V_ASHRREV_I64; 2226 swapOperands(Inst); 2227 } 2228 break; 2229 case AMDGPU::S_LSHR_B64: 2230 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2231 NewOpcode = AMDGPU::V_LSHRREV_B64; 2232 swapOperands(Inst); 2233 } 2234 break; 2235 2236 case AMDGPU::S_BFE_U64: 2237 case AMDGPU::S_BFM_B64: 2238 llvm_unreachable("Moving this op to VALU not implemented"); 2239 } 2240 2241 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 2242 // We cannot move this instruction to the VALU, so we should try to 2243 // legalize its operands instead. 2244 legalizeOperands(Inst); 2245 continue; 2246 } 2247 2248 // Use the new VALU Opcode. 2249 const MCInstrDesc &NewDesc = get(NewOpcode); 2250 Inst->setDesc(NewDesc); 2251 2252 // Remove any references to SCC. Vector instructions can't read from it, and 2253 // We're just about to add the implicit use / defs of VCC, and we don't want 2254 // both. 2255 for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) { 2256 MachineOperand &Op = Inst->getOperand(i); 2257 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) 2258 Inst->RemoveOperand(i); 2259 } 2260 2261 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 2262 // We are converting these to a BFE, so we need to add the missing 2263 // operands for the size and offset. 2264 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 2265 Inst->addOperand(MachineOperand::CreateImm(0)); 2266 Inst->addOperand(MachineOperand::CreateImm(Size)); 2267 2268 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 2269 // The VALU version adds the second operand to the result, so insert an 2270 // extra 0 operand. 2271 Inst->addOperand(MachineOperand::CreateImm(0)); 2272 } 2273 2274 addDescImplicitUseDef(NewDesc, Inst); 2275 2276 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 2277 const MachineOperand &OffsetWidthOp = Inst->getOperand(2); 2278 // If we need to move this to VGPRs, we need to unpack the second operand 2279 // back into the 2 separate ones for bit offset and width. 2280 assert(OffsetWidthOp.isImm() && 2281 "Scalar BFE is only implemented for constant width and offset"); 2282 uint32_t Imm = OffsetWidthOp.getImm(); 2283 2284 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 2285 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 2286 Inst->RemoveOperand(2); // Remove old immediate. 2287 Inst->addOperand(MachineOperand::CreateImm(Offset)); 2288 Inst->addOperand(MachineOperand::CreateImm(BitWidth)); 2289 } 2290 2291 // Update the destination register class. 2292 2293 const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0); 2294 2295 switch (Opcode) { 2296 // For target instructions, getOpRegClass just returns the virtual 2297 // register class associated with the operand, so we need to find an 2298 // equivalent VGPR register class in order to move the instruction to the 2299 // VALU. 2300 case AMDGPU::COPY: 2301 case AMDGPU::PHI: 2302 case AMDGPU::REG_SEQUENCE: 2303 case AMDGPU::INSERT_SUBREG: 2304 if (RI.hasVGPRs(NewDstRC)) 2305 continue; 2306 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 2307 if (!NewDstRC) 2308 continue; 2309 break; 2310 default: 2311 break; 2312 } 2313 2314 unsigned DstReg = Inst->getOperand(0).getReg(); 2315 unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); 2316 MRI.replaceRegWith(DstReg, NewDstReg); 2317 2318 // Legalize the operands 2319 legalizeOperands(Inst); 2320 2321 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg), 2322 E = MRI.use_end(); I != E; ++I) { 2323 MachineInstr &UseMI = *I->getParent(); 2324 if (!canReadVGPR(UseMI, I.getOperandNo())) { 2325 Worklist.push_back(&UseMI); 2326 } 2327 } 2328 } 2329 } 2330 2331 //===----------------------------------------------------------------------===// 2332 // Indirect addressing callbacks 2333 //===----------------------------------------------------------------------===// 2334 2335 unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex, 2336 unsigned Channel) const { 2337 assert(Channel == 0); 2338 return RegIndex; 2339 } 2340 2341 const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const { 2342 return &AMDGPU::VGPR_32RegClass; 2343 } 2344 2345 void SIInstrInfo::splitScalar64BitUnaryOp( 2346 SmallVectorImpl<MachineInstr *> &Worklist, 2347 MachineInstr *Inst, 2348 unsigned Opcode) const { 2349 MachineBasicBlock &MBB = *Inst->getParent(); 2350 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2351 2352 MachineOperand &Dest = Inst->getOperand(0); 2353 MachineOperand &Src0 = Inst->getOperand(1); 2354 DebugLoc DL = Inst->getDebugLoc(); 2355 2356 MachineBasicBlock::iterator MII = Inst; 2357 2358 const MCInstrDesc &InstDesc = get(Opcode); 2359 const TargetRegisterClass *Src0RC = Src0.isReg() ? 2360 MRI.getRegClass(Src0.getReg()) : 2361 &AMDGPU::SGPR_32RegClass; 2362 2363 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 2364 2365 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2366 AMDGPU::sub0, Src0SubRC); 2367 2368 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 2369 const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0); 2370 2371 unsigned DestSub0 = MRI.createVirtualRegister(DestRC); 2372 MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) 2373 .addOperand(SrcReg0Sub0); 2374 2375 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2376 AMDGPU::sub1, Src0SubRC); 2377 2378 unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC); 2379 MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) 2380 .addOperand(SrcReg0Sub1); 2381 2382 unsigned FullDestReg = MRI.createVirtualRegister(DestRC); 2383 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2384 .addReg(DestSub0) 2385 .addImm(AMDGPU::sub0) 2386 .addReg(DestSub1) 2387 .addImm(AMDGPU::sub1); 2388 2389 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 2390 2391 // Try to legalize the operands in case we need to swap the order to keep it 2392 // valid. 2393 Worklist.push_back(LoHalf); 2394 Worklist.push_back(HiHalf); 2395 } 2396 2397 void SIInstrInfo::splitScalar64BitBinaryOp( 2398 SmallVectorImpl<MachineInstr *> &Worklist, 2399 MachineInstr *Inst, 2400 unsigned Opcode) const { 2401 MachineBasicBlock &MBB = *Inst->getParent(); 2402 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2403 2404 MachineOperand &Dest = Inst->getOperand(0); 2405 MachineOperand &Src0 = Inst->getOperand(1); 2406 MachineOperand &Src1 = Inst->getOperand(2); 2407 DebugLoc DL = Inst->getDebugLoc(); 2408 2409 MachineBasicBlock::iterator MII = Inst; 2410 2411 const MCInstrDesc &InstDesc = get(Opcode); 2412 const TargetRegisterClass *Src0RC = Src0.isReg() ? 2413 MRI.getRegClass(Src0.getReg()) : 2414 &AMDGPU::SGPR_32RegClass; 2415 2416 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 2417 const TargetRegisterClass *Src1RC = Src1.isReg() ? 2418 MRI.getRegClass(Src1.getReg()) : 2419 &AMDGPU::SGPR_32RegClass; 2420 2421 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 2422 2423 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2424 AMDGPU::sub0, Src0SubRC); 2425 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 2426 AMDGPU::sub0, Src1SubRC); 2427 2428 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 2429 const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0); 2430 2431 unsigned DestSub0 = MRI.createVirtualRegister(DestRC); 2432 MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) 2433 .addOperand(SrcReg0Sub0) 2434 .addOperand(SrcReg1Sub0); 2435 2436 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2437 AMDGPU::sub1, Src0SubRC); 2438 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 2439 AMDGPU::sub1, Src1SubRC); 2440 2441 unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC); 2442 MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) 2443 .addOperand(SrcReg0Sub1) 2444 .addOperand(SrcReg1Sub1); 2445 2446 unsigned FullDestReg = MRI.createVirtualRegister(DestRC); 2447 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2448 .addReg(DestSub0) 2449 .addImm(AMDGPU::sub0) 2450 .addReg(DestSub1) 2451 .addImm(AMDGPU::sub1); 2452 2453 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 2454 2455 // Try to legalize the operands in case we need to swap the order to keep it 2456 // valid. 2457 Worklist.push_back(LoHalf); 2458 Worklist.push_back(HiHalf); 2459 } 2460 2461 void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, 2462 MachineInstr *Inst) const { 2463 MachineBasicBlock &MBB = *Inst->getParent(); 2464 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2465 2466 MachineBasicBlock::iterator MII = Inst; 2467 DebugLoc DL = Inst->getDebugLoc(); 2468 2469 MachineOperand &Dest = Inst->getOperand(0); 2470 MachineOperand &Src = Inst->getOperand(1); 2471 2472 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 2473 const TargetRegisterClass *SrcRC = Src.isReg() ? 2474 MRI.getRegClass(Src.getReg()) : 2475 &AMDGPU::SGPR_32RegClass; 2476 2477 unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2478 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2479 2480 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); 2481 2482 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 2483 AMDGPU::sub0, SrcSubRC); 2484 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 2485 AMDGPU::sub1, SrcSubRC); 2486 2487 MachineInstr *First = BuildMI(MBB, MII, DL, InstDesc, MidReg) 2488 .addOperand(SrcRegSub0) 2489 .addImm(0); 2490 2491 MachineInstr *Second = BuildMI(MBB, MII, DL, InstDesc, ResultReg) 2492 .addOperand(SrcRegSub1) 2493 .addReg(MidReg); 2494 2495 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2496 2497 Worklist.push_back(First); 2498 Worklist.push_back(Second); 2499 } 2500 2501 void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, 2502 MachineInstr *Inst) const { 2503 MachineBasicBlock &MBB = *Inst->getParent(); 2504 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2505 MachineBasicBlock::iterator MII = Inst; 2506 DebugLoc DL = Inst->getDebugLoc(); 2507 2508 MachineOperand &Dest = Inst->getOperand(0); 2509 uint32_t Imm = Inst->getOperand(2).getImm(); 2510 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 2511 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 2512 2513 (void) Offset; 2514 2515 // Only sext_inreg cases handled. 2516 assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 && 2517 BitWidth <= 32 && 2518 Offset == 0 && 2519 "Not implemented"); 2520 2521 if (BitWidth < 32) { 2522 unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2523 unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2524 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2525 2526 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) 2527 .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0) 2528 .addImm(0) 2529 .addImm(BitWidth); 2530 2531 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 2532 .addImm(31) 2533 .addReg(MidRegLo); 2534 2535 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 2536 .addReg(MidRegLo) 2537 .addImm(AMDGPU::sub0) 2538 .addReg(MidRegHi) 2539 .addImm(AMDGPU::sub1); 2540 2541 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2542 return; 2543 } 2544 2545 MachineOperand &Src = Inst->getOperand(1); 2546 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2547 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2548 2549 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 2550 .addImm(31) 2551 .addReg(Src.getReg(), 0, AMDGPU::sub0); 2552 2553 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 2554 .addReg(Src.getReg(), 0, AMDGPU::sub0) 2555 .addImm(AMDGPU::sub0) 2556 .addReg(TmpReg) 2557 .addImm(AMDGPU::sub1); 2558 2559 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2560 } 2561 2562 void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc, 2563 MachineInstr *Inst) const { 2564 // Add the implict and explicit register definitions. 2565 if (NewDesc.ImplicitUses) { 2566 for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) { 2567 unsigned Reg = NewDesc.ImplicitUses[i]; 2568 Inst->addOperand(MachineOperand::CreateReg(Reg, false, true)); 2569 } 2570 } 2571 2572 if (NewDesc.ImplicitDefs) { 2573 for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) { 2574 unsigned Reg = NewDesc.ImplicitDefs[i]; 2575 Inst->addOperand(MachineOperand::CreateReg(Reg, true, true)); 2576 } 2577 } 2578 } 2579 2580 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, 2581 int OpIndices[3]) const { 2582 const MCInstrDesc &Desc = get(MI->getOpcode()); 2583 2584 // Find the one SGPR operand we are allowed to use. 2585 unsigned SGPRReg = AMDGPU::NoRegister; 2586 2587 // First we need to consider the instruction's operand requirements before 2588 // legalizing. Some operands are required to be SGPRs, such as implicit uses 2589 // of VCC, but we are still bound by the constant bus requirement to only use 2590 // one. 2591 // 2592 // If the operand's class is an SGPR, we can never move it. 2593 2594 for (const MachineOperand &MO : MI->implicit_operands()) { 2595 // We only care about reads. 2596 if (MO.isDef()) 2597 continue; 2598 2599 if (MO.getReg() == AMDGPU::VCC) 2600 return AMDGPU::VCC; 2601 2602 if (MO.getReg() == AMDGPU::FLAT_SCR) 2603 return AMDGPU::FLAT_SCR; 2604 } 2605 2606 unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; 2607 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 2608 2609 for (unsigned i = 0; i < 3; ++i) { 2610 int Idx = OpIndices[i]; 2611 if (Idx == -1) 2612 break; 2613 2614 const MachineOperand &MO = MI->getOperand(Idx); 2615 if (RI.isSGPRClassID(Desc.OpInfo[Idx].RegClass)) 2616 SGPRReg = MO.getReg(); 2617 2618 if (MO.isReg() && RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 2619 UsedSGPRs[i] = MO.getReg(); 2620 } 2621 2622 if (SGPRReg != AMDGPU::NoRegister) 2623 return SGPRReg; 2624 2625 // We don't have a required SGPR operand, so we have a bit more freedom in 2626 // selecting operands to move. 2627 2628 // Try to select the most used SGPR. If an SGPR is equal to one of the 2629 // others, we choose that. 2630 // 2631 // e.g. 2632 // V_FMA_F32 v0, s0, s0, s0 -> No moves 2633 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 2634 2635 if (UsedSGPRs[0] != AMDGPU::NoRegister) { 2636 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 2637 SGPRReg = UsedSGPRs[0]; 2638 } 2639 2640 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { 2641 if (UsedSGPRs[1] == UsedSGPRs[2]) 2642 SGPRReg = UsedSGPRs[1]; 2643 } 2644 2645 return SGPRReg; 2646 } 2647 2648 MachineInstrBuilder SIInstrInfo::buildIndirectWrite( 2649 MachineBasicBlock *MBB, 2650 MachineBasicBlock::iterator I, 2651 unsigned ValueReg, 2652 unsigned Address, unsigned OffsetReg) const { 2653 const DebugLoc &DL = MBB->findDebugLoc(I); 2654 unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( 2655 getIndirectIndexBegin(*MBB->getParent())); 2656 2657 return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_DST_V1)) 2658 .addReg(IndirectBaseReg, RegState::Define) 2659 .addOperand(I->getOperand(0)) 2660 .addReg(IndirectBaseReg) 2661 .addReg(OffsetReg) 2662 .addImm(0) 2663 .addReg(ValueReg); 2664 } 2665 2666 MachineInstrBuilder SIInstrInfo::buildIndirectRead( 2667 MachineBasicBlock *MBB, 2668 MachineBasicBlock::iterator I, 2669 unsigned ValueReg, 2670 unsigned Address, unsigned OffsetReg) const { 2671 const DebugLoc &DL = MBB->findDebugLoc(I); 2672 unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( 2673 getIndirectIndexBegin(*MBB->getParent())); 2674 2675 return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC)) 2676 .addOperand(I->getOperand(0)) 2677 .addOperand(I->getOperand(1)) 2678 .addReg(IndirectBaseReg) 2679 .addReg(OffsetReg) 2680 .addImm(0); 2681 2682 } 2683 2684 void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved, 2685 const MachineFunction &MF) const { 2686 int End = getIndirectIndexEnd(MF); 2687 int Begin = getIndirectIndexBegin(MF); 2688 2689 if (End == -1) 2690 return; 2691 2692 2693 for (int Index = Begin; Index <= End; ++Index) 2694 Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index)); 2695 2696 for (int Index = std::max(0, Begin - 1); Index <= End; ++Index) 2697 Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index)); 2698 2699 for (int Index = std::max(0, Begin - 2); Index <= End; ++Index) 2700 Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index)); 2701 2702 for (int Index = std::max(0, Begin - 3); Index <= End; ++Index) 2703 Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index)); 2704 2705 for (int Index = std::max(0, Begin - 7); Index <= End; ++Index) 2706 Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index)); 2707 2708 for (int Index = std::max(0, Begin - 15); Index <= End; ++Index) 2709 Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index)); 2710 } 2711 2712 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 2713 unsigned OperandName) const { 2714 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 2715 if (Idx == -1) 2716 return nullptr; 2717 2718 return &MI.getOperand(Idx); 2719 } 2720 2721 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 2722 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 2723 if (ST.isAmdHsaOS()) { 2724 RsrcDataFormat |= (1ULL << 56); 2725 2726 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 2727 // Set MTYPE = 2 2728 RsrcDataFormat |= (2ULL << 59); 2729 } 2730 2731 return RsrcDataFormat; 2732 } 2733