1 //===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief SI Implementation of TargetInstrInfo. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "SIInstrInfo.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "SIDefines.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/CodeGen/MachineFrameInfo.h" 21 #include "llvm/CodeGen/MachineInstrBuilder.h" 22 #include "llvm/CodeGen/MachineRegisterInfo.h" 23 #include "llvm/IR/Function.h" 24 #include "llvm/CodeGen/RegisterScavenging.h" 25 #include "llvm/MC/MCInstrDesc.h" 26 #include "llvm/Support/Debug.h" 27 28 using namespace llvm; 29 30 SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st) 31 : AMDGPUInstrInfo(st), RI() {} 32 33 //===----------------------------------------------------------------------===// 34 // TargetInstrInfo callbacks 35 //===----------------------------------------------------------------------===// 36 37 static unsigned getNumOperandsNoGlue(SDNode *Node) { 38 unsigned N = Node->getNumOperands(); 39 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 40 --N; 41 return N; 42 } 43 44 static SDValue findChainOperand(SDNode *Load) { 45 SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); 46 assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); 47 return LastOp; 48 } 49 50 /// \brief Returns true if both nodes have the same value for the given 51 /// operand \p Op, or if both nodes do not have this operand. 52 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 53 unsigned Opc0 = N0->getMachineOpcode(); 54 unsigned Opc1 = N1->getMachineOpcode(); 55 56 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 57 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 58 59 if (Op0Idx == -1 && Op1Idx == -1) 60 return true; 61 62 63 if ((Op0Idx == -1 && Op1Idx != -1) || 64 (Op1Idx == -1 && Op0Idx != -1)) 65 return false; 66 67 // getNamedOperandIdx returns the index for the MachineInstr's operands, 68 // which includes the result as the first operand. We are indexing into the 69 // MachineSDNode's operands, so we need to skip the result operand to get 70 // the real index. 71 --Op0Idx; 72 --Op1Idx; 73 74 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 75 } 76 77 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, 78 AliasAnalysis *AA) const { 79 // TODO: The generic check fails for VALU instructions that should be 80 // rematerializable due to implicit reads of exec. We really want all of the 81 // generic logic for this except for this. 82 switch (MI->getOpcode()) { 83 case AMDGPU::V_MOV_B32_e32: 84 case AMDGPU::V_MOV_B32_e64: 85 return true; 86 default: 87 return false; 88 } 89 } 90 91 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 92 int64_t &Offset0, 93 int64_t &Offset1) const { 94 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 95 return false; 96 97 unsigned Opc0 = Load0->getMachineOpcode(); 98 unsigned Opc1 = Load1->getMachineOpcode(); 99 100 // Make sure both are actually loads. 101 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 102 return false; 103 104 if (isDS(Opc0) && isDS(Opc1)) { 105 106 // FIXME: Handle this case: 107 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 108 return false; 109 110 // Check base reg. 111 if (Load0->getOperand(1) != Load1->getOperand(1)) 112 return false; 113 114 // Check chain. 115 if (findChainOperand(Load0) != findChainOperand(Load1)) 116 return false; 117 118 // Skip read2 / write2 variants for simplicity. 119 // TODO: We should report true if the used offsets are adjacent (excluded 120 // st64 versions). 121 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || 122 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) 123 return false; 124 125 Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue(); 126 Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue(); 127 return true; 128 } 129 130 if (isSMRD(Opc0) && isSMRD(Opc1)) { 131 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); 132 133 // Check base reg. 134 if (Load0->getOperand(0) != Load1->getOperand(0)) 135 return false; 136 137 const ConstantSDNode *Load0Offset = 138 dyn_cast<ConstantSDNode>(Load0->getOperand(1)); 139 const ConstantSDNode *Load1Offset = 140 dyn_cast<ConstantSDNode>(Load1->getOperand(1)); 141 142 if (!Load0Offset || !Load1Offset) 143 return false; 144 145 // Check chain. 146 if (findChainOperand(Load0) != findChainOperand(Load1)) 147 return false; 148 149 Offset0 = Load0Offset->getZExtValue(); 150 Offset1 = Load1Offset->getZExtValue(); 151 return true; 152 } 153 154 // MUBUF and MTBUF can access the same addresses. 155 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 156 157 // MUBUF and MTBUF have vaddr at different indices. 158 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 159 findChainOperand(Load0) != findChainOperand(Load1) || 160 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 161 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 162 return false; 163 164 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 165 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 166 167 if (OffIdx0 == -1 || OffIdx1 == -1) 168 return false; 169 170 // getNamedOperandIdx returns the index for MachineInstrs. Since they 171 // inlcude the output in the operand list, but SDNodes don't, we need to 172 // subtract the index by one. 173 --OffIdx0; 174 --OffIdx1; 175 176 SDValue Off0 = Load0->getOperand(OffIdx0); 177 SDValue Off1 = Load1->getOperand(OffIdx1); 178 179 // The offset might be a FrameIndexSDNode. 180 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 181 return false; 182 183 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 184 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 185 return true; 186 } 187 188 return false; 189 } 190 191 static bool isStride64(unsigned Opc) { 192 switch (Opc) { 193 case AMDGPU::DS_READ2ST64_B32: 194 case AMDGPU::DS_READ2ST64_B64: 195 case AMDGPU::DS_WRITE2ST64_B32: 196 case AMDGPU::DS_WRITE2ST64_B64: 197 return true; 198 default: 199 return false; 200 } 201 } 202 203 bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, 204 unsigned &Offset, 205 const TargetRegisterInfo *TRI) const { 206 unsigned Opc = LdSt->getOpcode(); 207 if (isDS(Opc)) { 208 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 209 AMDGPU::OpName::offset); 210 if (OffsetImm) { 211 // Normal, single offset LDS instruction. 212 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 213 AMDGPU::OpName::addr); 214 215 BaseReg = AddrReg->getReg(); 216 Offset = OffsetImm->getImm(); 217 return true; 218 } 219 220 // The 2 offset instructions use offset0 and offset1 instead. We can treat 221 // these as a load with a single offset if the 2 offsets are consecutive. We 222 // will use this for some partially aligned loads. 223 const MachineOperand *Offset0Imm = getNamedOperand(*LdSt, 224 AMDGPU::OpName::offset0); 225 const MachineOperand *Offset1Imm = getNamedOperand(*LdSt, 226 AMDGPU::OpName::offset1); 227 228 uint8_t Offset0 = Offset0Imm->getImm(); 229 uint8_t Offset1 = Offset1Imm->getImm(); 230 assert(Offset1 > Offset0); 231 232 if (Offset1 - Offset0 == 1) { 233 // Each of these offsets is in element sized units, so we need to convert 234 // to bytes of the individual reads. 235 236 unsigned EltSize; 237 if (LdSt->mayLoad()) 238 EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2; 239 else { 240 assert(LdSt->mayStore()); 241 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 242 EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize(); 243 } 244 245 if (isStride64(Opc)) 246 EltSize *= 64; 247 248 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 249 AMDGPU::OpName::addr); 250 BaseReg = AddrReg->getReg(); 251 Offset = EltSize * Offset0; 252 return true; 253 } 254 255 return false; 256 } 257 258 if (isMUBUF(Opc) || isMTBUF(Opc)) { 259 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) 260 return false; 261 262 const MachineOperand *AddrReg = getNamedOperand(*LdSt, 263 AMDGPU::OpName::vaddr); 264 if (!AddrReg) 265 return false; 266 267 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 268 AMDGPU::OpName::offset); 269 BaseReg = AddrReg->getReg(); 270 Offset = OffsetImm->getImm(); 271 return true; 272 } 273 274 if (isSMRD(Opc)) { 275 const MachineOperand *OffsetImm = getNamedOperand(*LdSt, 276 AMDGPU::OpName::offset); 277 if (!OffsetImm) 278 return false; 279 280 const MachineOperand *SBaseReg = getNamedOperand(*LdSt, 281 AMDGPU::OpName::sbase); 282 BaseReg = SBaseReg->getReg(); 283 Offset = OffsetImm->getImm(); 284 return true; 285 } 286 287 return false; 288 } 289 290 bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, 291 MachineInstr *SecondLdSt, 292 unsigned NumLoads) const { 293 unsigned Opc0 = FirstLdSt->getOpcode(); 294 unsigned Opc1 = SecondLdSt->getOpcode(); 295 296 // TODO: This needs finer tuning 297 if (NumLoads > 4) 298 return false; 299 300 if (isDS(Opc0) && isDS(Opc1)) 301 return true; 302 303 if (isSMRD(Opc0) && isSMRD(Opc1)) 304 return true; 305 306 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) 307 return true; 308 309 return false; 310 } 311 312 void 313 SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 314 MachineBasicBlock::iterator MI, DebugLoc DL, 315 unsigned DestReg, unsigned SrcReg, 316 bool KillSrc) const { 317 318 // If we are trying to copy to or from SCC, there is a bug somewhere else in 319 // the backend. While it may be theoretically possible to do this, it should 320 // never be necessary. 321 assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC); 322 323 static const int16_t Sub0_15[] = { 324 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 325 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 326 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 327 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 0 328 }; 329 330 static const int16_t Sub0_7[] = { 331 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 332 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 0 333 }; 334 335 static const int16_t Sub0_3[] = { 336 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0 337 }; 338 339 static const int16_t Sub0_2[] = { 340 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 0 341 }; 342 343 static const int16_t Sub0_1[] = { 344 AMDGPU::sub0, AMDGPU::sub1, 0 345 }; 346 347 unsigned Opcode; 348 const int16_t *SubIndices; 349 350 if (AMDGPU::SReg_32RegClass.contains(DestReg)) { 351 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 352 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 353 .addReg(SrcReg, getKillRegState(KillSrc)); 354 return; 355 356 } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) { 357 if (DestReg == AMDGPU::VCC) { 358 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 359 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 360 .addReg(SrcReg, getKillRegState(KillSrc)); 361 } else { 362 // FIXME: Hack until VReg_1 removed. 363 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 364 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32), AMDGPU::VCC) 365 .addImm(0) 366 .addReg(SrcReg, getKillRegState(KillSrc)); 367 } 368 369 return; 370 } 371 372 assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); 373 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 374 .addReg(SrcReg, getKillRegState(KillSrc)); 375 return; 376 377 } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) { 378 assert(AMDGPU::SReg_128RegClass.contains(SrcReg)); 379 Opcode = AMDGPU::S_MOV_B32; 380 SubIndices = Sub0_3; 381 382 } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) { 383 assert(AMDGPU::SReg_256RegClass.contains(SrcReg)); 384 Opcode = AMDGPU::S_MOV_B32; 385 SubIndices = Sub0_7; 386 387 } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) { 388 assert(AMDGPU::SReg_512RegClass.contains(SrcReg)); 389 Opcode = AMDGPU::S_MOV_B32; 390 SubIndices = Sub0_15; 391 392 } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) { 393 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 394 AMDGPU::SReg_32RegClass.contains(SrcReg)); 395 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 396 .addReg(SrcReg, getKillRegState(KillSrc)); 397 return; 398 399 } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) { 400 assert(AMDGPU::VReg_64RegClass.contains(SrcReg) || 401 AMDGPU::SReg_64RegClass.contains(SrcReg)); 402 Opcode = AMDGPU::V_MOV_B32_e32; 403 SubIndices = Sub0_1; 404 405 } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) { 406 assert(AMDGPU::VReg_96RegClass.contains(SrcReg)); 407 Opcode = AMDGPU::V_MOV_B32_e32; 408 SubIndices = Sub0_2; 409 410 } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) { 411 assert(AMDGPU::VReg_128RegClass.contains(SrcReg) || 412 AMDGPU::SReg_128RegClass.contains(SrcReg)); 413 Opcode = AMDGPU::V_MOV_B32_e32; 414 SubIndices = Sub0_3; 415 416 } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) { 417 assert(AMDGPU::VReg_256RegClass.contains(SrcReg) || 418 AMDGPU::SReg_256RegClass.contains(SrcReg)); 419 Opcode = AMDGPU::V_MOV_B32_e32; 420 SubIndices = Sub0_7; 421 422 } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) { 423 assert(AMDGPU::VReg_512RegClass.contains(SrcReg) || 424 AMDGPU::SReg_512RegClass.contains(SrcReg)); 425 Opcode = AMDGPU::V_MOV_B32_e32; 426 SubIndices = Sub0_15; 427 428 } else { 429 llvm_unreachable("Can't copy register!"); 430 } 431 432 while (unsigned SubIdx = *SubIndices++) { 433 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 434 get(Opcode), RI.getSubReg(DestReg, SubIdx)); 435 436 Builder.addReg(RI.getSubReg(SrcReg, SubIdx), getKillRegState(KillSrc)); 437 438 if (*SubIndices) 439 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 440 } 441 } 442 443 int SIInstrInfo::commuteOpcode(const MachineInstr &MI) const { 444 const unsigned Opcode = MI.getOpcode(); 445 446 int NewOpc; 447 448 // Try to map original to commuted opcode 449 NewOpc = AMDGPU::getCommuteRev(Opcode); 450 if (NewOpc != -1) 451 // Check if the commuted (REV) opcode exists on the target. 452 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 453 454 // Try to map commuted to original opcode 455 NewOpc = AMDGPU::getCommuteOrig(Opcode); 456 if (NewOpc != -1) 457 // Check if the original (non-REV) opcode exists on the target. 458 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 459 460 return Opcode; 461 } 462 463 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 464 465 if (DstRC->getSize() == 4) { 466 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 467 } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) { 468 return AMDGPU::S_MOV_B64; 469 } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) { 470 return AMDGPU::V_MOV_B64_PSEUDO; 471 } 472 return AMDGPU::COPY; 473 } 474 475 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 476 MachineBasicBlock::iterator MI, 477 unsigned SrcReg, bool isKill, 478 int FrameIndex, 479 const TargetRegisterClass *RC, 480 const TargetRegisterInfo *TRI) const { 481 MachineFunction *MF = MBB.getParent(); 482 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 483 MachineFrameInfo *FrameInfo = MF->getFrameInfo(); 484 DebugLoc DL = MBB.findDebugLoc(MI); 485 int Opcode = -1; 486 487 if (RI.isSGPRClass(RC)) { 488 // We are only allowed to create one new instruction when spilling 489 // registers, so we need to use pseudo instruction for spilling 490 // SGPRs. 491 switch (RC->getSize() * 8) { 492 case 32: Opcode = AMDGPU::SI_SPILL_S32_SAVE; break; 493 case 64: Opcode = AMDGPU::SI_SPILL_S64_SAVE; break; 494 case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break; 495 case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break; 496 case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break; 497 } 498 } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) { 499 MFI->setHasSpilledVGPRs(); 500 501 switch(RC->getSize() * 8) { 502 case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break; 503 case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break; 504 case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break; 505 case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break; 506 case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break; 507 case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break; 508 } 509 } 510 511 if (Opcode != -1) { 512 FrameInfo->setObjectAlignment(FrameIndex, 4); 513 BuildMI(MBB, MI, DL, get(Opcode)) 514 .addReg(SrcReg) 515 .addFrameIndex(FrameIndex) 516 // Place-holder registers, these will be filled in by 517 // SIPrepareScratchRegs. 518 .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) 519 .addReg(AMDGPU::SGPR0, RegState::Undef); 520 } else { 521 LLVMContext &Ctx = MF->getFunction()->getContext(); 522 Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" 523 " spill register"); 524 BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) 525 .addReg(SrcReg); 526 } 527 } 528 529 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 530 MachineBasicBlock::iterator MI, 531 unsigned DestReg, int FrameIndex, 532 const TargetRegisterClass *RC, 533 const TargetRegisterInfo *TRI) const { 534 MachineFunction *MF = MBB.getParent(); 535 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 536 MachineFrameInfo *FrameInfo = MF->getFrameInfo(); 537 DebugLoc DL = MBB.findDebugLoc(MI); 538 int Opcode = -1; 539 540 if (RI.isSGPRClass(RC)){ 541 switch(RC->getSize() * 8) { 542 case 32: Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break; 543 case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break; 544 case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break; 545 case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break; 546 case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break; 547 } 548 } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) { 549 switch(RC->getSize() * 8) { 550 case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break; 551 case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break; 552 case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break; 553 case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break; 554 case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break; 555 case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break; 556 } 557 } 558 559 if (Opcode != -1) { 560 FrameInfo->setObjectAlignment(FrameIndex, 4); 561 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 562 .addFrameIndex(FrameIndex) 563 // Place-holder registers, these will be filled in by 564 // SIPrepareScratchRegs. 565 .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) 566 .addReg(AMDGPU::SGPR0, RegState::Undef); 567 568 } else { 569 LLVMContext &Ctx = MF->getFunction()->getContext(); 570 Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" 571 " restore register"); 572 BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); 573 } 574 } 575 576 /// \param @Offset Offset in bytes of the FrameIndex being spilled 577 unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, 578 MachineBasicBlock::iterator MI, 579 RegScavenger *RS, unsigned TmpReg, 580 unsigned FrameOffset, 581 unsigned Size) const { 582 MachineFunction *MF = MBB.getParent(); 583 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 584 const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>(); 585 const SIRegisterInfo *TRI = 586 static_cast<const SIRegisterInfo*>(ST.getRegisterInfo()); 587 DebugLoc DL = MBB.findDebugLoc(MI); 588 unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF); 589 unsigned WavefrontSize = ST.getWavefrontSize(); 590 591 unsigned TIDReg = MFI->getTIDReg(); 592 if (!MFI->hasCalculatedTID()) { 593 MachineBasicBlock &Entry = MBB.getParent()->front(); 594 MachineBasicBlock::iterator Insert = Entry.front(); 595 DebugLoc DL = Insert->getDebugLoc(); 596 597 TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass); 598 if (TIDReg == AMDGPU::NoRegister) 599 return TIDReg; 600 601 602 if (MFI->getShaderType() == ShaderType::COMPUTE && 603 WorkGroupSize > WavefrontSize) { 604 605 unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X); 606 unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y); 607 unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z); 608 unsigned InputPtrReg = 609 TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR); 610 for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { 611 if (!Entry.isLiveIn(Reg)) 612 Entry.addLiveIn(Reg); 613 } 614 615 RS->enterBasicBlock(&Entry); 616 unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 617 unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 618 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) 619 .addReg(InputPtrReg) 620 .addImm(SI::KernelInputOffsets::NGROUPS_Z); 621 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) 622 .addReg(InputPtrReg) 623 .addImm(SI::KernelInputOffsets::NGROUPS_Y); 624 625 // NGROUPS.X * NGROUPS.Y 626 BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) 627 .addReg(STmp1) 628 .addReg(STmp0); 629 // (NGROUPS.X * NGROUPS.Y) * TIDIG.X 630 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) 631 .addReg(STmp1) 632 .addReg(TIDIGXReg); 633 // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) 634 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) 635 .addReg(STmp0) 636 .addReg(TIDIGYReg) 637 .addReg(TIDReg); 638 // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z 639 BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) 640 .addReg(TIDReg) 641 .addReg(TIDIGZReg); 642 } else { 643 // Get the wave id 644 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), 645 TIDReg) 646 .addImm(-1) 647 .addImm(0); 648 649 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), 650 TIDReg) 651 .addImm(-1) 652 .addReg(TIDReg); 653 } 654 655 BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), 656 TIDReg) 657 .addImm(2) 658 .addReg(TIDReg); 659 MFI->setTIDReg(TIDReg); 660 } 661 662 // Add FrameIndex to LDS offset 663 unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize); 664 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) 665 .addImm(LDSOffset) 666 .addReg(TIDReg); 667 668 return TmpReg; 669 } 670 671 void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI, 672 int Count) const { 673 while (Count > 0) { 674 int Arg; 675 if (Count >= 8) 676 Arg = 7; 677 else 678 Arg = Count - 1; 679 Count -= 8; 680 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(AMDGPU::S_NOP)) 681 .addImm(Arg); 682 } 683 } 684 685 bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { 686 MachineBasicBlock &MBB = *MI->getParent(); 687 DebugLoc DL = MBB.findDebugLoc(MI); 688 switch (MI->getOpcode()) { 689 default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); 690 691 case AMDGPU::SI_CONSTDATA_PTR: { 692 unsigned Reg = MI->getOperand(0).getReg(); 693 unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); 694 unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); 695 696 BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg); 697 698 // Add 32-bit offset from this instruction to the start of the constant data. 699 BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_U32), RegLo) 700 .addReg(RegLo) 701 .addTargetIndex(AMDGPU::TI_CONSTDATA_START) 702 .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit); 703 BuildMI(MBB, MI, DL, get(AMDGPU::S_ADDC_U32), RegHi) 704 .addReg(RegHi) 705 .addImm(0) 706 .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit) 707 .addReg(AMDGPU::SCC, RegState::Implicit); 708 MI->eraseFromParent(); 709 break; 710 } 711 case AMDGPU::SGPR_USE: 712 // This is just a placeholder for register allocation. 713 MI->eraseFromParent(); 714 break; 715 716 case AMDGPU::V_MOV_B64_PSEUDO: { 717 unsigned Dst = MI->getOperand(0).getReg(); 718 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 719 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 720 721 const MachineOperand &SrcOp = MI->getOperand(1); 722 // FIXME: Will this work for 64-bit floating point immediates? 723 assert(!SrcOp.isFPImm()); 724 if (SrcOp.isImm()) { 725 APInt Imm(64, SrcOp.getImm()); 726 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 727 .addImm(Imm.getLoBits(32).getZExtValue()) 728 .addReg(Dst, RegState::Implicit); 729 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 730 .addImm(Imm.getHiBits(32).getZExtValue()) 731 .addReg(Dst, RegState::Implicit); 732 } else { 733 assert(SrcOp.isReg()); 734 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 735 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 736 .addReg(Dst, RegState::Implicit); 737 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 738 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 739 .addReg(Dst, RegState::Implicit); 740 } 741 MI->eraseFromParent(); 742 break; 743 } 744 745 case AMDGPU::V_CNDMASK_B64_PSEUDO: { 746 unsigned Dst = MI->getOperand(0).getReg(); 747 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 748 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 749 unsigned Src0 = MI->getOperand(1).getReg(); 750 unsigned Src1 = MI->getOperand(2).getReg(); 751 const MachineOperand &SrcCond = MI->getOperand(3); 752 753 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo) 754 .addReg(RI.getSubReg(Src0, AMDGPU::sub0)) 755 .addReg(RI.getSubReg(Src1, AMDGPU::sub0)) 756 .addOperand(SrcCond); 757 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi) 758 .addReg(RI.getSubReg(Src0, AMDGPU::sub1)) 759 .addReg(RI.getSubReg(Src1, AMDGPU::sub1)) 760 .addOperand(SrcCond); 761 MI->eraseFromParent(); 762 break; 763 } 764 } 765 return true; 766 } 767 768 MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, 769 bool NewMI) const { 770 771 if (MI->getNumOperands() < 3) 772 return nullptr; 773 774 int CommutedOpcode = commuteOpcode(*MI); 775 if (CommutedOpcode == -1) 776 return nullptr; 777 778 int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 779 AMDGPU::OpName::src0); 780 assert(Src0Idx != -1 && "Should always have src0 operand"); 781 782 MachineOperand &Src0 = MI->getOperand(Src0Idx); 783 if (!Src0.isReg()) 784 return nullptr; 785 786 int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 787 AMDGPU::OpName::src1); 788 if (Src1Idx == -1) 789 return nullptr; 790 791 MachineOperand &Src1 = MI->getOperand(Src1Idx); 792 793 // Make sure it's legal to commute operands for VOP2. 794 if (isVOP2(MI->getOpcode()) && 795 (!isOperandLegal(MI, Src0Idx, &Src1) || 796 !isOperandLegal(MI, Src1Idx, &Src0))) { 797 return nullptr; 798 } 799 800 if (!Src1.isReg()) { 801 // Allow commuting instructions with Imm operands. 802 if (NewMI || !Src1.isImm() || 803 (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) { 804 return nullptr; 805 } 806 807 // Be sure to copy the source modifiers to the right place. 808 if (MachineOperand *Src0Mods 809 = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { 810 MachineOperand *Src1Mods 811 = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers); 812 813 int Src0ModsVal = Src0Mods->getImm(); 814 if (!Src1Mods && Src0ModsVal != 0) 815 return nullptr; 816 817 // XXX - This assert might be a lie. It might be useful to have a neg 818 // modifier with 0.0. 819 int Src1ModsVal = Src1Mods->getImm(); 820 assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates"); 821 822 Src1Mods->setImm(Src0ModsVal); 823 Src0Mods->setImm(Src1ModsVal); 824 } 825 826 unsigned Reg = Src0.getReg(); 827 unsigned SubReg = Src0.getSubReg(); 828 if (Src1.isImm()) 829 Src0.ChangeToImmediate(Src1.getImm()); 830 else 831 llvm_unreachable("Should only have immediates"); 832 833 Src1.ChangeToRegister(Reg, false); 834 Src1.setSubReg(SubReg); 835 } else { 836 MI = TargetInstrInfo::commuteInstruction(MI, NewMI); 837 } 838 839 if (MI) 840 MI->setDesc(get(CommutedOpcode)); 841 842 return MI; 843 } 844 845 // This needs to be implemented because the source modifiers may be inserted 846 // between the true commutable operands, and the base 847 // TargetInstrInfo::commuteInstruction uses it. 848 bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, 849 unsigned &SrcOpIdx1, 850 unsigned &SrcOpIdx2) const { 851 const MCInstrDesc &MCID = MI->getDesc(); 852 if (!MCID.isCommutable()) 853 return false; 854 855 unsigned Opc = MI->getOpcode(); 856 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 857 if (Src0Idx == -1) 858 return false; 859 860 // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on 861 // immediate. 862 if (!MI->getOperand(Src0Idx).isReg()) 863 return false; 864 865 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 866 if (Src1Idx == -1) 867 return false; 868 869 if (!MI->getOperand(Src1Idx).isReg()) 870 return false; 871 872 // If any source modifiers are set, the generic instruction commuting won't 873 // understand how to copy the source modifiers. 874 if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || 875 hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) 876 return false; 877 878 SrcOpIdx1 = Src0Idx; 879 SrcOpIdx2 = Src1Idx; 880 return true; 881 } 882 883 MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB, 884 MachineBasicBlock::iterator I, 885 unsigned DstReg, 886 unsigned SrcReg) const { 887 return BuildMI(*MBB, I, MBB->findDebugLoc(I), get(AMDGPU::V_MOV_B32_e32), 888 DstReg) .addReg(SrcReg); 889 } 890 891 bool SIInstrInfo::isMov(unsigned Opcode) const { 892 switch(Opcode) { 893 default: return false; 894 case AMDGPU::S_MOV_B32: 895 case AMDGPU::S_MOV_B64: 896 case AMDGPU::V_MOV_B32_e32: 897 case AMDGPU::V_MOV_B32_e64: 898 return true; 899 } 900 } 901 902 bool 903 SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { 904 return RC != &AMDGPU::EXECRegRegClass; 905 } 906 907 static void removeModOperands(MachineInstr &MI) { 908 unsigned Opc = MI.getOpcode(); 909 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, 910 AMDGPU::OpName::src0_modifiers); 911 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, 912 AMDGPU::OpName::src1_modifiers); 913 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, 914 AMDGPU::OpName::src2_modifiers); 915 916 MI.RemoveOperand(Src2ModIdx); 917 MI.RemoveOperand(Src1ModIdx); 918 MI.RemoveOperand(Src0ModIdx); 919 } 920 921 bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, 922 unsigned Reg, MachineRegisterInfo *MRI) const { 923 if (!MRI->hasOneNonDBGUse(Reg)) 924 return false; 925 926 unsigned Opc = UseMI->getOpcode(); 927 if (Opc == AMDGPU::V_MAD_F32) { 928 // Don't fold if we are using source modifiers. The new VOP2 instructions 929 // don't have them. 930 if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) || 931 hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) || 932 hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) { 933 return false; 934 } 935 936 MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0); 937 MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1); 938 MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2); 939 940 // Multiplied part is the constant: Use v_madmk_f32 941 // We should only expect these to be on src0 due to canonicalizations. 942 if (Src0->isReg() && Src0->getReg() == Reg) { 943 if (!Src1->isReg() || 944 (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) 945 return false; 946 947 if (!Src2->isReg() || 948 (Src2->isReg() && RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))) 949 return false; 950 951 // We need to do some weird looking operand shuffling since the madmk 952 // operands are out of the normal expected order with the multiplied 953 // constant as the last operand. 954 // 955 // v_mad_f32 src0, src1, src2 -> v_madmk_f32 src0 * src2K + src1 956 // src0 -> src2 K 957 // src1 -> src0 958 // src2 -> src1 959 960 const int64_t Imm = DefMI->getOperand(1).getImm(); 961 962 // FIXME: This would be a lot easier if we could return a new instruction 963 // instead of having to modify in place. 964 965 // Remove these first since they are at the end. 966 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, 967 AMDGPU::OpName::omod)); 968 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, 969 AMDGPU::OpName::clamp)); 970 971 unsigned Src1Reg = Src1->getReg(); 972 unsigned Src1SubReg = Src1->getSubReg(); 973 unsigned Src2Reg = Src2->getReg(); 974 unsigned Src2SubReg = Src2->getSubReg(); 975 Src0->setReg(Src1Reg); 976 Src0->setSubReg(Src1SubReg); 977 Src0->setIsKill(Src1->isKill()); 978 979 Src1->setReg(Src2Reg); 980 Src1->setSubReg(Src2SubReg); 981 Src1->setIsKill(Src2->isKill()); 982 983 Src2->ChangeToImmediate(Imm); 984 985 removeModOperands(*UseMI); 986 UseMI->setDesc(get(AMDGPU::V_MADMK_F32)); 987 988 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 989 if (DeleteDef) 990 DefMI->eraseFromParent(); 991 992 return true; 993 } 994 995 // Added part is the constant: Use v_madak_f32 996 if (Src2->isReg() && Src2->getReg() == Reg) { 997 // Not allowed to use constant bus for another operand. 998 // We can however allow an inline immediate as src0. 999 if (!Src0->isImm() && 1000 (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) 1001 return false; 1002 1003 if (!Src1->isReg() || 1004 (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) 1005 return false; 1006 1007 const int64_t Imm = DefMI->getOperand(1).getImm(); 1008 1009 // FIXME: This would be a lot easier if we could return a new instruction 1010 // instead of having to modify in place. 1011 1012 // Remove these first since they are at the end. 1013 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, 1014 AMDGPU::OpName::omod)); 1015 UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, 1016 AMDGPU::OpName::clamp)); 1017 1018 Src2->ChangeToImmediate(Imm); 1019 1020 // These come before src2. 1021 removeModOperands(*UseMI); 1022 UseMI->setDesc(get(AMDGPU::V_MADAK_F32)); 1023 1024 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 1025 if (DeleteDef) 1026 DefMI->eraseFromParent(); 1027 1028 return true; 1029 } 1030 } 1031 1032 return false; 1033 } 1034 1035 bool 1036 SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI, 1037 AliasAnalysis *AA) const { 1038 switch(MI->getOpcode()) { 1039 default: return AMDGPUInstrInfo::isTriviallyReMaterializable(MI, AA); 1040 case AMDGPU::S_MOV_B32: 1041 case AMDGPU::S_MOV_B64: 1042 case AMDGPU::V_MOV_B32_e32: 1043 return MI->getOperand(1).isImm(); 1044 } 1045 } 1046 1047 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 1048 int WidthB, int OffsetB) { 1049 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1050 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1051 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1052 return LowOffset + LowWidth <= HighOffset; 1053 } 1054 1055 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa, 1056 MachineInstr *MIb) const { 1057 unsigned BaseReg0, Offset0; 1058 unsigned BaseReg1, Offset1; 1059 1060 if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && 1061 getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { 1062 assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() && 1063 "read2 / write2 not expected here yet"); 1064 unsigned Width0 = (*MIa->memoperands_begin())->getSize(); 1065 unsigned Width1 = (*MIb->memoperands_begin())->getSize(); 1066 if (BaseReg0 == BaseReg1 && 1067 offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { 1068 return true; 1069 } 1070 } 1071 1072 return false; 1073 } 1074 1075 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, 1076 MachineInstr *MIb, 1077 AliasAnalysis *AA) const { 1078 unsigned Opc0 = MIa->getOpcode(); 1079 unsigned Opc1 = MIb->getOpcode(); 1080 1081 assert(MIa && (MIa->mayLoad() || MIa->mayStore()) && 1082 "MIa must load from or modify a memory location"); 1083 assert(MIb && (MIb->mayLoad() || MIb->mayStore()) && 1084 "MIb must load from or modify a memory location"); 1085 1086 if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects()) 1087 return false; 1088 1089 // XXX - Can we relax this between address spaces? 1090 if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef()) 1091 return false; 1092 1093 // TODO: Should we check the address space from the MachineMemOperand? That 1094 // would allow us to distinguish objects we know don't alias based on the 1095 // underlying addres space, even if it was lowered to a different one, 1096 // e.g. private accesses lowered to use MUBUF instructions on a scratch 1097 // buffer. 1098 if (isDS(Opc0)) { 1099 if (isDS(Opc1)) 1100 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1101 1102 return !isFLAT(Opc1); 1103 } 1104 1105 if (isMUBUF(Opc0) || isMTBUF(Opc0)) { 1106 if (isMUBUF(Opc1) || isMTBUF(Opc1)) 1107 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1108 1109 return !isFLAT(Opc1) && !isSMRD(Opc1); 1110 } 1111 1112 if (isSMRD(Opc0)) { 1113 if (isSMRD(Opc1)) 1114 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1115 1116 return !isFLAT(Opc1) && !isMUBUF(Opc0) && !isMTBUF(Opc0); 1117 } 1118 1119 if (isFLAT(Opc0)) { 1120 if (isFLAT(Opc1)) 1121 return checkInstOffsetsDoNotOverlap(MIa, MIb); 1122 1123 return false; 1124 } 1125 1126 return false; 1127 } 1128 1129 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 1130 int64_t SVal = Imm.getSExtValue(); 1131 if (SVal >= -16 && SVal <= 64) 1132 return true; 1133 1134 if (Imm.getBitWidth() == 64) { 1135 uint64_t Val = Imm.getZExtValue(); 1136 return (DoubleToBits(0.0) == Val) || 1137 (DoubleToBits(1.0) == Val) || 1138 (DoubleToBits(-1.0) == Val) || 1139 (DoubleToBits(0.5) == Val) || 1140 (DoubleToBits(-0.5) == Val) || 1141 (DoubleToBits(2.0) == Val) || 1142 (DoubleToBits(-2.0) == Val) || 1143 (DoubleToBits(4.0) == Val) || 1144 (DoubleToBits(-4.0) == Val); 1145 } 1146 1147 // The actual type of the operand does not seem to matter as long 1148 // as the bits match one of the inline immediate values. For example: 1149 // 1150 // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal, 1151 // so it is a legal inline immediate. 1152 // 1153 // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in 1154 // floating-point, so it is a legal inline immediate. 1155 uint32_t Val = Imm.getZExtValue(); 1156 1157 return (FloatToBits(0.0f) == Val) || 1158 (FloatToBits(1.0f) == Val) || 1159 (FloatToBits(-1.0f) == Val) || 1160 (FloatToBits(0.5f) == Val) || 1161 (FloatToBits(-0.5f) == Val) || 1162 (FloatToBits(2.0f) == Val) || 1163 (FloatToBits(-2.0f) == Val) || 1164 (FloatToBits(4.0f) == Val) || 1165 (FloatToBits(-4.0f) == Val); 1166 } 1167 1168 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 1169 unsigned OpSize) const { 1170 if (MO.isImm()) { 1171 // MachineOperand provides no way to tell the true operand size, since it 1172 // only records a 64-bit value. We need to know the size to determine if a 1173 // 32-bit floating point immediate bit pattern is legal for an integer 1174 // immediate. It would be for any 32-bit integer operand, but would not be 1175 // for a 64-bit one. 1176 1177 unsigned BitSize = 8 * OpSize; 1178 return isInlineConstant(APInt(BitSize, MO.getImm(), true)); 1179 } 1180 1181 return false; 1182 } 1183 1184 bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO, 1185 unsigned OpSize) const { 1186 return MO.isImm() && !isInlineConstant(MO, OpSize); 1187 } 1188 1189 static bool compareMachineOp(const MachineOperand &Op0, 1190 const MachineOperand &Op1) { 1191 if (Op0.getType() != Op1.getType()) 1192 return false; 1193 1194 switch (Op0.getType()) { 1195 case MachineOperand::MO_Register: 1196 return Op0.getReg() == Op1.getReg(); 1197 case MachineOperand::MO_Immediate: 1198 return Op0.getImm() == Op1.getImm(); 1199 default: 1200 llvm_unreachable("Didn't expect to be comparing these operand types"); 1201 } 1202 } 1203 1204 bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, 1205 const MachineOperand &MO) const { 1206 const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo]; 1207 1208 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 1209 1210 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 1211 return true; 1212 1213 if (OpInfo.RegClass < 0) 1214 return false; 1215 1216 unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize(); 1217 if (isLiteralConstant(MO, OpSize)) 1218 return RI.opCanUseLiteralConstant(OpInfo.OperandType); 1219 1220 return RI.opCanUseInlineConstant(OpInfo.OperandType); 1221 } 1222 1223 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 1224 int Op32 = AMDGPU::getVOPe32(Opcode); 1225 if (Op32 == -1) 1226 return false; 1227 1228 return pseudoToMCOpcode(Op32) != -1; 1229 } 1230 1231 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 1232 // The src0_modifier operand is present on all instructions 1233 // that have modifiers. 1234 1235 return AMDGPU::getNamedOperandIdx(Opcode, 1236 AMDGPU::OpName::src0_modifiers) != -1; 1237 } 1238 1239 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 1240 unsigned OpName) const { 1241 const MachineOperand *Mods = getNamedOperand(MI, OpName); 1242 return Mods && Mods->getImm(); 1243 } 1244 1245 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 1246 const MachineOperand &MO, 1247 unsigned OpSize) const { 1248 // Literal constants use the constant bus. 1249 if (isLiteralConstant(MO, OpSize)) 1250 return true; 1251 1252 if (!MO.isReg() || !MO.isUse()) 1253 return false; 1254 1255 if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) 1256 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 1257 1258 // FLAT_SCR is just an SGPR pair. 1259 if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) 1260 return true; 1261 1262 // EXEC register uses the constant bus. 1263 if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) 1264 return true; 1265 1266 // SGPRs use the constant bus 1267 if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC || 1268 (!MO.isImplicit() && 1269 (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || 1270 AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) { 1271 return true; 1272 } 1273 1274 return false; 1275 } 1276 1277 bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, 1278 StringRef &ErrInfo) const { 1279 uint16_t Opcode = MI->getOpcode(); 1280 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1281 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 1282 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 1283 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 1284 1285 // Make sure the number of operands is correct. 1286 const MCInstrDesc &Desc = get(Opcode); 1287 if (!Desc.isVariadic() && 1288 Desc.getNumOperands() != MI->getNumExplicitOperands()) { 1289 ErrInfo = "Instruction has wrong number of operands."; 1290 return false; 1291 } 1292 1293 // Make sure the register classes are correct 1294 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 1295 if (MI->getOperand(i).isFPImm()) { 1296 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 1297 "all fp values to integers."; 1298 return false; 1299 } 1300 1301 int RegClass = Desc.OpInfo[i].RegClass; 1302 1303 switch (Desc.OpInfo[i].OperandType) { 1304 case MCOI::OPERAND_REGISTER: 1305 if (MI->getOperand(i).isImm()) { 1306 ErrInfo = "Illegal immediate value for operand."; 1307 return false; 1308 } 1309 break; 1310 case AMDGPU::OPERAND_REG_IMM32: 1311 break; 1312 case AMDGPU::OPERAND_REG_INLINE_C: 1313 if (isLiteralConstant(MI->getOperand(i), 1314 RI.getRegClass(RegClass)->getSize())) { 1315 ErrInfo = "Illegal immediate value for operand."; 1316 return false; 1317 } 1318 break; 1319 case MCOI::OPERAND_IMMEDIATE: 1320 // Check if this operand is an immediate. 1321 // FrameIndex operands will be replaced by immediates, so they are 1322 // allowed. 1323 if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) { 1324 ErrInfo = "Expected immediate, but got non-immediate"; 1325 return false; 1326 } 1327 // Fall-through 1328 default: 1329 continue; 1330 } 1331 1332 if (!MI->getOperand(i).isReg()) 1333 continue; 1334 1335 if (RegClass != -1) { 1336 unsigned Reg = MI->getOperand(i).getReg(); 1337 if (TargetRegisterInfo::isVirtualRegister(Reg)) 1338 continue; 1339 1340 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 1341 if (!RC->contains(Reg)) { 1342 ErrInfo = "Operand has incorrect register class."; 1343 return false; 1344 } 1345 } 1346 } 1347 1348 1349 // Verify VOP* 1350 if (isVOP1(Opcode) || isVOP2(Opcode) || isVOP3(Opcode) || isVOPC(Opcode)) { 1351 // Only look at the true operands. Only a real operand can use the constant 1352 // bus, and we don't want to check pseudo-operands like the source modifier 1353 // flags. 1354 const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; 1355 1356 unsigned ConstantBusCount = 0; 1357 unsigned SGPRUsed = AMDGPU::NoRegister; 1358 for (int OpIdx : OpIndices) { 1359 if (OpIdx == -1) 1360 break; 1361 const MachineOperand &MO = MI->getOperand(OpIdx); 1362 if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) { 1363 if (MO.isReg()) { 1364 if (MO.getReg() != SGPRUsed) 1365 ++ConstantBusCount; 1366 SGPRUsed = MO.getReg(); 1367 } else { 1368 ++ConstantBusCount; 1369 } 1370 } 1371 } 1372 if (ConstantBusCount > 1) { 1373 ErrInfo = "VOP* instruction uses the constant bus more than once"; 1374 return false; 1375 } 1376 } 1377 1378 // Verify misc. restrictions on specific instructions. 1379 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || 1380 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { 1381 const MachineOperand &Src0 = MI->getOperand(Src0Idx); 1382 const MachineOperand &Src1 = MI->getOperand(Src1Idx); 1383 const MachineOperand &Src2 = MI->getOperand(Src2Idx); 1384 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 1385 if (!compareMachineOp(Src0, Src1) && 1386 !compareMachineOp(Src0, Src2)) { 1387 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 1388 return false; 1389 } 1390 } 1391 } 1392 1393 return true; 1394 } 1395 1396 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { 1397 switch (MI.getOpcode()) { 1398 default: return AMDGPU::INSTRUCTION_LIST_END; 1399 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 1400 case AMDGPU::COPY: return AMDGPU::COPY; 1401 case AMDGPU::PHI: return AMDGPU::PHI; 1402 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 1403 case AMDGPU::S_MOV_B32: 1404 return MI.getOperand(1).isReg() ? 1405 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 1406 case AMDGPU::S_ADD_I32: 1407 case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; 1408 case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; 1409 case AMDGPU::S_SUB_I32: 1410 case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; 1411 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 1412 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; 1413 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32; 1414 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32; 1415 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32; 1416 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32; 1417 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32; 1418 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32; 1419 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32; 1420 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 1421 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; 1422 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 1423 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; 1424 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 1425 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; 1426 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; 1427 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; 1428 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; 1429 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; 1430 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 1431 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 1432 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 1433 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 1434 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; 1435 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; 1436 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; 1437 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; 1438 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; 1439 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; 1440 case AMDGPU::S_LOAD_DWORD_IMM: 1441 case AMDGPU::S_LOAD_DWORD_SGPR: return AMDGPU::BUFFER_LOAD_DWORD_ADDR64; 1442 case AMDGPU::S_LOAD_DWORDX2_IMM: 1443 case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; 1444 case AMDGPU::S_LOAD_DWORDX4_IMM: 1445 case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; 1446 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 1447 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 1448 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 1449 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 1450 } 1451 } 1452 1453 bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const { 1454 return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END; 1455 } 1456 1457 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 1458 unsigned OpNo) const { 1459 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 1460 const MCInstrDesc &Desc = get(MI.getOpcode()); 1461 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 1462 Desc.OpInfo[OpNo].RegClass == -1) { 1463 unsigned Reg = MI.getOperand(OpNo).getReg(); 1464 1465 if (TargetRegisterInfo::isVirtualRegister(Reg)) 1466 return MRI.getRegClass(Reg); 1467 return RI.getPhysRegClass(Reg); 1468 } 1469 1470 unsigned RCID = Desc.OpInfo[OpNo].RegClass; 1471 return RI.getRegClass(RCID); 1472 } 1473 1474 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { 1475 switch (MI.getOpcode()) { 1476 case AMDGPU::COPY: 1477 case AMDGPU::REG_SEQUENCE: 1478 case AMDGPU::PHI: 1479 case AMDGPU::INSERT_SUBREG: 1480 return RI.hasVGPRs(getOpRegClass(MI, 0)); 1481 default: 1482 return RI.hasVGPRs(getOpRegClass(MI, OpNo)); 1483 } 1484 } 1485 1486 void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const { 1487 MachineBasicBlock::iterator I = MI; 1488 MachineBasicBlock *MBB = MI->getParent(); 1489 MachineOperand &MO = MI->getOperand(OpIdx); 1490 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1491 unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass; 1492 const TargetRegisterClass *RC = RI.getRegClass(RCID); 1493 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 1494 if (MO.isReg()) 1495 Opcode = AMDGPU::COPY; 1496 else if (RI.isSGPRClass(RC)) 1497 Opcode = AMDGPU::S_MOV_B32; 1498 1499 1500 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 1501 if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) 1502 VRC = &AMDGPU::VReg_64RegClass; 1503 else 1504 VRC = &AMDGPU::VGPR_32RegClass; 1505 1506 unsigned Reg = MRI.createVirtualRegister(VRC); 1507 DebugLoc DL = MBB->findDebugLoc(I); 1508 BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg) 1509 .addOperand(MO); 1510 MO.ChangeToRegister(Reg, false); 1511 } 1512 1513 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 1514 MachineRegisterInfo &MRI, 1515 MachineOperand &SuperReg, 1516 const TargetRegisterClass *SuperRC, 1517 unsigned SubIdx, 1518 const TargetRegisterClass *SubRC) 1519 const { 1520 assert(SuperReg.isReg()); 1521 1522 unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); 1523 unsigned SubReg = MRI.createVirtualRegister(SubRC); 1524 1525 // Just in case the super register is itself a sub-register, copy it to a new 1526 // value so we don't need to worry about merging its subreg index with the 1527 // SubIdx passed to this function. The register coalescer should be able to 1528 // eliminate this extra copy. 1529 MachineBasicBlock *MBB = MI->getParent(); 1530 DebugLoc DL = MI->getDebugLoc(); 1531 1532 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 1533 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 1534 1535 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 1536 .addReg(NewSuperReg, 0, SubIdx); 1537 1538 return SubReg; 1539 } 1540 1541 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 1542 MachineBasicBlock::iterator MII, 1543 MachineRegisterInfo &MRI, 1544 MachineOperand &Op, 1545 const TargetRegisterClass *SuperRC, 1546 unsigned SubIdx, 1547 const TargetRegisterClass *SubRC) const { 1548 if (Op.isImm()) { 1549 // XXX - Is there a better way to do this? 1550 if (SubIdx == AMDGPU::sub0) 1551 return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF); 1552 if (SubIdx == AMDGPU::sub1) 1553 return MachineOperand::CreateImm(Op.getImm() >> 32); 1554 1555 llvm_unreachable("Unhandled register index for immediate"); 1556 } 1557 1558 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 1559 SubIdx, SubRC); 1560 return MachineOperand::CreateReg(SubReg, false); 1561 } 1562 1563 unsigned SIInstrInfo::split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist, 1564 MachineBasicBlock::iterator MI, 1565 MachineRegisterInfo &MRI, 1566 const TargetRegisterClass *RC, 1567 const MachineOperand &Op) const { 1568 MachineBasicBlock *MBB = MI->getParent(); 1569 DebugLoc DL = MI->getDebugLoc(); 1570 unsigned LoDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 1571 unsigned HiDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 1572 unsigned Dst = MRI.createVirtualRegister(RC); 1573 1574 MachineInstr *Lo = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), 1575 LoDst) 1576 .addImm(Op.getImm() & 0xFFFFFFFF); 1577 MachineInstr *Hi = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), 1578 HiDst) 1579 .addImm(Op.getImm() >> 32); 1580 1581 BuildMI(*MBB, MI, DL, get(TargetOpcode::REG_SEQUENCE), Dst) 1582 .addReg(LoDst) 1583 .addImm(AMDGPU::sub0) 1584 .addReg(HiDst) 1585 .addImm(AMDGPU::sub1); 1586 1587 Worklist.push_back(Lo); 1588 Worklist.push_back(Hi); 1589 1590 return Dst; 1591 } 1592 1593 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 1594 void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const { 1595 assert(Inst->getNumExplicitOperands() == 3); 1596 MachineOperand Op1 = Inst->getOperand(1); 1597 Inst->RemoveOperand(1); 1598 Inst->addOperand(Op1); 1599 } 1600 1601 bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, 1602 const MachineOperand *MO) const { 1603 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1604 const MCInstrDesc &InstDesc = get(MI->getOpcode()); 1605 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; 1606 const TargetRegisterClass *DefinedRC = 1607 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 1608 if (!MO) 1609 MO = &MI->getOperand(OpIdx); 1610 1611 if (isVALU(InstDesc.Opcode) && 1612 usesConstantBus(MRI, *MO, DefinedRC->getSize())) { 1613 unsigned SGPRUsed = 1614 MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister; 1615 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { 1616 if (i == OpIdx) 1617 continue; 1618 const MachineOperand &Op = MI->getOperand(i); 1619 if (Op.isReg() && Op.getReg() != SGPRUsed && 1620 usesConstantBus(MRI, Op, getOpSize(*MI, i))) { 1621 return false; 1622 } 1623 } 1624 } 1625 1626 if (MO->isReg()) { 1627 assert(DefinedRC); 1628 const TargetRegisterClass *RC = 1629 TargetRegisterInfo::isVirtualRegister(MO->getReg()) ? 1630 MRI.getRegClass(MO->getReg()) : 1631 RI.getPhysRegClass(MO->getReg()); 1632 1633 // In order to be legal, the common sub-class must be equal to the 1634 // class of the current operand. For example: 1635 // 1636 // v_mov_b32 s0 ; Operand defined as vsrc_32 1637 // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL 1638 // 1639 // s_sendmsg 0, s0 ; Operand defined as m0reg 1640 // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL 1641 1642 return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; 1643 } 1644 1645 1646 // Handle non-register types that are treated like immediates. 1647 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); 1648 1649 if (!DefinedRC) { 1650 // This operand expects an immediate. 1651 return true; 1652 } 1653 1654 return isImmOperandLegal(MI, OpIdx, *MO); 1655 } 1656 1657 void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { 1658 MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1659 1660 int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 1661 AMDGPU::OpName::src0); 1662 int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 1663 AMDGPU::OpName::src1); 1664 int Src2Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 1665 AMDGPU::OpName::src2); 1666 1667 // Legalize VOP2 1668 if (isVOP2(MI->getOpcode()) && Src1Idx != -1) { 1669 // Legalize src0 1670 if (!isOperandLegal(MI, Src0Idx)) 1671 legalizeOpWithMove(MI, Src0Idx); 1672 1673 // Legalize src1 1674 if (isOperandLegal(MI, Src1Idx)) 1675 return; 1676 1677 // Usually src0 of VOP2 instructions allow more types of inputs 1678 // than src1, so try to commute the instruction to decrease our 1679 // chances of having to insert a MOV instruction to legalize src1. 1680 if (MI->isCommutable()) { 1681 if (commuteInstruction(MI)) 1682 // If we are successful in commuting, then we know MI is legal, so 1683 // we are done. 1684 return; 1685 } 1686 1687 legalizeOpWithMove(MI, Src1Idx); 1688 return; 1689 } 1690 1691 // XXX - Do any VOP3 instructions read VCC? 1692 // Legalize VOP3 1693 if (isVOP3(MI->getOpcode())) { 1694 int VOP3Idx[3] = { Src0Idx, Src1Idx, Src2Idx }; 1695 1696 // Find the one SGPR operand we are allowed to use. 1697 unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); 1698 1699 for (unsigned i = 0; i < 3; ++i) { 1700 int Idx = VOP3Idx[i]; 1701 if (Idx == -1) 1702 break; 1703 MachineOperand &MO = MI->getOperand(Idx); 1704 1705 if (MO.isReg()) { 1706 if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 1707 continue; // VGPRs are legal 1708 1709 assert(MO.getReg() != AMDGPU::SCC && "SCC operand to VOP3 instruction"); 1710 1711 if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { 1712 SGPRReg = MO.getReg(); 1713 // We can use one SGPR in each VOP3 instruction. 1714 continue; 1715 } 1716 } else if (!isLiteralConstant(MO, getOpSize(MI->getOpcode(), Idx))) { 1717 // If it is not a register and not a literal constant, then it must be 1718 // an inline constant which is always legal. 1719 continue; 1720 } 1721 // If we make it this far, then the operand is not legal and we must 1722 // legalize it. 1723 legalizeOpWithMove(MI, Idx); 1724 } 1725 } 1726 1727 // Legalize REG_SEQUENCE and PHI 1728 // The register class of the operands much be the same type as the register 1729 // class of the output. 1730 if (MI->getOpcode() == AMDGPU::REG_SEQUENCE || 1731 MI->getOpcode() == AMDGPU::PHI) { 1732 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 1733 for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { 1734 if (!MI->getOperand(i).isReg() || 1735 !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) 1736 continue; 1737 const TargetRegisterClass *OpRC = 1738 MRI.getRegClass(MI->getOperand(i).getReg()); 1739 if (RI.hasVGPRs(OpRC)) { 1740 VRC = OpRC; 1741 } else { 1742 SRC = OpRC; 1743 } 1744 } 1745 1746 // If any of the operands are VGPR registers, then they all most be 1747 // otherwise we will create illegal VGPR->SGPR copies when legalizing 1748 // them. 1749 if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) { 1750 if (!VRC) { 1751 assert(SRC); 1752 VRC = RI.getEquivalentVGPRClass(SRC); 1753 } 1754 RC = VRC; 1755 } else { 1756 RC = SRC; 1757 } 1758 1759 // Update all the operands so they have the same type. 1760 for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { 1761 if (!MI->getOperand(i).isReg() || 1762 !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) 1763 continue; 1764 unsigned DstReg = MRI.createVirtualRegister(RC); 1765 MachineBasicBlock *InsertBB; 1766 MachineBasicBlock::iterator Insert; 1767 if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) { 1768 InsertBB = MI->getParent(); 1769 Insert = MI; 1770 } else { 1771 // MI is a PHI instruction. 1772 InsertBB = MI->getOperand(i + 1).getMBB(); 1773 Insert = InsertBB->getFirstTerminator(); 1774 } 1775 BuildMI(*InsertBB, Insert, MI->getDebugLoc(), 1776 get(AMDGPU::COPY), DstReg) 1777 .addOperand(MI->getOperand(i)); 1778 MI->getOperand(i).setReg(DstReg); 1779 } 1780 } 1781 1782 // Legalize INSERT_SUBREG 1783 // src0 must have the same register class as dst 1784 if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) { 1785 unsigned Dst = MI->getOperand(0).getReg(); 1786 unsigned Src0 = MI->getOperand(1).getReg(); 1787 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 1788 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 1789 if (DstRC != Src0RC) { 1790 MachineBasicBlock &MBB = *MI->getParent(); 1791 unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); 1792 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0) 1793 .addReg(Src0); 1794 MI->getOperand(1).setReg(NewSrc0); 1795 } 1796 return; 1797 } 1798 1799 // Legalize MUBUF* instructions 1800 // FIXME: If we start using the non-addr64 instructions for compute, we 1801 // may need to legalize them here. 1802 int SRsrcIdx = 1803 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc); 1804 if (SRsrcIdx != -1) { 1805 // We have an MUBUF instruction 1806 MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx); 1807 unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass; 1808 if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), 1809 RI.getRegClass(SRsrcRC))) { 1810 // The operands are legal. 1811 // FIXME: We may need to legalize operands besided srsrc. 1812 return; 1813 } 1814 1815 MachineBasicBlock &MBB = *MI->getParent(); 1816 // Extract the ptr from the resource descriptor. 1817 1818 // SRsrcPtrLo = srsrc:sub0 1819 unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc, 1820 &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VGPR_32RegClass); 1821 1822 // SRsrcPtrHi = srsrc:sub1 1823 unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc, 1824 &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VGPR_32RegClass); 1825 1826 // Create an empty resource descriptor 1827 unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1828 unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 1829 unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 1830 unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); 1831 uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); 1832 1833 // Zero64 = 0 1834 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64), 1835 Zero64) 1836 .addImm(0); 1837 1838 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 1839 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 1840 SRsrcFormatLo) 1841 .addImm(RsrcDataFormat & 0xFFFFFFFF); 1842 1843 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 1844 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 1845 SRsrcFormatHi) 1846 .addImm(RsrcDataFormat >> 32); 1847 1848 // NewSRsrc = {Zero64, SRsrcFormat} 1849 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), 1850 NewSRsrc) 1851 .addReg(Zero64) 1852 .addImm(AMDGPU::sub0_sub1) 1853 .addReg(SRsrcFormatLo) 1854 .addImm(AMDGPU::sub2) 1855 .addReg(SRsrcFormatHi) 1856 .addImm(AMDGPU::sub3); 1857 1858 MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); 1859 unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 1860 unsigned NewVAddrLo; 1861 unsigned NewVAddrHi; 1862 if (VAddr) { 1863 // This is already an ADDR64 instruction so we need to add the pointer 1864 // extracted from the resource descriptor to the current value of VAddr. 1865 NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1866 NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1867 1868 // NewVaddrLo = SRsrcPtrLo + VAddr:sub0 1869 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32), 1870 NewVAddrLo) 1871 .addReg(SRsrcPtrLo) 1872 .addReg(VAddr->getReg(), 0, AMDGPU::sub0) 1873 .addReg(AMDGPU::VCC, RegState::ImplicitDefine); 1874 1875 // NewVaddrHi = SRsrcPtrHi + VAddr:sub1 1876 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32), 1877 NewVAddrHi) 1878 .addReg(SRsrcPtrHi) 1879 .addReg(VAddr->getReg(), 0, AMDGPU::sub1) 1880 .addReg(AMDGPU::VCC, RegState::ImplicitDefine) 1881 .addReg(AMDGPU::VCC, RegState::Implicit); 1882 1883 } else { 1884 // This instructions is the _OFFSET variant, so we need to convert it to 1885 // ADDR64. 1886 MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata); 1887 MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset); 1888 MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset); 1889 1890 // Create the new instruction. 1891 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode()); 1892 MachineInstr *Addr64 = 1893 BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) 1894 .addOperand(*VData) 1895 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 1896 // This will be replaced later 1897 // with the new value of vaddr. 1898 .addOperand(*SRsrc) 1899 .addOperand(*SOffset) 1900 .addOperand(*Offset) 1901 .addImm(0) // glc 1902 .addImm(0) // slc 1903 .addImm(0); // tfe 1904 1905 MI->removeFromParent(); 1906 MI = Addr64; 1907 1908 NewVAddrLo = SRsrcPtrLo; 1909 NewVAddrHi = SRsrcPtrHi; 1910 VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); 1911 SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); 1912 } 1913 1914 // NewVaddr = {NewVaddrHi, NewVaddrLo} 1915 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), 1916 NewVAddr) 1917 .addReg(NewVAddrLo) 1918 .addImm(AMDGPU::sub0) 1919 .addReg(NewVAddrHi) 1920 .addImm(AMDGPU::sub1); 1921 1922 1923 // Update the instruction to use NewVaddr 1924 VAddr->setReg(NewVAddr); 1925 // Update the instruction to use NewSRsrc 1926 SRsrc->setReg(NewSRsrc); 1927 } 1928 } 1929 1930 void SIInstrInfo::splitSMRD(MachineInstr *MI, 1931 const TargetRegisterClass *HalfRC, 1932 unsigned HalfImmOp, unsigned HalfSGPROp, 1933 MachineInstr *&Lo, MachineInstr *&Hi) const { 1934 1935 DebugLoc DL = MI->getDebugLoc(); 1936 MachineBasicBlock *MBB = MI->getParent(); 1937 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1938 unsigned RegLo = MRI.createVirtualRegister(HalfRC); 1939 unsigned RegHi = MRI.createVirtualRegister(HalfRC); 1940 unsigned HalfSize = HalfRC->getSize(); 1941 const MachineOperand *OffOp = 1942 getNamedOperand(*MI, AMDGPU::OpName::offset); 1943 const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase); 1944 1945 // The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes 1946 // on VI. 1947 1948 bool IsKill = SBase->isKill(); 1949 if (OffOp) { 1950 bool isVI = 1951 MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >= 1952 AMDGPUSubtarget::VOLCANIC_ISLANDS; 1953 unsigned OffScale = isVI ? 1 : 4; 1954 // Handle the _IMM variant 1955 unsigned LoOffset = OffOp->getImm() * OffScale; 1956 unsigned HiOffset = LoOffset + HalfSize; 1957 Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo) 1958 // Use addReg instead of addOperand 1959 // to make sure kill flag is cleared. 1960 .addReg(SBase->getReg(), 0, SBase->getSubReg()) 1961 .addImm(LoOffset / OffScale); 1962 1963 if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) { 1964 unsigned OffsetSGPR = 1965 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1966 BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR) 1967 .addImm(HiOffset); // The offset in register is in bytes. 1968 Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) 1969 .addReg(SBase->getReg(), getKillRegState(IsKill), 1970 SBase->getSubReg()) 1971 .addReg(OffsetSGPR); 1972 } else { 1973 Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi) 1974 .addReg(SBase->getReg(), getKillRegState(IsKill), 1975 SBase->getSubReg()) 1976 .addImm(HiOffset / OffScale); 1977 } 1978 } else { 1979 // Handle the _SGPR variant 1980 MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff); 1981 Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo) 1982 .addReg(SBase->getReg(), 0, SBase->getSubReg()) 1983 .addOperand(*SOff); 1984 unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1985 BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR) 1986 .addOperand(*SOff) 1987 .addImm(HalfSize); 1988 Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp)) 1989 .addReg(SBase->getReg(), getKillRegState(IsKill), 1990 SBase->getSubReg()) 1991 .addReg(OffsetSGPR); 1992 } 1993 1994 unsigned SubLo, SubHi; 1995 switch (HalfSize) { 1996 case 4: 1997 SubLo = AMDGPU::sub0; 1998 SubHi = AMDGPU::sub1; 1999 break; 2000 case 8: 2001 SubLo = AMDGPU::sub0_sub1; 2002 SubHi = AMDGPU::sub2_sub3; 2003 break; 2004 case 16: 2005 SubLo = AMDGPU::sub0_sub1_sub2_sub3; 2006 SubHi = AMDGPU::sub4_sub5_sub6_sub7; 2007 break; 2008 case 32: 2009 SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; 2010 SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15; 2011 break; 2012 default: 2013 llvm_unreachable("Unhandled HalfSize"); 2014 } 2015 2016 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE)) 2017 .addOperand(MI->getOperand(0)) 2018 .addReg(RegLo) 2019 .addImm(SubLo) 2020 .addReg(RegHi) 2021 .addImm(SubHi); 2022 } 2023 2024 void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const { 2025 MachineBasicBlock *MBB = MI->getParent(); 2026 switch (MI->getOpcode()) { 2027 case AMDGPU::S_LOAD_DWORD_IMM: 2028 case AMDGPU::S_LOAD_DWORD_SGPR: 2029 case AMDGPU::S_LOAD_DWORDX2_IMM: 2030 case AMDGPU::S_LOAD_DWORDX2_SGPR: 2031 case AMDGPU::S_LOAD_DWORDX4_IMM: 2032 case AMDGPU::S_LOAD_DWORDX4_SGPR: { 2033 unsigned NewOpcode = getVALUOp(*MI); 2034 unsigned RegOffset; 2035 unsigned ImmOffset; 2036 2037 if (MI->getOperand(2).isReg()) { 2038 RegOffset = MI->getOperand(2).getReg(); 2039 ImmOffset = 0; 2040 } else { 2041 assert(MI->getOperand(2).isImm()); 2042 // SMRD instructions take a dword offsets on SI and byte offset on VI 2043 // and MUBUF instructions always take a byte offset. 2044 ImmOffset = MI->getOperand(2).getImm(); 2045 if (MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() <= 2046 AMDGPUSubtarget::SEA_ISLANDS) 2047 ImmOffset <<= 2; 2048 RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2049 2050 if (isUInt<12>(ImmOffset)) { 2051 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 2052 RegOffset) 2053 .addImm(0); 2054 } else { 2055 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), 2056 RegOffset) 2057 .addImm(ImmOffset); 2058 ImmOffset = 0; 2059 } 2060 } 2061 2062 unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); 2063 unsigned DWord0 = RegOffset; 2064 unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2065 unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2066 unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2067 uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); 2068 2069 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1) 2070 .addImm(0); 2071 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2) 2072 .addImm(RsrcDataFormat & 0xFFFFFFFF); 2073 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3) 2074 .addImm(RsrcDataFormat >> 32); 2075 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc) 2076 .addReg(DWord0) 2077 .addImm(AMDGPU::sub0) 2078 .addReg(DWord1) 2079 .addImm(AMDGPU::sub1) 2080 .addReg(DWord2) 2081 .addImm(AMDGPU::sub2) 2082 .addReg(DWord3) 2083 .addImm(AMDGPU::sub3); 2084 MI->setDesc(get(NewOpcode)); 2085 if (MI->getOperand(2).isReg()) { 2086 MI->getOperand(2).setReg(SRsrc); 2087 } else { 2088 MI->getOperand(2).ChangeToRegister(SRsrc, false); 2089 } 2090 MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); 2091 MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset)); 2092 MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // glc 2093 MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // slc 2094 MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // tfe 2095 2096 const TargetRegisterClass *NewDstRC = 2097 RI.getRegClass(get(NewOpcode).OpInfo[0].RegClass); 2098 2099 unsigned DstReg = MI->getOperand(0).getReg(); 2100 unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); 2101 MRI.replaceRegWith(DstReg, NewDstReg); 2102 break; 2103 } 2104 case AMDGPU::S_LOAD_DWORDX8_IMM: 2105 case AMDGPU::S_LOAD_DWORDX8_SGPR: { 2106 MachineInstr *Lo, *Hi; 2107 splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM, 2108 AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi); 2109 MI->eraseFromParent(); 2110 moveSMRDToVALU(Lo, MRI); 2111 moveSMRDToVALU(Hi, MRI); 2112 break; 2113 } 2114 2115 case AMDGPU::S_LOAD_DWORDX16_IMM: 2116 case AMDGPU::S_LOAD_DWORDX16_SGPR: { 2117 MachineInstr *Lo, *Hi; 2118 splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM, 2119 AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi); 2120 MI->eraseFromParent(); 2121 moveSMRDToVALU(Lo, MRI); 2122 moveSMRDToVALU(Hi, MRI); 2123 break; 2124 } 2125 } 2126 } 2127 2128 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { 2129 SmallVector<MachineInstr *, 128> Worklist; 2130 Worklist.push_back(&TopInst); 2131 2132 while (!Worklist.empty()) { 2133 MachineInstr *Inst = Worklist.pop_back_val(); 2134 MachineBasicBlock *MBB = Inst->getParent(); 2135 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 2136 2137 unsigned Opcode = Inst->getOpcode(); 2138 unsigned NewOpcode = getVALUOp(*Inst); 2139 2140 // Handle some special cases 2141 switch (Opcode) { 2142 default: 2143 if (isSMRD(Inst->getOpcode())) { 2144 moveSMRDToVALU(Inst, MRI); 2145 } 2146 break; 2147 case AMDGPU::S_MOV_B64: { 2148 DebugLoc DL = Inst->getDebugLoc(); 2149 2150 // If the source operand is a register we can replace this with a 2151 // copy. 2152 if (Inst->getOperand(1).isReg()) { 2153 MachineInstr *Copy = BuildMI(*MBB, Inst, DL, get(TargetOpcode::COPY)) 2154 .addOperand(Inst->getOperand(0)) 2155 .addOperand(Inst->getOperand(1)); 2156 Worklist.push_back(Copy); 2157 } else { 2158 // Otherwise, we need to split this into two movs, because there is 2159 // no 64-bit VALU move instruction. 2160 unsigned Reg = Inst->getOperand(0).getReg(); 2161 unsigned Dst = split64BitImm(Worklist, 2162 Inst, 2163 MRI, 2164 MRI.getRegClass(Reg), 2165 Inst->getOperand(1)); 2166 MRI.replaceRegWith(Reg, Dst); 2167 } 2168 Inst->eraseFromParent(); 2169 continue; 2170 } 2171 case AMDGPU::S_AND_B64: 2172 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32); 2173 Inst->eraseFromParent(); 2174 continue; 2175 2176 case AMDGPU::S_OR_B64: 2177 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32); 2178 Inst->eraseFromParent(); 2179 continue; 2180 2181 case AMDGPU::S_XOR_B64: 2182 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32); 2183 Inst->eraseFromParent(); 2184 continue; 2185 2186 case AMDGPU::S_NOT_B64: 2187 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); 2188 Inst->eraseFromParent(); 2189 continue; 2190 2191 case AMDGPU::S_BCNT1_I32_B64: 2192 splitScalar64BitBCNT(Worklist, Inst); 2193 Inst->eraseFromParent(); 2194 continue; 2195 2196 case AMDGPU::S_BFE_I64: { 2197 splitScalar64BitBFE(Worklist, Inst); 2198 Inst->eraseFromParent(); 2199 continue; 2200 } 2201 2202 case AMDGPU::S_LSHL_B32: 2203 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2204 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 2205 swapOperands(Inst); 2206 } 2207 break; 2208 case AMDGPU::S_ASHR_I32: 2209 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2210 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 2211 swapOperands(Inst); 2212 } 2213 break; 2214 case AMDGPU::S_LSHR_B32: 2215 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2216 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 2217 swapOperands(Inst); 2218 } 2219 break; 2220 case AMDGPU::S_LSHL_B64: 2221 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2222 NewOpcode = AMDGPU::V_LSHLREV_B64; 2223 swapOperands(Inst); 2224 } 2225 break; 2226 case AMDGPU::S_ASHR_I64: 2227 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2228 NewOpcode = AMDGPU::V_ASHRREV_I64; 2229 swapOperands(Inst); 2230 } 2231 break; 2232 case AMDGPU::S_LSHR_B64: 2233 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 2234 NewOpcode = AMDGPU::V_LSHRREV_B64; 2235 swapOperands(Inst); 2236 } 2237 break; 2238 2239 case AMDGPU::S_BFE_U64: 2240 case AMDGPU::S_BFM_B64: 2241 llvm_unreachable("Moving this op to VALU not implemented"); 2242 } 2243 2244 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 2245 // We cannot move this instruction to the VALU, so we should try to 2246 // legalize its operands instead. 2247 legalizeOperands(Inst); 2248 continue; 2249 } 2250 2251 // Use the new VALU Opcode. 2252 const MCInstrDesc &NewDesc = get(NewOpcode); 2253 Inst->setDesc(NewDesc); 2254 2255 // Remove any references to SCC. Vector instructions can't read from it, and 2256 // We're just about to add the implicit use / defs of VCC, and we don't want 2257 // both. 2258 for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) { 2259 MachineOperand &Op = Inst->getOperand(i); 2260 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) 2261 Inst->RemoveOperand(i); 2262 } 2263 2264 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 2265 // We are converting these to a BFE, so we need to add the missing 2266 // operands for the size and offset. 2267 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 2268 Inst->addOperand(MachineOperand::CreateImm(0)); 2269 Inst->addOperand(MachineOperand::CreateImm(Size)); 2270 2271 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 2272 // The VALU version adds the second operand to the result, so insert an 2273 // extra 0 operand. 2274 Inst->addOperand(MachineOperand::CreateImm(0)); 2275 } 2276 2277 addDescImplicitUseDef(NewDesc, Inst); 2278 2279 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 2280 const MachineOperand &OffsetWidthOp = Inst->getOperand(2); 2281 // If we need to move this to VGPRs, we need to unpack the second operand 2282 // back into the 2 separate ones for bit offset and width. 2283 assert(OffsetWidthOp.isImm() && 2284 "Scalar BFE is only implemented for constant width and offset"); 2285 uint32_t Imm = OffsetWidthOp.getImm(); 2286 2287 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 2288 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 2289 Inst->RemoveOperand(2); // Remove old immediate. 2290 Inst->addOperand(MachineOperand::CreateImm(Offset)); 2291 Inst->addOperand(MachineOperand::CreateImm(BitWidth)); 2292 } 2293 2294 // Update the destination register class. 2295 2296 const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0); 2297 2298 switch (Opcode) { 2299 // For target instructions, getOpRegClass just returns the virtual 2300 // register class associated with the operand, so we need to find an 2301 // equivalent VGPR register class in order to move the instruction to the 2302 // VALU. 2303 case AMDGPU::COPY: 2304 case AMDGPU::PHI: 2305 case AMDGPU::REG_SEQUENCE: 2306 case AMDGPU::INSERT_SUBREG: 2307 if (RI.hasVGPRs(NewDstRC)) 2308 continue; 2309 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 2310 if (!NewDstRC) 2311 continue; 2312 break; 2313 default: 2314 break; 2315 } 2316 2317 unsigned DstReg = Inst->getOperand(0).getReg(); 2318 unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); 2319 MRI.replaceRegWith(DstReg, NewDstReg); 2320 2321 // Legalize the operands 2322 legalizeOperands(Inst); 2323 2324 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg), 2325 E = MRI.use_end(); I != E; ++I) { 2326 MachineInstr &UseMI = *I->getParent(); 2327 if (!canReadVGPR(UseMI, I.getOperandNo())) { 2328 Worklist.push_back(&UseMI); 2329 } 2330 } 2331 } 2332 } 2333 2334 //===----------------------------------------------------------------------===// 2335 // Indirect addressing callbacks 2336 //===----------------------------------------------------------------------===// 2337 2338 unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex, 2339 unsigned Channel) const { 2340 assert(Channel == 0); 2341 return RegIndex; 2342 } 2343 2344 const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const { 2345 return &AMDGPU::VGPR_32RegClass; 2346 } 2347 2348 void SIInstrInfo::splitScalar64BitUnaryOp( 2349 SmallVectorImpl<MachineInstr *> &Worklist, 2350 MachineInstr *Inst, 2351 unsigned Opcode) const { 2352 MachineBasicBlock &MBB = *Inst->getParent(); 2353 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2354 2355 MachineOperand &Dest = Inst->getOperand(0); 2356 MachineOperand &Src0 = Inst->getOperand(1); 2357 DebugLoc DL = Inst->getDebugLoc(); 2358 2359 MachineBasicBlock::iterator MII = Inst; 2360 2361 const MCInstrDesc &InstDesc = get(Opcode); 2362 const TargetRegisterClass *Src0RC = Src0.isReg() ? 2363 MRI.getRegClass(Src0.getReg()) : 2364 &AMDGPU::SGPR_32RegClass; 2365 2366 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 2367 2368 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2369 AMDGPU::sub0, Src0SubRC); 2370 2371 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 2372 const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0); 2373 2374 unsigned DestSub0 = MRI.createVirtualRegister(DestRC); 2375 MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) 2376 .addOperand(SrcReg0Sub0); 2377 2378 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2379 AMDGPU::sub1, Src0SubRC); 2380 2381 unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC); 2382 MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) 2383 .addOperand(SrcReg0Sub1); 2384 2385 unsigned FullDestReg = MRI.createVirtualRegister(DestRC); 2386 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2387 .addReg(DestSub0) 2388 .addImm(AMDGPU::sub0) 2389 .addReg(DestSub1) 2390 .addImm(AMDGPU::sub1); 2391 2392 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 2393 2394 // Try to legalize the operands in case we need to swap the order to keep it 2395 // valid. 2396 Worklist.push_back(LoHalf); 2397 Worklist.push_back(HiHalf); 2398 } 2399 2400 void SIInstrInfo::splitScalar64BitBinaryOp( 2401 SmallVectorImpl<MachineInstr *> &Worklist, 2402 MachineInstr *Inst, 2403 unsigned Opcode) const { 2404 MachineBasicBlock &MBB = *Inst->getParent(); 2405 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2406 2407 MachineOperand &Dest = Inst->getOperand(0); 2408 MachineOperand &Src0 = Inst->getOperand(1); 2409 MachineOperand &Src1 = Inst->getOperand(2); 2410 DebugLoc DL = Inst->getDebugLoc(); 2411 2412 MachineBasicBlock::iterator MII = Inst; 2413 2414 const MCInstrDesc &InstDesc = get(Opcode); 2415 const TargetRegisterClass *Src0RC = Src0.isReg() ? 2416 MRI.getRegClass(Src0.getReg()) : 2417 &AMDGPU::SGPR_32RegClass; 2418 2419 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 2420 const TargetRegisterClass *Src1RC = Src1.isReg() ? 2421 MRI.getRegClass(Src1.getReg()) : 2422 &AMDGPU::SGPR_32RegClass; 2423 2424 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 2425 2426 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2427 AMDGPU::sub0, Src0SubRC); 2428 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 2429 AMDGPU::sub0, Src1SubRC); 2430 2431 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 2432 const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0); 2433 2434 unsigned DestSub0 = MRI.createVirtualRegister(DestRC); 2435 MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) 2436 .addOperand(SrcReg0Sub0) 2437 .addOperand(SrcReg1Sub0); 2438 2439 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 2440 AMDGPU::sub1, Src0SubRC); 2441 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 2442 AMDGPU::sub1, Src1SubRC); 2443 2444 unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC); 2445 MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) 2446 .addOperand(SrcReg0Sub1) 2447 .addOperand(SrcReg1Sub1); 2448 2449 unsigned FullDestReg = MRI.createVirtualRegister(DestRC); 2450 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2451 .addReg(DestSub0) 2452 .addImm(AMDGPU::sub0) 2453 .addReg(DestSub1) 2454 .addImm(AMDGPU::sub1); 2455 2456 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 2457 2458 // Try to legalize the operands in case we need to swap the order to keep it 2459 // valid. 2460 Worklist.push_back(LoHalf); 2461 Worklist.push_back(HiHalf); 2462 } 2463 2464 void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, 2465 MachineInstr *Inst) const { 2466 MachineBasicBlock &MBB = *Inst->getParent(); 2467 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2468 2469 MachineBasicBlock::iterator MII = Inst; 2470 DebugLoc DL = Inst->getDebugLoc(); 2471 2472 MachineOperand &Dest = Inst->getOperand(0); 2473 MachineOperand &Src = Inst->getOperand(1); 2474 2475 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 2476 const TargetRegisterClass *SrcRC = Src.isReg() ? 2477 MRI.getRegClass(Src.getReg()) : 2478 &AMDGPU::SGPR_32RegClass; 2479 2480 unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2481 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2482 2483 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); 2484 2485 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 2486 AMDGPU::sub0, SrcSubRC); 2487 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 2488 AMDGPU::sub1, SrcSubRC); 2489 2490 MachineInstr *First = BuildMI(MBB, MII, DL, InstDesc, MidReg) 2491 .addOperand(SrcRegSub0) 2492 .addImm(0); 2493 2494 MachineInstr *Second = BuildMI(MBB, MII, DL, InstDesc, ResultReg) 2495 .addOperand(SrcRegSub1) 2496 .addReg(MidReg); 2497 2498 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2499 2500 Worklist.push_back(First); 2501 Worklist.push_back(Second); 2502 } 2503 2504 void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, 2505 MachineInstr *Inst) const { 2506 MachineBasicBlock &MBB = *Inst->getParent(); 2507 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2508 MachineBasicBlock::iterator MII = Inst; 2509 DebugLoc DL = Inst->getDebugLoc(); 2510 2511 MachineOperand &Dest = Inst->getOperand(0); 2512 uint32_t Imm = Inst->getOperand(2).getImm(); 2513 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 2514 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 2515 2516 (void) Offset; 2517 2518 // Only sext_inreg cases handled. 2519 assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 && 2520 BitWidth <= 32 && 2521 Offset == 0 && 2522 "Not implemented"); 2523 2524 if (BitWidth < 32) { 2525 unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2526 unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2527 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2528 2529 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) 2530 .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0) 2531 .addImm(0) 2532 .addImm(BitWidth); 2533 2534 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 2535 .addImm(31) 2536 .addReg(MidRegLo); 2537 2538 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 2539 .addReg(MidRegLo) 2540 .addImm(AMDGPU::sub0) 2541 .addReg(MidRegHi) 2542 .addImm(AMDGPU::sub1); 2543 2544 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2545 return; 2546 } 2547 2548 MachineOperand &Src = Inst->getOperand(1); 2549 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2550 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 2551 2552 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 2553 .addImm(31) 2554 .addReg(Src.getReg(), 0, AMDGPU::sub0); 2555 2556 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 2557 .addReg(Src.getReg(), 0, AMDGPU::sub0) 2558 .addImm(AMDGPU::sub0) 2559 .addReg(TmpReg) 2560 .addImm(AMDGPU::sub1); 2561 2562 MRI.replaceRegWith(Dest.getReg(), ResultReg); 2563 } 2564 2565 void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc, 2566 MachineInstr *Inst) const { 2567 // Add the implict and explicit register definitions. 2568 if (NewDesc.ImplicitUses) { 2569 for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) { 2570 unsigned Reg = NewDesc.ImplicitUses[i]; 2571 Inst->addOperand(MachineOperand::CreateReg(Reg, false, true)); 2572 } 2573 } 2574 2575 if (NewDesc.ImplicitDefs) { 2576 for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) { 2577 unsigned Reg = NewDesc.ImplicitDefs[i]; 2578 Inst->addOperand(MachineOperand::CreateReg(Reg, true, true)); 2579 } 2580 } 2581 } 2582 2583 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, 2584 int OpIndices[3]) const { 2585 const MCInstrDesc &Desc = get(MI->getOpcode()); 2586 2587 // Find the one SGPR operand we are allowed to use. 2588 unsigned SGPRReg = AMDGPU::NoRegister; 2589 2590 // First we need to consider the instruction's operand requirements before 2591 // legalizing. Some operands are required to be SGPRs, such as implicit uses 2592 // of VCC, but we are still bound by the constant bus requirement to only use 2593 // one. 2594 // 2595 // If the operand's class is an SGPR, we can never move it. 2596 2597 for (const MachineOperand &MO : MI->implicit_operands()) { 2598 // We only care about reads. 2599 if (MO.isDef()) 2600 continue; 2601 2602 if (MO.getReg() == AMDGPU::VCC) 2603 return AMDGPU::VCC; 2604 2605 if (MO.getReg() == AMDGPU::FLAT_SCR) 2606 return AMDGPU::FLAT_SCR; 2607 } 2608 2609 unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; 2610 const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 2611 2612 for (unsigned i = 0; i < 3; ++i) { 2613 int Idx = OpIndices[i]; 2614 if (Idx == -1) 2615 break; 2616 2617 const MachineOperand &MO = MI->getOperand(Idx); 2618 if (RI.isSGPRClassID(Desc.OpInfo[Idx].RegClass)) 2619 SGPRReg = MO.getReg(); 2620 2621 if (MO.isReg() && RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 2622 UsedSGPRs[i] = MO.getReg(); 2623 } 2624 2625 if (SGPRReg != AMDGPU::NoRegister) 2626 return SGPRReg; 2627 2628 // We don't have a required SGPR operand, so we have a bit more freedom in 2629 // selecting operands to move. 2630 2631 // Try to select the most used SGPR. If an SGPR is equal to one of the 2632 // others, we choose that. 2633 // 2634 // e.g. 2635 // V_FMA_F32 v0, s0, s0, s0 -> No moves 2636 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 2637 2638 if (UsedSGPRs[0] != AMDGPU::NoRegister) { 2639 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 2640 SGPRReg = UsedSGPRs[0]; 2641 } 2642 2643 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { 2644 if (UsedSGPRs[1] == UsedSGPRs[2]) 2645 SGPRReg = UsedSGPRs[1]; 2646 } 2647 2648 return SGPRReg; 2649 } 2650 2651 MachineInstrBuilder SIInstrInfo::buildIndirectWrite( 2652 MachineBasicBlock *MBB, 2653 MachineBasicBlock::iterator I, 2654 unsigned ValueReg, 2655 unsigned Address, unsigned OffsetReg) const { 2656 const DebugLoc &DL = MBB->findDebugLoc(I); 2657 unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( 2658 getIndirectIndexBegin(*MBB->getParent())); 2659 2660 return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_DST_V1)) 2661 .addReg(IndirectBaseReg, RegState::Define) 2662 .addOperand(I->getOperand(0)) 2663 .addReg(IndirectBaseReg) 2664 .addReg(OffsetReg) 2665 .addImm(0) 2666 .addReg(ValueReg); 2667 } 2668 2669 MachineInstrBuilder SIInstrInfo::buildIndirectRead( 2670 MachineBasicBlock *MBB, 2671 MachineBasicBlock::iterator I, 2672 unsigned ValueReg, 2673 unsigned Address, unsigned OffsetReg) const { 2674 const DebugLoc &DL = MBB->findDebugLoc(I); 2675 unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( 2676 getIndirectIndexBegin(*MBB->getParent())); 2677 2678 return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC)) 2679 .addOperand(I->getOperand(0)) 2680 .addOperand(I->getOperand(1)) 2681 .addReg(IndirectBaseReg) 2682 .addReg(OffsetReg) 2683 .addImm(0); 2684 2685 } 2686 2687 void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved, 2688 const MachineFunction &MF) const { 2689 int End = getIndirectIndexEnd(MF); 2690 int Begin = getIndirectIndexBegin(MF); 2691 2692 if (End == -1) 2693 return; 2694 2695 2696 for (int Index = Begin; Index <= End; ++Index) 2697 Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index)); 2698 2699 for (int Index = std::max(0, Begin - 1); Index <= End; ++Index) 2700 Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index)); 2701 2702 for (int Index = std::max(0, Begin - 2); Index <= End; ++Index) 2703 Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index)); 2704 2705 for (int Index = std::max(0, Begin - 3); Index <= End; ++Index) 2706 Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index)); 2707 2708 for (int Index = std::max(0, Begin - 7); Index <= End; ++Index) 2709 Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index)); 2710 2711 for (int Index = std::max(0, Begin - 15); Index <= End; ++Index) 2712 Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index)); 2713 } 2714 2715 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 2716 unsigned OperandName) const { 2717 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 2718 if (Idx == -1) 2719 return nullptr; 2720 2721 return &MI.getOperand(Idx); 2722 } 2723 2724 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 2725 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 2726 if (ST.isAmdHsaOS()) { 2727 RsrcDataFormat |= (1ULL << 56); 2728 2729 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 2730 // Set MTYPE = 2 2731 RsrcDataFormat |= (2ULL << 59); 2732 } 2733 2734 return RsrcDataFormat; 2735 } 2736