1 //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief SI Implementation of TargetInstrInfo. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "SIInstrInfo.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUSubtarget.h" 18 #include "GCNHazardRecognizer.h" 19 #include "SIDefines.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "SIRegisterInfo.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/APInt.h" 24 #include "llvm/ADT/ArrayRef.h" 25 #include "llvm/ADT/SmallVector.h" 26 #include "llvm/ADT/StringRef.h" 27 #include "llvm/ADT/iterator_range.h" 28 #include "llvm/Analysis/AliasAnalysis.h" 29 #include "llvm/Analysis/MemoryLocation.h" 30 #include "llvm/Analysis/ValueTracking.h" 31 #include "llvm/CodeGen/MachineBasicBlock.h" 32 #include "llvm/CodeGen/MachineFrameInfo.h" 33 #include "llvm/CodeGen/MachineFunction.h" 34 #include "llvm/CodeGen/MachineInstr.h" 35 #include "llvm/CodeGen/MachineInstrBuilder.h" 36 #include "llvm/CodeGen/MachineInstrBundle.h" 37 #include "llvm/CodeGen/MachineMemOperand.h" 38 #include "llvm/CodeGen/MachineOperand.h" 39 #include "llvm/CodeGen/MachineRegisterInfo.h" 40 #include "llvm/CodeGen/MachineValueType.h" 41 #include "llvm/CodeGen/RegisterScavenging.h" 42 #include "llvm/CodeGen/ScheduleDAG.h" 43 #include "llvm/CodeGen/SelectionDAGNodes.h" 44 #include "llvm/IR/DebugLoc.h" 45 #include "llvm/IR/DiagnosticInfo.h" 46 #include "llvm/IR/Function.h" 47 #include "llvm/IR/InlineAsm.h" 48 #include "llvm/IR/LLVMContext.h" 49 #include "llvm/MC/MCInstrDesc.h" 50 #include "llvm/Support/Casting.h" 51 #include "llvm/Support/CommandLine.h" 52 #include "llvm/Support/Compiler.h" 53 #include "llvm/Support/ErrorHandling.h" 54 #include "llvm/Support/MathExtras.h" 55 #include "llvm/Target/TargetMachine.h" 56 #include "llvm/Target/TargetOpcodes.h" 57 #include "llvm/Target/TargetRegisterInfo.h" 58 #include <cassert> 59 #include <cstdint> 60 #include <iterator> 61 #include <utility> 62 63 using namespace llvm; 64 65 // Must be at least 4 to be able to branch over minimum unconditional branch 66 // code. This is only for making it possible to write reasonably small tests for 67 // long branches. 68 static cl::opt<unsigned> 69 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), 70 cl::desc("Restrict range of branch instructions (DEBUG)")); 71 72 SIInstrInfo::SIInstrInfo(const SISubtarget &ST) 73 : AMDGPUInstrInfo(ST), RI(ST), ST(ST) {} 74 75 //===----------------------------------------------------------------------===// 76 // TargetInstrInfo callbacks 77 //===----------------------------------------------------------------------===// 78 79 static unsigned getNumOperandsNoGlue(SDNode *Node) { 80 unsigned N = Node->getNumOperands(); 81 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 82 --N; 83 return N; 84 } 85 86 static SDValue findChainOperand(SDNode *Load) { 87 SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); 88 assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); 89 return LastOp; 90 } 91 92 /// \brief Returns true if both nodes have the same value for the given 93 /// operand \p Op, or if both nodes do not have this operand. 94 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 95 unsigned Opc0 = N0->getMachineOpcode(); 96 unsigned Opc1 = N1->getMachineOpcode(); 97 98 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 99 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 100 101 if (Op0Idx == -1 && Op1Idx == -1) 102 return true; 103 104 105 if ((Op0Idx == -1 && Op1Idx != -1) || 106 (Op1Idx == -1 && Op0Idx != -1)) 107 return false; 108 109 // getNamedOperandIdx returns the index for the MachineInstr's operands, 110 // which includes the result as the first operand. We are indexing into the 111 // MachineSDNode's operands, so we need to skip the result operand to get 112 // the real index. 113 --Op0Idx; 114 --Op1Idx; 115 116 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 117 } 118 119 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, 120 AliasAnalysis *AA) const { 121 // TODO: The generic check fails for VALU instructions that should be 122 // rematerializable due to implicit reads of exec. We really want all of the 123 // generic logic for this except for this. 124 switch (MI.getOpcode()) { 125 case AMDGPU::V_MOV_B32_e32: 126 case AMDGPU::V_MOV_B32_e64: 127 case AMDGPU::V_MOV_B64_PSEUDO: 128 return true; 129 default: 130 return false; 131 } 132 } 133 134 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 135 int64_t &Offset0, 136 int64_t &Offset1) const { 137 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 138 return false; 139 140 unsigned Opc0 = Load0->getMachineOpcode(); 141 unsigned Opc1 = Load1->getMachineOpcode(); 142 143 // Make sure both are actually loads. 144 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 145 return false; 146 147 if (isDS(Opc0) && isDS(Opc1)) { 148 149 // FIXME: Handle this case: 150 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 151 return false; 152 153 // Check base reg. 154 if (Load0->getOperand(1) != Load1->getOperand(1)) 155 return false; 156 157 // Check chain. 158 if (findChainOperand(Load0) != findChainOperand(Load1)) 159 return false; 160 161 // Skip read2 / write2 variants for simplicity. 162 // TODO: We should report true if the used offsets are adjacent (excluded 163 // st64 versions). 164 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || 165 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) 166 return false; 167 168 Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue(); 169 Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue(); 170 return true; 171 } 172 173 if (isSMRD(Opc0) && isSMRD(Opc1)) { 174 // Skip time and cache invalidation instructions. 175 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 || 176 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1) 177 return false; 178 179 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); 180 181 // Check base reg. 182 if (Load0->getOperand(0) != Load1->getOperand(0)) 183 return false; 184 185 const ConstantSDNode *Load0Offset = 186 dyn_cast<ConstantSDNode>(Load0->getOperand(1)); 187 const ConstantSDNode *Load1Offset = 188 dyn_cast<ConstantSDNode>(Load1->getOperand(1)); 189 190 if (!Load0Offset || !Load1Offset) 191 return false; 192 193 // Check chain. 194 if (findChainOperand(Load0) != findChainOperand(Load1)) 195 return false; 196 197 Offset0 = Load0Offset->getZExtValue(); 198 Offset1 = Load1Offset->getZExtValue(); 199 return true; 200 } 201 202 // MUBUF and MTBUF can access the same addresses. 203 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 204 205 // MUBUF and MTBUF have vaddr at different indices. 206 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 207 findChainOperand(Load0) != findChainOperand(Load1) || 208 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 209 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 210 return false; 211 212 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 213 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 214 215 if (OffIdx0 == -1 || OffIdx1 == -1) 216 return false; 217 218 // getNamedOperandIdx returns the index for MachineInstrs. Since they 219 // inlcude the output in the operand list, but SDNodes don't, we need to 220 // subtract the index by one. 221 --OffIdx0; 222 --OffIdx1; 223 224 SDValue Off0 = Load0->getOperand(OffIdx0); 225 SDValue Off1 = Load1->getOperand(OffIdx1); 226 227 // The offset might be a FrameIndexSDNode. 228 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 229 return false; 230 231 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 232 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 233 return true; 234 } 235 236 return false; 237 } 238 239 static bool isStride64(unsigned Opc) { 240 switch (Opc) { 241 case AMDGPU::DS_READ2ST64_B32: 242 case AMDGPU::DS_READ2ST64_B64: 243 case AMDGPU::DS_WRITE2ST64_B32: 244 case AMDGPU::DS_WRITE2ST64_B64: 245 return true; 246 default: 247 return false; 248 } 249 } 250 251 bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, 252 int64_t &Offset, 253 const TargetRegisterInfo *TRI) const { 254 unsigned Opc = LdSt.getOpcode(); 255 256 if (isDS(LdSt)) { 257 const MachineOperand *OffsetImm = 258 getNamedOperand(LdSt, AMDGPU::OpName::offset); 259 if (OffsetImm) { 260 // Normal, single offset LDS instruction. 261 const MachineOperand *AddrReg = 262 getNamedOperand(LdSt, AMDGPU::OpName::addr); 263 264 BaseReg = AddrReg->getReg(); 265 Offset = OffsetImm->getImm(); 266 return true; 267 } 268 269 // The 2 offset instructions use offset0 and offset1 instead. We can treat 270 // these as a load with a single offset if the 2 offsets are consecutive. We 271 // will use this for some partially aligned loads. 272 const MachineOperand *Offset0Imm = 273 getNamedOperand(LdSt, AMDGPU::OpName::offset0); 274 const MachineOperand *Offset1Imm = 275 getNamedOperand(LdSt, AMDGPU::OpName::offset1); 276 277 uint8_t Offset0 = Offset0Imm->getImm(); 278 uint8_t Offset1 = Offset1Imm->getImm(); 279 280 if (Offset1 > Offset0 && Offset1 - Offset0 == 1) { 281 // Each of these offsets is in element sized units, so we need to convert 282 // to bytes of the individual reads. 283 284 unsigned EltSize; 285 if (LdSt.mayLoad()) 286 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16; 287 else { 288 assert(LdSt.mayStore()); 289 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 290 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8; 291 } 292 293 if (isStride64(Opc)) 294 EltSize *= 64; 295 296 const MachineOperand *AddrReg = 297 getNamedOperand(LdSt, AMDGPU::OpName::addr); 298 BaseReg = AddrReg->getReg(); 299 Offset = EltSize * Offset0; 300 return true; 301 } 302 303 return false; 304 } 305 306 if (isMUBUF(LdSt) || isMTBUF(LdSt)) { 307 const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset); 308 if (SOffset && SOffset->isReg()) 309 return false; 310 311 const MachineOperand *AddrReg = 312 getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 313 if (!AddrReg) 314 return false; 315 316 const MachineOperand *OffsetImm = 317 getNamedOperand(LdSt, AMDGPU::OpName::offset); 318 BaseReg = AddrReg->getReg(); 319 Offset = OffsetImm->getImm(); 320 321 if (SOffset) // soffset can be an inline immediate. 322 Offset += SOffset->getImm(); 323 324 return true; 325 } 326 327 if (isSMRD(LdSt)) { 328 const MachineOperand *OffsetImm = 329 getNamedOperand(LdSt, AMDGPU::OpName::offset); 330 if (!OffsetImm) 331 return false; 332 333 const MachineOperand *SBaseReg = 334 getNamedOperand(LdSt, AMDGPU::OpName::sbase); 335 BaseReg = SBaseReg->getReg(); 336 Offset = OffsetImm->getImm(); 337 return true; 338 } 339 340 if (isFLAT(LdSt)) { 341 const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 342 if (VAddr) { 343 // Can't analyze 2 offsets. 344 if (getNamedOperand(LdSt, AMDGPU::OpName::saddr)) 345 return false; 346 347 BaseReg = VAddr->getReg(); 348 } else { 349 // scratch instructions have either vaddr or saddr. 350 BaseReg = getNamedOperand(LdSt, AMDGPU::OpName::saddr)->getReg(); 351 } 352 353 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm(); 354 return true; 355 } 356 357 return false; 358 } 359 360 static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, unsigned BaseReg1, 361 const MachineInstr &MI2, unsigned BaseReg2) { 362 if (BaseReg1 == BaseReg2) 363 return true; 364 365 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand()) 366 return false; 367 368 auto MO1 = *MI1.memoperands_begin(); 369 auto MO2 = *MI2.memoperands_begin(); 370 if (MO1->getAddrSpace() != MO2->getAddrSpace()) 371 return false; 372 373 auto Base1 = MO1->getValue(); 374 auto Base2 = MO2->getValue(); 375 if (!Base1 || !Base2) 376 return false; 377 const MachineFunction &MF = *MI1.getParent()->getParent(); 378 const DataLayout &DL = MF.getFunction()->getParent()->getDataLayout(); 379 Base1 = GetUnderlyingObject(Base1, DL); 380 Base2 = GetUnderlyingObject(Base1, DL); 381 382 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2)) 383 return false; 384 385 return Base1 == Base2; 386 } 387 388 bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, 389 unsigned BaseReg1, 390 MachineInstr &SecondLdSt, 391 unsigned BaseReg2, 392 unsigned NumLoads) const { 393 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseReg1, SecondLdSt, BaseReg2)) 394 return false; 395 396 const MachineOperand *FirstDst = nullptr; 397 const MachineOperand *SecondDst = nullptr; 398 399 if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || 400 (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) || 401 (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) { 402 const unsigned MaxGlobalLoadCluster = 6; 403 if (NumLoads > MaxGlobalLoadCluster) 404 return false; 405 406 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); 407 if (!FirstDst) 408 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); 409 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); 410 if (!SecondDst) 411 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); 412 } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { 413 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); 414 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); 415 } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) { 416 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); 417 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); 418 } 419 420 if (!FirstDst || !SecondDst) 421 return false; 422 423 // Try to limit clustering based on the total number of bytes loaded 424 // rather than the number of instructions. This is done to help reduce 425 // register pressure. The method used is somewhat inexact, though, 426 // because it assumes that all loads in the cluster will load the 427 // same number of bytes as FirstLdSt. 428 429 // The unit of this value is bytes. 430 // FIXME: This needs finer tuning. 431 unsigned LoadClusterThreshold = 16; 432 433 const MachineRegisterInfo &MRI = 434 FirstLdSt.getParent()->getParent()->getRegInfo(); 435 const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg()); 436 437 return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold; 438 } 439 440 static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, 441 MachineBasicBlock::iterator MI, 442 const DebugLoc &DL, unsigned DestReg, 443 unsigned SrcReg, bool KillSrc) { 444 MachineFunction *MF = MBB.getParent(); 445 DiagnosticInfoUnsupported IllegalCopy(*MF->getFunction(), 446 "illegal SGPR to VGPR copy", 447 DL, DS_Error); 448 LLVMContext &C = MF->getFunction()->getContext(); 449 C.diagnose(IllegalCopy); 450 451 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg) 452 .addReg(SrcReg, getKillRegState(KillSrc)); 453 } 454 455 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 456 MachineBasicBlock::iterator MI, 457 const DebugLoc &DL, unsigned DestReg, 458 unsigned SrcReg, bool KillSrc) const { 459 const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg); 460 461 if (RC == &AMDGPU::VGPR_32RegClass) { 462 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 463 AMDGPU::SReg_32RegClass.contains(SrcReg)); 464 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 465 .addReg(SrcReg, getKillRegState(KillSrc)); 466 return; 467 } 468 469 if (RC == &AMDGPU::SReg_32_XM0RegClass || 470 RC == &AMDGPU::SReg_32RegClass) { 471 if (SrcReg == AMDGPU::SCC) { 472 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) 473 .addImm(-1) 474 .addImm(0); 475 return; 476 } 477 478 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) { 479 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 480 return; 481 } 482 483 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 484 .addReg(SrcReg, getKillRegState(KillSrc)); 485 return; 486 } 487 488 if (RC == &AMDGPU::SReg_64RegClass) { 489 if (DestReg == AMDGPU::VCC) { 490 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 491 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 492 .addReg(SrcReg, getKillRegState(KillSrc)); 493 } else { 494 // FIXME: Hack until VReg_1 removed. 495 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 496 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) 497 .addImm(0) 498 .addReg(SrcReg, getKillRegState(KillSrc)); 499 } 500 501 return; 502 } 503 504 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) { 505 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 506 return; 507 } 508 509 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 510 .addReg(SrcReg, getKillRegState(KillSrc)); 511 return; 512 } 513 514 if (DestReg == AMDGPU::SCC) { 515 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 516 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) 517 .addReg(SrcReg, getKillRegState(KillSrc)) 518 .addImm(0); 519 return; 520 } 521 522 unsigned EltSize = 4; 523 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 524 if (RI.isSGPRClass(RC)) { 525 if (RI.getRegSizeInBits(*RC) > 32) { 526 Opcode = AMDGPU::S_MOV_B64; 527 EltSize = 8; 528 } else { 529 Opcode = AMDGPU::S_MOV_B32; 530 EltSize = 4; 531 } 532 533 if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) { 534 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 535 return; 536 } 537 } 538 539 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); 540 bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); 541 542 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 543 unsigned SubIdx; 544 if (Forward) 545 SubIdx = SubIndices[Idx]; 546 else 547 SubIdx = SubIndices[SubIndices.size() - Idx - 1]; 548 549 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 550 get(Opcode), RI.getSubReg(DestReg, SubIdx)); 551 552 Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); 553 554 if (Idx == 0) 555 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 556 557 bool UseKill = KillSrc && Idx == SubIndices.size() - 1; 558 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); 559 } 560 } 561 562 int SIInstrInfo::commuteOpcode(unsigned Opcode) const { 563 int NewOpc; 564 565 // Try to map original to commuted opcode 566 NewOpc = AMDGPU::getCommuteRev(Opcode); 567 if (NewOpc != -1) 568 // Check if the commuted (REV) opcode exists on the target. 569 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 570 571 // Try to map commuted to original opcode 572 NewOpc = AMDGPU::getCommuteOrig(Opcode); 573 if (NewOpc != -1) 574 // Check if the original (non-REV) opcode exists on the target. 575 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 576 577 return Opcode; 578 } 579 580 void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, 581 MachineBasicBlock::iterator MI, 582 const DebugLoc &DL, unsigned DestReg, 583 int64_t Value) const { 584 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 585 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg); 586 if (RegClass == &AMDGPU::SReg_32RegClass || 587 RegClass == &AMDGPU::SGPR_32RegClass || 588 RegClass == &AMDGPU::SReg_32_XM0RegClass || 589 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) { 590 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 591 .addImm(Value); 592 return; 593 } 594 595 if (RegClass == &AMDGPU::SReg_64RegClass || 596 RegClass == &AMDGPU::SGPR_64RegClass || 597 RegClass == &AMDGPU::SReg_64_XEXECRegClass) { 598 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 599 .addImm(Value); 600 return; 601 } 602 603 if (RegClass == &AMDGPU::VGPR_32RegClass) { 604 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 605 .addImm(Value); 606 return; 607 } 608 if (RegClass == &AMDGPU::VReg_64RegClass) { 609 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg) 610 .addImm(Value); 611 return; 612 } 613 614 unsigned EltSize = 4; 615 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 616 if (RI.isSGPRClass(RegClass)) { 617 if (RI.getRegSizeInBits(*RegClass) > 32) { 618 Opcode = AMDGPU::S_MOV_B64; 619 EltSize = 8; 620 } else { 621 Opcode = AMDGPU::S_MOV_B32; 622 EltSize = 4; 623 } 624 } 625 626 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize); 627 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 628 int64_t IdxValue = Idx == 0 ? Value : 0; 629 630 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 631 get(Opcode), RI.getSubReg(DestReg, Idx)); 632 Builder.addImm(IdxValue); 633 } 634 } 635 636 const TargetRegisterClass * 637 SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const { 638 return &AMDGPU::VGPR_32RegClass; 639 } 640 641 void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, 642 MachineBasicBlock::iterator I, 643 const DebugLoc &DL, unsigned DstReg, 644 ArrayRef<MachineOperand> Cond, 645 unsigned TrueReg, 646 unsigned FalseReg) const { 647 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 648 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && 649 "Not a VGPR32 reg"); 650 651 if (Cond.size() == 1) { 652 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 653 .addReg(FalseReg) 654 .addReg(TrueReg) 655 .add(Cond[0]); 656 } else if (Cond.size() == 2) { 657 assert(Cond[0].isImm() && "Cond[0] is not an immediate"); 658 switch (Cond[0].getImm()) { 659 case SIInstrInfo::SCC_TRUE: { 660 unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 661 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) 662 .addImm(-1) 663 .addImm(0); 664 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 665 .addReg(FalseReg) 666 .addReg(TrueReg) 667 .addReg(SReg); 668 break; 669 } 670 case SIInstrInfo::SCC_FALSE: { 671 unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 672 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) 673 .addImm(0) 674 .addImm(-1); 675 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 676 .addReg(FalseReg) 677 .addReg(TrueReg) 678 .addReg(SReg); 679 break; 680 } 681 case SIInstrInfo::VCCNZ: { 682 MachineOperand RegOp = Cond[1]; 683 RegOp.setImplicit(false); 684 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 685 .addReg(FalseReg) 686 .addReg(TrueReg) 687 .add(RegOp); 688 break; 689 } 690 case SIInstrInfo::VCCZ: { 691 MachineOperand RegOp = Cond[1]; 692 RegOp.setImplicit(false); 693 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 694 .addReg(TrueReg) 695 .addReg(FalseReg) 696 .add(RegOp); 697 break; 698 } 699 case SIInstrInfo::EXECNZ: { 700 unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 701 unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 702 BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 703 .addImm(0); 704 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) 705 .addImm(-1) 706 .addImm(0); 707 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 708 .addReg(FalseReg) 709 .addReg(TrueReg) 710 .addReg(SReg); 711 break; 712 } 713 case SIInstrInfo::EXECZ: { 714 unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 715 unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 716 BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 717 .addImm(0); 718 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) 719 .addImm(0) 720 .addImm(-1); 721 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 722 .addReg(FalseReg) 723 .addReg(TrueReg) 724 .addReg(SReg); 725 llvm_unreachable("Unhandled branch predicate EXECZ"); 726 break; 727 } 728 default: 729 llvm_unreachable("invalid branch predicate"); 730 } 731 } else { 732 llvm_unreachable("Can only handle Cond size 1 or 2"); 733 } 734 } 735 736 unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB, 737 MachineBasicBlock::iterator I, 738 const DebugLoc &DL, 739 unsigned SrcReg, int Value) const { 740 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 741 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 742 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) 743 .addImm(Value) 744 .addReg(SrcReg); 745 746 return Reg; 747 } 748 749 unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB, 750 MachineBasicBlock::iterator I, 751 const DebugLoc &DL, 752 unsigned SrcReg, int Value) const { 753 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 754 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 755 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) 756 .addImm(Value) 757 .addReg(SrcReg); 758 759 return Reg; 760 } 761 762 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 763 764 if (RI.getRegSizeInBits(*DstRC) == 32) { 765 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 766 } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { 767 return AMDGPU::S_MOV_B64; 768 } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) { 769 return AMDGPU::V_MOV_B64_PSEUDO; 770 } 771 return AMDGPU::COPY; 772 } 773 774 static unsigned getSGPRSpillSaveOpcode(unsigned Size) { 775 switch (Size) { 776 case 4: 777 return AMDGPU::SI_SPILL_S32_SAVE; 778 case 8: 779 return AMDGPU::SI_SPILL_S64_SAVE; 780 case 16: 781 return AMDGPU::SI_SPILL_S128_SAVE; 782 case 32: 783 return AMDGPU::SI_SPILL_S256_SAVE; 784 case 64: 785 return AMDGPU::SI_SPILL_S512_SAVE; 786 default: 787 llvm_unreachable("unknown register size"); 788 } 789 } 790 791 static unsigned getVGPRSpillSaveOpcode(unsigned Size) { 792 switch (Size) { 793 case 4: 794 return AMDGPU::SI_SPILL_V32_SAVE; 795 case 8: 796 return AMDGPU::SI_SPILL_V64_SAVE; 797 case 12: 798 return AMDGPU::SI_SPILL_V96_SAVE; 799 case 16: 800 return AMDGPU::SI_SPILL_V128_SAVE; 801 case 32: 802 return AMDGPU::SI_SPILL_V256_SAVE; 803 case 64: 804 return AMDGPU::SI_SPILL_V512_SAVE; 805 default: 806 llvm_unreachable("unknown register size"); 807 } 808 } 809 810 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 811 MachineBasicBlock::iterator MI, 812 unsigned SrcReg, bool isKill, 813 int FrameIndex, 814 const TargetRegisterClass *RC, 815 const TargetRegisterInfo *TRI) const { 816 MachineFunction *MF = MBB.getParent(); 817 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 818 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 819 DebugLoc DL = MBB.findDebugLoc(MI); 820 821 assert(SrcReg != MFI->getStackPtrOffsetReg() && 822 SrcReg != MFI->getFrameOffsetReg() && 823 SrcReg != MFI->getScratchWaveOffsetReg()); 824 825 unsigned Size = FrameInfo.getObjectSize(FrameIndex); 826 unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); 827 MachinePointerInfo PtrInfo 828 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 829 MachineMemOperand *MMO 830 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 831 Size, Align); 832 unsigned SpillSize = TRI->getSpillSize(*RC); 833 834 if (RI.isSGPRClass(RC)) { 835 MFI->setHasSpilledSGPRs(); 836 837 // We are only allowed to create one new instruction when spilling 838 // registers, so we need to use pseudo instruction for spilling SGPRs. 839 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize)); 840 841 // The SGPR spill/restore instructions only work on number sgprs, so we need 842 // to make sure we are using the correct register class. 843 if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) { 844 MachineRegisterInfo &MRI = MF->getRegInfo(); 845 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); 846 } 847 848 MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc) 849 .addReg(SrcReg, getKillRegState(isKill)) // data 850 .addFrameIndex(FrameIndex) // addr 851 .addMemOperand(MMO) 852 .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) 853 .addReg(MFI->getFrameOffsetReg(), RegState::Implicit); 854 // Add the scratch resource registers as implicit uses because we may end up 855 // needing them, and need to ensure that the reserved registers are 856 // correctly handled. 857 858 FrameInfo.setStackID(FrameIndex, 1); 859 if (ST.hasScalarStores()) { 860 // m0 is used for offset to scalar stores if used to spill. 861 Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead); 862 } 863 864 return; 865 } 866 867 if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { 868 LLVMContext &Ctx = MF->getFunction()->getContext(); 869 Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" 870 " spill register"); 871 BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) 872 .addReg(SrcReg); 873 874 return; 875 } 876 877 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 878 879 unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize); 880 MFI->setHasSpilledVGPRs(); 881 BuildMI(MBB, MI, DL, get(Opcode)) 882 .addReg(SrcReg, getKillRegState(isKill)) // data 883 .addFrameIndex(FrameIndex) // addr 884 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 885 .addReg(MFI->getFrameOffsetReg()) // scratch_offset 886 .addImm(0) // offset 887 .addMemOperand(MMO); 888 } 889 890 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { 891 switch (Size) { 892 case 4: 893 return AMDGPU::SI_SPILL_S32_RESTORE; 894 case 8: 895 return AMDGPU::SI_SPILL_S64_RESTORE; 896 case 16: 897 return AMDGPU::SI_SPILL_S128_RESTORE; 898 case 32: 899 return AMDGPU::SI_SPILL_S256_RESTORE; 900 case 64: 901 return AMDGPU::SI_SPILL_S512_RESTORE; 902 default: 903 llvm_unreachable("unknown register size"); 904 } 905 } 906 907 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { 908 switch (Size) { 909 case 4: 910 return AMDGPU::SI_SPILL_V32_RESTORE; 911 case 8: 912 return AMDGPU::SI_SPILL_V64_RESTORE; 913 case 12: 914 return AMDGPU::SI_SPILL_V96_RESTORE; 915 case 16: 916 return AMDGPU::SI_SPILL_V128_RESTORE; 917 case 32: 918 return AMDGPU::SI_SPILL_V256_RESTORE; 919 case 64: 920 return AMDGPU::SI_SPILL_V512_RESTORE; 921 default: 922 llvm_unreachable("unknown register size"); 923 } 924 } 925 926 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 927 MachineBasicBlock::iterator MI, 928 unsigned DestReg, int FrameIndex, 929 const TargetRegisterClass *RC, 930 const TargetRegisterInfo *TRI) const { 931 MachineFunction *MF = MBB.getParent(); 932 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 933 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 934 DebugLoc DL = MBB.findDebugLoc(MI); 935 unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); 936 unsigned Size = FrameInfo.getObjectSize(FrameIndex); 937 unsigned SpillSize = TRI->getSpillSize(*RC); 938 939 MachinePointerInfo PtrInfo 940 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 941 942 MachineMemOperand *MMO = MF->getMachineMemOperand( 943 PtrInfo, MachineMemOperand::MOLoad, Size, Align); 944 945 if (RI.isSGPRClass(RC)) { 946 // FIXME: Maybe this should not include a memoperand because it will be 947 // lowered to non-memory instructions. 948 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); 949 if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) { 950 MachineRegisterInfo &MRI = MF->getRegInfo(); 951 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); 952 } 953 954 FrameInfo.setStackID(FrameIndex, 1); 955 MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg) 956 .addFrameIndex(FrameIndex) // addr 957 .addMemOperand(MMO) 958 .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) 959 .addReg(MFI->getFrameOffsetReg(), RegState::Implicit); 960 961 if (ST.hasScalarStores()) { 962 // m0 is used for offset to scalar stores if used to spill. 963 Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead); 964 } 965 966 return; 967 } 968 969 if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { 970 LLVMContext &Ctx = MF->getFunction()->getContext(); 971 Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" 972 " restore register"); 973 BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); 974 975 return; 976 } 977 978 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 979 980 unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize); 981 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 982 .addFrameIndex(FrameIndex) // vaddr 983 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 984 .addReg(MFI->getFrameOffsetReg()) // scratch_offset 985 .addImm(0) // offset 986 .addMemOperand(MMO); 987 } 988 989 /// \param @Offset Offset in bytes of the FrameIndex being spilled 990 unsigned SIInstrInfo::calculateLDSSpillAddress( 991 MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg, 992 unsigned FrameOffset, unsigned Size) const { 993 MachineFunction *MF = MBB.getParent(); 994 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 995 const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); 996 DebugLoc DL = MBB.findDebugLoc(MI); 997 unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); 998 unsigned WavefrontSize = ST.getWavefrontSize(); 999 1000 unsigned TIDReg = MFI->getTIDReg(); 1001 if (!MFI->hasCalculatedTID()) { 1002 MachineBasicBlock &Entry = MBB.getParent()->front(); 1003 MachineBasicBlock::iterator Insert = Entry.front(); 1004 DebugLoc DL = Insert->getDebugLoc(); 1005 1006 TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass, 1007 *MF); 1008 if (TIDReg == AMDGPU::NoRegister) 1009 return TIDReg; 1010 1011 if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) && 1012 WorkGroupSize > WavefrontSize) { 1013 unsigned TIDIGXReg 1014 = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 1015 unsigned TIDIGYReg 1016 = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 1017 unsigned TIDIGZReg 1018 = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 1019 unsigned InputPtrReg = 1020 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1021 for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { 1022 if (!Entry.isLiveIn(Reg)) 1023 Entry.addLiveIn(Reg); 1024 } 1025 1026 RS->enterBasicBlock(Entry); 1027 // FIXME: Can we scavenge an SReg_64 and access the subregs? 1028 unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 1029 unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 1030 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) 1031 .addReg(InputPtrReg) 1032 .addImm(SI::KernelInputOffsets::NGROUPS_Z); 1033 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) 1034 .addReg(InputPtrReg) 1035 .addImm(SI::KernelInputOffsets::NGROUPS_Y); 1036 1037 // NGROUPS.X * NGROUPS.Y 1038 BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) 1039 .addReg(STmp1) 1040 .addReg(STmp0); 1041 // (NGROUPS.X * NGROUPS.Y) * TIDIG.X 1042 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) 1043 .addReg(STmp1) 1044 .addReg(TIDIGXReg); 1045 // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) 1046 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) 1047 .addReg(STmp0) 1048 .addReg(TIDIGYReg) 1049 .addReg(TIDReg); 1050 // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z 1051 BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) 1052 .addReg(TIDReg) 1053 .addReg(TIDIGZReg); 1054 } else { 1055 // Get the wave id 1056 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), 1057 TIDReg) 1058 .addImm(-1) 1059 .addImm(0); 1060 1061 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), 1062 TIDReg) 1063 .addImm(-1) 1064 .addReg(TIDReg); 1065 } 1066 1067 BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), 1068 TIDReg) 1069 .addImm(2) 1070 .addReg(TIDReg); 1071 MFI->setTIDReg(TIDReg); 1072 } 1073 1074 // Add FrameIndex to LDS offset 1075 unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize); 1076 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) 1077 .addImm(LDSOffset) 1078 .addReg(TIDReg); 1079 1080 return TmpReg; 1081 } 1082 1083 void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB, 1084 MachineBasicBlock::iterator MI, 1085 int Count) const { 1086 DebugLoc DL = MBB.findDebugLoc(MI); 1087 while (Count > 0) { 1088 int Arg; 1089 if (Count >= 8) 1090 Arg = 7; 1091 else 1092 Arg = Count - 1; 1093 Count -= 8; 1094 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)) 1095 .addImm(Arg); 1096 } 1097 } 1098 1099 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, 1100 MachineBasicBlock::iterator MI) const { 1101 insertWaitStates(MBB, MI, 1); 1102 } 1103 1104 void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { 1105 auto MF = MBB.getParent(); 1106 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 1107 1108 assert(Info->isEntryFunction()); 1109 1110 if (MBB.succ_empty()) { 1111 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end(); 1112 if (HasNoTerminator) 1113 BuildMI(MBB, MBB.end(), DebugLoc(), 1114 get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG)); 1115 } 1116 } 1117 1118 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const { 1119 switch (MI.getOpcode()) { 1120 default: return 1; // FIXME: Do wait states equal cycles? 1121 1122 case AMDGPU::S_NOP: 1123 return MI.getOperand(0).getImm() + 1; 1124 } 1125 } 1126 1127 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 1128 MachineBasicBlock &MBB = *MI.getParent(); 1129 DebugLoc DL = MBB.findDebugLoc(MI); 1130 switch (MI.getOpcode()) { 1131 default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); 1132 case AMDGPU::S_MOV_B64_term: 1133 // This is only a terminator to get the correct spill code placement during 1134 // register allocation. 1135 MI.setDesc(get(AMDGPU::S_MOV_B64)); 1136 break; 1137 1138 case AMDGPU::S_XOR_B64_term: 1139 // This is only a terminator to get the correct spill code placement during 1140 // register allocation. 1141 MI.setDesc(get(AMDGPU::S_XOR_B64)); 1142 break; 1143 1144 case AMDGPU::S_ANDN2_B64_term: 1145 // This is only a terminator to get the correct spill code placement during 1146 // register allocation. 1147 MI.setDesc(get(AMDGPU::S_ANDN2_B64)); 1148 break; 1149 1150 case AMDGPU::V_MOV_B64_PSEUDO: { 1151 unsigned Dst = MI.getOperand(0).getReg(); 1152 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 1153 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 1154 1155 const MachineOperand &SrcOp = MI.getOperand(1); 1156 // FIXME: Will this work for 64-bit floating point immediates? 1157 assert(!SrcOp.isFPImm()); 1158 if (SrcOp.isImm()) { 1159 APInt Imm(64, SrcOp.getImm()); 1160 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 1161 .addImm(Imm.getLoBits(32).getZExtValue()) 1162 .addReg(Dst, RegState::Implicit | RegState::Define); 1163 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 1164 .addImm(Imm.getHiBits(32).getZExtValue()) 1165 .addReg(Dst, RegState::Implicit | RegState::Define); 1166 } else { 1167 assert(SrcOp.isReg()); 1168 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 1169 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 1170 .addReg(Dst, RegState::Implicit | RegState::Define); 1171 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 1172 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 1173 .addReg(Dst, RegState::Implicit | RegState::Define); 1174 } 1175 MI.eraseFromParent(); 1176 break; 1177 } 1178 case AMDGPU::V_SET_INACTIVE_B32: { 1179 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) 1180 .addReg(AMDGPU::EXEC); 1181 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) 1182 .add(MI.getOperand(2)); 1183 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) 1184 .addReg(AMDGPU::EXEC); 1185 MI.eraseFromParent(); 1186 break; 1187 } 1188 case AMDGPU::V_SET_INACTIVE_B64: { 1189 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) 1190 .addReg(AMDGPU::EXEC); 1191 MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), 1192 MI.getOperand(0).getReg()) 1193 .add(MI.getOperand(2)); 1194 expandPostRAPseudo(*Copy); 1195 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) 1196 .addReg(AMDGPU::EXEC); 1197 MI.eraseFromParent(); 1198 break; 1199 } 1200 case AMDGPU::V_MOVRELD_B32_V1: 1201 case AMDGPU::V_MOVRELD_B32_V2: 1202 case AMDGPU::V_MOVRELD_B32_V4: 1203 case AMDGPU::V_MOVRELD_B32_V8: 1204 case AMDGPU::V_MOVRELD_B32_V16: { 1205 const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32); 1206 unsigned VecReg = MI.getOperand(0).getReg(); 1207 bool IsUndef = MI.getOperand(1).isUndef(); 1208 unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm(); 1209 assert(VecReg == MI.getOperand(1).getReg()); 1210 1211 MachineInstr *MovRel = 1212 BuildMI(MBB, MI, DL, MovRelDesc) 1213 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) 1214 .add(MI.getOperand(2)) 1215 .addReg(VecReg, RegState::ImplicitDefine) 1216 .addReg(VecReg, 1217 RegState::Implicit | (IsUndef ? RegState::Undef : 0)); 1218 1219 const int ImpDefIdx = 1220 MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses(); 1221 const int ImpUseIdx = ImpDefIdx + 1; 1222 MovRel->tieOperands(ImpDefIdx, ImpUseIdx); 1223 1224 MI.eraseFromParent(); 1225 break; 1226 } 1227 case AMDGPU::SI_PC_ADD_REL_OFFSET: { 1228 MachineFunction &MF = *MBB.getParent(); 1229 unsigned Reg = MI.getOperand(0).getReg(); 1230 unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); 1231 unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); 1232 1233 // Create a bundle so these instructions won't be re-ordered by the 1234 // post-RA scheduler. 1235 MIBundleBuilder Bundler(MBB, MI); 1236 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); 1237 1238 // Add 32-bit offset from this instruction to the start of the 1239 // constant data. 1240 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) 1241 .addReg(RegLo) 1242 .add(MI.getOperand(1))); 1243 1244 MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) 1245 .addReg(RegHi); 1246 if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE) 1247 MIB.addImm(0); 1248 else 1249 MIB.add(MI.getOperand(2)); 1250 1251 Bundler.append(MIB); 1252 finalizeBundle(MBB, Bundler.begin()); 1253 1254 MI.eraseFromParent(); 1255 break; 1256 } 1257 case AMDGPU::EXIT_WWM: { 1258 // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM 1259 // is exited. 1260 MI.setDesc(get(AMDGPU::S_MOV_B64)); 1261 break; 1262 } 1263 } 1264 return true; 1265 } 1266 1267 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, 1268 MachineOperand &Src0, 1269 unsigned Src0OpName, 1270 MachineOperand &Src1, 1271 unsigned Src1OpName) const { 1272 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); 1273 if (!Src0Mods) 1274 return false; 1275 1276 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); 1277 assert(Src1Mods && 1278 "All commutable instructions have both src0 and src1 modifiers"); 1279 1280 int Src0ModsVal = Src0Mods->getImm(); 1281 int Src1ModsVal = Src1Mods->getImm(); 1282 1283 Src1Mods->setImm(Src0ModsVal); 1284 Src0Mods->setImm(Src1ModsVal); 1285 return true; 1286 } 1287 1288 static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, 1289 MachineOperand &RegOp, 1290 MachineOperand &NonRegOp) { 1291 unsigned Reg = RegOp.getReg(); 1292 unsigned SubReg = RegOp.getSubReg(); 1293 bool IsKill = RegOp.isKill(); 1294 bool IsDead = RegOp.isDead(); 1295 bool IsUndef = RegOp.isUndef(); 1296 bool IsDebug = RegOp.isDebug(); 1297 1298 if (NonRegOp.isImm()) 1299 RegOp.ChangeToImmediate(NonRegOp.getImm()); 1300 else if (NonRegOp.isFI()) 1301 RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); 1302 else 1303 return nullptr; 1304 1305 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); 1306 NonRegOp.setSubReg(SubReg); 1307 1308 return &MI; 1309 } 1310 1311 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, 1312 unsigned Src0Idx, 1313 unsigned Src1Idx) const { 1314 assert(!NewMI && "this should never be used"); 1315 1316 unsigned Opc = MI.getOpcode(); 1317 int CommutedOpcode = commuteOpcode(Opc); 1318 if (CommutedOpcode == -1) 1319 return nullptr; 1320 1321 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == 1322 static_cast<int>(Src0Idx) && 1323 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == 1324 static_cast<int>(Src1Idx) && 1325 "inconsistency with findCommutedOpIndices"); 1326 1327 MachineOperand &Src0 = MI.getOperand(Src0Idx); 1328 MachineOperand &Src1 = MI.getOperand(Src1Idx); 1329 1330 MachineInstr *CommutedMI = nullptr; 1331 if (Src0.isReg() && Src1.isReg()) { 1332 if (isOperandLegal(MI, Src1Idx, &Src0)) { 1333 // Be sure to copy the source modifiers to the right place. 1334 CommutedMI 1335 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); 1336 } 1337 1338 } else if (Src0.isReg() && !Src1.isReg()) { 1339 // src0 should always be able to support any operand type, so no need to 1340 // check operand legality. 1341 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); 1342 } else if (!Src0.isReg() && Src1.isReg()) { 1343 if (isOperandLegal(MI, Src1Idx, &Src0)) 1344 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); 1345 } else { 1346 // FIXME: Found two non registers to commute. This does happen. 1347 return nullptr; 1348 } 1349 1350 if (CommutedMI) { 1351 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, 1352 Src1, AMDGPU::OpName::src1_modifiers); 1353 1354 CommutedMI->setDesc(get(CommutedOpcode)); 1355 } 1356 1357 return CommutedMI; 1358 } 1359 1360 // This needs to be implemented because the source modifiers may be inserted 1361 // between the true commutable operands, and the base 1362 // TargetInstrInfo::commuteInstruction uses it. 1363 bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0, 1364 unsigned &SrcOpIdx1) const { 1365 if (!MI.isCommutable()) 1366 return false; 1367 1368 unsigned Opc = MI.getOpcode(); 1369 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1370 if (Src0Idx == -1) 1371 return false; 1372 1373 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1374 if (Src1Idx == -1) 1375 return false; 1376 1377 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); 1378 } 1379 1380 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, 1381 int64_t BrOffset) const { 1382 // BranchRelaxation should never have to check s_setpc_b64 because its dest 1383 // block is unanalyzable. 1384 assert(BranchOp != AMDGPU::S_SETPC_B64); 1385 1386 // Convert to dwords. 1387 BrOffset /= 4; 1388 1389 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is 1390 // from the next instruction. 1391 BrOffset -= 1; 1392 1393 return isIntN(BranchOffsetBits, BrOffset); 1394 } 1395 1396 MachineBasicBlock *SIInstrInfo::getBranchDestBlock( 1397 const MachineInstr &MI) const { 1398 if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { 1399 // This would be a difficult analysis to perform, but can always be legal so 1400 // there's no need to analyze it. 1401 return nullptr; 1402 } 1403 1404 return MI.getOperand(0).getMBB(); 1405 } 1406 1407 unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, 1408 MachineBasicBlock &DestBB, 1409 const DebugLoc &DL, 1410 int64_t BrOffset, 1411 RegScavenger *RS) const { 1412 assert(RS && "RegScavenger required for long branching"); 1413 assert(MBB.empty() && 1414 "new block should be inserted for expanding unconditional branch"); 1415 assert(MBB.pred_size() == 1); 1416 1417 MachineFunction *MF = MBB.getParent(); 1418 MachineRegisterInfo &MRI = MF->getRegInfo(); 1419 1420 // FIXME: Virtual register workaround for RegScavenger not working with empty 1421 // blocks. 1422 unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1423 1424 auto I = MBB.end(); 1425 1426 // We need to compute the offset relative to the instruction immediately after 1427 // s_getpc_b64. Insert pc arithmetic code before last terminator. 1428 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); 1429 1430 // TODO: Handle > 32-bit block address. 1431 if (BrOffset >= 0) { 1432 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) 1433 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 1434 .addReg(PCReg, 0, AMDGPU::sub0) 1435 .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD); 1436 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) 1437 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 1438 .addReg(PCReg, 0, AMDGPU::sub1) 1439 .addImm(0); 1440 } else { 1441 // Backwards branch. 1442 BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32)) 1443 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 1444 .addReg(PCReg, 0, AMDGPU::sub0) 1445 .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD); 1446 BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32)) 1447 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 1448 .addReg(PCReg, 0, AMDGPU::sub1) 1449 .addImm(0); 1450 } 1451 1452 // Insert the indirect branch after the other terminator. 1453 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) 1454 .addReg(PCReg); 1455 1456 // FIXME: If spilling is necessary, this will fail because this scavenger has 1457 // no emergency stack slots. It is non-trivial to spill in this situation, 1458 // because the restore code needs to be specially placed after the 1459 // jump. BranchRelaxation then needs to be made aware of the newly inserted 1460 // block. 1461 // 1462 // If a spill is needed for the pc register pair, we need to insert a spill 1463 // restore block right before the destination block, and insert a short branch 1464 // into the old destination block's fallthrough predecessor. 1465 // e.g.: 1466 // 1467 // s_cbranch_scc0 skip_long_branch: 1468 // 1469 // long_branch_bb: 1470 // spill s[8:9] 1471 // s_getpc_b64 s[8:9] 1472 // s_add_u32 s8, s8, restore_bb 1473 // s_addc_u32 s9, s9, 0 1474 // s_setpc_b64 s[8:9] 1475 // 1476 // skip_long_branch: 1477 // foo; 1478 // 1479 // ..... 1480 // 1481 // dest_bb_fallthrough_predecessor: 1482 // bar; 1483 // s_branch dest_bb 1484 // 1485 // restore_bb: 1486 // restore s[8:9] 1487 // fallthrough dest_bb 1488 /// 1489 // dest_bb: 1490 // buzz; 1491 1492 RS->enterBasicBlockEnd(MBB); 1493 unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass, 1494 MachineBasicBlock::iterator(GetPC), 0); 1495 MRI.replaceRegWith(PCReg, Scav); 1496 MRI.clearVirtRegs(); 1497 RS->setRegUsed(Scav); 1498 1499 return 4 + 8 + 4 + 4; 1500 } 1501 1502 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { 1503 switch (Cond) { 1504 case SIInstrInfo::SCC_TRUE: 1505 return AMDGPU::S_CBRANCH_SCC1; 1506 case SIInstrInfo::SCC_FALSE: 1507 return AMDGPU::S_CBRANCH_SCC0; 1508 case SIInstrInfo::VCCNZ: 1509 return AMDGPU::S_CBRANCH_VCCNZ; 1510 case SIInstrInfo::VCCZ: 1511 return AMDGPU::S_CBRANCH_VCCZ; 1512 case SIInstrInfo::EXECNZ: 1513 return AMDGPU::S_CBRANCH_EXECNZ; 1514 case SIInstrInfo::EXECZ: 1515 return AMDGPU::S_CBRANCH_EXECZ; 1516 default: 1517 llvm_unreachable("invalid branch predicate"); 1518 } 1519 } 1520 1521 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { 1522 switch (Opcode) { 1523 case AMDGPU::S_CBRANCH_SCC0: 1524 return SCC_FALSE; 1525 case AMDGPU::S_CBRANCH_SCC1: 1526 return SCC_TRUE; 1527 case AMDGPU::S_CBRANCH_VCCNZ: 1528 return VCCNZ; 1529 case AMDGPU::S_CBRANCH_VCCZ: 1530 return VCCZ; 1531 case AMDGPU::S_CBRANCH_EXECNZ: 1532 return EXECNZ; 1533 case AMDGPU::S_CBRANCH_EXECZ: 1534 return EXECZ; 1535 default: 1536 return INVALID_BR; 1537 } 1538 } 1539 1540 bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, 1541 MachineBasicBlock::iterator I, 1542 MachineBasicBlock *&TBB, 1543 MachineBasicBlock *&FBB, 1544 SmallVectorImpl<MachineOperand> &Cond, 1545 bool AllowModify) const { 1546 if (I->getOpcode() == AMDGPU::S_BRANCH) { 1547 // Unconditional Branch 1548 TBB = I->getOperand(0).getMBB(); 1549 return false; 1550 } 1551 1552 MachineBasicBlock *CondBB = nullptr; 1553 1554 if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 1555 CondBB = I->getOperand(1).getMBB(); 1556 Cond.push_back(I->getOperand(0)); 1557 } else { 1558 BranchPredicate Pred = getBranchPredicate(I->getOpcode()); 1559 if (Pred == INVALID_BR) 1560 return true; 1561 1562 CondBB = I->getOperand(0).getMBB(); 1563 Cond.push_back(MachineOperand::CreateImm(Pred)); 1564 Cond.push_back(I->getOperand(1)); // Save the branch register. 1565 } 1566 ++I; 1567 1568 if (I == MBB.end()) { 1569 // Conditional branch followed by fall-through. 1570 TBB = CondBB; 1571 return false; 1572 } 1573 1574 if (I->getOpcode() == AMDGPU::S_BRANCH) { 1575 TBB = CondBB; 1576 FBB = I->getOperand(0).getMBB(); 1577 return false; 1578 } 1579 1580 return true; 1581 } 1582 1583 bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, 1584 MachineBasicBlock *&FBB, 1585 SmallVectorImpl<MachineOperand> &Cond, 1586 bool AllowModify) const { 1587 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 1588 if (I == MBB.end()) 1589 return false; 1590 1591 if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH) 1592 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); 1593 1594 ++I; 1595 1596 // TODO: Should be able to treat as fallthrough? 1597 if (I == MBB.end()) 1598 return true; 1599 1600 if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify)) 1601 return true; 1602 1603 MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB(); 1604 1605 // Specifically handle the case where the conditional branch is to the same 1606 // destination as the mask branch. e.g. 1607 // 1608 // si_mask_branch BB8 1609 // s_cbranch_execz BB8 1610 // s_cbranch BB9 1611 // 1612 // This is required to understand divergent loops which may need the branches 1613 // to be relaxed. 1614 if (TBB != MaskBrDest || Cond.empty()) 1615 return true; 1616 1617 auto Pred = Cond[0].getImm(); 1618 return (Pred != EXECZ && Pred != EXECNZ); 1619 } 1620 1621 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, 1622 int *BytesRemoved) const { 1623 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 1624 1625 unsigned Count = 0; 1626 unsigned RemovedSize = 0; 1627 while (I != MBB.end()) { 1628 MachineBasicBlock::iterator Next = std::next(I); 1629 if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) { 1630 I = Next; 1631 continue; 1632 } 1633 1634 RemovedSize += getInstSizeInBytes(*I); 1635 I->eraseFromParent(); 1636 ++Count; 1637 I = Next; 1638 } 1639 1640 if (BytesRemoved) 1641 *BytesRemoved = RemovedSize; 1642 1643 return Count; 1644 } 1645 1646 // Copy the flags onto the implicit condition register operand. 1647 static void preserveCondRegFlags(MachineOperand &CondReg, 1648 const MachineOperand &OrigCond) { 1649 CondReg.setIsUndef(OrigCond.isUndef()); 1650 CondReg.setIsKill(OrigCond.isKill()); 1651 } 1652 1653 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, 1654 MachineBasicBlock *TBB, 1655 MachineBasicBlock *FBB, 1656 ArrayRef<MachineOperand> Cond, 1657 const DebugLoc &DL, 1658 int *BytesAdded) const { 1659 if (!FBB && Cond.empty()) { 1660 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 1661 .addMBB(TBB); 1662 if (BytesAdded) 1663 *BytesAdded = 4; 1664 return 1; 1665 } 1666 1667 if(Cond.size() == 1 && Cond[0].isReg()) { 1668 BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO)) 1669 .add(Cond[0]) 1670 .addMBB(TBB); 1671 return 1; 1672 } 1673 1674 assert(TBB && Cond[0].isImm()); 1675 1676 unsigned Opcode 1677 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); 1678 1679 if (!FBB) { 1680 Cond[1].isUndef(); 1681 MachineInstr *CondBr = 1682 BuildMI(&MBB, DL, get(Opcode)) 1683 .addMBB(TBB); 1684 1685 // Copy the flags onto the implicit condition register operand. 1686 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); 1687 1688 if (BytesAdded) 1689 *BytesAdded = 4; 1690 return 1; 1691 } 1692 1693 assert(TBB && FBB); 1694 1695 MachineInstr *CondBr = 1696 BuildMI(&MBB, DL, get(Opcode)) 1697 .addMBB(TBB); 1698 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 1699 .addMBB(FBB); 1700 1701 MachineOperand &CondReg = CondBr->getOperand(1); 1702 CondReg.setIsUndef(Cond[1].isUndef()); 1703 CondReg.setIsKill(Cond[1].isKill()); 1704 1705 if (BytesAdded) 1706 *BytesAdded = 8; 1707 1708 return 2; 1709 } 1710 1711 bool SIInstrInfo::reverseBranchCondition( 1712 SmallVectorImpl<MachineOperand> &Cond) const { 1713 if (Cond.size() != 2) { 1714 return true; 1715 } 1716 1717 if (Cond[0].isImm()) { 1718 Cond[0].setImm(-Cond[0].getImm()); 1719 return false; 1720 } 1721 1722 return true; 1723 } 1724 1725 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 1726 ArrayRef<MachineOperand> Cond, 1727 unsigned TrueReg, unsigned FalseReg, 1728 int &CondCycles, 1729 int &TrueCycles, int &FalseCycles) const { 1730 switch (Cond[0].getImm()) { 1731 case VCCNZ: 1732 case VCCZ: { 1733 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 1734 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 1735 assert(MRI.getRegClass(FalseReg) == RC); 1736 1737 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 1738 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 1739 1740 // Limit to equal cost for branch vs. N v_cndmask_b32s. 1741 return !RI.isSGPRClass(RC) && NumInsts <= 6; 1742 } 1743 case SCC_TRUE: 1744 case SCC_FALSE: { 1745 // FIXME: We could insert for VGPRs if we could replace the original compare 1746 // with a vector one. 1747 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 1748 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 1749 assert(MRI.getRegClass(FalseReg) == RC); 1750 1751 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 1752 1753 // Multiples of 8 can do s_cselect_b64 1754 if (NumInsts % 2 == 0) 1755 NumInsts /= 2; 1756 1757 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 1758 return RI.isSGPRClass(RC); 1759 } 1760 default: 1761 return false; 1762 } 1763 } 1764 1765 void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, 1766 MachineBasicBlock::iterator I, const DebugLoc &DL, 1767 unsigned DstReg, ArrayRef<MachineOperand> Cond, 1768 unsigned TrueReg, unsigned FalseReg) const { 1769 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); 1770 if (Pred == VCCZ || Pred == SCC_FALSE) { 1771 Pred = static_cast<BranchPredicate>(-Pred); 1772 std::swap(TrueReg, FalseReg); 1773 } 1774 1775 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 1776 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); 1777 unsigned DstSize = RI.getRegSizeInBits(*DstRC); 1778 1779 if (DstSize == 32) { 1780 unsigned SelOp = Pred == SCC_TRUE ? 1781 AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32; 1782 1783 // Instruction's operands are backwards from what is expected. 1784 MachineInstr *Select = 1785 BuildMI(MBB, I, DL, get(SelOp), DstReg) 1786 .addReg(FalseReg) 1787 .addReg(TrueReg); 1788 1789 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 1790 return; 1791 } 1792 1793 if (DstSize == 64 && Pred == SCC_TRUE) { 1794 MachineInstr *Select = 1795 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) 1796 .addReg(FalseReg) 1797 .addReg(TrueReg); 1798 1799 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 1800 return; 1801 } 1802 1803 static const int16_t Sub0_15[] = { 1804 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1805 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 1806 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 1807 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 1808 }; 1809 1810 static const int16_t Sub0_15_64[] = { 1811 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 1812 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 1813 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 1814 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 1815 }; 1816 1817 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; 1818 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; 1819 const int16_t *SubIndices = Sub0_15; 1820 int NElts = DstSize / 32; 1821 1822 // 64-bit select is only avaialble for SALU. 1823 if (Pred == SCC_TRUE) { 1824 SelOp = AMDGPU::S_CSELECT_B64; 1825 EltRC = &AMDGPU::SGPR_64RegClass; 1826 SubIndices = Sub0_15_64; 1827 1828 assert(NElts % 2 == 0); 1829 NElts /= 2; 1830 } 1831 1832 MachineInstrBuilder MIB = BuildMI( 1833 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); 1834 1835 I = MIB->getIterator(); 1836 1837 SmallVector<unsigned, 8> Regs; 1838 for (int Idx = 0; Idx != NElts; ++Idx) { 1839 unsigned DstElt = MRI.createVirtualRegister(EltRC); 1840 Regs.push_back(DstElt); 1841 1842 unsigned SubIdx = SubIndices[Idx]; 1843 1844 MachineInstr *Select = 1845 BuildMI(MBB, I, DL, get(SelOp), DstElt) 1846 .addReg(FalseReg, 0, SubIdx) 1847 .addReg(TrueReg, 0, SubIdx); 1848 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 1849 1850 MIB.addReg(DstElt) 1851 .addImm(SubIdx); 1852 } 1853 } 1854 1855 bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { 1856 switch (MI.getOpcode()) { 1857 case AMDGPU::V_MOV_B32_e32: 1858 case AMDGPU::V_MOV_B32_e64: 1859 case AMDGPU::V_MOV_B64_PSEUDO: { 1860 // If there are additional implicit register operands, this may be used for 1861 // register indexing so the source register operand isn't simply copied. 1862 unsigned NumOps = MI.getDesc().getNumOperands() + 1863 MI.getDesc().getNumImplicitUses(); 1864 1865 return MI.getNumOperands() == NumOps; 1866 } 1867 case AMDGPU::S_MOV_B32: 1868 case AMDGPU::S_MOV_B64: 1869 case AMDGPU::COPY: 1870 return true; 1871 default: 1872 return false; 1873 } 1874 } 1875 1876 unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind( 1877 PseudoSourceValue::PSVKind Kind) const { 1878 switch(Kind) { 1879 case PseudoSourceValue::Stack: 1880 case PseudoSourceValue::FixedStack: 1881 return AMDGPUASI.PRIVATE_ADDRESS; 1882 case PseudoSourceValue::ConstantPool: 1883 case PseudoSourceValue::GOT: 1884 case PseudoSourceValue::JumpTable: 1885 case PseudoSourceValue::GlobalValueCallEntry: 1886 case PseudoSourceValue::ExternalSymbolCallEntry: 1887 case PseudoSourceValue::TargetCustom: 1888 return AMDGPUASI.CONSTANT_ADDRESS; 1889 } 1890 return AMDGPUASI.FLAT_ADDRESS; 1891 } 1892 1893 static void removeModOperands(MachineInstr &MI) { 1894 unsigned Opc = MI.getOpcode(); 1895 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1896 AMDGPU::OpName::src0_modifiers); 1897 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1898 AMDGPU::OpName::src1_modifiers); 1899 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1900 AMDGPU::OpName::src2_modifiers); 1901 1902 MI.RemoveOperand(Src2ModIdx); 1903 MI.RemoveOperand(Src1ModIdx); 1904 MI.RemoveOperand(Src0ModIdx); 1905 } 1906 1907 bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, 1908 unsigned Reg, MachineRegisterInfo *MRI) const { 1909 if (!MRI->hasOneNonDBGUse(Reg)) 1910 return false; 1911 1912 unsigned Opc = UseMI.getOpcode(); 1913 if (Opc == AMDGPU::COPY) { 1914 bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg()); 1915 switch (DefMI.getOpcode()) { 1916 default: 1917 return false; 1918 case AMDGPU::S_MOV_B64: 1919 // TODO: We could fold 64-bit immediates, but this get compilicated 1920 // when there are sub-registers. 1921 return false; 1922 1923 case AMDGPU::V_MOV_B32_e32: 1924 case AMDGPU::S_MOV_B32: 1925 break; 1926 } 1927 unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1928 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); 1929 assert(ImmOp); 1930 // FIXME: We could handle FrameIndex values here. 1931 if (!ImmOp->isImm()) { 1932 return false; 1933 } 1934 UseMI.setDesc(get(NewOpc)); 1935 UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm()); 1936 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); 1937 return true; 1938 } 1939 1940 if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || 1941 Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) { 1942 // Don't fold if we are using source or output modifiers. The new VOP2 1943 // instructions don't have them. 1944 if (hasAnyModifiersSet(UseMI)) 1945 return false; 1946 1947 const MachineOperand &ImmOp = DefMI.getOperand(1); 1948 1949 // If this is a free constant, there's no reason to do this. 1950 // TODO: We could fold this here instead of letting SIFoldOperands do it 1951 // later. 1952 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); 1953 1954 // Any src operand can be used for the legality check. 1955 if (isInlineConstant(UseMI, *Src0, ImmOp)) 1956 return false; 1957 1958 bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64; 1959 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); 1960 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); 1961 1962 // Multiplied part is the constant: Use v_madmk_{f16, f32}. 1963 // We should only expect these to be on src0 due to canonicalizations. 1964 if (Src0->isReg() && Src0->getReg() == Reg) { 1965 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1966 return false; 1967 1968 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) 1969 return false; 1970 1971 // We need to swap operands 0 and 1 since madmk constant is at operand 1. 1972 1973 const int64_t Imm = DefMI.getOperand(1).getImm(); 1974 1975 // FIXME: This would be a lot easier if we could return a new instruction 1976 // instead of having to modify in place. 1977 1978 // Remove these first since they are at the end. 1979 UseMI.RemoveOperand( 1980 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 1981 UseMI.RemoveOperand( 1982 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 1983 1984 unsigned Src1Reg = Src1->getReg(); 1985 unsigned Src1SubReg = Src1->getSubReg(); 1986 Src0->setReg(Src1Reg); 1987 Src0->setSubReg(Src1SubReg); 1988 Src0->setIsKill(Src1->isKill()); 1989 1990 if (Opc == AMDGPU::V_MAC_F32_e64 || 1991 Opc == AMDGPU::V_MAC_F16_e64) 1992 UseMI.untieRegOperand( 1993 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1994 1995 Src1->ChangeToImmediate(Imm); 1996 1997 removeModOperands(UseMI); 1998 UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16)); 1999 2000 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 2001 if (DeleteDef) 2002 DefMI.eraseFromParent(); 2003 2004 return true; 2005 } 2006 2007 // Added part is the constant: Use v_madak_{f16, f32}. 2008 if (Src2->isReg() && Src2->getReg() == Reg) { 2009 // Not allowed to use constant bus for another operand. 2010 // We can however allow an inline immediate as src0. 2011 if (!Src0->isImm() && 2012 (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) 2013 return false; 2014 2015 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 2016 return false; 2017 2018 const int64_t Imm = DefMI.getOperand(1).getImm(); 2019 2020 // FIXME: This would be a lot easier if we could return a new instruction 2021 // instead of having to modify in place. 2022 2023 // Remove these first since they are at the end. 2024 UseMI.RemoveOperand( 2025 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 2026 UseMI.RemoveOperand( 2027 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 2028 2029 if (Opc == AMDGPU::V_MAC_F32_e64 || 2030 Opc == AMDGPU::V_MAC_F16_e64) 2031 UseMI.untieRegOperand( 2032 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 2033 2034 // ChangingToImmediate adds Src2 back to the instruction. 2035 Src2->ChangeToImmediate(Imm); 2036 2037 // These come before src2. 2038 removeModOperands(UseMI); 2039 UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16)); 2040 2041 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 2042 if (DeleteDef) 2043 DefMI.eraseFromParent(); 2044 2045 return true; 2046 } 2047 } 2048 2049 return false; 2050 } 2051 2052 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 2053 int WidthB, int OffsetB) { 2054 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 2055 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 2056 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 2057 return LowOffset + LowWidth <= HighOffset; 2058 } 2059 2060 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa, 2061 MachineInstr &MIb) const { 2062 unsigned BaseReg0, BaseReg1; 2063 int64_t Offset0, Offset1; 2064 2065 if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && 2066 getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { 2067 2068 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { 2069 // FIXME: Handle ds_read2 / ds_write2. 2070 return false; 2071 } 2072 unsigned Width0 = (*MIa.memoperands_begin())->getSize(); 2073 unsigned Width1 = (*MIb.memoperands_begin())->getSize(); 2074 if (BaseReg0 == BaseReg1 && 2075 offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { 2076 return true; 2077 } 2078 } 2079 2080 return false; 2081 } 2082 2083 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa, 2084 MachineInstr &MIb, 2085 AliasAnalysis *AA) const { 2086 assert((MIa.mayLoad() || MIa.mayStore()) && 2087 "MIa must load from or modify a memory location"); 2088 assert((MIb.mayLoad() || MIb.mayStore()) && 2089 "MIb must load from or modify a memory location"); 2090 2091 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) 2092 return false; 2093 2094 // XXX - Can we relax this between address spaces? 2095 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 2096 return false; 2097 2098 if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) { 2099 const MachineMemOperand *MMOa = *MIa.memoperands_begin(); 2100 const MachineMemOperand *MMOb = *MIb.memoperands_begin(); 2101 if (MMOa->getValue() && MMOb->getValue()) { 2102 MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo()); 2103 MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo()); 2104 if (!AA->alias(LocA, LocB)) 2105 return true; 2106 } 2107 } 2108 2109 // TODO: Should we check the address space from the MachineMemOperand? That 2110 // would allow us to distinguish objects we know don't alias based on the 2111 // underlying address space, even if it was lowered to a different one, 2112 // e.g. private accesses lowered to use MUBUF instructions on a scratch 2113 // buffer. 2114 if (isDS(MIa)) { 2115 if (isDS(MIb)) 2116 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2117 2118 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb); 2119 } 2120 2121 if (isMUBUF(MIa) || isMTBUF(MIa)) { 2122 if (isMUBUF(MIb) || isMTBUF(MIb)) 2123 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2124 2125 return !isFLAT(MIb) && !isSMRD(MIb); 2126 } 2127 2128 if (isSMRD(MIa)) { 2129 if (isSMRD(MIb)) 2130 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2131 2132 return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa); 2133 } 2134 2135 if (isFLAT(MIa)) { 2136 if (isFLAT(MIb)) 2137 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2138 2139 return false; 2140 } 2141 2142 return false; 2143 } 2144 2145 static int64_t getFoldableImm(const MachineOperand* MO) { 2146 if (!MO->isReg()) 2147 return false; 2148 const MachineFunction *MF = MO->getParent()->getParent()->getParent(); 2149 const MachineRegisterInfo &MRI = MF->getRegInfo(); 2150 auto Def = MRI.getUniqueVRegDef(MO->getReg()); 2151 if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 && 2152 Def->getOperand(1).isImm()) 2153 return Def->getOperand(1).getImm(); 2154 return AMDGPU::NoRegister; 2155 } 2156 2157 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, 2158 MachineInstr &MI, 2159 LiveVariables *LV) const { 2160 bool IsF16 = false; 2161 2162 switch (MI.getOpcode()) { 2163 default: 2164 return nullptr; 2165 case AMDGPU::V_MAC_F16_e64: 2166 IsF16 = true; 2167 LLVM_FALLTHROUGH; 2168 case AMDGPU::V_MAC_F32_e64: 2169 break; 2170 case AMDGPU::V_MAC_F16_e32: 2171 IsF16 = true; 2172 LLVM_FALLTHROUGH; 2173 case AMDGPU::V_MAC_F32_e32: { 2174 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 2175 AMDGPU::OpName::src0); 2176 const MachineOperand *Src0 = &MI.getOperand(Src0Idx); 2177 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0)) 2178 return nullptr; 2179 break; 2180 } 2181 } 2182 2183 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 2184 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); 2185 const MachineOperand *Src0Mods = 2186 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 2187 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 2188 const MachineOperand *Src1Mods = 2189 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 2190 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 2191 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); 2192 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); 2193 2194 if (!Src0Mods && !Src1Mods && !Clamp && !Omod && 2195 // If we have an SGPR input, we will violate the constant bus restriction. 2196 !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg())) { 2197 if (auto Imm = getFoldableImm(Src2)) { 2198 return BuildMI(*MBB, MI, MI.getDebugLoc(), 2199 get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32)) 2200 .add(*Dst) 2201 .add(*Src0) 2202 .add(*Src1) 2203 .addImm(Imm); 2204 } 2205 if (auto Imm = getFoldableImm(Src1)) { 2206 return BuildMI(*MBB, MI, MI.getDebugLoc(), 2207 get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32)) 2208 .add(*Dst) 2209 .add(*Src0) 2210 .addImm(Imm) 2211 .add(*Src2); 2212 } 2213 if (auto Imm = getFoldableImm(Src0)) { 2214 if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32, 2215 AMDGPU::OpName::src0), Src1)) 2216 return BuildMI(*MBB, MI, MI.getDebugLoc(), 2217 get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32)) 2218 .add(*Dst) 2219 .add(*Src1) 2220 .addImm(Imm) 2221 .add(*Src2); 2222 } 2223 } 2224 2225 return BuildMI(*MBB, MI, MI.getDebugLoc(), 2226 get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32)) 2227 .add(*Dst) 2228 .addImm(Src0Mods ? Src0Mods->getImm() : 0) 2229 .add(*Src0) 2230 .addImm(Src1Mods ? Src1Mods->getImm() : 0) 2231 .add(*Src1) 2232 .addImm(0) // Src mods 2233 .add(*Src2) 2234 .addImm(Clamp ? Clamp->getImm() : 0) 2235 .addImm(Omod ? Omod->getImm() : 0); 2236 } 2237 2238 // It's not generally safe to move VALU instructions across these since it will 2239 // start using the register as a base index rather than directly. 2240 // XXX - Why isn't hasSideEffects sufficient for these? 2241 static bool changesVGPRIndexingMode(const MachineInstr &MI) { 2242 switch (MI.getOpcode()) { 2243 case AMDGPU::S_SET_GPR_IDX_ON: 2244 case AMDGPU::S_SET_GPR_IDX_MODE: 2245 case AMDGPU::S_SET_GPR_IDX_OFF: 2246 return true; 2247 default: 2248 return false; 2249 } 2250 } 2251 2252 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, 2253 const MachineBasicBlock *MBB, 2254 const MachineFunction &MF) const { 2255 // XXX - Do we want the SP check in the base implementation? 2256 2257 // Target-independent instructions do not have an implicit-use of EXEC, even 2258 // when they operate on VGPRs. Treating EXEC modifications as scheduling 2259 // boundaries prevents incorrect movements of such instructions. 2260 return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) || 2261 MI.modifiesRegister(AMDGPU::EXEC, &RI) || 2262 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || 2263 MI.getOpcode() == AMDGPU::S_SETREG_B32 || 2264 changesVGPRIndexingMode(MI); 2265 } 2266 2267 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 2268 switch (Imm.getBitWidth()) { 2269 case 32: 2270 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(), 2271 ST.hasInv2PiInlineImm()); 2272 case 64: 2273 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), 2274 ST.hasInv2PiInlineImm()); 2275 case 16: 2276 return ST.has16BitInsts() && 2277 AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), 2278 ST.hasInv2PiInlineImm()); 2279 default: 2280 llvm_unreachable("invalid bitwidth"); 2281 } 2282 } 2283 2284 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 2285 uint8_t OperandType) const { 2286 if (!MO.isImm() || 2287 OperandType < AMDGPU::OPERAND_SRC_FIRST || 2288 OperandType > AMDGPU::OPERAND_SRC_LAST) 2289 return false; 2290 2291 // MachineOperand provides no way to tell the true operand size, since it only 2292 // records a 64-bit value. We need to know the size to determine if a 32-bit 2293 // floating point immediate bit pattern is legal for an integer immediate. It 2294 // would be for any 32-bit integer operand, but would not be for a 64-bit one. 2295 2296 int64_t Imm = MO.getImm(); 2297 switch (OperandType) { 2298 case AMDGPU::OPERAND_REG_IMM_INT32: 2299 case AMDGPU::OPERAND_REG_IMM_FP32: 2300 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 2301 case AMDGPU::OPERAND_REG_INLINE_C_FP32: { 2302 int32_t Trunc = static_cast<int32_t>(Imm); 2303 return Trunc == Imm && 2304 AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); 2305 } 2306 case AMDGPU::OPERAND_REG_IMM_INT64: 2307 case AMDGPU::OPERAND_REG_IMM_FP64: 2308 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 2309 case AMDGPU::OPERAND_REG_INLINE_C_FP64: 2310 return AMDGPU::isInlinableLiteral64(MO.getImm(), 2311 ST.hasInv2PiInlineImm()); 2312 case AMDGPU::OPERAND_REG_IMM_INT16: 2313 case AMDGPU::OPERAND_REG_IMM_FP16: 2314 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 2315 case AMDGPU::OPERAND_REG_INLINE_C_FP16: { 2316 if (isInt<16>(Imm) || isUInt<16>(Imm)) { 2317 // A few special case instructions have 16-bit operands on subtargets 2318 // where 16-bit instructions are not legal. 2319 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle 2320 // constants in these cases 2321 int16_t Trunc = static_cast<int16_t>(Imm); 2322 return ST.has16BitInsts() && 2323 AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); 2324 } 2325 2326 return false; 2327 } 2328 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: 2329 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { 2330 uint32_t Trunc = static_cast<uint32_t>(Imm); 2331 return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); 2332 } 2333 default: 2334 llvm_unreachable("invalid bitwidth"); 2335 } 2336 } 2337 2338 bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO, 2339 const MCOperandInfo &OpInfo) const { 2340 switch (MO.getType()) { 2341 case MachineOperand::MO_Register: 2342 return false; 2343 case MachineOperand::MO_Immediate: 2344 return !isInlineConstant(MO, OpInfo); 2345 case MachineOperand::MO_FrameIndex: 2346 case MachineOperand::MO_MachineBasicBlock: 2347 case MachineOperand::MO_ExternalSymbol: 2348 case MachineOperand::MO_GlobalAddress: 2349 case MachineOperand::MO_MCSymbol: 2350 return true; 2351 default: 2352 llvm_unreachable("unexpected operand type"); 2353 } 2354 } 2355 2356 static bool compareMachineOp(const MachineOperand &Op0, 2357 const MachineOperand &Op1) { 2358 if (Op0.getType() != Op1.getType()) 2359 return false; 2360 2361 switch (Op0.getType()) { 2362 case MachineOperand::MO_Register: 2363 return Op0.getReg() == Op1.getReg(); 2364 case MachineOperand::MO_Immediate: 2365 return Op0.getImm() == Op1.getImm(); 2366 default: 2367 llvm_unreachable("Didn't expect to be comparing these operand types"); 2368 } 2369 } 2370 2371 bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, 2372 const MachineOperand &MO) const { 2373 const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo]; 2374 2375 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 2376 2377 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 2378 return true; 2379 2380 if (OpInfo.RegClass < 0) 2381 return false; 2382 2383 if (MO.isImm() && isInlineConstant(MO, OpInfo)) 2384 return RI.opCanUseInlineConstant(OpInfo.OperandType); 2385 2386 return RI.opCanUseLiteralConstant(OpInfo.OperandType); 2387 } 2388 2389 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 2390 int Op32 = AMDGPU::getVOPe32(Opcode); 2391 if (Op32 == -1) 2392 return false; 2393 2394 return pseudoToMCOpcode(Op32) != -1; 2395 } 2396 2397 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 2398 // The src0_modifier operand is present on all instructions 2399 // that have modifiers. 2400 2401 return AMDGPU::getNamedOperandIdx(Opcode, 2402 AMDGPU::OpName::src0_modifiers) != -1; 2403 } 2404 2405 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 2406 unsigned OpName) const { 2407 const MachineOperand *Mods = getNamedOperand(MI, OpName); 2408 return Mods && Mods->getImm(); 2409 } 2410 2411 bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { 2412 return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || 2413 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || 2414 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) || 2415 hasModifiersSet(MI, AMDGPU::OpName::clamp) || 2416 hasModifiersSet(MI, AMDGPU::OpName::omod); 2417 } 2418 2419 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 2420 const MachineOperand &MO, 2421 const MCOperandInfo &OpInfo) const { 2422 // Literal constants use the constant bus. 2423 //if (isLiteralConstantLike(MO, OpInfo)) 2424 // return true; 2425 if (MO.isImm()) 2426 return !isInlineConstant(MO, OpInfo); 2427 2428 if (!MO.isReg()) 2429 return true; // Misc other operands like FrameIndex 2430 2431 if (!MO.isUse()) 2432 return false; 2433 2434 if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) 2435 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 2436 2437 // FLAT_SCR is just an SGPR pair. 2438 if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) 2439 return true; 2440 2441 // EXEC register uses the constant bus. 2442 if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) 2443 return true; 2444 2445 // SGPRs use the constant bus 2446 return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 || 2447 (!MO.isImplicit() && 2448 (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || 2449 AMDGPU::SGPR_64RegClass.contains(MO.getReg())))); 2450 } 2451 2452 static unsigned findImplicitSGPRRead(const MachineInstr &MI) { 2453 for (const MachineOperand &MO : MI.implicit_operands()) { 2454 // We only care about reads. 2455 if (MO.isDef()) 2456 continue; 2457 2458 switch (MO.getReg()) { 2459 case AMDGPU::VCC: 2460 case AMDGPU::M0: 2461 case AMDGPU::FLAT_SCR: 2462 return MO.getReg(); 2463 2464 default: 2465 break; 2466 } 2467 } 2468 2469 return AMDGPU::NoRegister; 2470 } 2471 2472 static bool shouldReadExec(const MachineInstr &MI) { 2473 if (SIInstrInfo::isVALU(MI)) { 2474 switch (MI.getOpcode()) { 2475 case AMDGPU::V_READLANE_B32: 2476 case AMDGPU::V_READLANE_B32_si: 2477 case AMDGPU::V_READLANE_B32_vi: 2478 case AMDGPU::V_WRITELANE_B32: 2479 case AMDGPU::V_WRITELANE_B32_si: 2480 case AMDGPU::V_WRITELANE_B32_vi: 2481 return false; 2482 } 2483 2484 return true; 2485 } 2486 2487 if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) || 2488 SIInstrInfo::isSALU(MI) || 2489 SIInstrInfo::isSMRD(MI)) 2490 return false; 2491 2492 return true; 2493 } 2494 2495 static bool isSubRegOf(const SIRegisterInfo &TRI, 2496 const MachineOperand &SuperVec, 2497 const MachineOperand &SubReg) { 2498 if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg())) 2499 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); 2500 2501 return SubReg.getSubReg() != AMDGPU::NoSubRegister && 2502 SubReg.getReg() == SuperVec.getReg(); 2503 } 2504 2505 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, 2506 StringRef &ErrInfo) const { 2507 uint16_t Opcode = MI.getOpcode(); 2508 if (SIInstrInfo::isGenericOpcode(MI.getOpcode())) 2509 return true; 2510 2511 const MachineFunction *MF = MI.getParent()->getParent(); 2512 const MachineRegisterInfo &MRI = MF->getRegInfo(); 2513 2514 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 2515 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 2516 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 2517 2518 // Make sure the number of operands is correct. 2519 const MCInstrDesc &Desc = get(Opcode); 2520 if (!Desc.isVariadic() && 2521 Desc.getNumOperands() != MI.getNumExplicitOperands()) { 2522 ErrInfo = "Instruction has wrong number of operands."; 2523 return false; 2524 } 2525 2526 if (MI.isInlineAsm()) { 2527 // Verify register classes for inlineasm constraints. 2528 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands(); 2529 I != E; ++I) { 2530 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI); 2531 if (!RC) 2532 continue; 2533 2534 const MachineOperand &Op = MI.getOperand(I); 2535 if (!Op.isReg()) 2536 continue; 2537 2538 unsigned Reg = Op.getReg(); 2539 if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) { 2540 ErrInfo = "inlineasm operand has incorrect register class."; 2541 return false; 2542 } 2543 } 2544 2545 return true; 2546 } 2547 2548 // Make sure the register classes are correct. 2549 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 2550 if (MI.getOperand(i).isFPImm()) { 2551 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 2552 "all fp values to integers."; 2553 return false; 2554 } 2555 2556 int RegClass = Desc.OpInfo[i].RegClass; 2557 2558 switch (Desc.OpInfo[i].OperandType) { 2559 case MCOI::OPERAND_REGISTER: 2560 if (MI.getOperand(i).isImm()) { 2561 ErrInfo = "Illegal immediate value for operand."; 2562 return false; 2563 } 2564 break; 2565 case AMDGPU::OPERAND_REG_IMM_INT32: 2566 case AMDGPU::OPERAND_REG_IMM_FP32: 2567 break; 2568 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 2569 case AMDGPU::OPERAND_REG_INLINE_C_FP32: 2570 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 2571 case AMDGPU::OPERAND_REG_INLINE_C_FP64: 2572 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 2573 case AMDGPU::OPERAND_REG_INLINE_C_FP16: { 2574 const MachineOperand &MO = MI.getOperand(i); 2575 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { 2576 ErrInfo = "Illegal immediate value for operand."; 2577 return false; 2578 } 2579 break; 2580 } 2581 case MCOI::OPERAND_IMMEDIATE: 2582 case AMDGPU::OPERAND_KIMM32: 2583 // Check if this operand is an immediate. 2584 // FrameIndex operands will be replaced by immediates, so they are 2585 // allowed. 2586 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { 2587 ErrInfo = "Expected immediate, but got non-immediate"; 2588 return false; 2589 } 2590 LLVM_FALLTHROUGH; 2591 default: 2592 continue; 2593 } 2594 2595 if (!MI.getOperand(i).isReg()) 2596 continue; 2597 2598 if (RegClass != -1) { 2599 unsigned Reg = MI.getOperand(i).getReg(); 2600 if (Reg == AMDGPU::NoRegister || 2601 TargetRegisterInfo::isVirtualRegister(Reg)) 2602 continue; 2603 2604 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 2605 if (!RC->contains(Reg)) { 2606 ErrInfo = "Operand has incorrect register class."; 2607 return false; 2608 } 2609 } 2610 } 2611 2612 // Verify SDWA 2613 if (isSDWA(MI)) { 2614 if (!ST.hasSDWA()) { 2615 ErrInfo = "SDWA is not supported on this target"; 2616 return false; 2617 } 2618 2619 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); 2620 2621 const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx }; 2622 2623 for (int OpIdx: OpIndicies) { 2624 if (OpIdx == -1) 2625 continue; 2626 const MachineOperand &MO = MI.getOperand(OpIdx); 2627 2628 if (!ST.hasSDWAScalar()) { 2629 // Only VGPRS on VI 2630 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) { 2631 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI"; 2632 return false; 2633 } 2634 } else { 2635 // No immediates on GFX9 2636 if (!MO.isReg()) { 2637 ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9"; 2638 return false; 2639 } 2640 } 2641 } 2642 2643 if (!ST.hasSDWAOmod()) { 2644 // No omod allowed on VI 2645 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); 2646 if (OMod != nullptr && 2647 (!OMod->isImm() || OMod->getImm() != 0)) { 2648 ErrInfo = "OMod not allowed in SDWA instructions on VI"; 2649 return false; 2650 } 2651 } 2652 2653 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode); 2654 if (isVOPC(BasicOpcode)) { 2655 if (!ST.hasSDWASdst() && DstIdx != -1) { 2656 // Only vcc allowed as dst on VI for VOPC 2657 const MachineOperand &Dst = MI.getOperand(DstIdx); 2658 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) { 2659 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI"; 2660 return false; 2661 } 2662 } else if (!ST.hasSDWAOutModsVOPC()) { 2663 // No clamp allowed on GFX9 for VOPC 2664 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); 2665 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) { 2666 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI"; 2667 return false; 2668 } 2669 2670 // No omod allowed on GFX9 for VOPC 2671 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); 2672 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) { 2673 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI"; 2674 return false; 2675 } 2676 } 2677 } 2678 } 2679 2680 // Verify VOP* 2681 if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI)) { 2682 // Only look at the true operands. Only a real operand can use the constant 2683 // bus, and we don't want to check pseudo-operands like the source modifier 2684 // flags. 2685 const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; 2686 2687 unsigned ConstantBusCount = 0; 2688 2689 if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) 2690 ++ConstantBusCount; 2691 2692 unsigned SGPRUsed = findImplicitSGPRRead(MI); 2693 if (SGPRUsed != AMDGPU::NoRegister) 2694 ++ConstantBusCount; 2695 2696 for (int OpIdx : OpIndices) { 2697 if (OpIdx == -1) 2698 break; 2699 const MachineOperand &MO = MI.getOperand(OpIdx); 2700 if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { 2701 if (MO.isReg()) { 2702 if (MO.getReg() != SGPRUsed) 2703 ++ConstantBusCount; 2704 SGPRUsed = MO.getReg(); 2705 } else { 2706 ++ConstantBusCount; 2707 } 2708 } 2709 } 2710 if (ConstantBusCount > 1) { 2711 ErrInfo = "VOP* instruction uses the constant bus more than once"; 2712 return false; 2713 } 2714 } 2715 2716 // Verify misc. restrictions on specific instructions. 2717 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || 2718 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { 2719 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 2720 const MachineOperand &Src1 = MI.getOperand(Src1Idx); 2721 const MachineOperand &Src2 = MI.getOperand(Src2Idx); 2722 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 2723 if (!compareMachineOp(Src0, Src1) && 2724 !compareMachineOp(Src0, Src2)) { 2725 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 2726 return false; 2727 } 2728 } 2729 } 2730 2731 if (isSOPK(MI)) { 2732 int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm(); 2733 if (sopkIsZext(MI)) { 2734 if (!isUInt<16>(Imm)) { 2735 ErrInfo = "invalid immediate for SOPK instruction"; 2736 return false; 2737 } 2738 } else { 2739 if (!isInt<16>(Imm)) { 2740 ErrInfo = "invalid immediate for SOPK instruction"; 2741 return false; 2742 } 2743 } 2744 } 2745 2746 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || 2747 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || 2748 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 2749 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { 2750 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 2751 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; 2752 2753 const unsigned StaticNumOps = Desc.getNumOperands() + 2754 Desc.getNumImplicitUses(); 2755 const unsigned NumImplicitOps = IsDst ? 2 : 1; 2756 2757 // Allow additional implicit operands. This allows a fixup done by the post 2758 // RA scheduler where the main implicit operand is killed and implicit-defs 2759 // are added for sub-registers that remain live after this instruction. 2760 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { 2761 ErrInfo = "missing implicit register operands"; 2762 return false; 2763 } 2764 2765 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 2766 if (IsDst) { 2767 if (!Dst->isUse()) { 2768 ErrInfo = "v_movreld_b32 vdst should be a use operand"; 2769 return false; 2770 } 2771 2772 unsigned UseOpIdx; 2773 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || 2774 UseOpIdx != StaticNumOps + 1) { 2775 ErrInfo = "movrel implicit operands should be tied"; 2776 return false; 2777 } 2778 } 2779 2780 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 2781 const MachineOperand &ImpUse 2782 = MI.getOperand(StaticNumOps + NumImplicitOps - 1); 2783 if (!ImpUse.isReg() || !ImpUse.isUse() || 2784 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { 2785 ErrInfo = "src0 should be subreg of implicit vector use"; 2786 return false; 2787 } 2788 } 2789 2790 // Make sure we aren't losing exec uses in the td files. This mostly requires 2791 // being careful when using let Uses to try to add other use registers. 2792 if (shouldReadExec(MI)) { 2793 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 2794 ErrInfo = "VALU instruction does not implicitly read exec mask"; 2795 return false; 2796 } 2797 } 2798 2799 if (isSMRD(MI)) { 2800 if (MI.mayStore()) { 2801 // The register offset form of scalar stores may only use m0 as the 2802 // soffset register. 2803 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff); 2804 if (Soff && Soff->getReg() != AMDGPU::M0) { 2805 ErrInfo = "scalar stores must use m0 as offset register"; 2806 return false; 2807 } 2808 } 2809 } 2810 2811 if (isFLAT(MI) && !MF->getSubtarget<SISubtarget>().hasFlatInstOffsets()) { 2812 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 2813 if (Offset->getImm() != 0) { 2814 ErrInfo = "subtarget does not support offsets in flat instructions"; 2815 return false; 2816 } 2817 } 2818 2819 return true; 2820 } 2821 2822 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { 2823 switch (MI.getOpcode()) { 2824 default: return AMDGPU::INSTRUCTION_LIST_END; 2825 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 2826 case AMDGPU::COPY: return AMDGPU::COPY; 2827 case AMDGPU::PHI: return AMDGPU::PHI; 2828 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 2829 case AMDGPU::WQM: return AMDGPU::WQM; 2830 case AMDGPU::WWM: return AMDGPU::WWM; 2831 case AMDGPU::S_MOV_B32: 2832 return MI.getOperand(1).isReg() ? 2833 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 2834 case AMDGPU::S_ADD_I32: 2835 case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; 2836 case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; 2837 case AMDGPU::S_SUB_I32: 2838 case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; 2839 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 2840 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; 2841 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; 2842 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; 2843 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; 2844 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; 2845 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; 2846 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; 2847 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; 2848 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 2849 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; 2850 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 2851 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; 2852 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 2853 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; 2854 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; 2855 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; 2856 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; 2857 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; 2858 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 2859 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 2860 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 2861 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 2862 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; 2863 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; 2864 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; 2865 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; 2866 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; 2867 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; 2868 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; 2869 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; 2870 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; 2871 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; 2872 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; 2873 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; 2874 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32; 2875 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32; 2876 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 2877 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 2878 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 2879 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 2880 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; 2881 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; 2882 } 2883 } 2884 2885 bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const { 2886 return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END; 2887 } 2888 2889 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 2890 unsigned OpNo) const { 2891 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 2892 const MCInstrDesc &Desc = get(MI.getOpcode()); 2893 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 2894 Desc.OpInfo[OpNo].RegClass == -1) { 2895 unsigned Reg = MI.getOperand(OpNo).getReg(); 2896 2897 if (TargetRegisterInfo::isVirtualRegister(Reg)) 2898 return MRI.getRegClass(Reg); 2899 return RI.getPhysRegClass(Reg); 2900 } 2901 2902 unsigned RCID = Desc.OpInfo[OpNo].RegClass; 2903 return RI.getRegClass(RCID); 2904 } 2905 2906 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { 2907 switch (MI.getOpcode()) { 2908 case AMDGPU::COPY: 2909 case AMDGPU::REG_SEQUENCE: 2910 case AMDGPU::PHI: 2911 case AMDGPU::INSERT_SUBREG: 2912 return RI.hasVGPRs(getOpRegClass(MI, 0)); 2913 default: 2914 return RI.hasVGPRs(getOpRegClass(MI, OpNo)); 2915 } 2916 } 2917 2918 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { 2919 MachineBasicBlock::iterator I = MI; 2920 MachineBasicBlock *MBB = MI.getParent(); 2921 MachineOperand &MO = MI.getOperand(OpIdx); 2922 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 2923 unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; 2924 const TargetRegisterClass *RC = RI.getRegClass(RCID); 2925 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 2926 if (MO.isReg()) 2927 Opcode = AMDGPU::COPY; 2928 else if (RI.isSGPRClass(RC)) 2929 Opcode = AMDGPU::S_MOV_B32; 2930 2931 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 2932 if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) 2933 VRC = &AMDGPU::VReg_64RegClass; 2934 else 2935 VRC = &AMDGPU::VGPR_32RegClass; 2936 2937 unsigned Reg = MRI.createVirtualRegister(VRC); 2938 DebugLoc DL = MBB->findDebugLoc(I); 2939 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); 2940 MO.ChangeToRegister(Reg, false); 2941 } 2942 2943 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 2944 MachineRegisterInfo &MRI, 2945 MachineOperand &SuperReg, 2946 const TargetRegisterClass *SuperRC, 2947 unsigned SubIdx, 2948 const TargetRegisterClass *SubRC) 2949 const { 2950 MachineBasicBlock *MBB = MI->getParent(); 2951 DebugLoc DL = MI->getDebugLoc(); 2952 unsigned SubReg = MRI.createVirtualRegister(SubRC); 2953 2954 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { 2955 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 2956 .addReg(SuperReg.getReg(), 0, SubIdx); 2957 return SubReg; 2958 } 2959 2960 // Just in case the super register is itself a sub-register, copy it to a new 2961 // value so we don't need to worry about merging its subreg index with the 2962 // SubIdx passed to this function. The register coalescer should be able to 2963 // eliminate this extra copy. 2964 unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); 2965 2966 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 2967 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 2968 2969 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 2970 .addReg(NewSuperReg, 0, SubIdx); 2971 2972 return SubReg; 2973 } 2974 2975 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 2976 MachineBasicBlock::iterator MII, 2977 MachineRegisterInfo &MRI, 2978 MachineOperand &Op, 2979 const TargetRegisterClass *SuperRC, 2980 unsigned SubIdx, 2981 const TargetRegisterClass *SubRC) const { 2982 if (Op.isImm()) { 2983 if (SubIdx == AMDGPU::sub0) 2984 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); 2985 if (SubIdx == AMDGPU::sub1) 2986 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32)); 2987 2988 llvm_unreachable("Unhandled register index for immediate"); 2989 } 2990 2991 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 2992 SubIdx, SubRC); 2993 return MachineOperand::CreateReg(SubReg, false); 2994 } 2995 2996 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 2997 void SIInstrInfo::swapOperands(MachineInstr &Inst) const { 2998 assert(Inst.getNumExplicitOperands() == 3); 2999 MachineOperand Op1 = Inst.getOperand(1); 3000 Inst.RemoveOperand(1); 3001 Inst.addOperand(Op1); 3002 } 3003 3004 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, 3005 const MCOperandInfo &OpInfo, 3006 const MachineOperand &MO) const { 3007 if (!MO.isReg()) 3008 return false; 3009 3010 unsigned Reg = MO.getReg(); 3011 const TargetRegisterClass *RC = 3012 TargetRegisterInfo::isVirtualRegister(Reg) ? 3013 MRI.getRegClass(Reg) : 3014 RI.getPhysRegClass(Reg); 3015 3016 const SIRegisterInfo *TRI = 3017 static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 3018 RC = TRI->getSubRegClass(RC, MO.getSubReg()); 3019 3020 // In order to be legal, the common sub-class must be equal to the 3021 // class of the current operand. For example: 3022 // 3023 // v_mov_b32 s0 ; Operand defined as vsrc_b32 3024 // ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL 3025 // 3026 // s_sendmsg 0, s0 ; Operand defined as m0reg 3027 // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL 3028 3029 return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; 3030 } 3031 3032 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, 3033 const MCOperandInfo &OpInfo, 3034 const MachineOperand &MO) const { 3035 if (MO.isReg()) 3036 return isLegalRegOperand(MRI, OpInfo, MO); 3037 3038 // Handle non-register types that are treated like immediates. 3039 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 3040 return true; 3041 } 3042 3043 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, 3044 const MachineOperand *MO) const { 3045 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 3046 const MCInstrDesc &InstDesc = MI.getDesc(); 3047 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; 3048 const TargetRegisterClass *DefinedRC = 3049 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 3050 if (!MO) 3051 MO = &MI.getOperand(OpIdx); 3052 3053 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { 3054 3055 RegSubRegPair SGPRUsed; 3056 if (MO->isReg()) 3057 SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg()); 3058 3059 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3060 if (i == OpIdx) 3061 continue; 3062 const MachineOperand &Op = MI.getOperand(i); 3063 if (Op.isReg()) { 3064 if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) && 3065 usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) { 3066 return false; 3067 } 3068 } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { 3069 return false; 3070 } 3071 } 3072 } 3073 3074 if (MO->isReg()) { 3075 assert(DefinedRC); 3076 return isLegalRegOperand(MRI, OpInfo, *MO); 3077 } 3078 3079 // Handle non-register types that are treated like immediates. 3080 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); 3081 3082 if (!DefinedRC) { 3083 // This operand expects an immediate. 3084 return true; 3085 } 3086 3087 return isImmOperandLegal(MI, OpIdx, *MO); 3088 } 3089 3090 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, 3091 MachineInstr &MI) const { 3092 unsigned Opc = MI.getOpcode(); 3093 const MCInstrDesc &InstrDesc = get(Opc); 3094 3095 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 3096 MachineOperand &Src1 = MI.getOperand(Src1Idx); 3097 3098 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 3099 // we need to only have one constant bus use. 3100 // 3101 // Note we do not need to worry about literal constants here. They are 3102 // disabled for the operand type for instructions because they will always 3103 // violate the one constant bus use rule. 3104 bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; 3105 if (HasImplicitSGPR) { 3106 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 3107 MachineOperand &Src0 = MI.getOperand(Src0Idx); 3108 3109 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) 3110 legalizeOpWithMove(MI, Src0Idx); 3111 } 3112 3113 // VOP2 src0 instructions support all operand types, so we don't need to check 3114 // their legality. If src1 is already legal, we don't need to do anything. 3115 if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) 3116 return; 3117 3118 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for 3119 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane 3120 // select is uniform. 3121 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() && 3122 RI.isVGPR(MRI, Src1.getReg())) { 3123 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 3124 const DebugLoc &DL = MI.getDebugLoc(); 3125 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 3126 .add(Src1); 3127 Src1.ChangeToRegister(Reg, false); 3128 return; 3129 } 3130 3131 // We do not use commuteInstruction here because it is too aggressive and will 3132 // commute if it is possible. We only want to commute here if it improves 3133 // legality. This can be called a fairly large number of times so don't waste 3134 // compile time pointlessly swapping and checking legality again. 3135 if (HasImplicitSGPR || !MI.isCommutable()) { 3136 legalizeOpWithMove(MI, Src1Idx); 3137 return; 3138 } 3139 3140 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 3141 MachineOperand &Src0 = MI.getOperand(Src0Idx); 3142 3143 // If src0 can be used as src1, commuting will make the operands legal. 3144 // Otherwise we have to give up and insert a move. 3145 // 3146 // TODO: Other immediate-like operand kinds could be commuted if there was a 3147 // MachineOperand::ChangeTo* for them. 3148 if ((!Src1.isImm() && !Src1.isReg()) || 3149 !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { 3150 legalizeOpWithMove(MI, Src1Idx); 3151 return; 3152 } 3153 3154 int CommutedOpc = commuteOpcode(MI); 3155 if (CommutedOpc == -1) { 3156 legalizeOpWithMove(MI, Src1Idx); 3157 return; 3158 } 3159 3160 MI.setDesc(get(CommutedOpc)); 3161 3162 unsigned Src0Reg = Src0.getReg(); 3163 unsigned Src0SubReg = Src0.getSubReg(); 3164 bool Src0Kill = Src0.isKill(); 3165 3166 if (Src1.isImm()) 3167 Src0.ChangeToImmediate(Src1.getImm()); 3168 else if (Src1.isReg()) { 3169 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); 3170 Src0.setSubReg(Src1.getSubReg()); 3171 } else 3172 llvm_unreachable("Should only have register or immediate operands"); 3173 3174 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); 3175 Src1.setSubReg(Src0SubReg); 3176 } 3177 3178 // Legalize VOP3 operands. Because all operand types are supported for any 3179 // operand, and since literal constants are not allowed and should never be 3180 // seen, we only need to worry about inserting copies if we use multiple SGPR 3181 // operands. 3182 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, 3183 MachineInstr &MI) const { 3184 unsigned Opc = MI.getOpcode(); 3185 3186 int VOP3Idx[3] = { 3187 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 3188 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), 3189 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) 3190 }; 3191 3192 // Find the one SGPR operand we are allowed to use. 3193 unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); 3194 3195 for (unsigned i = 0; i < 3; ++i) { 3196 int Idx = VOP3Idx[i]; 3197 if (Idx == -1) 3198 break; 3199 MachineOperand &MO = MI.getOperand(Idx); 3200 3201 // We should never see a VOP3 instruction with an illegal immediate operand. 3202 if (!MO.isReg()) 3203 continue; 3204 3205 if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 3206 continue; // VGPRs are legal 3207 3208 if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { 3209 SGPRReg = MO.getReg(); 3210 // We can use one SGPR in each VOP3 instruction. 3211 continue; 3212 } 3213 3214 // If we make it this far, then the operand is not legal and we must 3215 // legalize it. 3216 legalizeOpWithMove(MI, Idx); 3217 } 3218 } 3219 3220 unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, 3221 MachineRegisterInfo &MRI) const { 3222 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); 3223 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); 3224 unsigned DstReg = MRI.createVirtualRegister(SRC); 3225 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; 3226 3227 SmallVector<unsigned, 8> SRegs; 3228 for (unsigned i = 0; i < SubRegs; ++i) { 3229 unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 3230 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 3231 get(AMDGPU::V_READFIRSTLANE_B32), SGPR) 3232 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); 3233 SRegs.push_back(SGPR); 3234 } 3235 3236 MachineInstrBuilder MIB = 3237 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 3238 get(AMDGPU::REG_SEQUENCE), DstReg); 3239 for (unsigned i = 0; i < SubRegs; ++i) { 3240 MIB.addReg(SRegs[i]); 3241 MIB.addImm(RI.getSubRegFromChannel(i)); 3242 } 3243 return DstReg; 3244 } 3245 3246 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, 3247 MachineInstr &MI) const { 3248 3249 // If the pointer is store in VGPRs, then we need to move them to 3250 // SGPRs using v_readfirstlane. This is safe because we only select 3251 // loads with uniform pointers to SMRD instruction so we know the 3252 // pointer value is uniform. 3253 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); 3254 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { 3255 unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); 3256 SBase->setReg(SGPR); 3257 } 3258 } 3259 3260 void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, 3261 MachineBasicBlock::iterator I, 3262 const TargetRegisterClass *DstRC, 3263 MachineOperand &Op, 3264 MachineRegisterInfo &MRI, 3265 const DebugLoc &DL) const { 3266 unsigned OpReg = Op.getReg(); 3267 unsigned OpSubReg = Op.getSubReg(); 3268 3269 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( 3270 RI.getRegClassForReg(MRI, OpReg), OpSubReg); 3271 3272 // Check if operand is already the correct register class. 3273 if (DstRC == OpRC) 3274 return; 3275 3276 unsigned DstReg = MRI.createVirtualRegister(DstRC); 3277 MachineInstr *Copy = 3278 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); 3279 3280 Op.setReg(DstReg); 3281 Op.setSubReg(0); 3282 3283 MachineInstr *Def = MRI.getVRegDef(OpReg); 3284 if (!Def) 3285 return; 3286 3287 // Try to eliminate the copy if it is copying an immediate value. 3288 if (Def->isMoveImmediate()) 3289 FoldImmediate(*Copy, *Def, OpReg, &MRI); 3290 } 3291 3292 void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { 3293 MachineFunction &MF = *MI.getParent()->getParent(); 3294 MachineRegisterInfo &MRI = MF.getRegInfo(); 3295 3296 // Legalize VOP2 3297 if (isVOP2(MI) || isVOPC(MI)) { 3298 legalizeOperandsVOP2(MRI, MI); 3299 return; 3300 } 3301 3302 // Legalize VOP3 3303 if (isVOP3(MI)) { 3304 legalizeOperandsVOP3(MRI, MI); 3305 return; 3306 } 3307 3308 // Legalize SMRD 3309 if (isSMRD(MI)) { 3310 legalizeOperandsSMRD(MRI, MI); 3311 return; 3312 } 3313 3314 // Legalize REG_SEQUENCE and PHI 3315 // The register class of the operands much be the same type as the register 3316 // class of the output. 3317 if (MI.getOpcode() == AMDGPU::PHI) { 3318 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 3319 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { 3320 if (!MI.getOperand(i).isReg() || 3321 !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) 3322 continue; 3323 const TargetRegisterClass *OpRC = 3324 MRI.getRegClass(MI.getOperand(i).getReg()); 3325 if (RI.hasVGPRs(OpRC)) { 3326 VRC = OpRC; 3327 } else { 3328 SRC = OpRC; 3329 } 3330 } 3331 3332 // If any of the operands are VGPR registers, then they all most be 3333 // otherwise we will create illegal VGPR->SGPR copies when legalizing 3334 // them. 3335 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { 3336 if (!VRC) { 3337 assert(SRC); 3338 VRC = RI.getEquivalentVGPRClass(SRC); 3339 } 3340 RC = VRC; 3341 } else { 3342 RC = SRC; 3343 } 3344 3345 // Update all the operands so they have the same type. 3346 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3347 MachineOperand &Op = MI.getOperand(I); 3348 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 3349 continue; 3350 3351 // MI is a PHI instruction. 3352 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); 3353 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); 3354 3355 // Avoid creating no-op copies with the same src and dst reg class. These 3356 // confuse some of the machine passes. 3357 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); 3358 } 3359 } 3360 3361 // REG_SEQUENCE doesn't really require operand legalization, but if one has a 3362 // VGPR dest type and SGPR sources, insert copies so all operands are 3363 // VGPRs. This seems to help operand folding / the register coalescer. 3364 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { 3365 MachineBasicBlock *MBB = MI.getParent(); 3366 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); 3367 if (RI.hasVGPRs(DstRC)) { 3368 // Update all the operands so they are VGPR register classes. These may 3369 // not be the same register class because REG_SEQUENCE supports mixing 3370 // subregister index types e.g. sub0_sub1 + sub2 + sub3 3371 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3372 MachineOperand &Op = MI.getOperand(I); 3373 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 3374 continue; 3375 3376 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); 3377 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); 3378 if (VRC == OpRC) 3379 continue; 3380 3381 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc()); 3382 Op.setIsKill(); 3383 } 3384 } 3385 3386 return; 3387 } 3388 3389 // Legalize INSERT_SUBREG 3390 // src0 must have the same register class as dst 3391 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { 3392 unsigned Dst = MI.getOperand(0).getReg(); 3393 unsigned Src0 = MI.getOperand(1).getReg(); 3394 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 3395 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 3396 if (DstRC != Src0RC) { 3397 MachineBasicBlock *MBB = MI.getParent(); 3398 MachineOperand &Op = MI.getOperand(1); 3399 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); 3400 } 3401 return; 3402 } 3403 3404 // Legalize MIMG and MUBUF/MTBUF for shaders. 3405 // 3406 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via 3407 // scratch memory access. In both cases, the legalization never involves 3408 // conversion to the addr64 form. 3409 if (isMIMG(MI) || 3410 (AMDGPU::isShader(MF.getFunction()->getCallingConv()) && 3411 (isMUBUF(MI) || isMTBUF(MI)))) { 3412 MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); 3413 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { 3414 unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); 3415 SRsrc->setReg(SGPR); 3416 } 3417 3418 MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); 3419 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { 3420 unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); 3421 SSamp->setReg(SGPR); 3422 } 3423 return; 3424 } 3425 3426 // Legalize MUBUF* instructions by converting to addr64 form. 3427 // FIXME: If we start using the non-addr64 instructions for compute, we 3428 // may need to legalize them as above. This especially applies to the 3429 // buffer_load_format_* variants and variants with idxen (or bothen). 3430 int SRsrcIdx = 3431 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); 3432 if (SRsrcIdx != -1) { 3433 // We have an MUBUF instruction 3434 MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx); 3435 unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass; 3436 if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), 3437 RI.getRegClass(SRsrcRC))) { 3438 // The operands are legal. 3439 // FIXME: We may need to legalize operands besided srsrc. 3440 return; 3441 } 3442 3443 MachineBasicBlock &MBB = *MI.getParent(); 3444 3445 // Extract the ptr from the resource descriptor. 3446 unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, 3447 &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); 3448 3449 // Create an empty resource descriptor 3450 unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3451 unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 3452 unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 3453 unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); 3454 uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); 3455 3456 // Zero64 = 0 3457 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64) 3458 .addImm(0); 3459 3460 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 3461 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo) 3462 .addImm(RsrcDataFormat & 0xFFFFFFFF); 3463 3464 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 3465 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi) 3466 .addImm(RsrcDataFormat >> 32); 3467 3468 // NewSRsrc = {Zero64, SRsrcFormat} 3469 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) 3470 .addReg(Zero64) 3471 .addImm(AMDGPU::sub0_sub1) 3472 .addReg(SRsrcFormatLo) 3473 .addImm(AMDGPU::sub2) 3474 .addReg(SRsrcFormatHi) 3475 .addImm(AMDGPU::sub3); 3476 3477 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 3478 unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 3479 if (VAddr) { 3480 // This is already an ADDR64 instruction so we need to add the pointer 3481 // extracted from the resource descriptor to the current value of VAddr. 3482 unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3483 unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3484 3485 // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 3486 DebugLoc DL = MI.getDebugLoc(); 3487 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) 3488 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 3489 .addReg(VAddr->getReg(), 0, AMDGPU::sub0); 3490 3491 // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1 3492 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) 3493 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 3494 .addReg(VAddr->getReg(), 0, AMDGPU::sub1); 3495 3496 // NewVaddr = {NewVaddrHi, NewVaddrLo} 3497 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 3498 .addReg(NewVAddrLo) 3499 .addImm(AMDGPU::sub0) 3500 .addReg(NewVAddrHi) 3501 .addImm(AMDGPU::sub1); 3502 } else { 3503 // This instructions is the _OFFSET variant, so we need to convert it to 3504 // ADDR64. 3505 assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration() 3506 < SISubtarget::VOLCANIC_ISLANDS && 3507 "FIXME: Need to emit flat atomics here"); 3508 3509 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); 3510 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 3511 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); 3512 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); 3513 3514 // Atomics rith return have have an additional tied operand and are 3515 // missing some of the special bits. 3516 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); 3517 MachineInstr *Addr64; 3518 3519 if (!VDataIn) { 3520 // Regular buffer load / store. 3521 MachineInstrBuilder MIB = 3522 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 3523 .add(*VData) 3524 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 3525 // This will be replaced later 3526 // with the new value of vaddr. 3527 .add(*SRsrc) 3528 .add(*SOffset) 3529 .add(*Offset); 3530 3531 // Atomics do not have this operand. 3532 if (const MachineOperand *GLC = 3533 getNamedOperand(MI, AMDGPU::OpName::glc)) { 3534 MIB.addImm(GLC->getImm()); 3535 } 3536 3537 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)); 3538 3539 if (const MachineOperand *TFE = 3540 getNamedOperand(MI, AMDGPU::OpName::tfe)) { 3541 MIB.addImm(TFE->getImm()); 3542 } 3543 3544 MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); 3545 Addr64 = MIB; 3546 } else { 3547 // Atomics with return. 3548 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 3549 .add(*VData) 3550 .add(*VDataIn) 3551 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 3552 // This will be replaced later 3553 // with the new value of vaddr. 3554 .add(*SRsrc) 3555 .add(*SOffset) 3556 .add(*Offset) 3557 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) 3558 .setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); 3559 } 3560 3561 MI.removeFromParent(); 3562 3563 // NewVaddr = {NewVaddrHi, NewVaddrLo} 3564 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), 3565 NewVAddr) 3566 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 3567 .addImm(AMDGPU::sub0) 3568 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 3569 .addImm(AMDGPU::sub1); 3570 3571 VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr); 3572 SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc); 3573 } 3574 3575 // Update the instruction to use NewVaddr 3576 VAddr->setReg(NewVAddr); 3577 // Update the instruction to use NewSRsrc 3578 SRsrc->setReg(NewSRsrc); 3579 } 3580 } 3581 3582 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { 3583 SetVectorType Worklist; 3584 Worklist.insert(&TopInst); 3585 3586 while (!Worklist.empty()) { 3587 MachineInstr &Inst = *Worklist.pop_back_val(); 3588 MachineBasicBlock *MBB = Inst.getParent(); 3589 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 3590 3591 unsigned Opcode = Inst.getOpcode(); 3592 unsigned NewOpcode = getVALUOp(Inst); 3593 3594 // Handle some special cases 3595 switch (Opcode) { 3596 default: 3597 break; 3598 case AMDGPU::S_AND_B64: 3599 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); 3600 Inst.eraseFromParent(); 3601 continue; 3602 3603 case AMDGPU::S_OR_B64: 3604 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64); 3605 Inst.eraseFromParent(); 3606 continue; 3607 3608 case AMDGPU::S_XOR_B64: 3609 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64); 3610 Inst.eraseFromParent(); 3611 continue; 3612 3613 case AMDGPU::S_NOT_B64: 3614 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); 3615 Inst.eraseFromParent(); 3616 continue; 3617 3618 case AMDGPU::S_BCNT1_I32_B64: 3619 splitScalar64BitBCNT(Worklist, Inst); 3620 Inst.eraseFromParent(); 3621 continue; 3622 3623 case AMDGPU::S_BFE_I64: 3624 splitScalar64BitBFE(Worklist, Inst); 3625 Inst.eraseFromParent(); 3626 continue; 3627 3628 case AMDGPU::S_LSHL_B32: 3629 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3630 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 3631 swapOperands(Inst); 3632 } 3633 break; 3634 case AMDGPU::S_ASHR_I32: 3635 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3636 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 3637 swapOperands(Inst); 3638 } 3639 break; 3640 case AMDGPU::S_LSHR_B32: 3641 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3642 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 3643 swapOperands(Inst); 3644 } 3645 break; 3646 case AMDGPU::S_LSHL_B64: 3647 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3648 NewOpcode = AMDGPU::V_LSHLREV_B64; 3649 swapOperands(Inst); 3650 } 3651 break; 3652 case AMDGPU::S_ASHR_I64: 3653 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3654 NewOpcode = AMDGPU::V_ASHRREV_I64; 3655 swapOperands(Inst); 3656 } 3657 break; 3658 case AMDGPU::S_LSHR_B64: 3659 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3660 NewOpcode = AMDGPU::V_LSHRREV_B64; 3661 swapOperands(Inst); 3662 } 3663 break; 3664 3665 case AMDGPU::S_ABS_I32: 3666 lowerScalarAbs(Worklist, Inst); 3667 Inst.eraseFromParent(); 3668 continue; 3669 3670 case AMDGPU::S_CBRANCH_SCC0: 3671 case AMDGPU::S_CBRANCH_SCC1: 3672 // Clear unused bits of vcc 3673 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), 3674 AMDGPU::VCC) 3675 .addReg(AMDGPU::EXEC) 3676 .addReg(AMDGPU::VCC); 3677 break; 3678 3679 case AMDGPU::S_BFE_U64: 3680 case AMDGPU::S_BFM_B64: 3681 llvm_unreachable("Moving this op to VALU not implemented"); 3682 3683 case AMDGPU::S_PACK_LL_B32_B16: 3684 case AMDGPU::S_PACK_LH_B32_B16: 3685 case AMDGPU::S_PACK_HH_B32_B16: 3686 movePackToVALU(Worklist, MRI, Inst); 3687 Inst.eraseFromParent(); 3688 continue; 3689 } 3690 3691 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 3692 // We cannot move this instruction to the VALU, so we should try to 3693 // legalize its operands instead. 3694 legalizeOperands(Inst); 3695 continue; 3696 } 3697 3698 // Use the new VALU Opcode. 3699 const MCInstrDesc &NewDesc = get(NewOpcode); 3700 Inst.setDesc(NewDesc); 3701 3702 // Remove any references to SCC. Vector instructions can't read from it, and 3703 // We're just about to add the implicit use / defs of VCC, and we don't want 3704 // both. 3705 for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { 3706 MachineOperand &Op = Inst.getOperand(i); 3707 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { 3708 Inst.RemoveOperand(i); 3709 addSCCDefUsersToVALUWorklist(Inst, Worklist); 3710 } 3711 } 3712 3713 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 3714 // We are converting these to a BFE, so we need to add the missing 3715 // operands for the size and offset. 3716 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 3717 Inst.addOperand(MachineOperand::CreateImm(0)); 3718 Inst.addOperand(MachineOperand::CreateImm(Size)); 3719 3720 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 3721 // The VALU version adds the second operand to the result, so insert an 3722 // extra 0 operand. 3723 Inst.addOperand(MachineOperand::CreateImm(0)); 3724 } 3725 3726 Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); 3727 3728 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 3729 const MachineOperand &OffsetWidthOp = Inst.getOperand(2); 3730 // If we need to move this to VGPRs, we need to unpack the second operand 3731 // back into the 2 separate ones for bit offset and width. 3732 assert(OffsetWidthOp.isImm() && 3733 "Scalar BFE is only implemented for constant width and offset"); 3734 uint32_t Imm = OffsetWidthOp.getImm(); 3735 3736 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 3737 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 3738 Inst.RemoveOperand(2); // Remove old immediate. 3739 Inst.addOperand(MachineOperand::CreateImm(Offset)); 3740 Inst.addOperand(MachineOperand::CreateImm(BitWidth)); 3741 } 3742 3743 bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); 3744 unsigned NewDstReg = AMDGPU::NoRegister; 3745 if (HasDst) { 3746 unsigned DstReg = Inst.getOperand(0).getReg(); 3747 if (TargetRegisterInfo::isPhysicalRegister(DstReg)) 3748 continue; 3749 3750 // Update the destination register class. 3751 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); 3752 if (!NewDstRC) 3753 continue; 3754 3755 if (Inst.isCopy() && 3756 TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) && 3757 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { 3758 // Instead of creating a copy where src and dst are the same register 3759 // class, we just replace all uses of dst with src. These kinds of 3760 // copies interfere with the heuristics MachineSink uses to decide 3761 // whether or not to split a critical edge. Since the pass assumes 3762 // that copies will end up as machine instructions and not be 3763 // eliminated. 3764 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); 3765 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); 3766 MRI.clearKillFlags(Inst.getOperand(1).getReg()); 3767 Inst.getOperand(0).setReg(DstReg); 3768 continue; 3769 } 3770 3771 NewDstReg = MRI.createVirtualRegister(NewDstRC); 3772 MRI.replaceRegWith(DstReg, NewDstReg); 3773 } 3774 3775 // Legalize the operands 3776 legalizeOperands(Inst); 3777 3778 if (HasDst) 3779 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 3780 } 3781 } 3782 3783 void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, 3784 MachineInstr &Inst) const { 3785 MachineBasicBlock &MBB = *Inst.getParent(); 3786 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3787 MachineBasicBlock::iterator MII = Inst; 3788 DebugLoc DL = Inst.getDebugLoc(); 3789 3790 MachineOperand &Dest = Inst.getOperand(0); 3791 MachineOperand &Src = Inst.getOperand(1); 3792 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3793 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3794 3795 BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg) 3796 .addImm(0) 3797 .addReg(Src.getReg()); 3798 3799 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) 3800 .addReg(Src.getReg()) 3801 .addReg(TmpReg); 3802 3803 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3804 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3805 } 3806 3807 void SIInstrInfo::splitScalar64BitUnaryOp( 3808 SetVectorType &Worklist, MachineInstr &Inst, 3809 unsigned Opcode) const { 3810 MachineBasicBlock &MBB = *Inst.getParent(); 3811 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3812 3813 MachineOperand &Dest = Inst.getOperand(0); 3814 MachineOperand &Src0 = Inst.getOperand(1); 3815 DebugLoc DL = Inst.getDebugLoc(); 3816 3817 MachineBasicBlock::iterator MII = Inst; 3818 3819 const MCInstrDesc &InstDesc = get(Opcode); 3820 const TargetRegisterClass *Src0RC = Src0.isReg() ? 3821 MRI.getRegClass(Src0.getReg()) : 3822 &AMDGPU::SGPR_32RegClass; 3823 3824 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 3825 3826 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3827 AMDGPU::sub0, Src0SubRC); 3828 3829 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 3830 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 3831 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 3832 3833 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 3834 BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); 3835 3836 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3837 AMDGPU::sub1, Src0SubRC); 3838 3839 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 3840 BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); 3841 3842 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 3843 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 3844 .addReg(DestSub0) 3845 .addImm(AMDGPU::sub0) 3846 .addReg(DestSub1) 3847 .addImm(AMDGPU::sub1); 3848 3849 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 3850 3851 // We don't need to legalizeOperands here because for a single operand, src0 3852 // will support any kind of input. 3853 3854 // Move all users of this moved value. 3855 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 3856 } 3857 3858 void SIInstrInfo::splitScalar64BitBinaryOp( 3859 SetVectorType &Worklist, MachineInstr &Inst, 3860 unsigned Opcode) const { 3861 MachineBasicBlock &MBB = *Inst.getParent(); 3862 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3863 3864 MachineOperand &Dest = Inst.getOperand(0); 3865 MachineOperand &Src0 = Inst.getOperand(1); 3866 MachineOperand &Src1 = Inst.getOperand(2); 3867 DebugLoc DL = Inst.getDebugLoc(); 3868 3869 MachineBasicBlock::iterator MII = Inst; 3870 3871 const MCInstrDesc &InstDesc = get(Opcode); 3872 const TargetRegisterClass *Src0RC = Src0.isReg() ? 3873 MRI.getRegClass(Src0.getReg()) : 3874 &AMDGPU::SGPR_32RegClass; 3875 3876 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 3877 const TargetRegisterClass *Src1RC = Src1.isReg() ? 3878 MRI.getRegClass(Src1.getReg()) : 3879 &AMDGPU::SGPR_32RegClass; 3880 3881 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 3882 3883 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3884 AMDGPU::sub0, Src0SubRC); 3885 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 3886 AMDGPU::sub0, Src1SubRC); 3887 3888 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 3889 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 3890 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 3891 3892 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 3893 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) 3894 .add(SrcReg0Sub0) 3895 .add(SrcReg1Sub0); 3896 3897 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 3898 AMDGPU::sub1, Src0SubRC); 3899 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 3900 AMDGPU::sub1, Src1SubRC); 3901 3902 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 3903 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) 3904 .add(SrcReg0Sub1) 3905 .add(SrcReg1Sub1); 3906 3907 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 3908 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 3909 .addReg(DestSub0) 3910 .addImm(AMDGPU::sub0) 3911 .addReg(DestSub1) 3912 .addImm(AMDGPU::sub1); 3913 3914 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 3915 3916 // Try to legalize the operands in case we need to swap the order to keep it 3917 // valid. 3918 legalizeOperands(LoHalf); 3919 legalizeOperands(HiHalf); 3920 3921 // Move all users of this moved vlaue. 3922 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 3923 } 3924 3925 void SIInstrInfo::splitScalar64BitBCNT( 3926 SetVectorType &Worklist, MachineInstr &Inst) const { 3927 MachineBasicBlock &MBB = *Inst.getParent(); 3928 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3929 3930 MachineBasicBlock::iterator MII = Inst; 3931 DebugLoc DL = Inst.getDebugLoc(); 3932 3933 MachineOperand &Dest = Inst.getOperand(0); 3934 MachineOperand &Src = Inst.getOperand(1); 3935 3936 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 3937 const TargetRegisterClass *SrcRC = Src.isReg() ? 3938 MRI.getRegClass(Src.getReg()) : 3939 &AMDGPU::SGPR_32RegClass; 3940 3941 unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3942 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3943 3944 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); 3945 3946 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 3947 AMDGPU::sub0, SrcSubRC); 3948 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 3949 AMDGPU::sub1, SrcSubRC); 3950 3951 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0); 3952 3953 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg); 3954 3955 MRI.replaceRegWith(Dest.getReg(), ResultReg); 3956 3957 // We don't need to legalize operands here. src0 for etiher instruction can be 3958 // an SGPR, and the second input is unused or determined here. 3959 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 3960 } 3961 3962 void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, 3963 MachineInstr &Inst) const { 3964 MachineBasicBlock &MBB = *Inst.getParent(); 3965 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3966 MachineBasicBlock::iterator MII = Inst; 3967 DebugLoc DL = Inst.getDebugLoc(); 3968 3969 MachineOperand &Dest = Inst.getOperand(0); 3970 uint32_t Imm = Inst.getOperand(2).getImm(); 3971 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 3972 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 3973 3974 (void) Offset; 3975 3976 // Only sext_inreg cases handled. 3977 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && 3978 Offset == 0 && "Not implemented"); 3979 3980 if (BitWidth < 32) { 3981 unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3982 unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3983 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 3984 3985 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) 3986 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) 3987 .addImm(0) 3988 .addImm(BitWidth); 3989 3990 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 3991 .addImm(31) 3992 .addReg(MidRegLo); 3993 3994 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 3995 .addReg(MidRegLo) 3996 .addImm(AMDGPU::sub0) 3997 .addReg(MidRegHi) 3998 .addImm(AMDGPU::sub1); 3999 4000 MRI.replaceRegWith(Dest.getReg(), ResultReg); 4001 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 4002 return; 4003 } 4004 4005 MachineOperand &Src = Inst.getOperand(1); 4006 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4007 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 4008 4009 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 4010 .addImm(31) 4011 .addReg(Src.getReg(), 0, AMDGPU::sub0); 4012 4013 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 4014 .addReg(Src.getReg(), 0, AMDGPU::sub0) 4015 .addImm(AMDGPU::sub0) 4016 .addReg(TmpReg) 4017 .addImm(AMDGPU::sub1); 4018 4019 MRI.replaceRegWith(Dest.getReg(), ResultReg); 4020 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 4021 } 4022 4023 void SIInstrInfo::addUsersToMoveToVALUWorklist( 4024 unsigned DstReg, 4025 MachineRegisterInfo &MRI, 4026 SetVectorType &Worklist) const { 4027 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), 4028 E = MRI.use_end(); I != E;) { 4029 MachineInstr &UseMI = *I->getParent(); 4030 if (!canReadVGPR(UseMI, I.getOperandNo())) { 4031 Worklist.insert(&UseMI); 4032 4033 do { 4034 ++I; 4035 } while (I != E && I->getParent() == &UseMI); 4036 } else { 4037 ++I; 4038 } 4039 } 4040 } 4041 4042 void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, 4043 MachineRegisterInfo &MRI, 4044 MachineInstr &Inst) const { 4045 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4046 MachineBasicBlock *MBB = Inst.getParent(); 4047 MachineOperand &Src0 = Inst.getOperand(1); 4048 MachineOperand &Src1 = Inst.getOperand(2); 4049 const DebugLoc &DL = Inst.getDebugLoc(); 4050 4051 switch (Inst.getOpcode()) { 4052 case AMDGPU::S_PACK_LL_B32_B16: { 4053 unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4054 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4055 4056 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are 4057 // 0. 4058 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 4059 .addImm(0xffff); 4060 4061 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg) 4062 .addReg(ImmReg, RegState::Kill) 4063 .add(Src0); 4064 4065 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg) 4066 .add(Src1) 4067 .addImm(16) 4068 .addReg(TmpReg, RegState::Kill); 4069 break; 4070 } 4071 case AMDGPU::S_PACK_LH_B32_B16: { 4072 unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4073 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 4074 .addImm(0xffff); 4075 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg) 4076 .addReg(ImmReg, RegState::Kill) 4077 .add(Src0) 4078 .add(Src1); 4079 break; 4080 } 4081 case AMDGPU::S_PACK_HH_B32_B16: { 4082 unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4083 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4084 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) 4085 .addImm(16) 4086 .add(Src0); 4087 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 4088 .addImm(0xffff0000); 4089 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg) 4090 .add(Src1) 4091 .addReg(ImmReg, RegState::Kill) 4092 .addReg(TmpReg, RegState::Kill); 4093 break; 4094 } 4095 default: 4096 llvm_unreachable("unhandled s_pack_* instruction"); 4097 } 4098 4099 MachineOperand &Dest = Inst.getOperand(0); 4100 MRI.replaceRegWith(Dest.getReg(), ResultReg); 4101 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 4102 } 4103 4104 void SIInstrInfo::addSCCDefUsersToVALUWorklist( 4105 MachineInstr &SCCDefInst, SetVectorType &Worklist) const { 4106 // This assumes that all the users of SCC are in the same block 4107 // as the SCC def. 4108 for (MachineInstr &MI : 4109 make_range(MachineBasicBlock::iterator(SCCDefInst), 4110 SCCDefInst.getParent()->end())) { 4111 // Exit if we find another SCC def. 4112 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1) 4113 return; 4114 4115 if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1) 4116 Worklist.insert(&MI); 4117 } 4118 } 4119 4120 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( 4121 const MachineInstr &Inst) const { 4122 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); 4123 4124 switch (Inst.getOpcode()) { 4125 // For target instructions, getOpRegClass just returns the virtual register 4126 // class associated with the operand, so we need to find an equivalent VGPR 4127 // register class in order to move the instruction to the VALU. 4128 case AMDGPU::COPY: 4129 case AMDGPU::PHI: 4130 case AMDGPU::REG_SEQUENCE: 4131 case AMDGPU::INSERT_SUBREG: 4132 case AMDGPU::WQM: 4133 case AMDGPU::WWM: 4134 if (RI.hasVGPRs(NewDstRC)) 4135 return nullptr; 4136 4137 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 4138 if (!NewDstRC) 4139 return nullptr; 4140 return NewDstRC; 4141 default: 4142 return NewDstRC; 4143 } 4144 } 4145 4146 // Find the one SGPR operand we are allowed to use. 4147 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI, 4148 int OpIndices[3]) const { 4149 const MCInstrDesc &Desc = MI.getDesc(); 4150 4151 // Find the one SGPR operand we are allowed to use. 4152 // 4153 // First we need to consider the instruction's operand requirements before 4154 // legalizing. Some operands are required to be SGPRs, such as implicit uses 4155 // of VCC, but we are still bound by the constant bus requirement to only use 4156 // one. 4157 // 4158 // If the operand's class is an SGPR, we can never move it. 4159 4160 unsigned SGPRReg = findImplicitSGPRRead(MI); 4161 if (SGPRReg != AMDGPU::NoRegister) 4162 return SGPRReg; 4163 4164 unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; 4165 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 4166 4167 for (unsigned i = 0; i < 3; ++i) { 4168 int Idx = OpIndices[i]; 4169 if (Idx == -1) 4170 break; 4171 4172 const MachineOperand &MO = MI.getOperand(Idx); 4173 if (!MO.isReg()) 4174 continue; 4175 4176 // Is this operand statically required to be an SGPR based on the operand 4177 // constraints? 4178 const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); 4179 bool IsRequiredSGPR = RI.isSGPRClass(OpRC); 4180 if (IsRequiredSGPR) 4181 return MO.getReg(); 4182 4183 // If this could be a VGPR or an SGPR, Check the dynamic register class. 4184 unsigned Reg = MO.getReg(); 4185 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); 4186 if (RI.isSGPRClass(RegRC)) 4187 UsedSGPRs[i] = Reg; 4188 } 4189 4190 // We don't have a required SGPR operand, so we have a bit more freedom in 4191 // selecting operands to move. 4192 4193 // Try to select the most used SGPR. If an SGPR is equal to one of the 4194 // others, we choose that. 4195 // 4196 // e.g. 4197 // V_FMA_F32 v0, s0, s0, s0 -> No moves 4198 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 4199 4200 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should 4201 // prefer those. 4202 4203 if (UsedSGPRs[0] != AMDGPU::NoRegister) { 4204 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 4205 SGPRReg = UsedSGPRs[0]; 4206 } 4207 4208 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { 4209 if (UsedSGPRs[1] == UsedSGPRs[2]) 4210 SGPRReg = UsedSGPRs[1]; 4211 } 4212 4213 return SGPRReg; 4214 } 4215 4216 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 4217 unsigned OperandName) const { 4218 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 4219 if (Idx == -1) 4220 return nullptr; 4221 4222 return &MI.getOperand(Idx); 4223 } 4224 4225 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 4226 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 4227 if (ST.isAmdHsaOS()) { 4228 // Set ATC = 1. GFX9 doesn't have this bit. 4229 if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) 4230 RsrcDataFormat |= (1ULL << 56); 4231 4232 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this. 4233 // BTW, it disables TC L2 and therefore decreases performance. 4234 if (ST.getGeneration() == SISubtarget::VOLCANIC_ISLANDS) 4235 RsrcDataFormat |= (2ULL << 59); 4236 } 4237 4238 return RsrcDataFormat; 4239 } 4240 4241 uint64_t SIInstrInfo::getScratchRsrcWords23() const { 4242 uint64_t Rsrc23 = getDefaultRsrcDataFormat() | 4243 AMDGPU::RSRC_TID_ENABLE | 4244 0xffffffff; // Size; 4245 4246 // GFX9 doesn't have ELEMENT_SIZE. 4247 if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) { 4248 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; 4249 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; 4250 } 4251 4252 // IndexStride = 64. 4253 Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; 4254 4255 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. 4256 // Clear them unless we want a huge stride. 4257 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) 4258 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; 4259 4260 return Rsrc23; 4261 } 4262 4263 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { 4264 unsigned Opc = MI.getOpcode(); 4265 4266 return isSMRD(Opc); 4267 } 4268 4269 bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const { 4270 unsigned Opc = MI.getOpcode(); 4271 4272 return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); 4273 } 4274 4275 unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, 4276 int &FrameIndex) const { 4277 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 4278 if (!Addr || !Addr->isFI()) 4279 return AMDGPU::NoRegister; 4280 4281 assert(!MI.memoperands_empty() && 4282 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUASI.PRIVATE_ADDRESS); 4283 4284 FrameIndex = Addr->getIndex(); 4285 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); 4286 } 4287 4288 unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, 4289 int &FrameIndex) const { 4290 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr); 4291 assert(Addr && Addr->isFI()); 4292 FrameIndex = Addr->getIndex(); 4293 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg(); 4294 } 4295 4296 unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 4297 int &FrameIndex) const { 4298 if (!MI.mayLoad()) 4299 return AMDGPU::NoRegister; 4300 4301 if (isMUBUF(MI) || isVGPRSpill(MI)) 4302 return isStackAccess(MI, FrameIndex); 4303 4304 if (isSGPRSpill(MI)) 4305 return isSGPRStackAccess(MI, FrameIndex); 4306 4307 return AMDGPU::NoRegister; 4308 } 4309 4310 unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI, 4311 int &FrameIndex) const { 4312 if (!MI.mayStore()) 4313 return AMDGPU::NoRegister; 4314 4315 if (isMUBUF(MI) || isVGPRSpill(MI)) 4316 return isStackAccess(MI, FrameIndex); 4317 4318 if (isSGPRSpill(MI)) 4319 return isSGPRStackAccess(MI, FrameIndex); 4320 4321 return AMDGPU::NoRegister; 4322 } 4323 4324 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 4325 unsigned Opc = MI.getOpcode(); 4326 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); 4327 unsigned DescSize = Desc.getSize(); 4328 4329 // If we have a definitive size, we can use it. Otherwise we need to inspect 4330 // the operands to know the size. 4331 // 4332 // FIXME: Instructions that have a base 32-bit encoding report their size as 4333 // 4, even though they are really 8 bytes if they have a literal operand. 4334 if (DescSize != 0 && DescSize != 4) 4335 return DescSize; 4336 4337 // 4-byte instructions may have a 32-bit literal encoded after them. Check 4338 // operands that coud ever be literals. 4339 if (isVALU(MI) || isSALU(MI)) { 4340 if (isFixedSize(MI)) 4341 return DescSize; 4342 4343 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 4344 if (Src0Idx == -1) 4345 return 4; // No operands. 4346 4347 if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx])) 4348 return 8; 4349 4350 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 4351 if (Src1Idx == -1) 4352 return 4; 4353 4354 if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx])) 4355 return 8; 4356 4357 return 4; 4358 } 4359 4360 if (DescSize == 4) 4361 return 4; 4362 4363 switch (Opc) { 4364 case TargetOpcode::IMPLICIT_DEF: 4365 case TargetOpcode::KILL: 4366 case TargetOpcode::DBG_VALUE: 4367 case TargetOpcode::BUNDLE: 4368 case TargetOpcode::EH_LABEL: 4369 return 0; 4370 case TargetOpcode::INLINEASM: { 4371 const MachineFunction *MF = MI.getParent()->getParent(); 4372 const char *AsmStr = MI.getOperand(0).getSymbolName(); 4373 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo()); 4374 } 4375 default: 4376 llvm_unreachable("unable to find instruction size"); 4377 } 4378 } 4379 4380 bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { 4381 if (!isFLAT(MI)) 4382 return false; 4383 4384 if (MI.memoperands_empty()) 4385 return true; 4386 4387 for (const MachineMemOperand *MMO : MI.memoperands()) { 4388 if (MMO->getAddrSpace() == AMDGPUASI.FLAT_ADDRESS) 4389 return true; 4390 } 4391 return false; 4392 } 4393 4394 bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const { 4395 return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO; 4396 } 4397 4398 void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, 4399 MachineBasicBlock *IfEnd) const { 4400 MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator(); 4401 assert(TI != IfEntry->end()); 4402 4403 MachineInstr *Branch = &(*TI); 4404 MachineFunction *MF = IfEntry->getParent(); 4405 MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo(); 4406 4407 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 4408 unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4409 MachineInstr *SIIF = 4410 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg) 4411 .add(Branch->getOperand(0)) 4412 .add(Branch->getOperand(1)); 4413 MachineInstr *SIEND = 4414 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF)) 4415 .addReg(DstReg); 4416 4417 IfEntry->erase(TI); 4418 IfEntry->insert(IfEntry->end(), SIIF); 4419 IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND); 4420 } 4421 } 4422 4423 void SIInstrInfo::convertNonUniformLoopRegion( 4424 MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const { 4425 MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator(); 4426 // We expect 2 terminators, one conditional and one unconditional. 4427 assert(TI != LoopEnd->end()); 4428 4429 MachineInstr *Branch = &(*TI); 4430 MachineFunction *MF = LoopEnd->getParent(); 4431 MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo(); 4432 4433 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 4434 4435 unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4436 unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4437 MachineInstrBuilder HeaderPHIBuilder = 4438 BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg); 4439 for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(), 4440 E = LoopEntry->pred_end(); 4441 PI != E; ++PI) { 4442 if (*PI == LoopEnd) { 4443 HeaderPHIBuilder.addReg(BackEdgeReg); 4444 } else { 4445 MachineBasicBlock *PMBB = *PI; 4446 unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4447 materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(), 4448 ZeroReg, 0); 4449 HeaderPHIBuilder.addReg(ZeroReg); 4450 } 4451 HeaderPHIBuilder.addMBB(*PI); 4452 } 4453 MachineInstr *HeaderPhi = HeaderPHIBuilder; 4454 MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(), 4455 get(AMDGPU::SI_IF_BREAK), BackEdgeReg) 4456 .addReg(DstReg) 4457 .add(Branch->getOperand(0)); 4458 MachineInstr *SILOOP = 4459 BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP)) 4460 .addReg(BackEdgeReg) 4461 .addMBB(LoopEntry); 4462 4463 LoopEntry->insert(LoopEntry->begin(), HeaderPhi); 4464 LoopEnd->erase(TI); 4465 LoopEnd->insert(LoopEnd->end(), SIIFBREAK); 4466 LoopEnd->insert(LoopEnd->end(), SILOOP); 4467 } 4468 } 4469 4470 ArrayRef<std::pair<int, const char *>> 4471 SIInstrInfo::getSerializableTargetIndices() const { 4472 static const std::pair<int, const char *> TargetIndices[] = { 4473 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, 4474 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, 4475 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, 4476 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, 4477 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; 4478 return makeArrayRef(TargetIndices); 4479 } 4480 4481 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The 4482 /// post-RA version of misched uses CreateTargetMIHazardRecognizer. 4483 ScheduleHazardRecognizer * 4484 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, 4485 const ScheduleDAG *DAG) const { 4486 return new GCNHazardRecognizer(DAG->MF); 4487 } 4488 4489 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer 4490 /// pass. 4491 ScheduleHazardRecognizer * 4492 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { 4493 return new GCNHazardRecognizer(MF); 4494 } 4495 4496 std::pair<unsigned, unsigned> 4497 SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { 4498 return std::make_pair(TF & MO_MASK, TF & ~MO_MASK); 4499 } 4500 4501 ArrayRef<std::pair<unsigned, const char *>> 4502 SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { 4503 static const std::pair<unsigned, const char *> TargetFlags[] = { 4504 { MO_GOTPCREL, "amdgpu-gotprel" }, 4505 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" }, 4506 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" }, 4507 { MO_REL32_LO, "amdgpu-rel32-lo" }, 4508 { MO_REL32_HI, "amdgpu-rel32-hi" } 4509 }; 4510 4511 return makeArrayRef(TargetFlags); 4512 } 4513 4514 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { 4515 return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && 4516 MI.modifiesRegister(AMDGPU::EXEC, &RI); 4517 } 4518 4519 MachineInstrBuilder 4520 SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, 4521 MachineBasicBlock::iterator I, 4522 const DebugLoc &DL, 4523 unsigned DestReg) const { 4524 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4525 4526 unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4527 4528 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) 4529 .addReg(UnusedCarry, RegState::Define | RegState::Dead); 4530 } 4531