1 //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// SI Implementation of TargetInstrInfo. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "SIInstrInfo.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUSubtarget.h" 18 #include "GCNHazardRecognizer.h" 19 #include "SIDefines.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "SIRegisterInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "Utils/AMDGPUBaseInfo.h" 24 #include "llvm/ADT/APInt.h" 25 #include "llvm/ADT/ArrayRef.h" 26 #include "llvm/ADT/SmallVector.h" 27 #include "llvm/ADT/StringRef.h" 28 #include "llvm/ADT/iterator_range.h" 29 #include "llvm/Analysis/AliasAnalysis.h" 30 #include "llvm/Analysis/MemoryLocation.h" 31 #include "llvm/Analysis/ValueTracking.h" 32 #include "llvm/CodeGen/MachineBasicBlock.h" 33 #include "llvm/CodeGen/MachineFrameInfo.h" 34 #include "llvm/CodeGen/MachineFunction.h" 35 #include "llvm/CodeGen/MachineInstr.h" 36 #include "llvm/CodeGen/MachineInstrBuilder.h" 37 #include "llvm/CodeGen/MachineInstrBundle.h" 38 #include "llvm/CodeGen/MachineMemOperand.h" 39 #include "llvm/CodeGen/MachineOperand.h" 40 #include "llvm/CodeGen/MachineRegisterInfo.h" 41 #include "llvm/CodeGen/RegisterScavenging.h" 42 #include "llvm/CodeGen/ScheduleDAG.h" 43 #include "llvm/CodeGen/SelectionDAGNodes.h" 44 #include "llvm/CodeGen/TargetOpcodes.h" 45 #include "llvm/CodeGen/TargetRegisterInfo.h" 46 #include "llvm/IR/DebugLoc.h" 47 #include "llvm/IR/DiagnosticInfo.h" 48 #include "llvm/IR/Function.h" 49 #include "llvm/IR/InlineAsm.h" 50 #include "llvm/IR/LLVMContext.h" 51 #include "llvm/MC/MCInstrDesc.h" 52 #include "llvm/Support/Casting.h" 53 #include "llvm/Support/CommandLine.h" 54 #include "llvm/Support/Compiler.h" 55 #include "llvm/Support/ErrorHandling.h" 56 #include "llvm/Support/MachineValueType.h" 57 #include "llvm/Support/MathExtras.h" 58 #include "llvm/Target/TargetMachine.h" 59 #include <cassert> 60 #include <cstdint> 61 #include <iterator> 62 #include <utility> 63 64 using namespace llvm; 65 66 // Must be at least 4 to be able to branch over minimum unconditional branch 67 // code. This is only for making it possible to write reasonably small tests for 68 // long branches. 69 static cl::opt<unsigned> 70 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), 71 cl::desc("Restrict range of branch instructions (DEBUG)")); 72 73 SIInstrInfo::SIInstrInfo(const SISubtarget &ST) 74 : AMDGPUInstrInfo(ST), RI(ST), ST(ST) {} 75 76 //===----------------------------------------------------------------------===// 77 // TargetInstrInfo callbacks 78 //===----------------------------------------------------------------------===// 79 80 static unsigned getNumOperandsNoGlue(SDNode *Node) { 81 unsigned N = Node->getNumOperands(); 82 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 83 --N; 84 return N; 85 } 86 87 static SDValue findChainOperand(SDNode *Load) { 88 SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); 89 assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); 90 return LastOp; 91 } 92 93 /// Returns true if both nodes have the same value for the given 94 /// operand \p Op, or if both nodes do not have this operand. 95 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 96 unsigned Opc0 = N0->getMachineOpcode(); 97 unsigned Opc1 = N1->getMachineOpcode(); 98 99 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 100 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 101 102 if (Op0Idx == -1 && Op1Idx == -1) 103 return true; 104 105 106 if ((Op0Idx == -1 && Op1Idx != -1) || 107 (Op1Idx == -1 && Op0Idx != -1)) 108 return false; 109 110 // getNamedOperandIdx returns the index for the MachineInstr's operands, 111 // which includes the result as the first operand. We are indexing into the 112 // MachineSDNode's operands, so we need to skip the result operand to get 113 // the real index. 114 --Op0Idx; 115 --Op1Idx; 116 117 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 118 } 119 120 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, 121 AliasAnalysis *AA) const { 122 // TODO: The generic check fails for VALU instructions that should be 123 // rematerializable due to implicit reads of exec. We really want all of the 124 // generic logic for this except for this. 125 switch (MI.getOpcode()) { 126 case AMDGPU::V_MOV_B32_e32: 127 case AMDGPU::V_MOV_B32_e64: 128 case AMDGPU::V_MOV_B64_PSEUDO: 129 return true; 130 default: 131 return false; 132 } 133 } 134 135 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 136 int64_t &Offset0, 137 int64_t &Offset1) const { 138 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 139 return false; 140 141 unsigned Opc0 = Load0->getMachineOpcode(); 142 unsigned Opc1 = Load1->getMachineOpcode(); 143 144 // Make sure both are actually loads. 145 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 146 return false; 147 148 if (isDS(Opc0) && isDS(Opc1)) { 149 150 // FIXME: Handle this case: 151 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 152 return false; 153 154 // Check base reg. 155 if (Load0->getOperand(1) != Load1->getOperand(1)) 156 return false; 157 158 // Check chain. 159 if (findChainOperand(Load0) != findChainOperand(Load1)) 160 return false; 161 162 // Skip read2 / write2 variants for simplicity. 163 // TODO: We should report true if the used offsets are adjacent (excluded 164 // st64 versions). 165 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || 166 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) 167 return false; 168 169 Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue(); 170 Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue(); 171 return true; 172 } 173 174 if (isSMRD(Opc0) && isSMRD(Opc1)) { 175 // Skip time and cache invalidation instructions. 176 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 || 177 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1) 178 return false; 179 180 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); 181 182 // Check base reg. 183 if (Load0->getOperand(0) != Load1->getOperand(0)) 184 return false; 185 186 const ConstantSDNode *Load0Offset = 187 dyn_cast<ConstantSDNode>(Load0->getOperand(1)); 188 const ConstantSDNode *Load1Offset = 189 dyn_cast<ConstantSDNode>(Load1->getOperand(1)); 190 191 if (!Load0Offset || !Load1Offset) 192 return false; 193 194 // Check chain. 195 if (findChainOperand(Load0) != findChainOperand(Load1)) 196 return false; 197 198 Offset0 = Load0Offset->getZExtValue(); 199 Offset1 = Load1Offset->getZExtValue(); 200 return true; 201 } 202 203 // MUBUF and MTBUF can access the same addresses. 204 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 205 206 // MUBUF and MTBUF have vaddr at different indices. 207 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 208 findChainOperand(Load0) != findChainOperand(Load1) || 209 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 210 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 211 return false; 212 213 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 214 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 215 216 if (OffIdx0 == -1 || OffIdx1 == -1) 217 return false; 218 219 // getNamedOperandIdx returns the index for MachineInstrs. Since they 220 // inlcude the output in the operand list, but SDNodes don't, we need to 221 // subtract the index by one. 222 --OffIdx0; 223 --OffIdx1; 224 225 SDValue Off0 = Load0->getOperand(OffIdx0); 226 SDValue Off1 = Load1->getOperand(OffIdx1); 227 228 // The offset might be a FrameIndexSDNode. 229 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 230 return false; 231 232 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 233 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 234 return true; 235 } 236 237 return false; 238 } 239 240 static bool isStride64(unsigned Opc) { 241 switch (Opc) { 242 case AMDGPU::DS_READ2ST64_B32: 243 case AMDGPU::DS_READ2ST64_B64: 244 case AMDGPU::DS_WRITE2ST64_B32: 245 case AMDGPU::DS_WRITE2ST64_B64: 246 return true; 247 default: 248 return false; 249 } 250 } 251 252 bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, 253 int64_t &Offset, 254 const TargetRegisterInfo *TRI) const { 255 unsigned Opc = LdSt.getOpcode(); 256 257 if (isDS(LdSt)) { 258 const MachineOperand *OffsetImm = 259 getNamedOperand(LdSt, AMDGPU::OpName::offset); 260 if (OffsetImm) { 261 // Normal, single offset LDS instruction. 262 const MachineOperand *AddrReg = 263 getNamedOperand(LdSt, AMDGPU::OpName::addr); 264 265 BaseReg = AddrReg->getReg(); 266 Offset = OffsetImm->getImm(); 267 return true; 268 } 269 270 // The 2 offset instructions use offset0 and offset1 instead. We can treat 271 // these as a load with a single offset if the 2 offsets are consecutive. We 272 // will use this for some partially aligned loads. 273 const MachineOperand *Offset0Imm = 274 getNamedOperand(LdSt, AMDGPU::OpName::offset0); 275 const MachineOperand *Offset1Imm = 276 getNamedOperand(LdSt, AMDGPU::OpName::offset1); 277 278 uint8_t Offset0 = Offset0Imm->getImm(); 279 uint8_t Offset1 = Offset1Imm->getImm(); 280 281 if (Offset1 > Offset0 && Offset1 - Offset0 == 1) { 282 // Each of these offsets is in element sized units, so we need to convert 283 // to bytes of the individual reads. 284 285 unsigned EltSize; 286 if (LdSt.mayLoad()) 287 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16; 288 else { 289 assert(LdSt.mayStore()); 290 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 291 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8; 292 } 293 294 if (isStride64(Opc)) 295 EltSize *= 64; 296 297 const MachineOperand *AddrReg = 298 getNamedOperand(LdSt, AMDGPU::OpName::addr); 299 BaseReg = AddrReg->getReg(); 300 Offset = EltSize * Offset0; 301 return true; 302 } 303 304 return false; 305 } 306 307 if (isMUBUF(LdSt) || isMTBUF(LdSt)) { 308 const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset); 309 if (SOffset && SOffset->isReg()) 310 return false; 311 312 const MachineOperand *AddrReg = 313 getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 314 if (!AddrReg) 315 return false; 316 317 const MachineOperand *OffsetImm = 318 getNamedOperand(LdSt, AMDGPU::OpName::offset); 319 BaseReg = AddrReg->getReg(); 320 Offset = OffsetImm->getImm(); 321 322 if (SOffset) // soffset can be an inline immediate. 323 Offset += SOffset->getImm(); 324 325 return true; 326 } 327 328 if (isSMRD(LdSt)) { 329 const MachineOperand *OffsetImm = 330 getNamedOperand(LdSt, AMDGPU::OpName::offset); 331 if (!OffsetImm) 332 return false; 333 334 const MachineOperand *SBaseReg = 335 getNamedOperand(LdSt, AMDGPU::OpName::sbase); 336 BaseReg = SBaseReg->getReg(); 337 Offset = OffsetImm->getImm(); 338 return true; 339 } 340 341 if (isFLAT(LdSt)) { 342 const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 343 if (VAddr) { 344 // Can't analyze 2 offsets. 345 if (getNamedOperand(LdSt, AMDGPU::OpName::saddr)) 346 return false; 347 348 BaseReg = VAddr->getReg(); 349 } else { 350 // scratch instructions have either vaddr or saddr. 351 BaseReg = getNamedOperand(LdSt, AMDGPU::OpName::saddr)->getReg(); 352 } 353 354 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm(); 355 return true; 356 } 357 358 return false; 359 } 360 361 static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, unsigned BaseReg1, 362 const MachineInstr &MI2, unsigned BaseReg2) { 363 if (BaseReg1 == BaseReg2) 364 return true; 365 366 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand()) 367 return false; 368 369 auto MO1 = *MI1.memoperands_begin(); 370 auto MO2 = *MI2.memoperands_begin(); 371 if (MO1->getAddrSpace() != MO2->getAddrSpace()) 372 return false; 373 374 auto Base1 = MO1->getValue(); 375 auto Base2 = MO2->getValue(); 376 if (!Base1 || !Base2) 377 return false; 378 const MachineFunction &MF = *MI1.getParent()->getParent(); 379 const DataLayout &DL = MF.getFunction().getParent()->getDataLayout(); 380 Base1 = GetUnderlyingObject(Base1, DL); 381 Base2 = GetUnderlyingObject(Base1, DL); 382 383 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2)) 384 return false; 385 386 return Base1 == Base2; 387 } 388 389 bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, 390 unsigned BaseReg1, 391 MachineInstr &SecondLdSt, 392 unsigned BaseReg2, 393 unsigned NumLoads) const { 394 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseReg1, SecondLdSt, BaseReg2)) 395 return false; 396 397 const MachineOperand *FirstDst = nullptr; 398 const MachineOperand *SecondDst = nullptr; 399 400 if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || 401 (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) || 402 (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) { 403 const unsigned MaxGlobalLoadCluster = 6; 404 if (NumLoads > MaxGlobalLoadCluster) 405 return false; 406 407 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); 408 if (!FirstDst) 409 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); 410 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); 411 if (!SecondDst) 412 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); 413 } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { 414 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); 415 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); 416 } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) { 417 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); 418 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); 419 } 420 421 if (!FirstDst || !SecondDst) 422 return false; 423 424 // Try to limit clustering based on the total number of bytes loaded 425 // rather than the number of instructions. This is done to help reduce 426 // register pressure. The method used is somewhat inexact, though, 427 // because it assumes that all loads in the cluster will load the 428 // same number of bytes as FirstLdSt. 429 430 // The unit of this value is bytes. 431 // FIXME: This needs finer tuning. 432 unsigned LoadClusterThreshold = 16; 433 434 const MachineRegisterInfo &MRI = 435 FirstLdSt.getParent()->getParent()->getRegInfo(); 436 const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg()); 437 438 return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold; 439 } 440 441 static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, 442 MachineBasicBlock::iterator MI, 443 const DebugLoc &DL, unsigned DestReg, 444 unsigned SrcReg, bool KillSrc) { 445 MachineFunction *MF = MBB.getParent(); 446 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), 447 "illegal SGPR to VGPR copy", 448 DL, DS_Error); 449 LLVMContext &C = MF->getFunction().getContext(); 450 C.diagnose(IllegalCopy); 451 452 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg) 453 .addReg(SrcReg, getKillRegState(KillSrc)); 454 } 455 456 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 457 MachineBasicBlock::iterator MI, 458 const DebugLoc &DL, unsigned DestReg, 459 unsigned SrcReg, bool KillSrc) const { 460 const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg); 461 462 if (RC == &AMDGPU::VGPR_32RegClass) { 463 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 464 AMDGPU::SReg_32RegClass.contains(SrcReg)); 465 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 466 .addReg(SrcReg, getKillRegState(KillSrc)); 467 return; 468 } 469 470 if (RC == &AMDGPU::SReg_32_XM0RegClass || 471 RC == &AMDGPU::SReg_32RegClass) { 472 if (SrcReg == AMDGPU::SCC) { 473 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) 474 .addImm(-1) 475 .addImm(0); 476 return; 477 } 478 479 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) { 480 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 481 return; 482 } 483 484 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 485 .addReg(SrcReg, getKillRegState(KillSrc)); 486 return; 487 } 488 489 if (RC == &AMDGPU::SReg_64RegClass) { 490 if (DestReg == AMDGPU::VCC) { 491 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 492 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 493 .addReg(SrcReg, getKillRegState(KillSrc)); 494 } else { 495 // FIXME: Hack until VReg_1 removed. 496 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 497 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) 498 .addImm(0) 499 .addReg(SrcReg, getKillRegState(KillSrc)); 500 } 501 502 return; 503 } 504 505 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) { 506 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 507 return; 508 } 509 510 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 511 .addReg(SrcReg, getKillRegState(KillSrc)); 512 return; 513 } 514 515 if (DestReg == AMDGPU::SCC) { 516 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 517 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) 518 .addReg(SrcReg, getKillRegState(KillSrc)) 519 .addImm(0); 520 return; 521 } 522 523 unsigned EltSize = 4; 524 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 525 if (RI.isSGPRClass(RC)) { 526 if (RI.getRegSizeInBits(*RC) > 32) { 527 Opcode = AMDGPU::S_MOV_B64; 528 EltSize = 8; 529 } else { 530 Opcode = AMDGPU::S_MOV_B32; 531 EltSize = 4; 532 } 533 534 if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) { 535 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 536 return; 537 } 538 } 539 540 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); 541 bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); 542 543 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 544 unsigned SubIdx; 545 if (Forward) 546 SubIdx = SubIndices[Idx]; 547 else 548 SubIdx = SubIndices[SubIndices.size() - Idx - 1]; 549 550 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 551 get(Opcode), RI.getSubReg(DestReg, SubIdx)); 552 553 Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); 554 555 if (Idx == 0) 556 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 557 558 bool UseKill = KillSrc && Idx == SubIndices.size() - 1; 559 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); 560 } 561 } 562 563 int SIInstrInfo::commuteOpcode(unsigned Opcode) const { 564 int NewOpc; 565 566 // Try to map original to commuted opcode 567 NewOpc = AMDGPU::getCommuteRev(Opcode); 568 if (NewOpc != -1) 569 // Check if the commuted (REV) opcode exists on the target. 570 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 571 572 // Try to map commuted to original opcode 573 NewOpc = AMDGPU::getCommuteOrig(Opcode); 574 if (NewOpc != -1) 575 // Check if the original (non-REV) opcode exists on the target. 576 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 577 578 return Opcode; 579 } 580 581 void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, 582 MachineBasicBlock::iterator MI, 583 const DebugLoc &DL, unsigned DestReg, 584 int64_t Value) const { 585 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 586 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg); 587 if (RegClass == &AMDGPU::SReg_32RegClass || 588 RegClass == &AMDGPU::SGPR_32RegClass || 589 RegClass == &AMDGPU::SReg_32_XM0RegClass || 590 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) { 591 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 592 .addImm(Value); 593 return; 594 } 595 596 if (RegClass == &AMDGPU::SReg_64RegClass || 597 RegClass == &AMDGPU::SGPR_64RegClass || 598 RegClass == &AMDGPU::SReg_64_XEXECRegClass) { 599 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 600 .addImm(Value); 601 return; 602 } 603 604 if (RegClass == &AMDGPU::VGPR_32RegClass) { 605 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 606 .addImm(Value); 607 return; 608 } 609 if (RegClass == &AMDGPU::VReg_64RegClass) { 610 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg) 611 .addImm(Value); 612 return; 613 } 614 615 unsigned EltSize = 4; 616 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 617 if (RI.isSGPRClass(RegClass)) { 618 if (RI.getRegSizeInBits(*RegClass) > 32) { 619 Opcode = AMDGPU::S_MOV_B64; 620 EltSize = 8; 621 } else { 622 Opcode = AMDGPU::S_MOV_B32; 623 EltSize = 4; 624 } 625 } 626 627 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize); 628 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 629 int64_t IdxValue = Idx == 0 ? Value : 0; 630 631 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 632 get(Opcode), RI.getSubReg(DestReg, Idx)); 633 Builder.addImm(IdxValue); 634 } 635 } 636 637 const TargetRegisterClass * 638 SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const { 639 return &AMDGPU::VGPR_32RegClass; 640 } 641 642 void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, 643 MachineBasicBlock::iterator I, 644 const DebugLoc &DL, unsigned DstReg, 645 ArrayRef<MachineOperand> Cond, 646 unsigned TrueReg, 647 unsigned FalseReg) const { 648 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 649 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && 650 "Not a VGPR32 reg"); 651 652 if (Cond.size() == 1) { 653 unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 654 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 655 .add(Cond[0]); 656 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 657 .addReg(FalseReg) 658 .addReg(TrueReg) 659 .addReg(SReg); 660 } else if (Cond.size() == 2) { 661 assert(Cond[0].isImm() && "Cond[0] is not an immediate"); 662 switch (Cond[0].getImm()) { 663 case SIInstrInfo::SCC_TRUE: { 664 unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 665 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) 666 .addImm(-1) 667 .addImm(0); 668 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 669 .addReg(FalseReg) 670 .addReg(TrueReg) 671 .addReg(SReg); 672 break; 673 } 674 case SIInstrInfo::SCC_FALSE: { 675 unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 676 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) 677 .addImm(0) 678 .addImm(-1); 679 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 680 .addReg(FalseReg) 681 .addReg(TrueReg) 682 .addReg(SReg); 683 break; 684 } 685 case SIInstrInfo::VCCNZ: { 686 MachineOperand RegOp = Cond[1]; 687 RegOp.setImplicit(false); 688 unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 689 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 690 .add(RegOp); 691 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 692 .addReg(FalseReg) 693 .addReg(TrueReg) 694 .addReg(SReg); 695 break; 696 } 697 case SIInstrInfo::VCCZ: { 698 MachineOperand RegOp = Cond[1]; 699 RegOp.setImplicit(false); 700 unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 701 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 702 .add(RegOp); 703 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 704 .addReg(TrueReg) 705 .addReg(FalseReg) 706 .addReg(SReg); 707 break; 708 } 709 case SIInstrInfo::EXECNZ: { 710 unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 711 unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 712 BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 713 .addImm(0); 714 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) 715 .addImm(-1) 716 .addImm(0); 717 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 718 .addReg(FalseReg) 719 .addReg(TrueReg) 720 .addReg(SReg); 721 break; 722 } 723 case SIInstrInfo::EXECZ: { 724 unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 725 unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 726 BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 727 .addImm(0); 728 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) 729 .addImm(0) 730 .addImm(-1); 731 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 732 .addReg(FalseReg) 733 .addReg(TrueReg) 734 .addReg(SReg); 735 llvm_unreachable("Unhandled branch predicate EXECZ"); 736 break; 737 } 738 default: 739 llvm_unreachable("invalid branch predicate"); 740 } 741 } else { 742 llvm_unreachable("Can only handle Cond size 1 or 2"); 743 } 744 } 745 746 unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB, 747 MachineBasicBlock::iterator I, 748 const DebugLoc &DL, 749 unsigned SrcReg, int Value) const { 750 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 751 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 752 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) 753 .addImm(Value) 754 .addReg(SrcReg); 755 756 return Reg; 757 } 758 759 unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB, 760 MachineBasicBlock::iterator I, 761 const DebugLoc &DL, 762 unsigned SrcReg, int Value) const { 763 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 764 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 765 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) 766 .addImm(Value) 767 .addReg(SrcReg); 768 769 return Reg; 770 } 771 772 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 773 774 if (RI.getRegSizeInBits(*DstRC) == 32) { 775 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 776 } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { 777 return AMDGPU::S_MOV_B64; 778 } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) { 779 return AMDGPU::V_MOV_B64_PSEUDO; 780 } 781 return AMDGPU::COPY; 782 } 783 784 static unsigned getSGPRSpillSaveOpcode(unsigned Size) { 785 switch (Size) { 786 case 4: 787 return AMDGPU::SI_SPILL_S32_SAVE; 788 case 8: 789 return AMDGPU::SI_SPILL_S64_SAVE; 790 case 16: 791 return AMDGPU::SI_SPILL_S128_SAVE; 792 case 32: 793 return AMDGPU::SI_SPILL_S256_SAVE; 794 case 64: 795 return AMDGPU::SI_SPILL_S512_SAVE; 796 default: 797 llvm_unreachable("unknown register size"); 798 } 799 } 800 801 static unsigned getVGPRSpillSaveOpcode(unsigned Size) { 802 switch (Size) { 803 case 4: 804 return AMDGPU::SI_SPILL_V32_SAVE; 805 case 8: 806 return AMDGPU::SI_SPILL_V64_SAVE; 807 case 12: 808 return AMDGPU::SI_SPILL_V96_SAVE; 809 case 16: 810 return AMDGPU::SI_SPILL_V128_SAVE; 811 case 32: 812 return AMDGPU::SI_SPILL_V256_SAVE; 813 case 64: 814 return AMDGPU::SI_SPILL_V512_SAVE; 815 default: 816 llvm_unreachable("unknown register size"); 817 } 818 } 819 820 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 821 MachineBasicBlock::iterator MI, 822 unsigned SrcReg, bool isKill, 823 int FrameIndex, 824 const TargetRegisterClass *RC, 825 const TargetRegisterInfo *TRI) const { 826 MachineFunction *MF = MBB.getParent(); 827 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 828 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 829 DebugLoc DL = MBB.findDebugLoc(MI); 830 831 unsigned Size = FrameInfo.getObjectSize(FrameIndex); 832 unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); 833 MachinePointerInfo PtrInfo 834 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 835 MachineMemOperand *MMO 836 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 837 Size, Align); 838 unsigned SpillSize = TRI->getSpillSize(*RC); 839 840 if (RI.isSGPRClass(RC)) { 841 MFI->setHasSpilledSGPRs(); 842 843 // We are only allowed to create one new instruction when spilling 844 // registers, so we need to use pseudo instruction for spilling SGPRs. 845 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize)); 846 847 // The SGPR spill/restore instructions only work on number sgprs, so we need 848 // to make sure we are using the correct register class. 849 if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) { 850 MachineRegisterInfo &MRI = MF->getRegInfo(); 851 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); 852 } 853 854 MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc) 855 .addReg(SrcReg, getKillRegState(isKill)) // data 856 .addFrameIndex(FrameIndex) // addr 857 .addMemOperand(MMO) 858 .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) 859 .addReg(MFI->getFrameOffsetReg(), RegState::Implicit); 860 // Add the scratch resource registers as implicit uses because we may end up 861 // needing them, and need to ensure that the reserved registers are 862 // correctly handled. 863 864 FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL); 865 if (ST.hasScalarStores()) { 866 // m0 is used for offset to scalar stores if used to spill. 867 Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead); 868 } 869 870 return; 871 } 872 873 if (!ST.isVGPRSpillingEnabled(MF->getFunction())) { 874 LLVMContext &Ctx = MF->getFunction().getContext(); 875 Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" 876 " spill register"); 877 BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) 878 .addReg(SrcReg); 879 880 return; 881 } 882 883 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 884 885 unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize); 886 MFI->setHasSpilledVGPRs(); 887 BuildMI(MBB, MI, DL, get(Opcode)) 888 .addReg(SrcReg, getKillRegState(isKill)) // data 889 .addFrameIndex(FrameIndex) // addr 890 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 891 .addReg(MFI->getFrameOffsetReg()) // scratch_offset 892 .addImm(0) // offset 893 .addMemOperand(MMO); 894 } 895 896 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { 897 switch (Size) { 898 case 4: 899 return AMDGPU::SI_SPILL_S32_RESTORE; 900 case 8: 901 return AMDGPU::SI_SPILL_S64_RESTORE; 902 case 16: 903 return AMDGPU::SI_SPILL_S128_RESTORE; 904 case 32: 905 return AMDGPU::SI_SPILL_S256_RESTORE; 906 case 64: 907 return AMDGPU::SI_SPILL_S512_RESTORE; 908 default: 909 llvm_unreachable("unknown register size"); 910 } 911 } 912 913 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { 914 switch (Size) { 915 case 4: 916 return AMDGPU::SI_SPILL_V32_RESTORE; 917 case 8: 918 return AMDGPU::SI_SPILL_V64_RESTORE; 919 case 12: 920 return AMDGPU::SI_SPILL_V96_RESTORE; 921 case 16: 922 return AMDGPU::SI_SPILL_V128_RESTORE; 923 case 32: 924 return AMDGPU::SI_SPILL_V256_RESTORE; 925 case 64: 926 return AMDGPU::SI_SPILL_V512_RESTORE; 927 default: 928 llvm_unreachable("unknown register size"); 929 } 930 } 931 932 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 933 MachineBasicBlock::iterator MI, 934 unsigned DestReg, int FrameIndex, 935 const TargetRegisterClass *RC, 936 const TargetRegisterInfo *TRI) const { 937 MachineFunction *MF = MBB.getParent(); 938 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 939 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 940 DebugLoc DL = MBB.findDebugLoc(MI); 941 unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); 942 unsigned Size = FrameInfo.getObjectSize(FrameIndex); 943 unsigned SpillSize = TRI->getSpillSize(*RC); 944 945 MachinePointerInfo PtrInfo 946 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 947 948 MachineMemOperand *MMO = MF->getMachineMemOperand( 949 PtrInfo, MachineMemOperand::MOLoad, Size, Align); 950 951 if (RI.isSGPRClass(RC)) { 952 // FIXME: Maybe this should not include a memoperand because it will be 953 // lowered to non-memory instructions. 954 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); 955 if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) { 956 MachineRegisterInfo &MRI = MF->getRegInfo(); 957 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); 958 } 959 960 FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL); 961 MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg) 962 .addFrameIndex(FrameIndex) // addr 963 .addMemOperand(MMO) 964 .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) 965 .addReg(MFI->getFrameOffsetReg(), RegState::Implicit); 966 967 if (ST.hasScalarStores()) { 968 // m0 is used for offset to scalar stores if used to spill. 969 Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead); 970 } 971 972 return; 973 } 974 975 if (!ST.isVGPRSpillingEnabled(MF->getFunction())) { 976 LLVMContext &Ctx = MF->getFunction().getContext(); 977 Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" 978 " restore register"); 979 BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); 980 981 return; 982 } 983 984 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 985 986 unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize); 987 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 988 .addFrameIndex(FrameIndex) // vaddr 989 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 990 .addReg(MFI->getFrameOffsetReg()) // scratch_offset 991 .addImm(0) // offset 992 .addMemOperand(MMO); 993 } 994 995 /// \param @Offset Offset in bytes of the FrameIndex being spilled 996 unsigned SIInstrInfo::calculateLDSSpillAddress( 997 MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg, 998 unsigned FrameOffset, unsigned Size) const { 999 MachineFunction *MF = MBB.getParent(); 1000 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1001 const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); 1002 DebugLoc DL = MBB.findDebugLoc(MI); 1003 unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); 1004 unsigned WavefrontSize = ST.getWavefrontSize(); 1005 1006 unsigned TIDReg = MFI->getTIDReg(); 1007 if (!MFI->hasCalculatedTID()) { 1008 MachineBasicBlock &Entry = MBB.getParent()->front(); 1009 MachineBasicBlock::iterator Insert = Entry.front(); 1010 DebugLoc DL = Insert->getDebugLoc(); 1011 1012 TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass, 1013 *MF); 1014 if (TIDReg == AMDGPU::NoRegister) 1015 return TIDReg; 1016 1017 if (!AMDGPU::isShader(MF->getFunction().getCallingConv()) && 1018 WorkGroupSize > WavefrontSize) { 1019 unsigned TIDIGXReg 1020 = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 1021 unsigned TIDIGYReg 1022 = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 1023 unsigned TIDIGZReg 1024 = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 1025 unsigned InputPtrReg = 1026 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1027 for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { 1028 if (!Entry.isLiveIn(Reg)) 1029 Entry.addLiveIn(Reg); 1030 } 1031 1032 RS->enterBasicBlock(Entry); 1033 // FIXME: Can we scavenge an SReg_64 and access the subregs? 1034 unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 1035 unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 1036 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) 1037 .addReg(InputPtrReg) 1038 .addImm(SI::KernelInputOffsets::NGROUPS_Z); 1039 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) 1040 .addReg(InputPtrReg) 1041 .addImm(SI::KernelInputOffsets::NGROUPS_Y); 1042 1043 // NGROUPS.X * NGROUPS.Y 1044 BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) 1045 .addReg(STmp1) 1046 .addReg(STmp0); 1047 // (NGROUPS.X * NGROUPS.Y) * TIDIG.X 1048 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) 1049 .addReg(STmp1) 1050 .addReg(TIDIGXReg); 1051 // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) 1052 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) 1053 .addReg(STmp0) 1054 .addReg(TIDIGYReg) 1055 .addReg(TIDReg); 1056 // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z 1057 getAddNoCarry(Entry, Insert, DL, TIDReg) 1058 .addReg(TIDReg) 1059 .addReg(TIDIGZReg); 1060 } else { 1061 // Get the wave id 1062 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), 1063 TIDReg) 1064 .addImm(-1) 1065 .addImm(0); 1066 1067 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), 1068 TIDReg) 1069 .addImm(-1) 1070 .addReg(TIDReg); 1071 } 1072 1073 BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), 1074 TIDReg) 1075 .addImm(2) 1076 .addReg(TIDReg); 1077 MFI->setTIDReg(TIDReg); 1078 } 1079 1080 // Add FrameIndex to LDS offset 1081 unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize); 1082 getAddNoCarry(MBB, MI, DL, TmpReg) 1083 .addImm(LDSOffset) 1084 .addReg(TIDReg); 1085 1086 return TmpReg; 1087 } 1088 1089 void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB, 1090 MachineBasicBlock::iterator MI, 1091 int Count) const { 1092 DebugLoc DL = MBB.findDebugLoc(MI); 1093 while (Count > 0) { 1094 int Arg; 1095 if (Count >= 8) 1096 Arg = 7; 1097 else 1098 Arg = Count - 1; 1099 Count -= 8; 1100 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)) 1101 .addImm(Arg); 1102 } 1103 } 1104 1105 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, 1106 MachineBasicBlock::iterator MI) const { 1107 insertWaitStates(MBB, MI, 1); 1108 } 1109 1110 void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { 1111 auto MF = MBB.getParent(); 1112 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 1113 1114 assert(Info->isEntryFunction()); 1115 1116 if (MBB.succ_empty()) { 1117 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end(); 1118 if (HasNoTerminator) 1119 BuildMI(MBB, MBB.end(), DebugLoc(), 1120 get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG)); 1121 } 1122 } 1123 1124 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const { 1125 switch (MI.getOpcode()) { 1126 default: return 1; // FIXME: Do wait states equal cycles? 1127 1128 case AMDGPU::S_NOP: 1129 return MI.getOperand(0).getImm() + 1; 1130 } 1131 } 1132 1133 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 1134 MachineBasicBlock &MBB = *MI.getParent(); 1135 DebugLoc DL = MBB.findDebugLoc(MI); 1136 switch (MI.getOpcode()) { 1137 default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); 1138 case AMDGPU::S_MOV_B64_term: 1139 // This is only a terminator to get the correct spill code placement during 1140 // register allocation. 1141 MI.setDesc(get(AMDGPU::S_MOV_B64)); 1142 break; 1143 1144 case AMDGPU::S_XOR_B64_term: 1145 // This is only a terminator to get the correct spill code placement during 1146 // register allocation. 1147 MI.setDesc(get(AMDGPU::S_XOR_B64)); 1148 break; 1149 1150 case AMDGPU::S_ANDN2_B64_term: 1151 // This is only a terminator to get the correct spill code placement during 1152 // register allocation. 1153 MI.setDesc(get(AMDGPU::S_ANDN2_B64)); 1154 break; 1155 1156 case AMDGPU::V_MOV_B64_PSEUDO: { 1157 unsigned Dst = MI.getOperand(0).getReg(); 1158 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 1159 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 1160 1161 const MachineOperand &SrcOp = MI.getOperand(1); 1162 // FIXME: Will this work for 64-bit floating point immediates? 1163 assert(!SrcOp.isFPImm()); 1164 if (SrcOp.isImm()) { 1165 APInt Imm(64, SrcOp.getImm()); 1166 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 1167 .addImm(Imm.getLoBits(32).getZExtValue()) 1168 .addReg(Dst, RegState::Implicit | RegState::Define); 1169 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 1170 .addImm(Imm.getHiBits(32).getZExtValue()) 1171 .addReg(Dst, RegState::Implicit | RegState::Define); 1172 } else { 1173 assert(SrcOp.isReg()); 1174 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 1175 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 1176 .addReg(Dst, RegState::Implicit | RegState::Define); 1177 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 1178 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 1179 .addReg(Dst, RegState::Implicit | RegState::Define); 1180 } 1181 MI.eraseFromParent(); 1182 break; 1183 } 1184 case AMDGPU::V_SET_INACTIVE_B32: { 1185 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) 1186 .addReg(AMDGPU::EXEC); 1187 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) 1188 .add(MI.getOperand(2)); 1189 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) 1190 .addReg(AMDGPU::EXEC); 1191 MI.eraseFromParent(); 1192 break; 1193 } 1194 case AMDGPU::V_SET_INACTIVE_B64: { 1195 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) 1196 .addReg(AMDGPU::EXEC); 1197 MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), 1198 MI.getOperand(0).getReg()) 1199 .add(MI.getOperand(2)); 1200 expandPostRAPseudo(*Copy); 1201 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) 1202 .addReg(AMDGPU::EXEC); 1203 MI.eraseFromParent(); 1204 break; 1205 } 1206 case AMDGPU::V_MOVRELD_B32_V1: 1207 case AMDGPU::V_MOVRELD_B32_V2: 1208 case AMDGPU::V_MOVRELD_B32_V4: 1209 case AMDGPU::V_MOVRELD_B32_V8: 1210 case AMDGPU::V_MOVRELD_B32_V16: { 1211 const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32); 1212 unsigned VecReg = MI.getOperand(0).getReg(); 1213 bool IsUndef = MI.getOperand(1).isUndef(); 1214 unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm(); 1215 assert(VecReg == MI.getOperand(1).getReg()); 1216 1217 MachineInstr *MovRel = 1218 BuildMI(MBB, MI, DL, MovRelDesc) 1219 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) 1220 .add(MI.getOperand(2)) 1221 .addReg(VecReg, RegState::ImplicitDefine) 1222 .addReg(VecReg, 1223 RegState::Implicit | (IsUndef ? RegState::Undef : 0)); 1224 1225 const int ImpDefIdx = 1226 MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses(); 1227 const int ImpUseIdx = ImpDefIdx + 1; 1228 MovRel->tieOperands(ImpDefIdx, ImpUseIdx); 1229 1230 MI.eraseFromParent(); 1231 break; 1232 } 1233 case AMDGPU::SI_PC_ADD_REL_OFFSET: { 1234 MachineFunction &MF = *MBB.getParent(); 1235 unsigned Reg = MI.getOperand(0).getReg(); 1236 unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); 1237 unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); 1238 1239 // Create a bundle so these instructions won't be re-ordered by the 1240 // post-RA scheduler. 1241 MIBundleBuilder Bundler(MBB, MI); 1242 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); 1243 1244 // Add 32-bit offset from this instruction to the start of the 1245 // constant data. 1246 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) 1247 .addReg(RegLo) 1248 .add(MI.getOperand(1))); 1249 1250 MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) 1251 .addReg(RegHi); 1252 if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE) 1253 MIB.addImm(0); 1254 else 1255 MIB.add(MI.getOperand(2)); 1256 1257 Bundler.append(MIB); 1258 finalizeBundle(MBB, Bundler.begin()); 1259 1260 MI.eraseFromParent(); 1261 break; 1262 } 1263 case AMDGPU::EXIT_WWM: { 1264 // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM 1265 // is exited. 1266 MI.setDesc(get(AMDGPU::S_MOV_B64)); 1267 break; 1268 } 1269 } 1270 return true; 1271 } 1272 1273 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, 1274 MachineOperand &Src0, 1275 unsigned Src0OpName, 1276 MachineOperand &Src1, 1277 unsigned Src1OpName) const { 1278 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); 1279 if (!Src0Mods) 1280 return false; 1281 1282 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); 1283 assert(Src1Mods && 1284 "All commutable instructions have both src0 and src1 modifiers"); 1285 1286 int Src0ModsVal = Src0Mods->getImm(); 1287 int Src1ModsVal = Src1Mods->getImm(); 1288 1289 Src1Mods->setImm(Src0ModsVal); 1290 Src0Mods->setImm(Src1ModsVal); 1291 return true; 1292 } 1293 1294 static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, 1295 MachineOperand &RegOp, 1296 MachineOperand &NonRegOp) { 1297 unsigned Reg = RegOp.getReg(); 1298 unsigned SubReg = RegOp.getSubReg(); 1299 bool IsKill = RegOp.isKill(); 1300 bool IsDead = RegOp.isDead(); 1301 bool IsUndef = RegOp.isUndef(); 1302 bool IsDebug = RegOp.isDebug(); 1303 1304 if (NonRegOp.isImm()) 1305 RegOp.ChangeToImmediate(NonRegOp.getImm()); 1306 else if (NonRegOp.isFI()) 1307 RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); 1308 else 1309 return nullptr; 1310 1311 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); 1312 NonRegOp.setSubReg(SubReg); 1313 1314 return &MI; 1315 } 1316 1317 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, 1318 unsigned Src0Idx, 1319 unsigned Src1Idx) const { 1320 assert(!NewMI && "this should never be used"); 1321 1322 unsigned Opc = MI.getOpcode(); 1323 int CommutedOpcode = commuteOpcode(Opc); 1324 if (CommutedOpcode == -1) 1325 return nullptr; 1326 1327 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == 1328 static_cast<int>(Src0Idx) && 1329 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == 1330 static_cast<int>(Src1Idx) && 1331 "inconsistency with findCommutedOpIndices"); 1332 1333 MachineOperand &Src0 = MI.getOperand(Src0Idx); 1334 MachineOperand &Src1 = MI.getOperand(Src1Idx); 1335 1336 MachineInstr *CommutedMI = nullptr; 1337 if (Src0.isReg() && Src1.isReg()) { 1338 if (isOperandLegal(MI, Src1Idx, &Src0)) { 1339 // Be sure to copy the source modifiers to the right place. 1340 CommutedMI 1341 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); 1342 } 1343 1344 } else if (Src0.isReg() && !Src1.isReg()) { 1345 // src0 should always be able to support any operand type, so no need to 1346 // check operand legality. 1347 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); 1348 } else if (!Src0.isReg() && Src1.isReg()) { 1349 if (isOperandLegal(MI, Src1Idx, &Src0)) 1350 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); 1351 } else { 1352 // FIXME: Found two non registers to commute. This does happen. 1353 return nullptr; 1354 } 1355 1356 if (CommutedMI) { 1357 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, 1358 Src1, AMDGPU::OpName::src1_modifiers); 1359 1360 CommutedMI->setDesc(get(CommutedOpcode)); 1361 } 1362 1363 return CommutedMI; 1364 } 1365 1366 // This needs to be implemented because the source modifiers may be inserted 1367 // between the true commutable operands, and the base 1368 // TargetInstrInfo::commuteInstruction uses it. 1369 bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0, 1370 unsigned &SrcOpIdx1) const { 1371 if (!MI.isCommutable()) 1372 return false; 1373 1374 unsigned Opc = MI.getOpcode(); 1375 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1376 if (Src0Idx == -1) 1377 return false; 1378 1379 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1380 if (Src1Idx == -1) 1381 return false; 1382 1383 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); 1384 } 1385 1386 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, 1387 int64_t BrOffset) const { 1388 // BranchRelaxation should never have to check s_setpc_b64 because its dest 1389 // block is unanalyzable. 1390 assert(BranchOp != AMDGPU::S_SETPC_B64); 1391 1392 // Convert to dwords. 1393 BrOffset /= 4; 1394 1395 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is 1396 // from the next instruction. 1397 BrOffset -= 1; 1398 1399 return isIntN(BranchOffsetBits, BrOffset); 1400 } 1401 1402 MachineBasicBlock *SIInstrInfo::getBranchDestBlock( 1403 const MachineInstr &MI) const { 1404 if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { 1405 // This would be a difficult analysis to perform, but can always be legal so 1406 // there's no need to analyze it. 1407 return nullptr; 1408 } 1409 1410 return MI.getOperand(0).getMBB(); 1411 } 1412 1413 unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, 1414 MachineBasicBlock &DestBB, 1415 const DebugLoc &DL, 1416 int64_t BrOffset, 1417 RegScavenger *RS) const { 1418 assert(RS && "RegScavenger required for long branching"); 1419 assert(MBB.empty() && 1420 "new block should be inserted for expanding unconditional branch"); 1421 assert(MBB.pred_size() == 1); 1422 1423 MachineFunction *MF = MBB.getParent(); 1424 MachineRegisterInfo &MRI = MF->getRegInfo(); 1425 1426 // FIXME: Virtual register workaround for RegScavenger not working with empty 1427 // blocks. 1428 unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1429 1430 auto I = MBB.end(); 1431 1432 // We need to compute the offset relative to the instruction immediately after 1433 // s_getpc_b64. Insert pc arithmetic code before last terminator. 1434 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); 1435 1436 // TODO: Handle > 32-bit block address. 1437 if (BrOffset >= 0) { 1438 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) 1439 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 1440 .addReg(PCReg, 0, AMDGPU::sub0) 1441 .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD); 1442 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) 1443 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 1444 .addReg(PCReg, 0, AMDGPU::sub1) 1445 .addImm(0); 1446 } else { 1447 // Backwards branch. 1448 BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32)) 1449 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 1450 .addReg(PCReg, 0, AMDGPU::sub0) 1451 .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD); 1452 BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32)) 1453 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 1454 .addReg(PCReg, 0, AMDGPU::sub1) 1455 .addImm(0); 1456 } 1457 1458 // Insert the indirect branch after the other terminator. 1459 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) 1460 .addReg(PCReg); 1461 1462 // FIXME: If spilling is necessary, this will fail because this scavenger has 1463 // no emergency stack slots. It is non-trivial to spill in this situation, 1464 // because the restore code needs to be specially placed after the 1465 // jump. BranchRelaxation then needs to be made aware of the newly inserted 1466 // block. 1467 // 1468 // If a spill is needed for the pc register pair, we need to insert a spill 1469 // restore block right before the destination block, and insert a short branch 1470 // into the old destination block's fallthrough predecessor. 1471 // e.g.: 1472 // 1473 // s_cbranch_scc0 skip_long_branch: 1474 // 1475 // long_branch_bb: 1476 // spill s[8:9] 1477 // s_getpc_b64 s[8:9] 1478 // s_add_u32 s8, s8, restore_bb 1479 // s_addc_u32 s9, s9, 0 1480 // s_setpc_b64 s[8:9] 1481 // 1482 // skip_long_branch: 1483 // foo; 1484 // 1485 // ..... 1486 // 1487 // dest_bb_fallthrough_predecessor: 1488 // bar; 1489 // s_branch dest_bb 1490 // 1491 // restore_bb: 1492 // restore s[8:9] 1493 // fallthrough dest_bb 1494 /// 1495 // dest_bb: 1496 // buzz; 1497 1498 RS->enterBasicBlockEnd(MBB); 1499 unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass, 1500 MachineBasicBlock::iterator(GetPC), 0); 1501 MRI.replaceRegWith(PCReg, Scav); 1502 MRI.clearVirtRegs(); 1503 RS->setRegUsed(Scav); 1504 1505 return 4 + 8 + 4 + 4; 1506 } 1507 1508 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { 1509 switch (Cond) { 1510 case SIInstrInfo::SCC_TRUE: 1511 return AMDGPU::S_CBRANCH_SCC1; 1512 case SIInstrInfo::SCC_FALSE: 1513 return AMDGPU::S_CBRANCH_SCC0; 1514 case SIInstrInfo::VCCNZ: 1515 return AMDGPU::S_CBRANCH_VCCNZ; 1516 case SIInstrInfo::VCCZ: 1517 return AMDGPU::S_CBRANCH_VCCZ; 1518 case SIInstrInfo::EXECNZ: 1519 return AMDGPU::S_CBRANCH_EXECNZ; 1520 case SIInstrInfo::EXECZ: 1521 return AMDGPU::S_CBRANCH_EXECZ; 1522 default: 1523 llvm_unreachable("invalid branch predicate"); 1524 } 1525 } 1526 1527 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { 1528 switch (Opcode) { 1529 case AMDGPU::S_CBRANCH_SCC0: 1530 return SCC_FALSE; 1531 case AMDGPU::S_CBRANCH_SCC1: 1532 return SCC_TRUE; 1533 case AMDGPU::S_CBRANCH_VCCNZ: 1534 return VCCNZ; 1535 case AMDGPU::S_CBRANCH_VCCZ: 1536 return VCCZ; 1537 case AMDGPU::S_CBRANCH_EXECNZ: 1538 return EXECNZ; 1539 case AMDGPU::S_CBRANCH_EXECZ: 1540 return EXECZ; 1541 default: 1542 return INVALID_BR; 1543 } 1544 } 1545 1546 bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, 1547 MachineBasicBlock::iterator I, 1548 MachineBasicBlock *&TBB, 1549 MachineBasicBlock *&FBB, 1550 SmallVectorImpl<MachineOperand> &Cond, 1551 bool AllowModify) const { 1552 if (I->getOpcode() == AMDGPU::S_BRANCH) { 1553 // Unconditional Branch 1554 TBB = I->getOperand(0).getMBB(); 1555 return false; 1556 } 1557 1558 MachineBasicBlock *CondBB = nullptr; 1559 1560 if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 1561 CondBB = I->getOperand(1).getMBB(); 1562 Cond.push_back(I->getOperand(0)); 1563 } else { 1564 BranchPredicate Pred = getBranchPredicate(I->getOpcode()); 1565 if (Pred == INVALID_BR) 1566 return true; 1567 1568 CondBB = I->getOperand(0).getMBB(); 1569 Cond.push_back(MachineOperand::CreateImm(Pred)); 1570 Cond.push_back(I->getOperand(1)); // Save the branch register. 1571 } 1572 ++I; 1573 1574 if (I == MBB.end()) { 1575 // Conditional branch followed by fall-through. 1576 TBB = CondBB; 1577 return false; 1578 } 1579 1580 if (I->getOpcode() == AMDGPU::S_BRANCH) { 1581 TBB = CondBB; 1582 FBB = I->getOperand(0).getMBB(); 1583 return false; 1584 } 1585 1586 return true; 1587 } 1588 1589 bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, 1590 MachineBasicBlock *&FBB, 1591 SmallVectorImpl<MachineOperand> &Cond, 1592 bool AllowModify) const { 1593 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 1594 if (I == MBB.end()) 1595 return false; 1596 1597 if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH) 1598 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); 1599 1600 ++I; 1601 1602 // TODO: Should be able to treat as fallthrough? 1603 if (I == MBB.end()) 1604 return true; 1605 1606 if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify)) 1607 return true; 1608 1609 MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB(); 1610 1611 // Specifically handle the case where the conditional branch is to the same 1612 // destination as the mask branch. e.g. 1613 // 1614 // si_mask_branch BB8 1615 // s_cbranch_execz BB8 1616 // s_cbranch BB9 1617 // 1618 // This is required to understand divergent loops which may need the branches 1619 // to be relaxed. 1620 if (TBB != MaskBrDest || Cond.empty()) 1621 return true; 1622 1623 auto Pred = Cond[0].getImm(); 1624 return (Pred != EXECZ && Pred != EXECNZ); 1625 } 1626 1627 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, 1628 int *BytesRemoved) const { 1629 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 1630 1631 unsigned Count = 0; 1632 unsigned RemovedSize = 0; 1633 while (I != MBB.end()) { 1634 MachineBasicBlock::iterator Next = std::next(I); 1635 if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) { 1636 I = Next; 1637 continue; 1638 } 1639 1640 RemovedSize += getInstSizeInBytes(*I); 1641 I->eraseFromParent(); 1642 ++Count; 1643 I = Next; 1644 } 1645 1646 if (BytesRemoved) 1647 *BytesRemoved = RemovedSize; 1648 1649 return Count; 1650 } 1651 1652 // Copy the flags onto the implicit condition register operand. 1653 static void preserveCondRegFlags(MachineOperand &CondReg, 1654 const MachineOperand &OrigCond) { 1655 CondReg.setIsUndef(OrigCond.isUndef()); 1656 CondReg.setIsKill(OrigCond.isKill()); 1657 } 1658 1659 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, 1660 MachineBasicBlock *TBB, 1661 MachineBasicBlock *FBB, 1662 ArrayRef<MachineOperand> Cond, 1663 const DebugLoc &DL, 1664 int *BytesAdded) const { 1665 if (!FBB && Cond.empty()) { 1666 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 1667 .addMBB(TBB); 1668 if (BytesAdded) 1669 *BytesAdded = 4; 1670 return 1; 1671 } 1672 1673 if(Cond.size() == 1 && Cond[0].isReg()) { 1674 BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO)) 1675 .add(Cond[0]) 1676 .addMBB(TBB); 1677 return 1; 1678 } 1679 1680 assert(TBB && Cond[0].isImm()); 1681 1682 unsigned Opcode 1683 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); 1684 1685 if (!FBB) { 1686 Cond[1].isUndef(); 1687 MachineInstr *CondBr = 1688 BuildMI(&MBB, DL, get(Opcode)) 1689 .addMBB(TBB); 1690 1691 // Copy the flags onto the implicit condition register operand. 1692 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); 1693 1694 if (BytesAdded) 1695 *BytesAdded = 4; 1696 return 1; 1697 } 1698 1699 assert(TBB && FBB); 1700 1701 MachineInstr *CondBr = 1702 BuildMI(&MBB, DL, get(Opcode)) 1703 .addMBB(TBB); 1704 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 1705 .addMBB(FBB); 1706 1707 MachineOperand &CondReg = CondBr->getOperand(1); 1708 CondReg.setIsUndef(Cond[1].isUndef()); 1709 CondReg.setIsKill(Cond[1].isKill()); 1710 1711 if (BytesAdded) 1712 *BytesAdded = 8; 1713 1714 return 2; 1715 } 1716 1717 bool SIInstrInfo::reverseBranchCondition( 1718 SmallVectorImpl<MachineOperand> &Cond) const { 1719 if (Cond.size() != 2) { 1720 return true; 1721 } 1722 1723 if (Cond[0].isImm()) { 1724 Cond[0].setImm(-Cond[0].getImm()); 1725 return false; 1726 } 1727 1728 return true; 1729 } 1730 1731 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 1732 ArrayRef<MachineOperand> Cond, 1733 unsigned TrueReg, unsigned FalseReg, 1734 int &CondCycles, 1735 int &TrueCycles, int &FalseCycles) const { 1736 switch (Cond[0].getImm()) { 1737 case VCCNZ: 1738 case VCCZ: { 1739 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 1740 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 1741 assert(MRI.getRegClass(FalseReg) == RC); 1742 1743 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 1744 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 1745 1746 // Limit to equal cost for branch vs. N v_cndmask_b32s. 1747 return !RI.isSGPRClass(RC) && NumInsts <= 6; 1748 } 1749 case SCC_TRUE: 1750 case SCC_FALSE: { 1751 // FIXME: We could insert for VGPRs if we could replace the original compare 1752 // with a vector one. 1753 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 1754 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 1755 assert(MRI.getRegClass(FalseReg) == RC); 1756 1757 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 1758 1759 // Multiples of 8 can do s_cselect_b64 1760 if (NumInsts % 2 == 0) 1761 NumInsts /= 2; 1762 1763 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 1764 return RI.isSGPRClass(RC); 1765 } 1766 default: 1767 return false; 1768 } 1769 } 1770 1771 void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, 1772 MachineBasicBlock::iterator I, const DebugLoc &DL, 1773 unsigned DstReg, ArrayRef<MachineOperand> Cond, 1774 unsigned TrueReg, unsigned FalseReg) const { 1775 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); 1776 if (Pred == VCCZ || Pred == SCC_FALSE) { 1777 Pred = static_cast<BranchPredicate>(-Pred); 1778 std::swap(TrueReg, FalseReg); 1779 } 1780 1781 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 1782 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); 1783 unsigned DstSize = RI.getRegSizeInBits(*DstRC); 1784 1785 if (DstSize == 32) { 1786 unsigned SelOp = Pred == SCC_TRUE ? 1787 AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32; 1788 1789 // Instruction's operands are backwards from what is expected. 1790 MachineInstr *Select = 1791 BuildMI(MBB, I, DL, get(SelOp), DstReg) 1792 .addReg(FalseReg) 1793 .addReg(TrueReg); 1794 1795 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 1796 return; 1797 } 1798 1799 if (DstSize == 64 && Pred == SCC_TRUE) { 1800 MachineInstr *Select = 1801 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) 1802 .addReg(FalseReg) 1803 .addReg(TrueReg); 1804 1805 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 1806 return; 1807 } 1808 1809 static const int16_t Sub0_15[] = { 1810 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1811 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 1812 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 1813 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 1814 }; 1815 1816 static const int16_t Sub0_15_64[] = { 1817 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 1818 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 1819 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 1820 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 1821 }; 1822 1823 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; 1824 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; 1825 const int16_t *SubIndices = Sub0_15; 1826 int NElts = DstSize / 32; 1827 1828 // 64-bit select is only avaialble for SALU. 1829 if (Pred == SCC_TRUE) { 1830 SelOp = AMDGPU::S_CSELECT_B64; 1831 EltRC = &AMDGPU::SGPR_64RegClass; 1832 SubIndices = Sub0_15_64; 1833 1834 assert(NElts % 2 == 0); 1835 NElts /= 2; 1836 } 1837 1838 MachineInstrBuilder MIB = BuildMI( 1839 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); 1840 1841 I = MIB->getIterator(); 1842 1843 SmallVector<unsigned, 8> Regs; 1844 for (int Idx = 0; Idx != NElts; ++Idx) { 1845 unsigned DstElt = MRI.createVirtualRegister(EltRC); 1846 Regs.push_back(DstElt); 1847 1848 unsigned SubIdx = SubIndices[Idx]; 1849 1850 MachineInstr *Select = 1851 BuildMI(MBB, I, DL, get(SelOp), DstElt) 1852 .addReg(FalseReg, 0, SubIdx) 1853 .addReg(TrueReg, 0, SubIdx); 1854 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 1855 1856 MIB.addReg(DstElt) 1857 .addImm(SubIdx); 1858 } 1859 } 1860 1861 bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { 1862 switch (MI.getOpcode()) { 1863 case AMDGPU::V_MOV_B32_e32: 1864 case AMDGPU::V_MOV_B32_e64: 1865 case AMDGPU::V_MOV_B64_PSEUDO: { 1866 // If there are additional implicit register operands, this may be used for 1867 // register indexing so the source register operand isn't simply copied. 1868 unsigned NumOps = MI.getDesc().getNumOperands() + 1869 MI.getDesc().getNumImplicitUses(); 1870 1871 return MI.getNumOperands() == NumOps; 1872 } 1873 case AMDGPU::S_MOV_B32: 1874 case AMDGPU::S_MOV_B64: 1875 case AMDGPU::COPY: 1876 return true; 1877 default: 1878 return false; 1879 } 1880 } 1881 1882 unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind( 1883 PseudoSourceValue::PSVKind Kind) const { 1884 switch(Kind) { 1885 case PseudoSourceValue::Stack: 1886 case PseudoSourceValue::FixedStack: 1887 return AMDGPUASI.PRIVATE_ADDRESS; 1888 case PseudoSourceValue::ConstantPool: 1889 case PseudoSourceValue::GOT: 1890 case PseudoSourceValue::JumpTable: 1891 case PseudoSourceValue::GlobalValueCallEntry: 1892 case PseudoSourceValue::ExternalSymbolCallEntry: 1893 case PseudoSourceValue::TargetCustom: 1894 return AMDGPUASI.CONSTANT_ADDRESS; 1895 } 1896 return AMDGPUASI.FLAT_ADDRESS; 1897 } 1898 1899 static void removeModOperands(MachineInstr &MI) { 1900 unsigned Opc = MI.getOpcode(); 1901 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1902 AMDGPU::OpName::src0_modifiers); 1903 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1904 AMDGPU::OpName::src1_modifiers); 1905 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1906 AMDGPU::OpName::src2_modifiers); 1907 1908 MI.RemoveOperand(Src2ModIdx); 1909 MI.RemoveOperand(Src1ModIdx); 1910 MI.RemoveOperand(Src0ModIdx); 1911 } 1912 1913 bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, 1914 unsigned Reg, MachineRegisterInfo *MRI) const { 1915 if (!MRI->hasOneNonDBGUse(Reg)) 1916 return false; 1917 1918 switch (DefMI.getOpcode()) { 1919 default: 1920 return false; 1921 case AMDGPU::S_MOV_B64: 1922 // TODO: We could fold 64-bit immediates, but this get compilicated 1923 // when there are sub-registers. 1924 return false; 1925 1926 case AMDGPU::V_MOV_B32_e32: 1927 case AMDGPU::S_MOV_B32: 1928 break; 1929 } 1930 1931 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); 1932 assert(ImmOp); 1933 // FIXME: We could handle FrameIndex values here. 1934 if (!ImmOp->isImm()) 1935 return false; 1936 1937 unsigned Opc = UseMI.getOpcode(); 1938 if (Opc == AMDGPU::COPY) { 1939 bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg()); 1940 unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 1941 UseMI.setDesc(get(NewOpc)); 1942 UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm()); 1943 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); 1944 return true; 1945 } 1946 1947 if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || 1948 Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) { 1949 // Don't fold if we are using source or output modifiers. The new VOP2 1950 // instructions don't have them. 1951 if (hasAnyModifiersSet(UseMI)) 1952 return false; 1953 1954 // If this is a free constant, there's no reason to do this. 1955 // TODO: We could fold this here instead of letting SIFoldOperands do it 1956 // later. 1957 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); 1958 1959 // Any src operand can be used for the legality check. 1960 if (isInlineConstant(UseMI, *Src0, *ImmOp)) 1961 return false; 1962 1963 bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64; 1964 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); 1965 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); 1966 1967 // Multiplied part is the constant: Use v_madmk_{f16, f32}. 1968 // We should only expect these to be on src0 due to canonicalizations. 1969 if (Src0->isReg() && Src0->getReg() == Reg) { 1970 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 1971 return false; 1972 1973 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) 1974 return false; 1975 1976 // We need to swap operands 0 and 1 since madmk constant is at operand 1. 1977 1978 const int64_t Imm = ImmOp->getImm(); 1979 1980 // FIXME: This would be a lot easier if we could return a new instruction 1981 // instead of having to modify in place. 1982 1983 // Remove these first since they are at the end. 1984 UseMI.RemoveOperand( 1985 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 1986 UseMI.RemoveOperand( 1987 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 1988 1989 unsigned Src1Reg = Src1->getReg(); 1990 unsigned Src1SubReg = Src1->getSubReg(); 1991 Src0->setReg(Src1Reg); 1992 Src0->setSubReg(Src1SubReg); 1993 Src0->setIsKill(Src1->isKill()); 1994 1995 if (Opc == AMDGPU::V_MAC_F32_e64 || 1996 Opc == AMDGPU::V_MAC_F16_e64) 1997 UseMI.untieRegOperand( 1998 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 1999 2000 Src1->ChangeToImmediate(Imm); 2001 2002 removeModOperands(UseMI); 2003 UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16)); 2004 2005 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 2006 if (DeleteDef) 2007 DefMI.eraseFromParent(); 2008 2009 return true; 2010 } 2011 2012 // Added part is the constant: Use v_madak_{f16, f32}. 2013 if (Src2->isReg() && Src2->getReg() == Reg) { 2014 // Not allowed to use constant bus for another operand. 2015 // We can however allow an inline immediate as src0. 2016 if (!Src0->isImm() && 2017 (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) 2018 return false; 2019 2020 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 2021 return false; 2022 2023 const int64_t Imm = ImmOp->getImm(); 2024 2025 // FIXME: This would be a lot easier if we could return a new instruction 2026 // instead of having to modify in place. 2027 2028 // Remove these first since they are at the end. 2029 UseMI.RemoveOperand( 2030 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 2031 UseMI.RemoveOperand( 2032 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 2033 2034 if (Opc == AMDGPU::V_MAC_F32_e64 || 2035 Opc == AMDGPU::V_MAC_F16_e64) 2036 UseMI.untieRegOperand( 2037 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 2038 2039 // ChangingToImmediate adds Src2 back to the instruction. 2040 Src2->ChangeToImmediate(Imm); 2041 2042 // These come before src2. 2043 removeModOperands(UseMI); 2044 UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16)); 2045 2046 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 2047 if (DeleteDef) 2048 DefMI.eraseFromParent(); 2049 2050 return true; 2051 } 2052 } 2053 2054 return false; 2055 } 2056 2057 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 2058 int WidthB, int OffsetB) { 2059 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 2060 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 2061 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 2062 return LowOffset + LowWidth <= HighOffset; 2063 } 2064 2065 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa, 2066 MachineInstr &MIb) const { 2067 unsigned BaseReg0, BaseReg1; 2068 int64_t Offset0, Offset1; 2069 2070 if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && 2071 getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { 2072 2073 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { 2074 // FIXME: Handle ds_read2 / ds_write2. 2075 return false; 2076 } 2077 unsigned Width0 = (*MIa.memoperands_begin())->getSize(); 2078 unsigned Width1 = (*MIb.memoperands_begin())->getSize(); 2079 if (BaseReg0 == BaseReg1 && 2080 offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { 2081 return true; 2082 } 2083 } 2084 2085 return false; 2086 } 2087 2088 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa, 2089 MachineInstr &MIb, 2090 AliasAnalysis *AA) const { 2091 assert((MIa.mayLoad() || MIa.mayStore()) && 2092 "MIa must load from or modify a memory location"); 2093 assert((MIb.mayLoad() || MIb.mayStore()) && 2094 "MIb must load from or modify a memory location"); 2095 2096 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) 2097 return false; 2098 2099 // XXX - Can we relax this between address spaces? 2100 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 2101 return false; 2102 2103 if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) { 2104 const MachineMemOperand *MMOa = *MIa.memoperands_begin(); 2105 const MachineMemOperand *MMOb = *MIb.memoperands_begin(); 2106 if (MMOa->getValue() && MMOb->getValue()) { 2107 MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo()); 2108 MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo()); 2109 if (!AA->alias(LocA, LocB)) 2110 return true; 2111 } 2112 } 2113 2114 // TODO: Should we check the address space from the MachineMemOperand? That 2115 // would allow us to distinguish objects we know don't alias based on the 2116 // underlying address space, even if it was lowered to a different one, 2117 // e.g. private accesses lowered to use MUBUF instructions on a scratch 2118 // buffer. 2119 if (isDS(MIa)) { 2120 if (isDS(MIb)) 2121 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2122 2123 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb); 2124 } 2125 2126 if (isMUBUF(MIa) || isMTBUF(MIa)) { 2127 if (isMUBUF(MIb) || isMTBUF(MIb)) 2128 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2129 2130 return !isFLAT(MIb) && !isSMRD(MIb); 2131 } 2132 2133 if (isSMRD(MIa)) { 2134 if (isSMRD(MIb)) 2135 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2136 2137 return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa); 2138 } 2139 2140 if (isFLAT(MIa)) { 2141 if (isFLAT(MIb)) 2142 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2143 2144 return false; 2145 } 2146 2147 return false; 2148 } 2149 2150 static int64_t getFoldableImm(const MachineOperand* MO) { 2151 if (!MO->isReg()) 2152 return false; 2153 const MachineFunction *MF = MO->getParent()->getParent()->getParent(); 2154 const MachineRegisterInfo &MRI = MF->getRegInfo(); 2155 auto Def = MRI.getUniqueVRegDef(MO->getReg()); 2156 if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 && 2157 Def->getOperand(1).isImm()) 2158 return Def->getOperand(1).getImm(); 2159 return AMDGPU::NoRegister; 2160 } 2161 2162 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, 2163 MachineInstr &MI, 2164 LiveVariables *LV) const { 2165 unsigned Opc = MI.getOpcode(); 2166 bool IsF16 = false; 2167 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64; 2168 2169 switch (Opc) { 2170 default: 2171 return nullptr; 2172 case AMDGPU::V_MAC_F16_e64: 2173 IsF16 = true; 2174 LLVM_FALLTHROUGH; 2175 case AMDGPU::V_MAC_F32_e64: 2176 case AMDGPU::V_FMAC_F32_e64: 2177 break; 2178 case AMDGPU::V_MAC_F16_e32: 2179 IsF16 = true; 2180 LLVM_FALLTHROUGH; 2181 case AMDGPU::V_MAC_F32_e32: 2182 case AMDGPU::V_FMAC_F32_e32: { 2183 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 2184 AMDGPU::OpName::src0); 2185 const MachineOperand *Src0 = &MI.getOperand(Src0Idx); 2186 if (!Src0->isReg() && !Src0->isImm()) 2187 return nullptr; 2188 2189 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0)) 2190 return nullptr; 2191 2192 break; 2193 } 2194 } 2195 2196 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 2197 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); 2198 const MachineOperand *Src0Mods = 2199 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 2200 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 2201 const MachineOperand *Src1Mods = 2202 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 2203 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 2204 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); 2205 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); 2206 2207 if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod && 2208 // If we have an SGPR input, we will violate the constant bus restriction. 2209 (!Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) { 2210 if (auto Imm = getFoldableImm(Src2)) { 2211 return BuildMI(*MBB, MI, MI.getDebugLoc(), 2212 get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32)) 2213 .add(*Dst) 2214 .add(*Src0) 2215 .add(*Src1) 2216 .addImm(Imm); 2217 } 2218 if (auto Imm = getFoldableImm(Src1)) { 2219 return BuildMI(*MBB, MI, MI.getDebugLoc(), 2220 get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32)) 2221 .add(*Dst) 2222 .add(*Src0) 2223 .addImm(Imm) 2224 .add(*Src2); 2225 } 2226 if (auto Imm = getFoldableImm(Src0)) { 2227 if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32, 2228 AMDGPU::OpName::src0), Src1)) 2229 return BuildMI(*MBB, MI, MI.getDebugLoc(), 2230 get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32)) 2231 .add(*Dst) 2232 .add(*Src1) 2233 .addImm(Imm) 2234 .add(*Src2); 2235 } 2236 } 2237 2238 assert((!IsFMA || !IsF16) && "fmac only expected with f32"); 2239 unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 : 2240 (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32); 2241 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) 2242 .add(*Dst) 2243 .addImm(Src0Mods ? Src0Mods->getImm() : 0) 2244 .add(*Src0) 2245 .addImm(Src1Mods ? Src1Mods->getImm() : 0) 2246 .add(*Src1) 2247 .addImm(0) // Src mods 2248 .add(*Src2) 2249 .addImm(Clamp ? Clamp->getImm() : 0) 2250 .addImm(Omod ? Omod->getImm() : 0); 2251 } 2252 2253 // It's not generally safe to move VALU instructions across these since it will 2254 // start using the register as a base index rather than directly. 2255 // XXX - Why isn't hasSideEffects sufficient for these? 2256 static bool changesVGPRIndexingMode(const MachineInstr &MI) { 2257 switch (MI.getOpcode()) { 2258 case AMDGPU::S_SET_GPR_IDX_ON: 2259 case AMDGPU::S_SET_GPR_IDX_MODE: 2260 case AMDGPU::S_SET_GPR_IDX_OFF: 2261 return true; 2262 default: 2263 return false; 2264 } 2265 } 2266 2267 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, 2268 const MachineBasicBlock *MBB, 2269 const MachineFunction &MF) const { 2270 // XXX - Do we want the SP check in the base implementation? 2271 2272 // Target-independent instructions do not have an implicit-use of EXEC, even 2273 // when they operate on VGPRs. Treating EXEC modifications as scheduling 2274 // boundaries prevents incorrect movements of such instructions. 2275 return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) || 2276 MI.modifiesRegister(AMDGPU::EXEC, &RI) || 2277 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || 2278 MI.getOpcode() == AMDGPU::S_SETREG_B32 || 2279 changesVGPRIndexingMode(MI); 2280 } 2281 2282 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 2283 switch (Imm.getBitWidth()) { 2284 case 32: 2285 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(), 2286 ST.hasInv2PiInlineImm()); 2287 case 64: 2288 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), 2289 ST.hasInv2PiInlineImm()); 2290 case 16: 2291 return ST.has16BitInsts() && 2292 AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), 2293 ST.hasInv2PiInlineImm()); 2294 default: 2295 llvm_unreachable("invalid bitwidth"); 2296 } 2297 } 2298 2299 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 2300 uint8_t OperandType) const { 2301 if (!MO.isImm() || 2302 OperandType < AMDGPU::OPERAND_SRC_FIRST || 2303 OperandType > AMDGPU::OPERAND_SRC_LAST) 2304 return false; 2305 2306 // MachineOperand provides no way to tell the true operand size, since it only 2307 // records a 64-bit value. We need to know the size to determine if a 32-bit 2308 // floating point immediate bit pattern is legal for an integer immediate. It 2309 // would be for any 32-bit integer operand, but would not be for a 64-bit one. 2310 2311 int64_t Imm = MO.getImm(); 2312 switch (OperandType) { 2313 case AMDGPU::OPERAND_REG_IMM_INT32: 2314 case AMDGPU::OPERAND_REG_IMM_FP32: 2315 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 2316 case AMDGPU::OPERAND_REG_INLINE_C_FP32: { 2317 int32_t Trunc = static_cast<int32_t>(Imm); 2318 return Trunc == Imm && 2319 AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); 2320 } 2321 case AMDGPU::OPERAND_REG_IMM_INT64: 2322 case AMDGPU::OPERAND_REG_IMM_FP64: 2323 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 2324 case AMDGPU::OPERAND_REG_INLINE_C_FP64: 2325 return AMDGPU::isInlinableLiteral64(MO.getImm(), 2326 ST.hasInv2PiInlineImm()); 2327 case AMDGPU::OPERAND_REG_IMM_INT16: 2328 case AMDGPU::OPERAND_REG_IMM_FP16: 2329 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 2330 case AMDGPU::OPERAND_REG_INLINE_C_FP16: { 2331 if (isInt<16>(Imm) || isUInt<16>(Imm)) { 2332 // A few special case instructions have 16-bit operands on subtargets 2333 // where 16-bit instructions are not legal. 2334 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle 2335 // constants in these cases 2336 int16_t Trunc = static_cast<int16_t>(Imm); 2337 return ST.has16BitInsts() && 2338 AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); 2339 } 2340 2341 return false; 2342 } 2343 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: 2344 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { 2345 if (isUInt<16>(Imm)) { 2346 int16_t Trunc = static_cast<int16_t>(Imm); 2347 return ST.has16BitInsts() && 2348 AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); 2349 } 2350 if (!(Imm & 0xffff)) { 2351 return ST.has16BitInsts() && 2352 AMDGPU::isInlinableLiteral16(Imm >> 16, ST.hasInv2PiInlineImm()); 2353 } 2354 uint32_t Trunc = static_cast<uint32_t>(Imm); 2355 return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); 2356 } 2357 default: 2358 llvm_unreachable("invalid bitwidth"); 2359 } 2360 } 2361 2362 bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO, 2363 const MCOperandInfo &OpInfo) const { 2364 switch (MO.getType()) { 2365 case MachineOperand::MO_Register: 2366 return false; 2367 case MachineOperand::MO_Immediate: 2368 return !isInlineConstant(MO, OpInfo); 2369 case MachineOperand::MO_FrameIndex: 2370 case MachineOperand::MO_MachineBasicBlock: 2371 case MachineOperand::MO_ExternalSymbol: 2372 case MachineOperand::MO_GlobalAddress: 2373 case MachineOperand::MO_MCSymbol: 2374 return true; 2375 default: 2376 llvm_unreachable("unexpected operand type"); 2377 } 2378 } 2379 2380 static bool compareMachineOp(const MachineOperand &Op0, 2381 const MachineOperand &Op1) { 2382 if (Op0.getType() != Op1.getType()) 2383 return false; 2384 2385 switch (Op0.getType()) { 2386 case MachineOperand::MO_Register: 2387 return Op0.getReg() == Op1.getReg(); 2388 case MachineOperand::MO_Immediate: 2389 return Op0.getImm() == Op1.getImm(); 2390 default: 2391 llvm_unreachable("Didn't expect to be comparing these operand types"); 2392 } 2393 } 2394 2395 bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, 2396 const MachineOperand &MO) const { 2397 const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo]; 2398 2399 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 2400 2401 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 2402 return true; 2403 2404 if (OpInfo.RegClass < 0) 2405 return false; 2406 2407 if (MO.isImm() && isInlineConstant(MO, OpInfo)) 2408 return RI.opCanUseInlineConstant(OpInfo.OperandType); 2409 2410 return RI.opCanUseLiteralConstant(OpInfo.OperandType); 2411 } 2412 2413 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 2414 int Op32 = AMDGPU::getVOPe32(Opcode); 2415 if (Op32 == -1) 2416 return false; 2417 2418 return pseudoToMCOpcode(Op32) != -1; 2419 } 2420 2421 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 2422 // The src0_modifier operand is present on all instructions 2423 // that have modifiers. 2424 2425 return AMDGPU::getNamedOperandIdx(Opcode, 2426 AMDGPU::OpName::src0_modifiers) != -1; 2427 } 2428 2429 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 2430 unsigned OpName) const { 2431 const MachineOperand *Mods = getNamedOperand(MI, OpName); 2432 return Mods && Mods->getImm(); 2433 } 2434 2435 bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { 2436 return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || 2437 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || 2438 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) || 2439 hasModifiersSet(MI, AMDGPU::OpName::clamp) || 2440 hasModifiersSet(MI, AMDGPU::OpName::omod); 2441 } 2442 2443 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 2444 const MachineOperand &MO, 2445 const MCOperandInfo &OpInfo) const { 2446 // Literal constants use the constant bus. 2447 //if (isLiteralConstantLike(MO, OpInfo)) 2448 // return true; 2449 if (MO.isImm()) 2450 return !isInlineConstant(MO, OpInfo); 2451 2452 if (!MO.isReg()) 2453 return true; // Misc other operands like FrameIndex 2454 2455 if (!MO.isUse()) 2456 return false; 2457 2458 if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) 2459 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 2460 2461 // FLAT_SCR is just an SGPR pair. 2462 if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) 2463 return true; 2464 2465 // EXEC register uses the constant bus. 2466 if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) 2467 return true; 2468 2469 // SGPRs use the constant bus 2470 return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 || 2471 (!MO.isImplicit() && 2472 (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || 2473 AMDGPU::SGPR_64RegClass.contains(MO.getReg())))); 2474 } 2475 2476 static unsigned findImplicitSGPRRead(const MachineInstr &MI) { 2477 for (const MachineOperand &MO : MI.implicit_operands()) { 2478 // We only care about reads. 2479 if (MO.isDef()) 2480 continue; 2481 2482 switch (MO.getReg()) { 2483 case AMDGPU::VCC: 2484 case AMDGPU::M0: 2485 case AMDGPU::FLAT_SCR: 2486 return MO.getReg(); 2487 2488 default: 2489 break; 2490 } 2491 } 2492 2493 return AMDGPU::NoRegister; 2494 } 2495 2496 static bool shouldReadExec(const MachineInstr &MI) { 2497 if (SIInstrInfo::isVALU(MI)) { 2498 switch (MI.getOpcode()) { 2499 case AMDGPU::V_READLANE_B32: 2500 case AMDGPU::V_READLANE_B32_si: 2501 case AMDGPU::V_READLANE_B32_vi: 2502 case AMDGPU::V_WRITELANE_B32: 2503 case AMDGPU::V_WRITELANE_B32_si: 2504 case AMDGPU::V_WRITELANE_B32_vi: 2505 return false; 2506 } 2507 2508 return true; 2509 } 2510 2511 if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) || 2512 SIInstrInfo::isSALU(MI) || 2513 SIInstrInfo::isSMRD(MI)) 2514 return false; 2515 2516 return true; 2517 } 2518 2519 static bool isSubRegOf(const SIRegisterInfo &TRI, 2520 const MachineOperand &SuperVec, 2521 const MachineOperand &SubReg) { 2522 if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg())) 2523 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); 2524 2525 return SubReg.getSubReg() != AMDGPU::NoSubRegister && 2526 SubReg.getReg() == SuperVec.getReg(); 2527 } 2528 2529 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, 2530 StringRef &ErrInfo) const { 2531 uint16_t Opcode = MI.getOpcode(); 2532 if (SIInstrInfo::isGenericOpcode(MI.getOpcode())) 2533 return true; 2534 2535 const MachineFunction *MF = MI.getParent()->getParent(); 2536 const MachineRegisterInfo &MRI = MF->getRegInfo(); 2537 2538 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 2539 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 2540 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 2541 2542 // Make sure the number of operands is correct. 2543 const MCInstrDesc &Desc = get(Opcode); 2544 if (!Desc.isVariadic() && 2545 Desc.getNumOperands() != MI.getNumExplicitOperands()) { 2546 ErrInfo = "Instruction has wrong number of operands."; 2547 return false; 2548 } 2549 2550 if (MI.isInlineAsm()) { 2551 // Verify register classes for inlineasm constraints. 2552 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands(); 2553 I != E; ++I) { 2554 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI); 2555 if (!RC) 2556 continue; 2557 2558 const MachineOperand &Op = MI.getOperand(I); 2559 if (!Op.isReg()) 2560 continue; 2561 2562 unsigned Reg = Op.getReg(); 2563 if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) { 2564 ErrInfo = "inlineasm operand has incorrect register class."; 2565 return false; 2566 } 2567 } 2568 2569 return true; 2570 } 2571 2572 // Make sure the register classes are correct. 2573 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 2574 if (MI.getOperand(i).isFPImm()) { 2575 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 2576 "all fp values to integers."; 2577 return false; 2578 } 2579 2580 int RegClass = Desc.OpInfo[i].RegClass; 2581 2582 switch (Desc.OpInfo[i].OperandType) { 2583 case MCOI::OPERAND_REGISTER: 2584 if (MI.getOperand(i).isImm()) { 2585 ErrInfo = "Illegal immediate value for operand."; 2586 return false; 2587 } 2588 break; 2589 case AMDGPU::OPERAND_REG_IMM_INT32: 2590 case AMDGPU::OPERAND_REG_IMM_FP32: 2591 break; 2592 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 2593 case AMDGPU::OPERAND_REG_INLINE_C_FP32: 2594 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 2595 case AMDGPU::OPERAND_REG_INLINE_C_FP64: 2596 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 2597 case AMDGPU::OPERAND_REG_INLINE_C_FP16: { 2598 const MachineOperand &MO = MI.getOperand(i); 2599 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { 2600 ErrInfo = "Illegal immediate value for operand."; 2601 return false; 2602 } 2603 break; 2604 } 2605 case MCOI::OPERAND_IMMEDIATE: 2606 case AMDGPU::OPERAND_KIMM32: 2607 // Check if this operand is an immediate. 2608 // FrameIndex operands will be replaced by immediates, so they are 2609 // allowed. 2610 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { 2611 ErrInfo = "Expected immediate, but got non-immediate"; 2612 return false; 2613 } 2614 LLVM_FALLTHROUGH; 2615 default: 2616 continue; 2617 } 2618 2619 if (!MI.getOperand(i).isReg()) 2620 continue; 2621 2622 if (RegClass != -1) { 2623 unsigned Reg = MI.getOperand(i).getReg(); 2624 if (Reg == AMDGPU::NoRegister || 2625 TargetRegisterInfo::isVirtualRegister(Reg)) 2626 continue; 2627 2628 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 2629 if (!RC->contains(Reg)) { 2630 ErrInfo = "Operand has incorrect register class."; 2631 return false; 2632 } 2633 } 2634 } 2635 2636 // Verify SDWA 2637 if (isSDWA(MI)) { 2638 if (!ST.hasSDWA()) { 2639 ErrInfo = "SDWA is not supported on this target"; 2640 return false; 2641 } 2642 2643 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); 2644 2645 const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx }; 2646 2647 for (int OpIdx: OpIndicies) { 2648 if (OpIdx == -1) 2649 continue; 2650 const MachineOperand &MO = MI.getOperand(OpIdx); 2651 2652 if (!ST.hasSDWAScalar()) { 2653 // Only VGPRS on VI 2654 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) { 2655 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI"; 2656 return false; 2657 } 2658 } else { 2659 // No immediates on GFX9 2660 if (!MO.isReg()) { 2661 ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9"; 2662 return false; 2663 } 2664 } 2665 } 2666 2667 if (!ST.hasSDWAOmod()) { 2668 // No omod allowed on VI 2669 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); 2670 if (OMod != nullptr && 2671 (!OMod->isImm() || OMod->getImm() != 0)) { 2672 ErrInfo = "OMod not allowed in SDWA instructions on VI"; 2673 return false; 2674 } 2675 } 2676 2677 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode); 2678 if (isVOPC(BasicOpcode)) { 2679 if (!ST.hasSDWASdst() && DstIdx != -1) { 2680 // Only vcc allowed as dst on VI for VOPC 2681 const MachineOperand &Dst = MI.getOperand(DstIdx); 2682 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) { 2683 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI"; 2684 return false; 2685 } 2686 } else if (!ST.hasSDWAOutModsVOPC()) { 2687 // No clamp allowed on GFX9 for VOPC 2688 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); 2689 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) { 2690 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI"; 2691 return false; 2692 } 2693 2694 // No omod allowed on GFX9 for VOPC 2695 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); 2696 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) { 2697 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI"; 2698 return false; 2699 } 2700 } 2701 } 2702 2703 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused); 2704 if (DstUnused && DstUnused->isImm() && 2705 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) { 2706 const MachineOperand &Dst = MI.getOperand(DstIdx); 2707 if (!Dst.isReg() || !Dst.isTied()) { 2708 ErrInfo = "Dst register should have tied register"; 2709 return false; 2710 } 2711 2712 const MachineOperand &TiedMO = 2713 MI.getOperand(MI.findTiedOperandIdx(DstIdx)); 2714 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) { 2715 ErrInfo = 2716 "Dst register should be tied to implicit use of preserved register"; 2717 return false; 2718 } else if (TargetRegisterInfo::isPhysicalRegister(TiedMO.getReg()) && 2719 Dst.getReg() != TiedMO.getReg()) { 2720 ErrInfo = "Dst register should use same physical register as preserved"; 2721 return false; 2722 } 2723 } 2724 } 2725 2726 // Verify VOP*. Ignore multiple sgpr operands on writelane. 2727 if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32 2728 && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) { 2729 // Only look at the true operands. Only a real operand can use the constant 2730 // bus, and we don't want to check pseudo-operands like the source modifier 2731 // flags. 2732 const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; 2733 2734 unsigned ConstantBusCount = 0; 2735 unsigned LiteralCount = 0; 2736 2737 if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) 2738 ++ConstantBusCount; 2739 2740 unsigned SGPRUsed = findImplicitSGPRRead(MI); 2741 if (SGPRUsed != AMDGPU::NoRegister) 2742 ++ConstantBusCount; 2743 2744 for (int OpIdx : OpIndices) { 2745 if (OpIdx == -1) 2746 break; 2747 const MachineOperand &MO = MI.getOperand(OpIdx); 2748 if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { 2749 if (MO.isReg()) { 2750 if (MO.getReg() != SGPRUsed) 2751 ++ConstantBusCount; 2752 SGPRUsed = MO.getReg(); 2753 } else { 2754 ++ConstantBusCount; 2755 ++LiteralCount; 2756 } 2757 } 2758 } 2759 if (ConstantBusCount > 1) { 2760 ErrInfo = "VOP* instruction uses the constant bus more than once"; 2761 return false; 2762 } 2763 2764 if (isVOP3(MI) && LiteralCount) { 2765 ErrInfo = "VOP3 instruction uses literal"; 2766 return false; 2767 } 2768 } 2769 2770 // Verify misc. restrictions on specific instructions. 2771 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || 2772 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { 2773 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 2774 const MachineOperand &Src1 = MI.getOperand(Src1Idx); 2775 const MachineOperand &Src2 = MI.getOperand(Src2Idx); 2776 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 2777 if (!compareMachineOp(Src0, Src1) && 2778 !compareMachineOp(Src0, Src2)) { 2779 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 2780 return false; 2781 } 2782 } 2783 } 2784 2785 if (isSOPK(MI)) { 2786 int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm(); 2787 if (sopkIsZext(MI)) { 2788 if (!isUInt<16>(Imm)) { 2789 ErrInfo = "invalid immediate for SOPK instruction"; 2790 return false; 2791 } 2792 } else { 2793 if (!isInt<16>(Imm)) { 2794 ErrInfo = "invalid immediate for SOPK instruction"; 2795 return false; 2796 } 2797 } 2798 } 2799 2800 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || 2801 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || 2802 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 2803 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { 2804 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 2805 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; 2806 2807 const unsigned StaticNumOps = Desc.getNumOperands() + 2808 Desc.getNumImplicitUses(); 2809 const unsigned NumImplicitOps = IsDst ? 2 : 1; 2810 2811 // Allow additional implicit operands. This allows a fixup done by the post 2812 // RA scheduler where the main implicit operand is killed and implicit-defs 2813 // are added for sub-registers that remain live after this instruction. 2814 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { 2815 ErrInfo = "missing implicit register operands"; 2816 return false; 2817 } 2818 2819 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 2820 if (IsDst) { 2821 if (!Dst->isUse()) { 2822 ErrInfo = "v_movreld_b32 vdst should be a use operand"; 2823 return false; 2824 } 2825 2826 unsigned UseOpIdx; 2827 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || 2828 UseOpIdx != StaticNumOps + 1) { 2829 ErrInfo = "movrel implicit operands should be tied"; 2830 return false; 2831 } 2832 } 2833 2834 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 2835 const MachineOperand &ImpUse 2836 = MI.getOperand(StaticNumOps + NumImplicitOps - 1); 2837 if (!ImpUse.isReg() || !ImpUse.isUse() || 2838 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { 2839 ErrInfo = "src0 should be subreg of implicit vector use"; 2840 return false; 2841 } 2842 } 2843 2844 // Make sure we aren't losing exec uses in the td files. This mostly requires 2845 // being careful when using let Uses to try to add other use registers. 2846 if (shouldReadExec(MI)) { 2847 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 2848 ErrInfo = "VALU instruction does not implicitly read exec mask"; 2849 return false; 2850 } 2851 } 2852 2853 if (isSMRD(MI)) { 2854 if (MI.mayStore()) { 2855 // The register offset form of scalar stores may only use m0 as the 2856 // soffset register. 2857 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff); 2858 if (Soff && Soff->getReg() != AMDGPU::M0) { 2859 ErrInfo = "scalar stores must use m0 as offset register"; 2860 return false; 2861 } 2862 } 2863 } 2864 2865 if (isFLAT(MI) && !MF->getSubtarget<SISubtarget>().hasFlatInstOffsets()) { 2866 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 2867 if (Offset->getImm() != 0) { 2868 ErrInfo = "subtarget does not support offsets in flat instructions"; 2869 return false; 2870 } 2871 } 2872 2873 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl); 2874 if (DppCt) { 2875 using namespace AMDGPU::DPP; 2876 2877 unsigned DC = DppCt->getImm(); 2878 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 || 2879 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST || 2880 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) || 2881 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) || 2882 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) || 2883 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST)) { 2884 ErrInfo = "Invalid dpp_ctrl value"; 2885 return false; 2886 } 2887 } 2888 2889 return true; 2890 } 2891 2892 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { 2893 switch (MI.getOpcode()) { 2894 default: return AMDGPU::INSTRUCTION_LIST_END; 2895 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 2896 case AMDGPU::COPY: return AMDGPU::COPY; 2897 case AMDGPU::PHI: return AMDGPU::PHI; 2898 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 2899 case AMDGPU::WQM: return AMDGPU::WQM; 2900 case AMDGPU::WWM: return AMDGPU::WWM; 2901 case AMDGPU::S_MOV_B32: 2902 return MI.getOperand(1).isReg() ? 2903 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 2904 case AMDGPU::S_ADD_I32: 2905 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32; 2906 case AMDGPU::S_ADDC_U32: 2907 return AMDGPU::V_ADDC_U32_e32; 2908 case AMDGPU::S_SUB_I32: 2909 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32; 2910 // FIXME: These are not consistently handled, and selected when the carry is 2911 // used. 2912 case AMDGPU::S_ADD_U32: 2913 return AMDGPU::V_ADD_I32_e32; 2914 case AMDGPU::S_SUB_U32: 2915 return AMDGPU::V_SUB_I32_e32; 2916 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 2917 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; 2918 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; 2919 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; 2920 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; 2921 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; 2922 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; 2923 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; 2924 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; 2925 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 2926 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; 2927 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 2928 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; 2929 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 2930 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; 2931 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; 2932 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; 2933 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; 2934 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; 2935 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 2936 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 2937 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 2938 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 2939 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; 2940 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; 2941 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; 2942 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; 2943 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; 2944 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; 2945 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; 2946 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; 2947 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; 2948 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; 2949 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; 2950 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; 2951 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32; 2952 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32; 2953 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 2954 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 2955 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 2956 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 2957 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; 2958 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; 2959 } 2960 } 2961 2962 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 2963 unsigned OpNo) const { 2964 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 2965 const MCInstrDesc &Desc = get(MI.getOpcode()); 2966 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 2967 Desc.OpInfo[OpNo].RegClass == -1) { 2968 unsigned Reg = MI.getOperand(OpNo).getReg(); 2969 2970 if (TargetRegisterInfo::isVirtualRegister(Reg)) 2971 return MRI.getRegClass(Reg); 2972 return RI.getPhysRegClass(Reg); 2973 } 2974 2975 unsigned RCID = Desc.OpInfo[OpNo].RegClass; 2976 return RI.getRegClass(RCID); 2977 } 2978 2979 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { 2980 switch (MI.getOpcode()) { 2981 case AMDGPU::COPY: 2982 case AMDGPU::REG_SEQUENCE: 2983 case AMDGPU::PHI: 2984 case AMDGPU::INSERT_SUBREG: 2985 return RI.hasVGPRs(getOpRegClass(MI, 0)); 2986 default: 2987 return RI.hasVGPRs(getOpRegClass(MI, OpNo)); 2988 } 2989 } 2990 2991 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { 2992 MachineBasicBlock::iterator I = MI; 2993 MachineBasicBlock *MBB = MI.getParent(); 2994 MachineOperand &MO = MI.getOperand(OpIdx); 2995 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 2996 unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; 2997 const TargetRegisterClass *RC = RI.getRegClass(RCID); 2998 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 2999 if (MO.isReg()) 3000 Opcode = AMDGPU::COPY; 3001 else if (RI.isSGPRClass(RC)) 3002 Opcode = AMDGPU::S_MOV_B32; 3003 3004 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 3005 if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) 3006 VRC = &AMDGPU::VReg_64RegClass; 3007 else 3008 VRC = &AMDGPU::VGPR_32RegClass; 3009 3010 unsigned Reg = MRI.createVirtualRegister(VRC); 3011 DebugLoc DL = MBB->findDebugLoc(I); 3012 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); 3013 MO.ChangeToRegister(Reg, false); 3014 } 3015 3016 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 3017 MachineRegisterInfo &MRI, 3018 MachineOperand &SuperReg, 3019 const TargetRegisterClass *SuperRC, 3020 unsigned SubIdx, 3021 const TargetRegisterClass *SubRC) 3022 const { 3023 MachineBasicBlock *MBB = MI->getParent(); 3024 DebugLoc DL = MI->getDebugLoc(); 3025 unsigned SubReg = MRI.createVirtualRegister(SubRC); 3026 3027 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { 3028 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 3029 .addReg(SuperReg.getReg(), 0, SubIdx); 3030 return SubReg; 3031 } 3032 3033 // Just in case the super register is itself a sub-register, copy it to a new 3034 // value so we don't need to worry about merging its subreg index with the 3035 // SubIdx passed to this function. The register coalescer should be able to 3036 // eliminate this extra copy. 3037 unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); 3038 3039 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 3040 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 3041 3042 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 3043 .addReg(NewSuperReg, 0, SubIdx); 3044 3045 return SubReg; 3046 } 3047 3048 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 3049 MachineBasicBlock::iterator MII, 3050 MachineRegisterInfo &MRI, 3051 MachineOperand &Op, 3052 const TargetRegisterClass *SuperRC, 3053 unsigned SubIdx, 3054 const TargetRegisterClass *SubRC) const { 3055 if (Op.isImm()) { 3056 if (SubIdx == AMDGPU::sub0) 3057 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); 3058 if (SubIdx == AMDGPU::sub1) 3059 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32)); 3060 3061 llvm_unreachable("Unhandled register index for immediate"); 3062 } 3063 3064 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 3065 SubIdx, SubRC); 3066 return MachineOperand::CreateReg(SubReg, false); 3067 } 3068 3069 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 3070 void SIInstrInfo::swapOperands(MachineInstr &Inst) const { 3071 assert(Inst.getNumExplicitOperands() == 3); 3072 MachineOperand Op1 = Inst.getOperand(1); 3073 Inst.RemoveOperand(1); 3074 Inst.addOperand(Op1); 3075 } 3076 3077 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, 3078 const MCOperandInfo &OpInfo, 3079 const MachineOperand &MO) const { 3080 if (!MO.isReg()) 3081 return false; 3082 3083 unsigned Reg = MO.getReg(); 3084 const TargetRegisterClass *RC = 3085 TargetRegisterInfo::isVirtualRegister(Reg) ? 3086 MRI.getRegClass(Reg) : 3087 RI.getPhysRegClass(Reg); 3088 3089 const SIRegisterInfo *TRI = 3090 static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 3091 RC = TRI->getSubRegClass(RC, MO.getSubReg()); 3092 3093 // In order to be legal, the common sub-class must be equal to the 3094 // class of the current operand. For example: 3095 // 3096 // v_mov_b32 s0 ; Operand defined as vsrc_b32 3097 // ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL 3098 // 3099 // s_sendmsg 0, s0 ; Operand defined as m0reg 3100 // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL 3101 3102 return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; 3103 } 3104 3105 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, 3106 const MCOperandInfo &OpInfo, 3107 const MachineOperand &MO) const { 3108 if (MO.isReg()) 3109 return isLegalRegOperand(MRI, OpInfo, MO); 3110 3111 // Handle non-register types that are treated like immediates. 3112 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 3113 return true; 3114 } 3115 3116 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, 3117 const MachineOperand *MO) const { 3118 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 3119 const MCInstrDesc &InstDesc = MI.getDesc(); 3120 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; 3121 const TargetRegisterClass *DefinedRC = 3122 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 3123 if (!MO) 3124 MO = &MI.getOperand(OpIdx); 3125 3126 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { 3127 3128 RegSubRegPair SGPRUsed; 3129 if (MO->isReg()) 3130 SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg()); 3131 3132 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3133 if (i == OpIdx) 3134 continue; 3135 const MachineOperand &Op = MI.getOperand(i); 3136 if (Op.isReg()) { 3137 if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) && 3138 usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) { 3139 return false; 3140 } 3141 } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { 3142 return false; 3143 } 3144 } 3145 } 3146 3147 if (MO->isReg()) { 3148 assert(DefinedRC); 3149 return isLegalRegOperand(MRI, OpInfo, *MO); 3150 } 3151 3152 // Handle non-register types that are treated like immediates. 3153 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); 3154 3155 if (!DefinedRC) { 3156 // This operand expects an immediate. 3157 return true; 3158 } 3159 3160 return isImmOperandLegal(MI, OpIdx, *MO); 3161 } 3162 3163 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, 3164 MachineInstr &MI) const { 3165 unsigned Opc = MI.getOpcode(); 3166 const MCInstrDesc &InstrDesc = get(Opc); 3167 3168 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 3169 MachineOperand &Src1 = MI.getOperand(Src1Idx); 3170 3171 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 3172 // we need to only have one constant bus use. 3173 // 3174 // Note we do not need to worry about literal constants here. They are 3175 // disabled for the operand type for instructions because they will always 3176 // violate the one constant bus use rule. 3177 bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; 3178 if (HasImplicitSGPR) { 3179 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 3180 MachineOperand &Src0 = MI.getOperand(Src0Idx); 3181 3182 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) 3183 legalizeOpWithMove(MI, Src0Idx); 3184 } 3185 3186 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for 3187 // both the value to write (src0) and lane select (src1). Fix up non-SGPR 3188 // src0/src1 with V_READFIRSTLANE. 3189 if (Opc == AMDGPU::V_WRITELANE_B32) { 3190 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 3191 MachineOperand &Src0 = MI.getOperand(Src0Idx); 3192 const DebugLoc &DL = MI.getDebugLoc(); 3193 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) { 3194 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 3195 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 3196 .add(Src0); 3197 Src0.ChangeToRegister(Reg, false); 3198 } 3199 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) { 3200 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 3201 const DebugLoc &DL = MI.getDebugLoc(); 3202 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 3203 .add(Src1); 3204 Src1.ChangeToRegister(Reg, false); 3205 } 3206 return; 3207 } 3208 3209 // VOP2 src0 instructions support all operand types, so we don't need to check 3210 // their legality. If src1 is already legal, we don't need to do anything. 3211 if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) 3212 return; 3213 3214 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for 3215 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane 3216 // select is uniform. 3217 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() && 3218 RI.isVGPR(MRI, Src1.getReg())) { 3219 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 3220 const DebugLoc &DL = MI.getDebugLoc(); 3221 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 3222 .add(Src1); 3223 Src1.ChangeToRegister(Reg, false); 3224 return; 3225 } 3226 3227 // We do not use commuteInstruction here because it is too aggressive and will 3228 // commute if it is possible. We only want to commute here if it improves 3229 // legality. This can be called a fairly large number of times so don't waste 3230 // compile time pointlessly swapping and checking legality again. 3231 if (HasImplicitSGPR || !MI.isCommutable()) { 3232 legalizeOpWithMove(MI, Src1Idx); 3233 return; 3234 } 3235 3236 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 3237 MachineOperand &Src0 = MI.getOperand(Src0Idx); 3238 3239 // If src0 can be used as src1, commuting will make the operands legal. 3240 // Otherwise we have to give up and insert a move. 3241 // 3242 // TODO: Other immediate-like operand kinds could be commuted if there was a 3243 // MachineOperand::ChangeTo* for them. 3244 if ((!Src1.isImm() && !Src1.isReg()) || 3245 !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { 3246 legalizeOpWithMove(MI, Src1Idx); 3247 return; 3248 } 3249 3250 int CommutedOpc = commuteOpcode(MI); 3251 if (CommutedOpc == -1) { 3252 legalizeOpWithMove(MI, Src1Idx); 3253 return; 3254 } 3255 3256 MI.setDesc(get(CommutedOpc)); 3257 3258 unsigned Src0Reg = Src0.getReg(); 3259 unsigned Src0SubReg = Src0.getSubReg(); 3260 bool Src0Kill = Src0.isKill(); 3261 3262 if (Src1.isImm()) 3263 Src0.ChangeToImmediate(Src1.getImm()); 3264 else if (Src1.isReg()) { 3265 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); 3266 Src0.setSubReg(Src1.getSubReg()); 3267 } else 3268 llvm_unreachable("Should only have register or immediate operands"); 3269 3270 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); 3271 Src1.setSubReg(Src0SubReg); 3272 } 3273 3274 // Legalize VOP3 operands. Because all operand types are supported for any 3275 // operand, and since literal constants are not allowed and should never be 3276 // seen, we only need to worry about inserting copies if we use multiple SGPR 3277 // operands. 3278 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, 3279 MachineInstr &MI) const { 3280 unsigned Opc = MI.getOpcode(); 3281 3282 int VOP3Idx[3] = { 3283 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 3284 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), 3285 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) 3286 }; 3287 3288 // Find the one SGPR operand we are allowed to use. 3289 unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); 3290 3291 for (unsigned i = 0; i < 3; ++i) { 3292 int Idx = VOP3Idx[i]; 3293 if (Idx == -1) 3294 break; 3295 MachineOperand &MO = MI.getOperand(Idx); 3296 3297 // We should never see a VOP3 instruction with an illegal immediate operand. 3298 if (!MO.isReg()) 3299 continue; 3300 3301 if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 3302 continue; // VGPRs are legal 3303 3304 if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { 3305 SGPRReg = MO.getReg(); 3306 // We can use one SGPR in each VOP3 instruction. 3307 continue; 3308 } 3309 3310 // If we make it this far, then the operand is not legal and we must 3311 // legalize it. 3312 legalizeOpWithMove(MI, Idx); 3313 } 3314 } 3315 3316 unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, 3317 MachineRegisterInfo &MRI) const { 3318 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); 3319 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); 3320 unsigned DstReg = MRI.createVirtualRegister(SRC); 3321 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; 3322 3323 if (SubRegs == 1) { 3324 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 3325 get(AMDGPU::V_READFIRSTLANE_B32), DstReg) 3326 .addReg(SrcReg); 3327 return DstReg; 3328 } 3329 3330 SmallVector<unsigned, 8> SRegs; 3331 for (unsigned i = 0; i < SubRegs; ++i) { 3332 unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 3333 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 3334 get(AMDGPU::V_READFIRSTLANE_B32), SGPR) 3335 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); 3336 SRegs.push_back(SGPR); 3337 } 3338 3339 MachineInstrBuilder MIB = 3340 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 3341 get(AMDGPU::REG_SEQUENCE), DstReg); 3342 for (unsigned i = 0; i < SubRegs; ++i) { 3343 MIB.addReg(SRegs[i]); 3344 MIB.addImm(RI.getSubRegFromChannel(i)); 3345 } 3346 return DstReg; 3347 } 3348 3349 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, 3350 MachineInstr &MI) const { 3351 3352 // If the pointer is store in VGPRs, then we need to move them to 3353 // SGPRs using v_readfirstlane. This is safe because we only select 3354 // loads with uniform pointers to SMRD instruction so we know the 3355 // pointer value is uniform. 3356 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); 3357 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { 3358 unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); 3359 SBase->setReg(SGPR); 3360 } 3361 } 3362 3363 void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, 3364 MachineBasicBlock::iterator I, 3365 const TargetRegisterClass *DstRC, 3366 MachineOperand &Op, 3367 MachineRegisterInfo &MRI, 3368 const DebugLoc &DL) const { 3369 unsigned OpReg = Op.getReg(); 3370 unsigned OpSubReg = Op.getSubReg(); 3371 3372 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( 3373 RI.getRegClassForReg(MRI, OpReg), OpSubReg); 3374 3375 // Check if operand is already the correct register class. 3376 if (DstRC == OpRC) 3377 return; 3378 3379 unsigned DstReg = MRI.createVirtualRegister(DstRC); 3380 MachineInstr *Copy = 3381 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); 3382 3383 Op.setReg(DstReg); 3384 Op.setSubReg(0); 3385 3386 MachineInstr *Def = MRI.getVRegDef(OpReg); 3387 if (!Def) 3388 return; 3389 3390 // Try to eliminate the copy if it is copying an immediate value. 3391 if (Def->isMoveImmediate()) 3392 FoldImmediate(*Copy, *Def, OpReg, &MRI); 3393 } 3394 3395 void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { 3396 MachineFunction &MF = *MI.getParent()->getParent(); 3397 MachineRegisterInfo &MRI = MF.getRegInfo(); 3398 3399 // Legalize VOP2 3400 if (isVOP2(MI) || isVOPC(MI)) { 3401 legalizeOperandsVOP2(MRI, MI); 3402 return; 3403 } 3404 3405 // Legalize VOP3 3406 if (isVOP3(MI)) { 3407 legalizeOperandsVOP3(MRI, MI); 3408 return; 3409 } 3410 3411 // Legalize SMRD 3412 if (isSMRD(MI)) { 3413 legalizeOperandsSMRD(MRI, MI); 3414 return; 3415 } 3416 3417 // Legalize REG_SEQUENCE and PHI 3418 // The register class of the operands much be the same type as the register 3419 // class of the output. 3420 if (MI.getOpcode() == AMDGPU::PHI) { 3421 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 3422 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { 3423 if (!MI.getOperand(i).isReg() || 3424 !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) 3425 continue; 3426 const TargetRegisterClass *OpRC = 3427 MRI.getRegClass(MI.getOperand(i).getReg()); 3428 if (RI.hasVGPRs(OpRC)) { 3429 VRC = OpRC; 3430 } else { 3431 SRC = OpRC; 3432 } 3433 } 3434 3435 // If any of the operands are VGPR registers, then they all most be 3436 // otherwise we will create illegal VGPR->SGPR copies when legalizing 3437 // them. 3438 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { 3439 if (!VRC) { 3440 assert(SRC); 3441 VRC = RI.getEquivalentVGPRClass(SRC); 3442 } 3443 RC = VRC; 3444 } else { 3445 RC = SRC; 3446 } 3447 3448 // Update all the operands so they have the same type. 3449 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3450 MachineOperand &Op = MI.getOperand(I); 3451 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 3452 continue; 3453 3454 // MI is a PHI instruction. 3455 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); 3456 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); 3457 3458 // Avoid creating no-op copies with the same src and dst reg class. These 3459 // confuse some of the machine passes. 3460 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); 3461 } 3462 } 3463 3464 // REG_SEQUENCE doesn't really require operand legalization, but if one has a 3465 // VGPR dest type and SGPR sources, insert copies so all operands are 3466 // VGPRs. This seems to help operand folding / the register coalescer. 3467 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { 3468 MachineBasicBlock *MBB = MI.getParent(); 3469 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); 3470 if (RI.hasVGPRs(DstRC)) { 3471 // Update all the operands so they are VGPR register classes. These may 3472 // not be the same register class because REG_SEQUENCE supports mixing 3473 // subregister index types e.g. sub0_sub1 + sub2 + sub3 3474 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3475 MachineOperand &Op = MI.getOperand(I); 3476 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 3477 continue; 3478 3479 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); 3480 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); 3481 if (VRC == OpRC) 3482 continue; 3483 3484 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc()); 3485 Op.setIsKill(); 3486 } 3487 } 3488 3489 return; 3490 } 3491 3492 // Legalize INSERT_SUBREG 3493 // src0 must have the same register class as dst 3494 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { 3495 unsigned Dst = MI.getOperand(0).getReg(); 3496 unsigned Src0 = MI.getOperand(1).getReg(); 3497 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 3498 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 3499 if (DstRC != Src0RC) { 3500 MachineBasicBlock *MBB = MI.getParent(); 3501 MachineOperand &Op = MI.getOperand(1); 3502 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); 3503 } 3504 return; 3505 } 3506 3507 // Legalize SI_INIT_M0 3508 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) { 3509 MachineOperand &Src = MI.getOperand(0); 3510 if (Src.isReg() && RI.hasVGPRs(MRI.getRegClass(Src.getReg()))) 3511 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); 3512 return; 3513 } 3514 3515 // Legalize MIMG and MUBUF/MTBUF for shaders. 3516 // 3517 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via 3518 // scratch memory access. In both cases, the legalization never involves 3519 // conversion to the addr64 form. 3520 if (isMIMG(MI) || 3521 (AMDGPU::isShader(MF.getFunction().getCallingConv()) && 3522 (isMUBUF(MI) || isMTBUF(MI)))) { 3523 MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); 3524 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { 3525 unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); 3526 SRsrc->setReg(SGPR); 3527 } 3528 3529 MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); 3530 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { 3531 unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); 3532 SSamp->setReg(SGPR); 3533 } 3534 return; 3535 } 3536 3537 // Legalize MUBUF* instructions by converting to addr64 form. 3538 // FIXME: If we start using the non-addr64 instructions for compute, we 3539 // may need to legalize them as above. This especially applies to the 3540 // buffer_load_format_* variants and variants with idxen (or bothen). 3541 int SRsrcIdx = 3542 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); 3543 if (SRsrcIdx != -1) { 3544 // We have an MUBUF instruction 3545 MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx); 3546 unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass; 3547 if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), 3548 RI.getRegClass(SRsrcRC))) { 3549 // The operands are legal. 3550 // FIXME: We may need to legalize operands besided srsrc. 3551 return; 3552 } 3553 3554 MachineBasicBlock &MBB = *MI.getParent(); 3555 3556 // Extract the ptr from the resource descriptor. 3557 unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, 3558 &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); 3559 3560 // Create an empty resource descriptor 3561 unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3562 unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 3563 unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 3564 unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); 3565 uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); 3566 3567 // Zero64 = 0 3568 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64) 3569 .addImm(0); 3570 3571 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 3572 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo) 3573 .addImm(RsrcDataFormat & 0xFFFFFFFF); 3574 3575 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 3576 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi) 3577 .addImm(RsrcDataFormat >> 32); 3578 3579 // NewSRsrc = {Zero64, SRsrcFormat} 3580 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) 3581 .addReg(Zero64) 3582 .addImm(AMDGPU::sub0_sub1) 3583 .addReg(SRsrcFormatLo) 3584 .addImm(AMDGPU::sub2) 3585 .addReg(SRsrcFormatHi) 3586 .addImm(AMDGPU::sub3); 3587 3588 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 3589 unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 3590 if (VAddr) { 3591 // This is already an ADDR64 instruction so we need to add the pointer 3592 // extracted from the resource descriptor to the current value of VAddr. 3593 unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3594 unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3595 3596 // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 3597 DebugLoc DL = MI.getDebugLoc(); 3598 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) 3599 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 3600 .addReg(VAddr->getReg(), 0, AMDGPU::sub0); 3601 3602 // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1 3603 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) 3604 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 3605 .addReg(VAddr->getReg(), 0, AMDGPU::sub1); 3606 3607 // NewVaddr = {NewVaddrHi, NewVaddrLo} 3608 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 3609 .addReg(NewVAddrLo) 3610 .addImm(AMDGPU::sub0) 3611 .addReg(NewVAddrHi) 3612 .addImm(AMDGPU::sub1); 3613 } else { 3614 // This instructions is the _OFFSET variant, so we need to convert it to 3615 // ADDR64. 3616 assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration() 3617 < SISubtarget::VOLCANIC_ISLANDS && 3618 "FIXME: Need to emit flat atomics here"); 3619 3620 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); 3621 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 3622 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); 3623 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); 3624 3625 // Atomics rith return have have an additional tied operand and are 3626 // missing some of the special bits. 3627 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); 3628 MachineInstr *Addr64; 3629 3630 if (!VDataIn) { 3631 // Regular buffer load / store. 3632 MachineInstrBuilder MIB = 3633 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 3634 .add(*VData) 3635 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 3636 // This will be replaced later 3637 // with the new value of vaddr. 3638 .add(*SRsrc) 3639 .add(*SOffset) 3640 .add(*Offset); 3641 3642 // Atomics do not have this operand. 3643 if (const MachineOperand *GLC = 3644 getNamedOperand(MI, AMDGPU::OpName::glc)) { 3645 MIB.addImm(GLC->getImm()); 3646 } 3647 3648 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)); 3649 3650 if (const MachineOperand *TFE = 3651 getNamedOperand(MI, AMDGPU::OpName::tfe)) { 3652 MIB.addImm(TFE->getImm()); 3653 } 3654 3655 MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); 3656 Addr64 = MIB; 3657 } else { 3658 // Atomics with return. 3659 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 3660 .add(*VData) 3661 .add(*VDataIn) 3662 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. 3663 // This will be replaced later 3664 // with the new value of vaddr. 3665 .add(*SRsrc) 3666 .add(*SOffset) 3667 .add(*Offset) 3668 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) 3669 .setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); 3670 } 3671 3672 MI.removeFromParent(); 3673 3674 // NewVaddr = {NewVaddrHi, NewVaddrLo} 3675 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), 3676 NewVAddr) 3677 .addReg(SRsrcPtr, 0, AMDGPU::sub0) 3678 .addImm(AMDGPU::sub0) 3679 .addReg(SRsrcPtr, 0, AMDGPU::sub1) 3680 .addImm(AMDGPU::sub1); 3681 3682 VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr); 3683 SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc); 3684 } 3685 3686 // Update the instruction to use NewVaddr 3687 VAddr->setReg(NewVAddr); 3688 // Update the instruction to use NewSRsrc 3689 SRsrc->setReg(NewSRsrc); 3690 } 3691 } 3692 3693 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { 3694 SetVectorType Worklist; 3695 Worklist.insert(&TopInst); 3696 3697 while (!Worklist.empty()) { 3698 MachineInstr &Inst = *Worklist.pop_back_val(); 3699 MachineBasicBlock *MBB = Inst.getParent(); 3700 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 3701 3702 unsigned Opcode = Inst.getOpcode(); 3703 unsigned NewOpcode = getVALUOp(Inst); 3704 3705 // Handle some special cases 3706 switch (Opcode) { 3707 default: 3708 break; 3709 case AMDGPU::S_ADD_U64_PSEUDO: 3710 case AMDGPU::S_SUB_U64_PSEUDO: 3711 splitScalar64BitAddSub(Worklist, Inst); 3712 Inst.eraseFromParent(); 3713 continue; 3714 case AMDGPU::S_ADD_I32: 3715 case AMDGPU::S_SUB_I32: 3716 // FIXME: The u32 versions currently selected use the carry. 3717 if (moveScalarAddSub(Worklist, Inst)) 3718 continue; 3719 3720 // Default handling 3721 break; 3722 case AMDGPU::S_AND_B64: 3723 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); 3724 Inst.eraseFromParent(); 3725 continue; 3726 3727 case AMDGPU::S_OR_B64: 3728 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64); 3729 Inst.eraseFromParent(); 3730 continue; 3731 3732 case AMDGPU::S_XOR_B64: 3733 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64); 3734 Inst.eraseFromParent(); 3735 continue; 3736 3737 case AMDGPU::S_NOT_B64: 3738 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); 3739 Inst.eraseFromParent(); 3740 continue; 3741 3742 case AMDGPU::S_BCNT1_I32_B64: 3743 splitScalar64BitBCNT(Worklist, Inst); 3744 Inst.eraseFromParent(); 3745 continue; 3746 3747 case AMDGPU::S_BFE_I64: 3748 splitScalar64BitBFE(Worklist, Inst); 3749 Inst.eraseFromParent(); 3750 continue; 3751 3752 case AMDGPU::S_LSHL_B32: 3753 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3754 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 3755 swapOperands(Inst); 3756 } 3757 break; 3758 case AMDGPU::S_ASHR_I32: 3759 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3760 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 3761 swapOperands(Inst); 3762 } 3763 break; 3764 case AMDGPU::S_LSHR_B32: 3765 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3766 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 3767 swapOperands(Inst); 3768 } 3769 break; 3770 case AMDGPU::S_LSHL_B64: 3771 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3772 NewOpcode = AMDGPU::V_LSHLREV_B64; 3773 swapOperands(Inst); 3774 } 3775 break; 3776 case AMDGPU::S_ASHR_I64: 3777 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3778 NewOpcode = AMDGPU::V_ASHRREV_I64; 3779 swapOperands(Inst); 3780 } 3781 break; 3782 case AMDGPU::S_LSHR_B64: 3783 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 3784 NewOpcode = AMDGPU::V_LSHRREV_B64; 3785 swapOperands(Inst); 3786 } 3787 break; 3788 3789 case AMDGPU::S_ABS_I32: 3790 lowerScalarAbs(Worklist, Inst); 3791 Inst.eraseFromParent(); 3792 continue; 3793 3794 case AMDGPU::S_CBRANCH_SCC0: 3795 case AMDGPU::S_CBRANCH_SCC1: 3796 // Clear unused bits of vcc 3797 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), 3798 AMDGPU::VCC) 3799 .addReg(AMDGPU::EXEC) 3800 .addReg(AMDGPU::VCC); 3801 break; 3802 3803 case AMDGPU::S_BFE_U64: 3804 case AMDGPU::S_BFM_B64: 3805 llvm_unreachable("Moving this op to VALU not implemented"); 3806 3807 case AMDGPU::S_PACK_LL_B32_B16: 3808 case AMDGPU::S_PACK_LH_B32_B16: 3809 case AMDGPU::S_PACK_HH_B32_B16: 3810 movePackToVALU(Worklist, MRI, Inst); 3811 Inst.eraseFromParent(); 3812 continue; 3813 3814 case AMDGPU::S_XNOR_B32: 3815 lowerScalarXnor(Worklist, Inst); 3816 Inst.eraseFromParent(); 3817 continue; 3818 3819 case AMDGPU::S_XNOR_B64: 3820 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32); 3821 Inst.eraseFromParent(); 3822 continue; 3823 3824 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: { 3825 unsigned VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3826 const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff); 3827 auto Add = MRI.getUniqueVRegDef(VAddr->getReg()); 3828 unsigned Offset = 0; 3829 3830 // FIXME: This isn't safe because the addressing mode doesn't work 3831 // correctly if vaddr is negative. 3832 // 3833 // FIXME: Should probably be done somewhere else, maybe SIFoldOperands. 3834 // 3835 // See if we can extract an immediate offset by recognizing one of these: 3836 // V_ADD_I32_e32 dst, imm, src1 3837 // V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1 3838 // V_ADD will be removed by "Remove dead machine instructions". 3839 if (Add && 3840 (Add->getOpcode() == AMDGPU::V_ADD_I32_e32 || 3841 Add->getOpcode() == AMDGPU::V_ADD_U32_e64)) { 3842 static const unsigned SrcNames[2] = { 3843 AMDGPU::OpName::src0, 3844 AMDGPU::OpName::src1, 3845 }; 3846 3847 // Find a literal offset in one of source operands. 3848 for (int i = 0; i < 2; i++) { 3849 const MachineOperand *Src = 3850 getNamedOperand(*Add, SrcNames[i]); 3851 3852 if (Src->isReg()) { 3853 auto Mov = MRI.getUniqueVRegDef(Src->getReg()); 3854 if (Mov && Mov->getOpcode() == AMDGPU::S_MOV_B32) 3855 Src = &Mov->getOperand(1); 3856 } 3857 3858 if (Src) { 3859 if (Src->isImm()) 3860 Offset = Src->getImm(); 3861 else if (Src->isCImm()) 3862 Offset = Src->getCImm()->getZExtValue(); 3863 } 3864 3865 if (Offset && isLegalMUBUFImmOffset(Offset)) { 3866 VAddr = getNamedOperand(*Add, SrcNames[!i]); 3867 break; 3868 } 3869 3870 Offset = 0; 3871 } 3872 } 3873 3874 MachineInstr *NewInstr = 3875 BuildMI(*MBB, Inst, Inst.getDebugLoc(), 3876 get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), VDst) 3877 .add(*VAddr) // vaddr 3878 .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc 3879 .addImm(0) // soffset 3880 .addImm(Offset) // offset 3881 .addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm()) 3882 .addImm(0) // slc 3883 .addImm(0) // tfe 3884 .setMemRefs(Inst.memoperands_begin(), Inst.memoperands_end()) 3885 .getInstr(); 3886 3887 MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(), 3888 VDst); 3889 addUsersToMoveToVALUWorklist(VDst, MRI, Worklist); 3890 Inst.eraseFromParent(); 3891 3892 // Legalize all operands other than the offset. Notably, convert the srsrc 3893 // into SGPRs using v_readfirstlane if needed. 3894 legalizeOperands(*NewInstr); 3895 continue; 3896 } 3897 } 3898 3899 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 3900 // We cannot move this instruction to the VALU, so we should try to 3901 // legalize its operands instead. 3902 legalizeOperands(Inst); 3903 continue; 3904 } 3905 3906 // Use the new VALU Opcode. 3907 const MCInstrDesc &NewDesc = get(NewOpcode); 3908 Inst.setDesc(NewDesc); 3909 3910 // Remove any references to SCC. Vector instructions can't read from it, and 3911 // We're just about to add the implicit use / defs of VCC, and we don't want 3912 // both. 3913 for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { 3914 MachineOperand &Op = Inst.getOperand(i); 3915 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { 3916 Inst.RemoveOperand(i); 3917 addSCCDefUsersToVALUWorklist(Inst, Worklist); 3918 } 3919 } 3920 3921 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 3922 // We are converting these to a BFE, so we need to add the missing 3923 // operands for the size and offset. 3924 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 3925 Inst.addOperand(MachineOperand::CreateImm(0)); 3926 Inst.addOperand(MachineOperand::CreateImm(Size)); 3927 3928 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 3929 // The VALU version adds the second operand to the result, so insert an 3930 // extra 0 operand. 3931 Inst.addOperand(MachineOperand::CreateImm(0)); 3932 } 3933 3934 Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); 3935 3936 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 3937 const MachineOperand &OffsetWidthOp = Inst.getOperand(2); 3938 // If we need to move this to VGPRs, we need to unpack the second operand 3939 // back into the 2 separate ones for bit offset and width. 3940 assert(OffsetWidthOp.isImm() && 3941 "Scalar BFE is only implemented for constant width and offset"); 3942 uint32_t Imm = OffsetWidthOp.getImm(); 3943 3944 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 3945 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 3946 Inst.RemoveOperand(2); // Remove old immediate. 3947 Inst.addOperand(MachineOperand::CreateImm(Offset)); 3948 Inst.addOperand(MachineOperand::CreateImm(BitWidth)); 3949 } 3950 3951 bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); 3952 unsigned NewDstReg = AMDGPU::NoRegister; 3953 if (HasDst) { 3954 unsigned DstReg = Inst.getOperand(0).getReg(); 3955 if (TargetRegisterInfo::isPhysicalRegister(DstReg)) 3956 continue; 3957 3958 // Update the destination register class. 3959 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); 3960 if (!NewDstRC) 3961 continue; 3962 3963 if (Inst.isCopy() && 3964 TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) && 3965 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { 3966 // Instead of creating a copy where src and dst are the same register 3967 // class, we just replace all uses of dst with src. These kinds of 3968 // copies interfere with the heuristics MachineSink uses to decide 3969 // whether or not to split a critical edge. Since the pass assumes 3970 // that copies will end up as machine instructions and not be 3971 // eliminated. 3972 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); 3973 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); 3974 MRI.clearKillFlags(Inst.getOperand(1).getReg()); 3975 Inst.getOperand(0).setReg(DstReg); 3976 3977 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally 3978 // these are deleted later, but at -O0 it would leave a suspicious 3979 // looking illegal copy of an undef register. 3980 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I) 3981 Inst.RemoveOperand(I); 3982 Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); 3983 continue; 3984 } 3985 3986 NewDstReg = MRI.createVirtualRegister(NewDstRC); 3987 MRI.replaceRegWith(DstReg, NewDstReg); 3988 } 3989 3990 // Legalize the operands 3991 legalizeOperands(Inst); 3992 3993 if (HasDst) 3994 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 3995 } 3996 } 3997 3998 // Add/sub require special handling to deal with carry outs. 3999 bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, 4000 MachineInstr &Inst) const { 4001 if (ST.hasAddNoCarry()) { 4002 // Assume there is no user of scc since we don't select this in that case. 4003 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant 4004 // is used. 4005 4006 MachineBasicBlock &MBB = *Inst.getParent(); 4007 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4008 4009 unsigned OldDstReg = Inst.getOperand(0).getReg(); 4010 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4011 4012 unsigned Opc = Inst.getOpcode(); 4013 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32); 4014 4015 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ? 4016 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64; 4017 4018 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC); 4019 Inst.RemoveOperand(3); 4020 4021 Inst.setDesc(get(NewOpc)); 4022 Inst.addImplicitDefUseOperands(*MBB.getParent()); 4023 MRI.replaceRegWith(OldDstReg, ResultReg); 4024 legalizeOperands(Inst); 4025 4026 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 4027 return true; 4028 } 4029 4030 return false; 4031 } 4032 4033 void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, 4034 MachineInstr &Inst) const { 4035 MachineBasicBlock &MBB = *Inst.getParent(); 4036 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4037 MachineBasicBlock::iterator MII = Inst; 4038 DebugLoc DL = Inst.getDebugLoc(); 4039 4040 MachineOperand &Dest = Inst.getOperand(0); 4041 MachineOperand &Src = Inst.getOperand(1); 4042 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4043 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4044 4045 unsigned SubOp = ST.hasAddNoCarry() ? 4046 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32; 4047 4048 BuildMI(MBB, MII, DL, get(SubOp), TmpReg) 4049 .addImm(0) 4050 .addReg(Src.getReg()); 4051 4052 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) 4053 .addReg(Src.getReg()) 4054 .addReg(TmpReg); 4055 4056 MRI.replaceRegWith(Dest.getReg(), ResultReg); 4057 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 4058 } 4059 4060 void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, 4061 MachineInstr &Inst) const { 4062 MachineBasicBlock &MBB = *Inst.getParent(); 4063 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4064 MachineBasicBlock::iterator MII = Inst; 4065 const DebugLoc &DL = Inst.getDebugLoc(); 4066 4067 MachineOperand &Dest = Inst.getOperand(0); 4068 MachineOperand &Src0 = Inst.getOperand(1); 4069 MachineOperand &Src1 = Inst.getOperand(2); 4070 4071 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); 4072 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); 4073 4074 unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4075 if (ST.hasDLInsts()) { 4076 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest) 4077 .add(Src0) 4078 .add(Src1); 4079 } else { 4080 unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4081 BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor) 4082 .add(Src0) 4083 .add(Src1); 4084 4085 BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), NewDest) 4086 .addReg(Xor); 4087 } 4088 4089 MRI.replaceRegWith(Dest.getReg(), NewDest); 4090 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 4091 } 4092 4093 void SIInstrInfo::splitScalar64BitUnaryOp( 4094 SetVectorType &Worklist, MachineInstr &Inst, 4095 unsigned Opcode) const { 4096 MachineBasicBlock &MBB = *Inst.getParent(); 4097 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4098 4099 MachineOperand &Dest = Inst.getOperand(0); 4100 MachineOperand &Src0 = Inst.getOperand(1); 4101 DebugLoc DL = Inst.getDebugLoc(); 4102 4103 MachineBasicBlock::iterator MII = Inst; 4104 4105 const MCInstrDesc &InstDesc = get(Opcode); 4106 const TargetRegisterClass *Src0RC = Src0.isReg() ? 4107 MRI.getRegClass(Src0.getReg()) : 4108 &AMDGPU::SGPR_32RegClass; 4109 4110 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 4111 4112 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 4113 AMDGPU::sub0, Src0SubRC); 4114 4115 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 4116 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 4117 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 4118 4119 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 4120 BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); 4121 4122 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 4123 AMDGPU::sub1, Src0SubRC); 4124 4125 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 4126 BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); 4127 4128 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 4129 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 4130 .addReg(DestSub0) 4131 .addImm(AMDGPU::sub0) 4132 .addReg(DestSub1) 4133 .addImm(AMDGPU::sub1); 4134 4135 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 4136 4137 // We don't need to legalizeOperands here because for a single operand, src0 4138 // will support any kind of input. 4139 4140 // Move all users of this moved value. 4141 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 4142 } 4143 4144 void SIInstrInfo::splitScalar64BitAddSub( 4145 SetVectorType &Worklist, MachineInstr &Inst) const { 4146 bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); 4147 4148 MachineBasicBlock &MBB = *Inst.getParent(); 4149 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4150 4151 unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 4152 unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4153 unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4154 4155 unsigned CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 4156 unsigned DeadCarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 4157 4158 MachineOperand &Dest = Inst.getOperand(0); 4159 MachineOperand &Src0 = Inst.getOperand(1); 4160 MachineOperand &Src1 = Inst.getOperand(2); 4161 const DebugLoc &DL = Inst.getDebugLoc(); 4162 MachineBasicBlock::iterator MII = Inst; 4163 4164 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); 4165 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); 4166 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 4167 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 4168 4169 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 4170 AMDGPU::sub0, Src0SubRC); 4171 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 4172 AMDGPU::sub0, Src1SubRC); 4173 4174 4175 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 4176 AMDGPU::sub1, Src0SubRC); 4177 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 4178 AMDGPU::sub1, Src1SubRC); 4179 4180 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; 4181 MachineInstr *LoHalf = 4182 BuildMI(MBB, MII, DL, get(LoOpc), DestSub0) 4183 .addReg(CarryReg, RegState::Define) 4184 .add(SrcReg0Sub0) 4185 .add(SrcReg1Sub0); 4186 4187 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 4188 MachineInstr *HiHalf = 4189 BuildMI(MBB, MII, DL, get(HiOpc), DestSub1) 4190 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 4191 .add(SrcReg0Sub1) 4192 .add(SrcReg1Sub1) 4193 .addReg(CarryReg, RegState::Kill); 4194 4195 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 4196 .addReg(DestSub0) 4197 .addImm(AMDGPU::sub0) 4198 .addReg(DestSub1) 4199 .addImm(AMDGPU::sub1); 4200 4201 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 4202 4203 // Try to legalize the operands in case we need to swap the order to keep it 4204 // valid. 4205 legalizeOperands(*LoHalf); 4206 legalizeOperands(*HiHalf); 4207 4208 // Move all users of this moved vlaue. 4209 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 4210 } 4211 4212 void SIInstrInfo::splitScalar64BitBinaryOp( 4213 SetVectorType &Worklist, MachineInstr &Inst, 4214 unsigned Opcode) const { 4215 MachineBasicBlock &MBB = *Inst.getParent(); 4216 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4217 4218 MachineOperand &Dest = Inst.getOperand(0); 4219 MachineOperand &Src0 = Inst.getOperand(1); 4220 MachineOperand &Src1 = Inst.getOperand(2); 4221 DebugLoc DL = Inst.getDebugLoc(); 4222 4223 MachineBasicBlock::iterator MII = Inst; 4224 4225 const MCInstrDesc &InstDesc = get(Opcode); 4226 const TargetRegisterClass *Src0RC = Src0.isReg() ? 4227 MRI.getRegClass(Src0.getReg()) : 4228 &AMDGPU::SGPR_32RegClass; 4229 4230 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 4231 const TargetRegisterClass *Src1RC = Src1.isReg() ? 4232 MRI.getRegClass(Src1.getReg()) : 4233 &AMDGPU::SGPR_32RegClass; 4234 4235 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 4236 4237 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 4238 AMDGPU::sub0, Src0SubRC); 4239 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 4240 AMDGPU::sub0, Src1SubRC); 4241 4242 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 4243 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 4244 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 4245 4246 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 4247 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) 4248 .add(SrcReg0Sub0) 4249 .add(SrcReg1Sub0); 4250 4251 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 4252 AMDGPU::sub1, Src0SubRC); 4253 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 4254 AMDGPU::sub1, Src1SubRC); 4255 4256 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 4257 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) 4258 .add(SrcReg0Sub1) 4259 .add(SrcReg1Sub1); 4260 4261 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 4262 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 4263 .addReg(DestSub0) 4264 .addImm(AMDGPU::sub0) 4265 .addReg(DestSub1) 4266 .addImm(AMDGPU::sub1); 4267 4268 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 4269 4270 // Try to legalize the operands in case we need to swap the order to keep it 4271 // valid. 4272 legalizeOperands(LoHalf); 4273 legalizeOperands(HiHalf); 4274 4275 // Move all users of this moved vlaue. 4276 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 4277 } 4278 4279 void SIInstrInfo::splitScalar64BitBCNT( 4280 SetVectorType &Worklist, MachineInstr &Inst) const { 4281 MachineBasicBlock &MBB = *Inst.getParent(); 4282 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4283 4284 MachineBasicBlock::iterator MII = Inst; 4285 DebugLoc DL = Inst.getDebugLoc(); 4286 4287 MachineOperand &Dest = Inst.getOperand(0); 4288 MachineOperand &Src = Inst.getOperand(1); 4289 4290 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 4291 const TargetRegisterClass *SrcRC = Src.isReg() ? 4292 MRI.getRegClass(Src.getReg()) : 4293 &AMDGPU::SGPR_32RegClass; 4294 4295 unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4296 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4297 4298 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); 4299 4300 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 4301 AMDGPU::sub0, SrcSubRC); 4302 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 4303 AMDGPU::sub1, SrcSubRC); 4304 4305 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0); 4306 4307 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg); 4308 4309 MRI.replaceRegWith(Dest.getReg(), ResultReg); 4310 4311 // We don't need to legalize operands here. src0 for etiher instruction can be 4312 // an SGPR, and the second input is unused or determined here. 4313 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 4314 } 4315 4316 void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, 4317 MachineInstr &Inst) const { 4318 MachineBasicBlock &MBB = *Inst.getParent(); 4319 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4320 MachineBasicBlock::iterator MII = Inst; 4321 DebugLoc DL = Inst.getDebugLoc(); 4322 4323 MachineOperand &Dest = Inst.getOperand(0); 4324 uint32_t Imm = Inst.getOperand(2).getImm(); 4325 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 4326 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 4327 4328 (void) Offset; 4329 4330 // Only sext_inreg cases handled. 4331 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && 4332 Offset == 0 && "Not implemented"); 4333 4334 if (BitWidth < 32) { 4335 unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4336 unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4337 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 4338 4339 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) 4340 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) 4341 .addImm(0) 4342 .addImm(BitWidth); 4343 4344 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 4345 .addImm(31) 4346 .addReg(MidRegLo); 4347 4348 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 4349 .addReg(MidRegLo) 4350 .addImm(AMDGPU::sub0) 4351 .addReg(MidRegHi) 4352 .addImm(AMDGPU::sub1); 4353 4354 MRI.replaceRegWith(Dest.getReg(), ResultReg); 4355 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 4356 return; 4357 } 4358 4359 MachineOperand &Src = Inst.getOperand(1); 4360 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4361 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 4362 4363 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 4364 .addImm(31) 4365 .addReg(Src.getReg(), 0, AMDGPU::sub0); 4366 4367 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 4368 .addReg(Src.getReg(), 0, AMDGPU::sub0) 4369 .addImm(AMDGPU::sub0) 4370 .addReg(TmpReg) 4371 .addImm(AMDGPU::sub1); 4372 4373 MRI.replaceRegWith(Dest.getReg(), ResultReg); 4374 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 4375 } 4376 4377 void SIInstrInfo::addUsersToMoveToVALUWorklist( 4378 unsigned DstReg, 4379 MachineRegisterInfo &MRI, 4380 SetVectorType &Worklist) const { 4381 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), 4382 E = MRI.use_end(); I != E;) { 4383 MachineInstr &UseMI = *I->getParent(); 4384 if (!canReadVGPR(UseMI, I.getOperandNo())) { 4385 Worklist.insert(&UseMI); 4386 4387 do { 4388 ++I; 4389 } while (I != E && I->getParent() == &UseMI); 4390 } else { 4391 ++I; 4392 } 4393 } 4394 } 4395 4396 void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, 4397 MachineRegisterInfo &MRI, 4398 MachineInstr &Inst) const { 4399 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4400 MachineBasicBlock *MBB = Inst.getParent(); 4401 MachineOperand &Src0 = Inst.getOperand(1); 4402 MachineOperand &Src1 = Inst.getOperand(2); 4403 const DebugLoc &DL = Inst.getDebugLoc(); 4404 4405 switch (Inst.getOpcode()) { 4406 case AMDGPU::S_PACK_LL_B32_B16: { 4407 unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4408 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4409 4410 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are 4411 // 0. 4412 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 4413 .addImm(0xffff); 4414 4415 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg) 4416 .addReg(ImmReg, RegState::Kill) 4417 .add(Src0); 4418 4419 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg) 4420 .add(Src1) 4421 .addImm(16) 4422 .addReg(TmpReg, RegState::Kill); 4423 break; 4424 } 4425 case AMDGPU::S_PACK_LH_B32_B16: { 4426 unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4427 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 4428 .addImm(0xffff); 4429 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg) 4430 .addReg(ImmReg, RegState::Kill) 4431 .add(Src0) 4432 .add(Src1); 4433 break; 4434 } 4435 case AMDGPU::S_PACK_HH_B32_B16: { 4436 unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4437 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4438 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) 4439 .addImm(16) 4440 .add(Src0); 4441 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 4442 .addImm(0xffff0000); 4443 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg) 4444 .add(Src1) 4445 .addReg(ImmReg, RegState::Kill) 4446 .addReg(TmpReg, RegState::Kill); 4447 break; 4448 } 4449 default: 4450 llvm_unreachable("unhandled s_pack_* instruction"); 4451 } 4452 4453 MachineOperand &Dest = Inst.getOperand(0); 4454 MRI.replaceRegWith(Dest.getReg(), ResultReg); 4455 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 4456 } 4457 4458 void SIInstrInfo::addSCCDefUsersToVALUWorklist( 4459 MachineInstr &SCCDefInst, SetVectorType &Worklist) const { 4460 // This assumes that all the users of SCC are in the same block 4461 // as the SCC def. 4462 for (MachineInstr &MI : 4463 make_range(MachineBasicBlock::iterator(SCCDefInst), 4464 SCCDefInst.getParent()->end())) { 4465 // Exit if we find another SCC def. 4466 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1) 4467 return; 4468 4469 if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1) 4470 Worklist.insert(&MI); 4471 } 4472 } 4473 4474 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( 4475 const MachineInstr &Inst) const { 4476 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); 4477 4478 switch (Inst.getOpcode()) { 4479 // For target instructions, getOpRegClass just returns the virtual register 4480 // class associated with the operand, so we need to find an equivalent VGPR 4481 // register class in order to move the instruction to the VALU. 4482 case AMDGPU::COPY: 4483 case AMDGPU::PHI: 4484 case AMDGPU::REG_SEQUENCE: 4485 case AMDGPU::INSERT_SUBREG: 4486 case AMDGPU::WQM: 4487 case AMDGPU::WWM: 4488 if (RI.hasVGPRs(NewDstRC)) 4489 return nullptr; 4490 4491 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 4492 if (!NewDstRC) 4493 return nullptr; 4494 return NewDstRC; 4495 default: 4496 return NewDstRC; 4497 } 4498 } 4499 4500 // Find the one SGPR operand we are allowed to use. 4501 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI, 4502 int OpIndices[3]) const { 4503 const MCInstrDesc &Desc = MI.getDesc(); 4504 4505 // Find the one SGPR operand we are allowed to use. 4506 // 4507 // First we need to consider the instruction's operand requirements before 4508 // legalizing. Some operands are required to be SGPRs, such as implicit uses 4509 // of VCC, but we are still bound by the constant bus requirement to only use 4510 // one. 4511 // 4512 // If the operand's class is an SGPR, we can never move it. 4513 4514 unsigned SGPRReg = findImplicitSGPRRead(MI); 4515 if (SGPRReg != AMDGPU::NoRegister) 4516 return SGPRReg; 4517 4518 unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; 4519 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 4520 4521 for (unsigned i = 0; i < 3; ++i) { 4522 int Idx = OpIndices[i]; 4523 if (Idx == -1) 4524 break; 4525 4526 const MachineOperand &MO = MI.getOperand(Idx); 4527 if (!MO.isReg()) 4528 continue; 4529 4530 // Is this operand statically required to be an SGPR based on the operand 4531 // constraints? 4532 const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); 4533 bool IsRequiredSGPR = RI.isSGPRClass(OpRC); 4534 if (IsRequiredSGPR) 4535 return MO.getReg(); 4536 4537 // If this could be a VGPR or an SGPR, Check the dynamic register class. 4538 unsigned Reg = MO.getReg(); 4539 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); 4540 if (RI.isSGPRClass(RegRC)) 4541 UsedSGPRs[i] = Reg; 4542 } 4543 4544 // We don't have a required SGPR operand, so we have a bit more freedom in 4545 // selecting operands to move. 4546 4547 // Try to select the most used SGPR. If an SGPR is equal to one of the 4548 // others, we choose that. 4549 // 4550 // e.g. 4551 // V_FMA_F32 v0, s0, s0, s0 -> No moves 4552 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 4553 4554 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should 4555 // prefer those. 4556 4557 if (UsedSGPRs[0] != AMDGPU::NoRegister) { 4558 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 4559 SGPRReg = UsedSGPRs[0]; 4560 } 4561 4562 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { 4563 if (UsedSGPRs[1] == UsedSGPRs[2]) 4564 SGPRReg = UsedSGPRs[1]; 4565 } 4566 4567 return SGPRReg; 4568 } 4569 4570 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 4571 unsigned OperandName) const { 4572 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 4573 if (Idx == -1) 4574 return nullptr; 4575 4576 return &MI.getOperand(Idx); 4577 } 4578 4579 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 4580 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 4581 if (ST.isAmdHsaOS()) { 4582 // Set ATC = 1. GFX9 doesn't have this bit. 4583 if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) 4584 RsrcDataFormat |= (1ULL << 56); 4585 4586 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this. 4587 // BTW, it disables TC L2 and therefore decreases performance. 4588 if (ST.getGeneration() == SISubtarget::VOLCANIC_ISLANDS) 4589 RsrcDataFormat |= (2ULL << 59); 4590 } 4591 4592 return RsrcDataFormat; 4593 } 4594 4595 uint64_t SIInstrInfo::getScratchRsrcWords23() const { 4596 uint64_t Rsrc23 = getDefaultRsrcDataFormat() | 4597 AMDGPU::RSRC_TID_ENABLE | 4598 0xffffffff; // Size; 4599 4600 // GFX9 doesn't have ELEMENT_SIZE. 4601 if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) { 4602 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; 4603 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; 4604 } 4605 4606 // IndexStride = 64. 4607 Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; 4608 4609 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. 4610 // Clear them unless we want a huge stride. 4611 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) 4612 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; 4613 4614 return Rsrc23; 4615 } 4616 4617 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { 4618 unsigned Opc = MI.getOpcode(); 4619 4620 return isSMRD(Opc); 4621 } 4622 4623 bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const { 4624 unsigned Opc = MI.getOpcode(); 4625 4626 return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); 4627 } 4628 4629 unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, 4630 int &FrameIndex) const { 4631 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 4632 if (!Addr || !Addr->isFI()) 4633 return AMDGPU::NoRegister; 4634 4635 assert(!MI.memoperands_empty() && 4636 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUASI.PRIVATE_ADDRESS); 4637 4638 FrameIndex = Addr->getIndex(); 4639 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); 4640 } 4641 4642 unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, 4643 int &FrameIndex) const { 4644 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr); 4645 assert(Addr && Addr->isFI()); 4646 FrameIndex = Addr->getIndex(); 4647 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg(); 4648 } 4649 4650 unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 4651 int &FrameIndex) const { 4652 if (!MI.mayLoad()) 4653 return AMDGPU::NoRegister; 4654 4655 if (isMUBUF(MI) || isVGPRSpill(MI)) 4656 return isStackAccess(MI, FrameIndex); 4657 4658 if (isSGPRSpill(MI)) 4659 return isSGPRStackAccess(MI, FrameIndex); 4660 4661 return AMDGPU::NoRegister; 4662 } 4663 4664 unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI, 4665 int &FrameIndex) const { 4666 if (!MI.mayStore()) 4667 return AMDGPU::NoRegister; 4668 4669 if (isMUBUF(MI) || isVGPRSpill(MI)) 4670 return isStackAccess(MI, FrameIndex); 4671 4672 if (isSGPRSpill(MI)) 4673 return isSGPRStackAccess(MI, FrameIndex); 4674 4675 return AMDGPU::NoRegister; 4676 } 4677 4678 unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const { 4679 unsigned Size = 0; 4680 MachineBasicBlock::const_instr_iterator I = MI.getIterator(); 4681 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); 4682 while (++I != E && I->isInsideBundle()) { 4683 assert(!I->isBundle() && "No nested bundle!"); 4684 Size += getInstSizeInBytes(*I); 4685 } 4686 4687 return Size; 4688 } 4689 4690 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 4691 unsigned Opc = MI.getOpcode(); 4692 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); 4693 unsigned DescSize = Desc.getSize(); 4694 4695 // If we have a definitive size, we can use it. Otherwise we need to inspect 4696 // the operands to know the size. 4697 // 4698 // FIXME: Instructions that have a base 32-bit encoding report their size as 4699 // 4, even though they are really 8 bytes if they have a literal operand. 4700 if (DescSize != 0 && DescSize != 4) 4701 return DescSize; 4702 4703 // 4-byte instructions may have a 32-bit literal encoded after them. Check 4704 // operands that coud ever be literals. 4705 if (isVALU(MI) || isSALU(MI)) { 4706 if (isFixedSize(MI)) 4707 return DescSize; 4708 4709 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 4710 if (Src0Idx == -1) 4711 return 4; // No operands. 4712 4713 if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx])) 4714 return 8; 4715 4716 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 4717 if (Src1Idx == -1) 4718 return 4; 4719 4720 if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx])) 4721 return 8; 4722 4723 return 4; 4724 } 4725 4726 if (DescSize == 4) 4727 return 4; 4728 4729 switch (Opc) { 4730 case TargetOpcode::IMPLICIT_DEF: 4731 case TargetOpcode::KILL: 4732 case TargetOpcode::DBG_VALUE: 4733 case TargetOpcode::EH_LABEL: 4734 return 0; 4735 case TargetOpcode::BUNDLE: 4736 return getInstBundleSize(MI); 4737 case TargetOpcode::INLINEASM: { 4738 const MachineFunction *MF = MI.getParent()->getParent(); 4739 const char *AsmStr = MI.getOperand(0).getSymbolName(); 4740 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo()); 4741 } 4742 default: 4743 llvm_unreachable("unable to find instruction size"); 4744 } 4745 } 4746 4747 bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { 4748 if (!isFLAT(MI)) 4749 return false; 4750 4751 if (MI.memoperands_empty()) 4752 return true; 4753 4754 for (const MachineMemOperand *MMO : MI.memoperands()) { 4755 if (MMO->getAddrSpace() == AMDGPUASI.FLAT_ADDRESS) 4756 return true; 4757 } 4758 return false; 4759 } 4760 4761 bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const { 4762 return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO; 4763 } 4764 4765 void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, 4766 MachineBasicBlock *IfEnd) const { 4767 MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator(); 4768 assert(TI != IfEntry->end()); 4769 4770 MachineInstr *Branch = &(*TI); 4771 MachineFunction *MF = IfEntry->getParent(); 4772 MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo(); 4773 4774 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 4775 unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4776 MachineInstr *SIIF = 4777 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg) 4778 .add(Branch->getOperand(0)) 4779 .add(Branch->getOperand(1)); 4780 MachineInstr *SIEND = 4781 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF)) 4782 .addReg(DstReg); 4783 4784 IfEntry->erase(TI); 4785 IfEntry->insert(IfEntry->end(), SIIF); 4786 IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND); 4787 } 4788 } 4789 4790 void SIInstrInfo::convertNonUniformLoopRegion( 4791 MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const { 4792 MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator(); 4793 // We expect 2 terminators, one conditional and one unconditional. 4794 assert(TI != LoopEnd->end()); 4795 4796 MachineInstr *Branch = &(*TI); 4797 MachineFunction *MF = LoopEnd->getParent(); 4798 MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo(); 4799 4800 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 4801 4802 unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4803 unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4804 MachineInstrBuilder HeaderPHIBuilder = 4805 BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg); 4806 for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(), 4807 E = LoopEntry->pred_end(); 4808 PI != E; ++PI) { 4809 if (*PI == LoopEnd) { 4810 HeaderPHIBuilder.addReg(BackEdgeReg); 4811 } else { 4812 MachineBasicBlock *PMBB = *PI; 4813 unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4814 materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(), 4815 ZeroReg, 0); 4816 HeaderPHIBuilder.addReg(ZeroReg); 4817 } 4818 HeaderPHIBuilder.addMBB(*PI); 4819 } 4820 MachineInstr *HeaderPhi = HeaderPHIBuilder; 4821 MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(), 4822 get(AMDGPU::SI_IF_BREAK), BackEdgeReg) 4823 .addReg(DstReg) 4824 .add(Branch->getOperand(0)); 4825 MachineInstr *SILOOP = 4826 BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP)) 4827 .addReg(BackEdgeReg) 4828 .addMBB(LoopEntry); 4829 4830 LoopEntry->insert(LoopEntry->begin(), HeaderPhi); 4831 LoopEnd->erase(TI); 4832 LoopEnd->insert(LoopEnd->end(), SIIFBREAK); 4833 LoopEnd->insert(LoopEnd->end(), SILOOP); 4834 } 4835 } 4836 4837 ArrayRef<std::pair<int, const char *>> 4838 SIInstrInfo::getSerializableTargetIndices() const { 4839 static const std::pair<int, const char *> TargetIndices[] = { 4840 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, 4841 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, 4842 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, 4843 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, 4844 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; 4845 return makeArrayRef(TargetIndices); 4846 } 4847 4848 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The 4849 /// post-RA version of misched uses CreateTargetMIHazardRecognizer. 4850 ScheduleHazardRecognizer * 4851 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, 4852 const ScheduleDAG *DAG) const { 4853 return new GCNHazardRecognizer(DAG->MF); 4854 } 4855 4856 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer 4857 /// pass. 4858 ScheduleHazardRecognizer * 4859 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { 4860 return new GCNHazardRecognizer(MF); 4861 } 4862 4863 std::pair<unsigned, unsigned> 4864 SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { 4865 return std::make_pair(TF & MO_MASK, TF & ~MO_MASK); 4866 } 4867 4868 ArrayRef<std::pair<unsigned, const char *>> 4869 SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { 4870 static const std::pair<unsigned, const char *> TargetFlags[] = { 4871 { MO_GOTPCREL, "amdgpu-gotprel" }, 4872 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" }, 4873 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" }, 4874 { MO_REL32_LO, "amdgpu-rel32-lo" }, 4875 { MO_REL32_HI, "amdgpu-rel32-hi" } 4876 }; 4877 4878 return makeArrayRef(TargetFlags); 4879 } 4880 4881 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { 4882 return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && 4883 MI.modifiesRegister(AMDGPU::EXEC, &RI); 4884 } 4885 4886 MachineInstrBuilder 4887 SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, 4888 MachineBasicBlock::iterator I, 4889 const DebugLoc &DL, 4890 unsigned DestReg) const { 4891 if (ST.hasAddNoCarry()) 4892 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg); 4893 4894 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4895 unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4896 MRI.setRegAllocationHint(UnusedCarry, 0, AMDGPU::VCC); 4897 4898 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) 4899 .addReg(UnusedCarry, RegState::Define | RegState::Dead); 4900 } 4901 4902 bool SIInstrInfo::isKillTerminator(unsigned Opcode) { 4903 switch (Opcode) { 4904 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: 4905 case AMDGPU::SI_KILL_I1_TERMINATOR: 4906 return true; 4907 default: 4908 return false; 4909 } 4910 } 4911 4912 const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const { 4913 switch (Opcode) { 4914 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: 4915 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR); 4916 case AMDGPU::SI_KILL_I1_PSEUDO: 4917 return get(AMDGPU::SI_KILL_I1_TERMINATOR); 4918 default: 4919 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO"); 4920 } 4921 } 4922 4923 bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { 4924 if (!isSMRD(MI)) 4925 return false; 4926 4927 // Check that it is using a buffer resource. 4928 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase); 4929 if (Idx == -1) // e.g. s_memtime 4930 return false; 4931 4932 const auto RCID = MI.getDesc().OpInfo[Idx].RegClass; 4933 return RCID == AMDGPU::SReg_128RegClassID; 4934 } 4935