1 //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI Implementation of TargetInstrInfo. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIInstrInfo.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUSubtarget.h" 17 #include "GCNHazardRecognizer.h" 18 #include "SIDefines.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "SIRegisterInfo.h" 21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/APInt.h" 24 #include "llvm/ADT/ArrayRef.h" 25 #include "llvm/ADT/SmallVector.h" 26 #include "llvm/ADT/StringRef.h" 27 #include "llvm/ADT/iterator_range.h" 28 #include "llvm/Analysis/AliasAnalysis.h" 29 #include "llvm/Analysis/MemoryLocation.h" 30 #include "llvm/Analysis/ValueTracking.h" 31 #include "llvm/CodeGen/MachineBasicBlock.h" 32 #include "llvm/CodeGen/MachineDominators.h" 33 #include "llvm/CodeGen/MachineFrameInfo.h" 34 #include "llvm/CodeGen/MachineFunction.h" 35 #include "llvm/CodeGen/MachineInstr.h" 36 #include "llvm/CodeGen/MachineInstrBuilder.h" 37 #include "llvm/CodeGen/MachineInstrBundle.h" 38 #include "llvm/CodeGen/MachineMemOperand.h" 39 #include "llvm/CodeGen/MachineOperand.h" 40 #include "llvm/CodeGen/MachineRegisterInfo.h" 41 #include "llvm/CodeGen/RegisterScavenging.h" 42 #include "llvm/CodeGen/ScheduleDAG.h" 43 #include "llvm/CodeGen/SelectionDAGNodes.h" 44 #include "llvm/CodeGen/TargetOpcodes.h" 45 #include "llvm/CodeGen/TargetRegisterInfo.h" 46 #include "llvm/IR/DebugLoc.h" 47 #include "llvm/IR/DiagnosticInfo.h" 48 #include "llvm/IR/Function.h" 49 #include "llvm/IR/InlineAsm.h" 50 #include "llvm/IR/LLVMContext.h" 51 #include "llvm/MC/MCInstrDesc.h" 52 #include "llvm/Support/Casting.h" 53 #include "llvm/Support/CommandLine.h" 54 #include "llvm/Support/Compiler.h" 55 #include "llvm/Support/ErrorHandling.h" 56 #include "llvm/Support/MachineValueType.h" 57 #include "llvm/Support/MathExtras.h" 58 #include "llvm/Target/TargetMachine.h" 59 #include <cassert> 60 #include <cstdint> 61 #include <iterator> 62 #include <utility> 63 64 using namespace llvm; 65 66 #define GET_INSTRINFO_CTOR_DTOR 67 #include "AMDGPUGenInstrInfo.inc" 68 69 namespace llvm { 70 namespace AMDGPU { 71 #define GET_D16ImageDimIntrinsics_IMPL 72 #define GET_ImageDimIntrinsicTable_IMPL 73 #define GET_RsrcIntrinsics_IMPL 74 #include "AMDGPUGenSearchableTables.inc" 75 } 76 } 77 78 79 // Must be at least 4 to be able to branch over minimum unconditional branch 80 // code. This is only for making it possible to write reasonably small tests for 81 // long branches. 82 static cl::opt<unsigned> 83 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), 84 cl::desc("Restrict range of branch instructions (DEBUG)")); 85 86 SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) 87 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), 88 RI(ST), ST(ST) { 89 SchedModel.init(&ST); 90 } 91 92 //===----------------------------------------------------------------------===// 93 // TargetInstrInfo callbacks 94 //===----------------------------------------------------------------------===// 95 96 static unsigned getNumOperandsNoGlue(SDNode *Node) { 97 unsigned N = Node->getNumOperands(); 98 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 99 --N; 100 return N; 101 } 102 103 /// Returns true if both nodes have the same value for the given 104 /// operand \p Op, or if both nodes do not have this operand. 105 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 106 unsigned Opc0 = N0->getMachineOpcode(); 107 unsigned Opc1 = N1->getMachineOpcode(); 108 109 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 110 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 111 112 if (Op0Idx == -1 && Op1Idx == -1) 113 return true; 114 115 116 if ((Op0Idx == -1 && Op1Idx != -1) || 117 (Op1Idx == -1 && Op0Idx != -1)) 118 return false; 119 120 // getNamedOperandIdx returns the index for the MachineInstr's operands, 121 // which includes the result as the first operand. We are indexing into the 122 // MachineSDNode's operands, so we need to skip the result operand to get 123 // the real index. 124 --Op0Idx; 125 --Op1Idx; 126 127 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 128 } 129 130 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, 131 AliasAnalysis *AA) const { 132 // TODO: The generic check fails for VALU instructions that should be 133 // rematerializable due to implicit reads of exec. We really want all of the 134 // generic logic for this except for this. 135 switch (MI.getOpcode()) { 136 case AMDGPU::V_MOV_B32_e32: 137 case AMDGPU::V_MOV_B32_e64: 138 case AMDGPU::V_MOV_B64_PSEUDO: 139 // No implicit operands. 140 return MI.getNumOperands() == MI.getDesc().getNumOperands(); 141 default: 142 return false; 143 } 144 } 145 146 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 147 int64_t &Offset0, 148 int64_t &Offset1) const { 149 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 150 return false; 151 152 unsigned Opc0 = Load0->getMachineOpcode(); 153 unsigned Opc1 = Load1->getMachineOpcode(); 154 155 // Make sure both are actually loads. 156 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 157 return false; 158 159 if (isDS(Opc0) && isDS(Opc1)) { 160 161 // FIXME: Handle this case: 162 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 163 return false; 164 165 // Check base reg. 166 if (Load0->getOperand(0) != Load1->getOperand(0)) 167 return false; 168 169 // Skip read2 / write2 variants for simplicity. 170 // TODO: We should report true if the used offsets are adjacent (excluded 171 // st64 versions). 172 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 173 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 174 if (Offset0Idx == -1 || Offset1Idx == -1) 175 return false; 176 177 // XXX - be careful of datalesss loads 178 // getNamedOperandIdx returns the index for MachineInstrs. Since they 179 // include the output in the operand list, but SDNodes don't, we need to 180 // subtract the index by one. 181 Offset0Idx -= get(Opc0).NumDefs; 182 Offset1Idx -= get(Opc1).NumDefs; 183 Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue(); 184 Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue(); 185 return true; 186 } 187 188 if (isSMRD(Opc0) && isSMRD(Opc1)) { 189 // Skip time and cache invalidation instructions. 190 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 || 191 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1) 192 return false; 193 194 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); 195 196 // Check base reg. 197 if (Load0->getOperand(0) != Load1->getOperand(0)) 198 return false; 199 200 const ConstantSDNode *Load0Offset = 201 dyn_cast<ConstantSDNode>(Load0->getOperand(1)); 202 const ConstantSDNode *Load1Offset = 203 dyn_cast<ConstantSDNode>(Load1->getOperand(1)); 204 205 if (!Load0Offset || !Load1Offset) 206 return false; 207 208 Offset0 = Load0Offset->getZExtValue(); 209 Offset1 = Load1Offset->getZExtValue(); 210 return true; 211 } 212 213 // MUBUF and MTBUF can access the same addresses. 214 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 215 216 // MUBUF and MTBUF have vaddr at different indices. 217 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 218 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 219 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 220 return false; 221 222 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 223 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 224 225 if (OffIdx0 == -1 || OffIdx1 == -1) 226 return false; 227 228 // getNamedOperandIdx returns the index for MachineInstrs. Since they 229 // include the output in the operand list, but SDNodes don't, we need to 230 // subtract the index by one. 231 OffIdx0 -= get(Opc0).NumDefs; 232 OffIdx1 -= get(Opc1).NumDefs; 233 234 SDValue Off0 = Load0->getOperand(OffIdx0); 235 SDValue Off1 = Load1->getOperand(OffIdx1); 236 237 // The offset might be a FrameIndexSDNode. 238 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 239 return false; 240 241 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 242 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 243 return true; 244 } 245 246 return false; 247 } 248 249 static bool isStride64(unsigned Opc) { 250 switch (Opc) { 251 case AMDGPU::DS_READ2ST64_B32: 252 case AMDGPU::DS_READ2ST64_B64: 253 case AMDGPU::DS_WRITE2ST64_B32: 254 case AMDGPU::DS_WRITE2ST64_B64: 255 return true; 256 default: 257 return false; 258 } 259 } 260 261 bool SIInstrInfo::getMemOperandsWithOffset( 262 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, 263 int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) 264 const { 265 if (!LdSt.mayLoadOrStore()) 266 return false; 267 268 unsigned Opc = LdSt.getOpcode(); 269 OffsetIsScalable = false; 270 const MachineOperand *BaseOp, *OffsetOp; 271 272 if (isDS(LdSt)) { 273 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); 274 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); 275 if (OffsetOp) { 276 // Normal, single offset LDS instruction. 277 if (!BaseOp) { 278 // DS_CONSUME/DS_APPEND use M0 for the base address. 279 // TODO: find the implicit use operand for M0 and use that as BaseOp? 280 return false; 281 } 282 BaseOps.push_back(BaseOp); 283 Offset = OffsetOp->getImm(); 284 } else { 285 // The 2 offset instructions use offset0 and offset1 instead. We can treat 286 // these as a load with a single offset if the 2 offsets are consecutive. 287 // We will use this for some partially aligned loads. 288 const MachineOperand *Offset0Op = 289 getNamedOperand(LdSt, AMDGPU::OpName::offset0); 290 const MachineOperand *Offset1Op = 291 getNamedOperand(LdSt, AMDGPU::OpName::offset1); 292 293 unsigned Offset0 = Offset0Op->getImm(); 294 unsigned Offset1 = Offset1Op->getImm(); 295 if (Offset0 + 1 != Offset1) 296 return false; 297 298 // Each of these offsets is in element sized units, so we need to convert 299 // to bytes of the individual reads. 300 301 unsigned EltSize; 302 if (LdSt.mayLoad()) 303 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16; 304 else { 305 assert(LdSt.mayStore()); 306 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 307 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8; 308 } 309 310 if (isStride64(Opc)) 311 EltSize *= 64; 312 313 BaseOps.push_back(BaseOp); 314 Offset = EltSize * Offset0; 315 } 316 return true; 317 } 318 319 if (isMUBUF(LdSt) || isMTBUF(LdSt)) { 320 const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset); 321 if (SOffset && SOffset->isReg()) { 322 // We can only handle this if it's a stack access, as any other resource 323 // would require reporting multiple base registers. 324 const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 325 if (AddrReg && !AddrReg->isFI()) 326 return false; 327 328 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); 329 const SIMachineFunctionInfo *MFI 330 = LdSt.getParent()->getParent()->getInfo<SIMachineFunctionInfo>(); 331 if (RSrc->getReg() != MFI->getScratchRSrcReg()) 332 return false; 333 334 const MachineOperand *OffsetImm = 335 getNamedOperand(LdSt, AMDGPU::OpName::offset); 336 BaseOps.push_back(RSrc); 337 BaseOps.push_back(SOffset); 338 Offset = OffsetImm->getImm(); 339 return true; 340 } 341 342 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); 343 if (!BaseOp) // e.g. BUFFER_WBINVL1_VOL 344 return false; 345 BaseOps.push_back(BaseOp); 346 347 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 348 if (BaseOp) 349 BaseOps.push_back(BaseOp); 350 351 const MachineOperand *OffsetImm = 352 getNamedOperand(LdSt, AMDGPU::OpName::offset); 353 Offset = OffsetImm->getImm(); 354 if (SOffset) // soffset can be an inline immediate. 355 Offset += SOffset->getImm(); 356 return true; 357 } 358 359 if (isSMRD(LdSt)) { 360 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase); 361 if (!BaseOp) // e.g. S_MEMTIME 362 return false; 363 BaseOps.push_back(BaseOp); 364 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); 365 Offset = OffsetOp ? OffsetOp->getImm() : 0; 366 return true; 367 } 368 369 if (isFLAT(LdSt)) { 370 // Instructions have either vaddr or saddr or both. 371 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 372 if (BaseOp) 373 BaseOps.push_back(BaseOp); 374 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr); 375 if (BaseOp) 376 BaseOps.push_back(BaseOp); 377 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm(); 378 return true; 379 } 380 381 return false; 382 } 383 384 static bool 385 memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1, 386 ArrayRef<const MachineOperand *> BaseOps2) { 387 if (BaseOps1.size() != BaseOps2.size()) 388 return false; 389 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) 390 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I])) 391 return false; 392 return true; 393 } 394 395 static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, 396 ArrayRef<const MachineOperand *> BaseOps1, 397 const MachineInstr &MI2, 398 ArrayRef<const MachineOperand *> BaseOps2) { 399 if (memOpsHaveSameBaseOperands(BaseOps1, BaseOps2)) 400 return true; 401 402 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand()) 403 return false; 404 405 auto MO1 = *MI1.memoperands_begin(); 406 auto MO2 = *MI2.memoperands_begin(); 407 if (MO1->getAddrSpace() != MO2->getAddrSpace()) 408 return false; 409 410 auto Base1 = MO1->getValue(); 411 auto Base2 = MO2->getValue(); 412 if (!Base1 || !Base2) 413 return false; 414 const MachineFunction &MF = *MI1.getParent()->getParent(); 415 const DataLayout &DL = MF.getFunction().getParent()->getDataLayout(); 416 Base1 = GetUnderlyingObject(Base1, DL); 417 Base2 = GetUnderlyingObject(Base2, DL); 418 419 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2)) 420 return false; 421 422 return Base1 == Base2; 423 } 424 425 bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1, 426 ArrayRef<const MachineOperand *> BaseOps2, 427 unsigned NumLoads) const { 428 assert(!BaseOps1.empty() && !BaseOps2.empty()); 429 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent(); 430 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent(); 431 432 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) 433 return false; 434 435 const MachineOperand *FirstDst = nullptr; 436 const MachineOperand *SecondDst = nullptr; 437 438 if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || 439 (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) || 440 (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) { 441 const unsigned MaxGlobalLoadCluster = 7; 442 if (NumLoads > MaxGlobalLoadCluster) 443 return false; 444 445 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); 446 if (!FirstDst) 447 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); 448 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); 449 if (!SecondDst) 450 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); 451 } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { 452 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); 453 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); 454 } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) { 455 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); 456 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); 457 } 458 459 if (!FirstDst || !SecondDst) 460 return false; 461 462 // Try to limit clustering based on the total number of bytes loaded 463 // rather than the number of instructions. This is done to help reduce 464 // register pressure. The method used is somewhat inexact, though, 465 // because it assumes that all loads in the cluster will load the 466 // same number of bytes as FirstLdSt. 467 468 // The unit of this value is bytes. 469 // FIXME: This needs finer tuning. 470 unsigned LoadClusterThreshold = 16; 471 472 const MachineRegisterInfo &MRI = 473 FirstLdSt.getParent()->getParent()->getRegInfo(); 474 475 const Register Reg = FirstDst->getReg(); 476 477 const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg) 478 ? MRI.getRegClass(Reg) 479 : RI.getPhysRegClass(Reg); 480 481 // FIXME: NumLoads should not be subtracted 1. This is to match behavior 482 // of clusterNeighboringMemOps which was previosly passing cluster length 483 // less 1. LoadClusterThreshold should be tuned instead. 484 return ((NumLoads - 1) * (RI.getRegSizeInBits(*DstRC) / 8)) <= 485 LoadClusterThreshold; 486 } 487 488 // FIXME: This behaves strangely. If, for example, you have 32 load + stores, 489 // the first 16 loads will be interleaved with the stores, and the next 16 will 490 // be clustered as expected. It should really split into 2 16 store batches. 491 // 492 // Loads are clustered until this returns false, rather than trying to schedule 493 // groups of stores. This also means we have to deal with saying different 494 // address space loads should be clustered, and ones which might cause bank 495 // conflicts. 496 // 497 // This might be deprecated so it might not be worth that much effort to fix. 498 bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, 499 int64_t Offset0, int64_t Offset1, 500 unsigned NumLoads) const { 501 assert(Offset1 > Offset0 && 502 "Second offset should be larger than first offset!"); 503 // If we have less than 16 loads in a row, and the offsets are within 64 504 // bytes, then schedule together. 505 506 // A cacheline is 64 bytes (for global memory). 507 return (NumLoads <= 16 && (Offset1 - Offset0) < 64); 508 } 509 510 static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, 511 MachineBasicBlock::iterator MI, 512 const DebugLoc &DL, MCRegister DestReg, 513 MCRegister SrcReg, bool KillSrc) { 514 MachineFunction *MF = MBB.getParent(); 515 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), 516 "illegal SGPR to VGPR copy", 517 DL, DS_Error); 518 LLVMContext &C = MF->getFunction().getContext(); 519 C.diagnose(IllegalCopy); 520 521 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg) 522 .addReg(SrcReg, getKillRegState(KillSrc)); 523 } 524 525 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 526 MachineBasicBlock::iterator MI, 527 const DebugLoc &DL, MCRegister DestReg, 528 MCRegister SrcReg, bool KillSrc) const { 529 const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg); 530 531 if (RC == &AMDGPU::VGPR_32RegClass) { 532 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 533 AMDGPU::SReg_32RegClass.contains(SrcReg) || 534 AMDGPU::AGPR_32RegClass.contains(SrcReg)); 535 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ? 536 AMDGPU::V_ACCVGPR_READ_B32 : AMDGPU::V_MOV_B32_e32; 537 BuildMI(MBB, MI, DL, get(Opc), DestReg) 538 .addReg(SrcReg, getKillRegState(KillSrc)); 539 return; 540 } 541 542 if (RC == &AMDGPU::SReg_32_XM0RegClass || 543 RC == &AMDGPU::SReg_32RegClass) { 544 if (SrcReg == AMDGPU::SCC) { 545 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) 546 .addImm(1) 547 .addImm(0); 548 return; 549 } 550 551 if (DestReg == AMDGPU::VCC_LO) { 552 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) { 553 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO) 554 .addReg(SrcReg, getKillRegState(KillSrc)); 555 } else { 556 // FIXME: Hack until VReg_1 removed. 557 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 558 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) 559 .addImm(0) 560 .addReg(SrcReg, getKillRegState(KillSrc)); 561 } 562 563 return; 564 } 565 566 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) { 567 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 568 return; 569 } 570 571 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 572 .addReg(SrcReg, getKillRegState(KillSrc)); 573 return; 574 } 575 576 if (RC == &AMDGPU::SReg_64RegClass) { 577 if (DestReg == AMDGPU::VCC) { 578 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 579 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 580 .addReg(SrcReg, getKillRegState(KillSrc)); 581 } else { 582 // FIXME: Hack until VReg_1 removed. 583 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 584 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) 585 .addImm(0) 586 .addReg(SrcReg, getKillRegState(KillSrc)); 587 } 588 589 return; 590 } 591 592 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) { 593 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 594 return; 595 } 596 597 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 598 .addReg(SrcReg, getKillRegState(KillSrc)); 599 return; 600 } 601 602 if (DestReg == AMDGPU::SCC) { 603 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 604 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) 605 .addReg(SrcReg, getKillRegState(KillSrc)) 606 .addImm(0); 607 return; 608 } 609 610 if (RC == &AMDGPU::AGPR_32RegClass) { 611 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 612 AMDGPU::SReg_32RegClass.contains(SrcReg) || 613 AMDGPU::AGPR_32RegClass.contains(SrcReg)); 614 if (!AMDGPU::VGPR_32RegClass.contains(SrcReg)) { 615 // First try to find defining accvgpr_write to avoid temporary registers. 616 for (auto Def = MI, E = MBB.begin(); Def != E; ) { 617 --Def; 618 if (!Def->definesRegister(SrcReg, &RI)) 619 continue; 620 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32) 621 break; 622 623 MachineOperand &DefOp = Def->getOperand(1); 624 assert(DefOp.isReg() || DefOp.isImm()); 625 626 if (DefOp.isReg()) { 627 // Check that register source operand if not clobbered before MI. 628 // Immediate operands are always safe to propagate. 629 bool SafeToPropagate = true; 630 for (auto I = Def; I != MI && SafeToPropagate; ++I) 631 if (I->modifiesRegister(DefOp.getReg(), &RI)) 632 SafeToPropagate = false; 633 634 if (!SafeToPropagate) 635 break; 636 637 DefOp.setIsKill(false); 638 } 639 640 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) 641 .add(DefOp); 642 return; 643 } 644 645 RegScavenger RS; 646 RS.enterBasicBlock(MBB); 647 RS.forward(MI); 648 649 // Ideally we want to have three registers for a long reg_sequence copy 650 // to hide 2 waitstates between v_mov_b32 and accvgpr_write. 651 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 652 *MBB.getParent()); 653 654 // Registers in the sequence are allocated contiguously so we can just 655 // use register number to pick one of three round-robin temps. 656 unsigned RegNo = DestReg % 3; 657 Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); 658 if (!Tmp) 659 report_fatal_error("Cannot scavenge VGPR to copy to AGPR"); 660 RS.setRegUsed(Tmp); 661 // Only loop through if there are any free registers left, otherwise 662 // scavenger may report a fatal error without emergency spill slot 663 // or spill with the slot. 664 while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { 665 unsigned Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); 666 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) 667 break; 668 Tmp = Tmp2; 669 RS.setRegUsed(Tmp); 670 } 671 copyPhysReg(MBB, MI, DL, Tmp, SrcReg, KillSrc); 672 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) 673 .addReg(Tmp, RegState::Kill); 674 return; 675 } 676 677 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) 678 .addReg(SrcReg, getKillRegState(KillSrc)); 679 return; 680 } 681 682 if (RC == &AMDGPU::VGPR_LO16RegClass || RC == &AMDGPU::VGPR_HI16RegClass) { 683 assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || 684 AMDGPU::VGPR_HI16RegClass.contains(SrcReg)); 685 686 // d s 687 // l -> l : hhhhxxxx : xxxxllll -> v_alignbyte_b32 d, s, d, 2 688 // llllhhhh : xxxxllll -> v_alignbyte_b32 d, d, d, 2 689 // l -> h : xxxxllll : xxxxhhhh -> v_lshlrev_b32 d, 16, d 690 // llll0000 : xxxxhhhh -> v_alignbyte_b32 d, s, d, 2 691 // h -> l : hhhhxxxx : llllxxxx -> v_lshrrev_b32 d, 16, d 692 // 0000hhhh : llllxxxx -> v_alignbyte_b32 d, d, s, 2 693 // h -> h : xxxxllll : hhhhxxxx -> v_alignbyte_b32 d, d, s, 2 694 // llllhhhh : hhhhxxxx -> v_alignbyte_b32 d, d, d, 2 695 696 bool DstLow = RC == &AMDGPU::VGPR_LO16RegClass; 697 bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg); 698 DestReg = RI.getMatchingSuperReg(DestReg, 699 DstLow ? AMDGPU::lo16 : AMDGPU::hi16, 700 &AMDGPU::VGPR_32RegClass); 701 SrcReg = RI.getMatchingSuperReg(SrcReg, 702 SrcLow ? AMDGPU::lo16 : AMDGPU::hi16, 703 &AMDGPU::VGPR_32RegClass); 704 705 if (DestReg == SrcReg) { 706 // l -> h : v_pk_add_u16 v1, v1, 0 op_sel_hi:[0,0] 707 // h -> l : v_pk_add_u16 v1, v1, 0 op_sel:[1,0] op_sel_hi:[1,0] 708 if (DstLow == SrcLow) 709 return; 710 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_ADD_U16), DestReg) 711 .addImm(DstLow ? SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1 : 0) 712 .addReg(DestReg, RegState::Undef) 713 .addImm(0) // src1_mod 714 .addImm(0) // src1 715 .addImm(0) 716 .addImm(0) 717 .addImm(0) 718 .addImm(0) 719 .addImm(0); 720 721 return; 722 } 723 724 // Last instruction first: 725 auto Last = BuildMI(MBB, MI, DL, get(AMDGPU::V_ALIGNBYTE_B32), DestReg) 726 .addReg((SrcLow && !DstLow) ? SrcReg : DestReg, 727 (SrcLow && !DstLow) ? getKillRegState(KillSrc) : 0) 728 .addReg((!SrcLow && DstLow) ? SrcReg : DestReg, 729 (!SrcLow && DstLow) ? getKillRegState(KillSrc) : 0) 730 .addImm(2); 731 732 unsigned OpcFirst = (DstLow == SrcLow) ? AMDGPU::V_ALIGNBYTE_B32 733 : SrcLow ? AMDGPU::V_LSHRREV_B32_e32 734 : AMDGPU::V_LSHLREV_B32_e32; 735 auto First = BuildMI(MBB, &*Last, DL, get(OpcFirst), DestReg); 736 if (DstLow == SrcLow) { // alignbyte 737 First 738 .addReg(SrcLow ? SrcReg : DestReg, 739 SrcLow ? getKillRegState(KillSrc) : unsigned(RegState::Undef)) 740 .addReg(SrcLow ? DestReg : SrcReg, 741 SrcLow ? unsigned(RegState::Undef) : getKillRegState(KillSrc)) 742 .addImm(2); 743 } else { 744 First.addImm(16) 745 .addReg(DestReg, RegState::Undef); 746 } 747 748 return; 749 } 750 751 unsigned EltSize = 4; 752 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 753 if (RI.isSGPRClass(RC)) { 754 // TODO: Copy vec3/vec5 with s_mov_b64s then final s_mov_b32. 755 if (!(RI.getRegSizeInBits(*RC) % 64)) { 756 Opcode = AMDGPU::S_MOV_B64; 757 EltSize = 8; 758 } else { 759 Opcode = AMDGPU::S_MOV_B32; 760 EltSize = 4; 761 } 762 763 if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) { 764 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 765 return; 766 } 767 } else if (RI.hasAGPRs(RC)) { 768 Opcode = RI.hasVGPRs(RI.getPhysRegClass(SrcReg)) ? 769 AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY; 770 } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) { 771 Opcode = AMDGPU::V_ACCVGPR_READ_B32; 772 } 773 774 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); 775 bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); 776 777 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 778 unsigned SubIdx; 779 if (Forward) 780 SubIdx = SubIndices[Idx]; 781 else 782 SubIdx = SubIndices[SubIndices.size() - Idx - 1]; 783 784 if (Opcode == TargetOpcode::COPY) { 785 copyPhysReg(MBB, MI, DL, RI.getSubReg(DestReg, SubIdx), 786 RI.getSubReg(SrcReg, SubIdx), KillSrc); 787 continue; 788 } 789 790 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 791 get(Opcode), RI.getSubReg(DestReg, SubIdx)); 792 793 Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); 794 795 if (Idx == 0) 796 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 797 798 bool UseKill = KillSrc && Idx == SubIndices.size() - 1; 799 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); 800 } 801 } 802 803 int SIInstrInfo::commuteOpcode(unsigned Opcode) const { 804 int NewOpc; 805 806 // Try to map original to commuted opcode 807 NewOpc = AMDGPU::getCommuteRev(Opcode); 808 if (NewOpc != -1) 809 // Check if the commuted (REV) opcode exists on the target. 810 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 811 812 // Try to map commuted to original opcode 813 NewOpc = AMDGPU::getCommuteOrig(Opcode); 814 if (NewOpc != -1) 815 // Check if the original (non-REV) opcode exists on the target. 816 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 817 818 return Opcode; 819 } 820 821 void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, 822 MachineBasicBlock::iterator MI, 823 const DebugLoc &DL, unsigned DestReg, 824 int64_t Value) const { 825 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 826 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg); 827 if (RegClass == &AMDGPU::SReg_32RegClass || 828 RegClass == &AMDGPU::SGPR_32RegClass || 829 RegClass == &AMDGPU::SReg_32_XM0RegClass || 830 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) { 831 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 832 .addImm(Value); 833 return; 834 } 835 836 if (RegClass == &AMDGPU::SReg_64RegClass || 837 RegClass == &AMDGPU::SGPR_64RegClass || 838 RegClass == &AMDGPU::SReg_64_XEXECRegClass) { 839 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 840 .addImm(Value); 841 return; 842 } 843 844 if (RegClass == &AMDGPU::VGPR_32RegClass) { 845 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 846 .addImm(Value); 847 return; 848 } 849 if (RegClass == &AMDGPU::VReg_64RegClass) { 850 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg) 851 .addImm(Value); 852 return; 853 } 854 855 unsigned EltSize = 4; 856 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 857 if (RI.isSGPRClass(RegClass)) { 858 if (RI.getRegSizeInBits(*RegClass) > 32) { 859 Opcode = AMDGPU::S_MOV_B64; 860 EltSize = 8; 861 } else { 862 Opcode = AMDGPU::S_MOV_B32; 863 EltSize = 4; 864 } 865 } 866 867 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize); 868 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 869 int64_t IdxValue = Idx == 0 ? Value : 0; 870 871 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 872 get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx])); 873 Builder.addImm(IdxValue); 874 } 875 } 876 877 const TargetRegisterClass * 878 SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const { 879 return &AMDGPU::VGPR_32RegClass; 880 } 881 882 void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, 883 MachineBasicBlock::iterator I, 884 const DebugLoc &DL, Register DstReg, 885 ArrayRef<MachineOperand> Cond, 886 Register TrueReg, 887 Register FalseReg) const { 888 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 889 MachineFunction *MF = MBB.getParent(); 890 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 891 const TargetRegisterClass *BoolXExecRC = 892 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 893 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && 894 "Not a VGPR32 reg"); 895 896 if (Cond.size() == 1) { 897 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 898 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 899 .add(Cond[0]); 900 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 901 .addImm(0) 902 .addReg(FalseReg) 903 .addImm(0) 904 .addReg(TrueReg) 905 .addReg(SReg); 906 } else if (Cond.size() == 2) { 907 assert(Cond[0].isImm() && "Cond[0] is not an immediate"); 908 switch (Cond[0].getImm()) { 909 case SIInstrInfo::SCC_TRUE: { 910 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 911 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 912 : AMDGPU::S_CSELECT_B64), SReg) 913 .addImm(1) 914 .addImm(0); 915 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 916 .addImm(0) 917 .addReg(FalseReg) 918 .addImm(0) 919 .addReg(TrueReg) 920 .addReg(SReg); 921 break; 922 } 923 case SIInstrInfo::SCC_FALSE: { 924 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 925 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 926 : AMDGPU::S_CSELECT_B64), SReg) 927 .addImm(0) 928 .addImm(1); 929 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 930 .addImm(0) 931 .addReg(FalseReg) 932 .addImm(0) 933 .addReg(TrueReg) 934 .addReg(SReg); 935 break; 936 } 937 case SIInstrInfo::VCCNZ: { 938 MachineOperand RegOp = Cond[1]; 939 RegOp.setImplicit(false); 940 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 941 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 942 .add(RegOp); 943 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 944 .addImm(0) 945 .addReg(FalseReg) 946 .addImm(0) 947 .addReg(TrueReg) 948 .addReg(SReg); 949 break; 950 } 951 case SIInstrInfo::VCCZ: { 952 MachineOperand RegOp = Cond[1]; 953 RegOp.setImplicit(false); 954 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 955 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 956 .add(RegOp); 957 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 958 .addImm(0) 959 .addReg(TrueReg) 960 .addImm(0) 961 .addReg(FalseReg) 962 .addReg(SReg); 963 break; 964 } 965 case SIInstrInfo::EXECNZ: { 966 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 967 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); 968 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 969 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 970 .addImm(0); 971 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 972 : AMDGPU::S_CSELECT_B64), SReg) 973 .addImm(1) 974 .addImm(0); 975 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 976 .addImm(0) 977 .addReg(FalseReg) 978 .addImm(0) 979 .addReg(TrueReg) 980 .addReg(SReg); 981 break; 982 } 983 case SIInstrInfo::EXECZ: { 984 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 985 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); 986 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 987 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 988 .addImm(0); 989 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 990 : AMDGPU::S_CSELECT_B64), SReg) 991 .addImm(0) 992 .addImm(1); 993 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 994 .addImm(0) 995 .addReg(FalseReg) 996 .addImm(0) 997 .addReg(TrueReg) 998 .addReg(SReg); 999 llvm_unreachable("Unhandled branch predicate EXECZ"); 1000 break; 1001 } 1002 default: 1003 llvm_unreachable("invalid branch predicate"); 1004 } 1005 } else { 1006 llvm_unreachable("Can only handle Cond size 1 or 2"); 1007 } 1008 } 1009 1010 Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB, 1011 MachineBasicBlock::iterator I, 1012 const DebugLoc &DL, 1013 Register SrcReg, int Value) const { 1014 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1015 Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); 1016 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) 1017 .addImm(Value) 1018 .addReg(SrcReg); 1019 1020 return Reg; 1021 } 1022 1023 Register SIInstrInfo::insertNE(MachineBasicBlock *MBB, 1024 MachineBasicBlock::iterator I, 1025 const DebugLoc &DL, 1026 Register SrcReg, int Value) const { 1027 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1028 Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); 1029 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) 1030 .addImm(Value) 1031 .addReg(SrcReg); 1032 1033 return Reg; 1034 } 1035 1036 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 1037 1038 if (RI.hasAGPRs(DstRC)) 1039 return AMDGPU::COPY; 1040 if (RI.getRegSizeInBits(*DstRC) == 32) { 1041 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1042 } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { 1043 return AMDGPU::S_MOV_B64; 1044 } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) { 1045 return AMDGPU::V_MOV_B64_PSEUDO; 1046 } 1047 return AMDGPU::COPY; 1048 } 1049 1050 static unsigned getIndirectVGPRWritePseudoOpc(unsigned VecSize) { 1051 switch (VecSize) { 1052 case 32: // 4 bytes 1053 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V1; 1054 case 64: // 8 bytes 1055 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V2; 1056 case 96: // 12 bytes 1057 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V3; 1058 case 128: // 16 bytes 1059 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V4; 1060 case 160: // 20 bytes 1061 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V5; 1062 case 256: // 32 bytes 1063 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V8; 1064 case 512: // 64 bytes 1065 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V16; 1066 case 1024: // 128 bytes 1067 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V32; 1068 default: 1069 llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); 1070 } 1071 } 1072 1073 static unsigned getIndirectSGPRWritePseudo32(unsigned VecSize) { 1074 switch (VecSize) { 1075 case 32: // 4 bytes 1076 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V1; 1077 case 64: // 8 bytes 1078 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V2; 1079 case 96: // 12 bytes 1080 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V3; 1081 case 128: // 16 bytes 1082 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V4; 1083 case 160: // 20 bytes 1084 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V5; 1085 case 256: // 32 bytes 1086 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V8; 1087 case 512: // 64 bytes 1088 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V16; 1089 case 1024: // 128 bytes 1090 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V32; 1091 default: 1092 llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); 1093 } 1094 } 1095 1096 static unsigned getIndirectSGPRWritePseudo64(unsigned VecSize) { 1097 switch (VecSize) { 1098 case 64: // 8 bytes 1099 return AMDGPU::S_INDIRECT_REG_WRITE_B64_V1; 1100 case 128: // 16 bytes 1101 return AMDGPU::S_INDIRECT_REG_WRITE_B64_V2; 1102 case 256: // 32 bytes 1103 return AMDGPU::S_INDIRECT_REG_WRITE_B64_V4; 1104 case 512: // 64 bytes 1105 return AMDGPU::S_INDIRECT_REG_WRITE_B64_V8; 1106 case 1024: // 128 bytes 1107 return AMDGPU::S_INDIRECT_REG_WRITE_B64_V16; 1108 default: 1109 llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); 1110 } 1111 } 1112 1113 const MCInstrDesc &SIInstrInfo::getIndirectRegWritePseudo( 1114 unsigned VecSize, unsigned EltSize, bool IsSGPR) const { 1115 if (IsSGPR) { 1116 switch (EltSize) { 1117 case 32: 1118 return get(getIndirectSGPRWritePseudo32(VecSize)); 1119 case 64: 1120 return get(getIndirectSGPRWritePseudo64(VecSize)); 1121 default: 1122 llvm_unreachable("invalid reg indexing elt size"); 1123 } 1124 } 1125 1126 assert(EltSize == 32 && "invalid reg indexing elt size"); 1127 return get(getIndirectVGPRWritePseudoOpc(VecSize)); 1128 } 1129 1130 static unsigned getSGPRSpillSaveOpcode(unsigned Size) { 1131 switch (Size) { 1132 case 4: 1133 return AMDGPU::SI_SPILL_S32_SAVE; 1134 case 8: 1135 return AMDGPU::SI_SPILL_S64_SAVE; 1136 case 12: 1137 return AMDGPU::SI_SPILL_S96_SAVE; 1138 case 16: 1139 return AMDGPU::SI_SPILL_S128_SAVE; 1140 case 20: 1141 return AMDGPU::SI_SPILL_S160_SAVE; 1142 case 32: 1143 return AMDGPU::SI_SPILL_S256_SAVE; 1144 case 64: 1145 return AMDGPU::SI_SPILL_S512_SAVE; 1146 case 128: 1147 return AMDGPU::SI_SPILL_S1024_SAVE; 1148 default: 1149 llvm_unreachable("unknown register size"); 1150 } 1151 } 1152 1153 static unsigned getVGPRSpillSaveOpcode(unsigned Size) { 1154 switch (Size) { 1155 case 4: 1156 return AMDGPU::SI_SPILL_V32_SAVE; 1157 case 8: 1158 return AMDGPU::SI_SPILL_V64_SAVE; 1159 case 12: 1160 return AMDGPU::SI_SPILL_V96_SAVE; 1161 case 16: 1162 return AMDGPU::SI_SPILL_V128_SAVE; 1163 case 20: 1164 return AMDGPU::SI_SPILL_V160_SAVE; 1165 case 32: 1166 return AMDGPU::SI_SPILL_V256_SAVE; 1167 case 64: 1168 return AMDGPU::SI_SPILL_V512_SAVE; 1169 case 128: 1170 return AMDGPU::SI_SPILL_V1024_SAVE; 1171 default: 1172 llvm_unreachable("unknown register size"); 1173 } 1174 } 1175 1176 static unsigned getAGPRSpillSaveOpcode(unsigned Size) { 1177 switch (Size) { 1178 case 4: 1179 return AMDGPU::SI_SPILL_A32_SAVE; 1180 case 8: 1181 return AMDGPU::SI_SPILL_A64_SAVE; 1182 case 16: 1183 return AMDGPU::SI_SPILL_A128_SAVE; 1184 case 64: 1185 return AMDGPU::SI_SPILL_A512_SAVE; 1186 case 128: 1187 return AMDGPU::SI_SPILL_A1024_SAVE; 1188 default: 1189 llvm_unreachable("unknown register size"); 1190 } 1191 } 1192 1193 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 1194 MachineBasicBlock::iterator MI, 1195 Register SrcReg, bool isKill, 1196 int FrameIndex, 1197 const TargetRegisterClass *RC, 1198 const TargetRegisterInfo *TRI) const { 1199 MachineFunction *MF = MBB.getParent(); 1200 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1201 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1202 const DebugLoc &DL = MBB.findDebugLoc(MI); 1203 1204 MachinePointerInfo PtrInfo 1205 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 1206 MachineMemOperand *MMO = MF->getMachineMemOperand( 1207 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex), 1208 FrameInfo.getObjectAlign(FrameIndex)); 1209 unsigned SpillSize = TRI->getSpillSize(*RC); 1210 1211 if (RI.isSGPRClass(RC)) { 1212 MFI->setHasSpilledSGPRs(); 1213 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled"); 1214 1215 // We are only allowed to create one new instruction when spilling 1216 // registers, so we need to use pseudo instruction for spilling SGPRs. 1217 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize)); 1218 1219 // The SGPR spill/restore instructions only work on number sgprs, so we need 1220 // to make sure we are using the correct register class. 1221 if (Register::isVirtualRegister(SrcReg) && SpillSize == 4) { 1222 MachineRegisterInfo &MRI = MF->getRegInfo(); 1223 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); 1224 } 1225 1226 BuildMI(MBB, MI, DL, OpDesc) 1227 .addReg(SrcReg, getKillRegState(isKill)) // data 1228 .addFrameIndex(FrameIndex) // addr 1229 .addMemOperand(MMO) 1230 .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) 1231 .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); 1232 // Add the scratch resource registers as implicit uses because we may end up 1233 // needing them, and need to ensure that the reserved registers are 1234 // correctly handled. 1235 if (RI.spillSGPRToVGPR()) 1236 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); 1237 return; 1238 } 1239 1240 unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillSaveOpcode(SpillSize) 1241 : getVGPRSpillSaveOpcode(SpillSize); 1242 MFI->setHasSpilledVGPRs(); 1243 1244 auto MIB = BuildMI(MBB, MI, DL, get(Opcode)); 1245 if (RI.hasAGPRs(RC)) { 1246 MachineRegisterInfo &MRI = MF->getRegInfo(); 1247 Register Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1248 MIB.addReg(Tmp, RegState::Define); 1249 } 1250 MIB.addReg(SrcReg, getKillRegState(isKill)) // data 1251 .addFrameIndex(FrameIndex) // addr 1252 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 1253 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset 1254 .addImm(0) // offset 1255 .addMemOperand(MMO); 1256 } 1257 1258 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { 1259 switch (Size) { 1260 case 4: 1261 return AMDGPU::SI_SPILL_S32_RESTORE; 1262 case 8: 1263 return AMDGPU::SI_SPILL_S64_RESTORE; 1264 case 12: 1265 return AMDGPU::SI_SPILL_S96_RESTORE; 1266 case 16: 1267 return AMDGPU::SI_SPILL_S128_RESTORE; 1268 case 20: 1269 return AMDGPU::SI_SPILL_S160_RESTORE; 1270 case 32: 1271 return AMDGPU::SI_SPILL_S256_RESTORE; 1272 case 64: 1273 return AMDGPU::SI_SPILL_S512_RESTORE; 1274 case 128: 1275 return AMDGPU::SI_SPILL_S1024_RESTORE; 1276 default: 1277 llvm_unreachable("unknown register size"); 1278 } 1279 } 1280 1281 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { 1282 switch (Size) { 1283 case 4: 1284 return AMDGPU::SI_SPILL_V32_RESTORE; 1285 case 8: 1286 return AMDGPU::SI_SPILL_V64_RESTORE; 1287 case 12: 1288 return AMDGPU::SI_SPILL_V96_RESTORE; 1289 case 16: 1290 return AMDGPU::SI_SPILL_V128_RESTORE; 1291 case 20: 1292 return AMDGPU::SI_SPILL_V160_RESTORE; 1293 case 32: 1294 return AMDGPU::SI_SPILL_V256_RESTORE; 1295 case 64: 1296 return AMDGPU::SI_SPILL_V512_RESTORE; 1297 case 128: 1298 return AMDGPU::SI_SPILL_V1024_RESTORE; 1299 default: 1300 llvm_unreachable("unknown register size"); 1301 } 1302 } 1303 1304 static unsigned getAGPRSpillRestoreOpcode(unsigned Size) { 1305 switch (Size) { 1306 case 4: 1307 return AMDGPU::SI_SPILL_A32_RESTORE; 1308 case 8: 1309 return AMDGPU::SI_SPILL_A64_RESTORE; 1310 case 16: 1311 return AMDGPU::SI_SPILL_A128_RESTORE; 1312 case 64: 1313 return AMDGPU::SI_SPILL_A512_RESTORE; 1314 case 128: 1315 return AMDGPU::SI_SPILL_A1024_RESTORE; 1316 default: 1317 llvm_unreachable("unknown register size"); 1318 } 1319 } 1320 1321 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 1322 MachineBasicBlock::iterator MI, 1323 Register DestReg, int FrameIndex, 1324 const TargetRegisterClass *RC, 1325 const TargetRegisterInfo *TRI) const { 1326 MachineFunction *MF = MBB.getParent(); 1327 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1328 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1329 const DebugLoc &DL = MBB.findDebugLoc(MI); 1330 unsigned SpillSize = TRI->getSpillSize(*RC); 1331 1332 MachinePointerInfo PtrInfo 1333 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 1334 1335 MachineMemOperand *MMO = MF->getMachineMemOperand( 1336 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex), 1337 FrameInfo.getObjectAlign(FrameIndex)); 1338 1339 if (RI.isSGPRClass(RC)) { 1340 MFI->setHasSpilledSGPRs(); 1341 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into"); 1342 1343 // FIXME: Maybe this should not include a memoperand because it will be 1344 // lowered to non-memory instructions. 1345 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); 1346 if (DestReg.isVirtual() && SpillSize == 4) { 1347 MachineRegisterInfo &MRI = MF->getRegInfo(); 1348 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); 1349 } 1350 1351 if (RI.spillSGPRToVGPR()) 1352 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); 1353 BuildMI(MBB, MI, DL, OpDesc, DestReg) 1354 .addFrameIndex(FrameIndex) // addr 1355 .addMemOperand(MMO) 1356 .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) 1357 .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); 1358 return; 1359 } 1360 1361 unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize) 1362 : getVGPRSpillRestoreOpcode(SpillSize); 1363 auto MIB = BuildMI(MBB, MI, DL, get(Opcode), DestReg); 1364 if (RI.hasAGPRs(RC)) { 1365 MachineRegisterInfo &MRI = MF->getRegInfo(); 1366 Register Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1367 MIB.addReg(Tmp, RegState::Define); 1368 } 1369 MIB.addFrameIndex(FrameIndex) // vaddr 1370 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 1371 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset 1372 .addImm(0) // offset 1373 .addMemOperand(MMO); 1374 } 1375 1376 /// \param @Offset Offset in bytes of the FrameIndex being spilled 1377 unsigned SIInstrInfo::calculateLDSSpillAddress( 1378 MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg, 1379 unsigned FrameOffset, unsigned Size) const { 1380 MachineFunction *MF = MBB.getParent(); 1381 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1382 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 1383 const DebugLoc &DL = MBB.findDebugLoc(MI); 1384 unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); 1385 unsigned WavefrontSize = ST.getWavefrontSize(); 1386 1387 Register TIDReg = MFI->getTIDReg(); 1388 if (!MFI->hasCalculatedTID()) { 1389 MachineBasicBlock &Entry = MBB.getParent()->front(); 1390 MachineBasicBlock::iterator Insert = Entry.front(); 1391 const DebugLoc &DL = Insert->getDebugLoc(); 1392 1393 TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass, 1394 *MF); 1395 if (TIDReg == AMDGPU::NoRegister) 1396 return TIDReg; 1397 1398 if (!AMDGPU::isShader(MF->getFunction().getCallingConv()) && 1399 WorkGroupSize > WavefrontSize) { 1400 Register TIDIGXReg = 1401 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 1402 Register TIDIGYReg = 1403 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 1404 Register TIDIGZReg = 1405 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 1406 Register InputPtrReg = 1407 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1408 for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { 1409 if (!Entry.isLiveIn(Reg)) 1410 Entry.addLiveIn(Reg); 1411 } 1412 1413 RS->enterBasicBlock(Entry); 1414 // FIXME: Can we scavenge an SReg_64 and access the subregs? 1415 Register STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 1416 Register STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 1417 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) 1418 .addReg(InputPtrReg) 1419 .addImm(SI::KernelInputOffsets::NGROUPS_Z); 1420 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) 1421 .addReg(InputPtrReg) 1422 .addImm(SI::KernelInputOffsets::NGROUPS_Y); 1423 1424 // NGROUPS.X * NGROUPS.Y 1425 BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) 1426 .addReg(STmp1) 1427 .addReg(STmp0); 1428 // (NGROUPS.X * NGROUPS.Y) * TIDIG.X 1429 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) 1430 .addReg(STmp1) 1431 .addReg(TIDIGXReg); 1432 // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) 1433 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) 1434 .addReg(STmp0) 1435 .addReg(TIDIGYReg) 1436 .addReg(TIDReg); 1437 // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z 1438 getAddNoCarry(Entry, Insert, DL, TIDReg) 1439 .addReg(TIDReg) 1440 .addReg(TIDIGZReg) 1441 .addImm(0); // clamp bit 1442 } else { 1443 // Get the wave id 1444 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), 1445 TIDReg) 1446 .addImm(-1) 1447 .addImm(0); 1448 1449 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), 1450 TIDReg) 1451 .addImm(-1) 1452 .addReg(TIDReg); 1453 } 1454 1455 BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), 1456 TIDReg) 1457 .addImm(2) 1458 .addReg(TIDReg); 1459 MFI->setTIDReg(TIDReg); 1460 } 1461 1462 // Add FrameIndex to LDS offset 1463 unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize); 1464 getAddNoCarry(MBB, MI, DL, TmpReg) 1465 .addImm(LDSOffset) 1466 .addReg(TIDReg) 1467 .addImm(0); // clamp bit 1468 1469 return TmpReg; 1470 } 1471 1472 void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB, 1473 MachineBasicBlock::iterator MI, 1474 int Count) const { 1475 DebugLoc DL = MBB.findDebugLoc(MI); 1476 while (Count > 0) { 1477 int Arg; 1478 if (Count >= 8) 1479 Arg = 7; 1480 else 1481 Arg = Count - 1; 1482 Count -= 8; 1483 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)) 1484 .addImm(Arg); 1485 } 1486 } 1487 1488 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, 1489 MachineBasicBlock::iterator MI) const { 1490 insertWaitStates(MBB, MI, 1); 1491 } 1492 1493 void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { 1494 auto MF = MBB.getParent(); 1495 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 1496 1497 assert(Info->isEntryFunction()); 1498 1499 if (MBB.succ_empty()) { 1500 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end(); 1501 if (HasNoTerminator) { 1502 if (Info->returnsVoid()) { 1503 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0); 1504 } else { 1505 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG)); 1506 } 1507 } 1508 } 1509 } 1510 1511 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { 1512 switch (MI.getOpcode()) { 1513 default: return 1; // FIXME: Do wait states equal cycles? 1514 1515 case AMDGPU::S_NOP: 1516 return MI.getOperand(0).getImm() + 1; 1517 } 1518 } 1519 1520 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 1521 MachineBasicBlock &MBB = *MI.getParent(); 1522 DebugLoc DL = MBB.findDebugLoc(MI); 1523 switch (MI.getOpcode()) { 1524 default: return TargetInstrInfo::expandPostRAPseudo(MI); 1525 case AMDGPU::S_MOV_B64_term: 1526 // This is only a terminator to get the correct spill code placement during 1527 // register allocation. 1528 MI.setDesc(get(AMDGPU::S_MOV_B64)); 1529 break; 1530 1531 case AMDGPU::S_MOV_B32_term: 1532 // This is only a terminator to get the correct spill code placement during 1533 // register allocation. 1534 MI.setDesc(get(AMDGPU::S_MOV_B32)); 1535 break; 1536 1537 case AMDGPU::S_XOR_B64_term: 1538 // This is only a terminator to get the correct spill code placement during 1539 // register allocation. 1540 MI.setDesc(get(AMDGPU::S_XOR_B64)); 1541 break; 1542 1543 case AMDGPU::S_XOR_B32_term: 1544 // This is only a terminator to get the correct spill code placement during 1545 // register allocation. 1546 MI.setDesc(get(AMDGPU::S_XOR_B32)); 1547 break; 1548 1549 case AMDGPU::S_OR_B32_term: 1550 // This is only a terminator to get the correct spill code placement during 1551 // register allocation. 1552 MI.setDesc(get(AMDGPU::S_OR_B32)); 1553 break; 1554 1555 case AMDGPU::S_ANDN2_B64_term: 1556 // This is only a terminator to get the correct spill code placement during 1557 // register allocation. 1558 MI.setDesc(get(AMDGPU::S_ANDN2_B64)); 1559 break; 1560 1561 case AMDGPU::S_ANDN2_B32_term: 1562 // This is only a terminator to get the correct spill code placement during 1563 // register allocation. 1564 MI.setDesc(get(AMDGPU::S_ANDN2_B32)); 1565 break; 1566 1567 case AMDGPU::V_MOV_B64_PSEUDO: { 1568 Register Dst = MI.getOperand(0).getReg(); 1569 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 1570 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 1571 1572 const MachineOperand &SrcOp = MI.getOperand(1); 1573 // FIXME: Will this work for 64-bit floating point immediates? 1574 assert(!SrcOp.isFPImm()); 1575 if (SrcOp.isImm()) { 1576 APInt Imm(64, SrcOp.getImm()); 1577 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 1578 .addImm(Imm.getLoBits(32).getZExtValue()) 1579 .addReg(Dst, RegState::Implicit | RegState::Define); 1580 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 1581 .addImm(Imm.getHiBits(32).getZExtValue()) 1582 .addReg(Dst, RegState::Implicit | RegState::Define); 1583 } else { 1584 assert(SrcOp.isReg()); 1585 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 1586 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 1587 .addReg(Dst, RegState::Implicit | RegState::Define); 1588 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 1589 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 1590 .addReg(Dst, RegState::Implicit | RegState::Define); 1591 } 1592 MI.eraseFromParent(); 1593 break; 1594 } 1595 case AMDGPU::V_MOV_B64_DPP_PSEUDO: { 1596 expandMovDPP64(MI); 1597 break; 1598 } 1599 case AMDGPU::V_SET_INACTIVE_B32: { 1600 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; 1601 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1602 BuildMI(MBB, MI, DL, get(NotOpc), Exec) 1603 .addReg(Exec); 1604 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) 1605 .add(MI.getOperand(2)); 1606 BuildMI(MBB, MI, DL, get(NotOpc), Exec) 1607 .addReg(Exec); 1608 MI.eraseFromParent(); 1609 break; 1610 } 1611 case AMDGPU::V_SET_INACTIVE_B64: { 1612 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; 1613 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1614 BuildMI(MBB, MI, DL, get(NotOpc), Exec) 1615 .addReg(Exec); 1616 MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), 1617 MI.getOperand(0).getReg()) 1618 .add(MI.getOperand(2)); 1619 expandPostRAPseudo(*Copy); 1620 BuildMI(MBB, MI, DL, get(NotOpc), Exec) 1621 .addReg(Exec); 1622 MI.eraseFromParent(); 1623 break; 1624 } 1625 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V1: 1626 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V2: 1627 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V3: 1628 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V4: 1629 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V5: 1630 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V8: 1631 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V16: 1632 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V32: 1633 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V1: 1634 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V2: 1635 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V3: 1636 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V4: 1637 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V5: 1638 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V8: 1639 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V16: 1640 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V32: 1641 case AMDGPU::S_INDIRECT_REG_WRITE_B64_V1: 1642 case AMDGPU::S_INDIRECT_REG_WRITE_B64_V2: 1643 case AMDGPU::S_INDIRECT_REG_WRITE_B64_V4: 1644 case AMDGPU::S_INDIRECT_REG_WRITE_B64_V8: 1645 case AMDGPU::S_INDIRECT_REG_WRITE_B64_V16: { 1646 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2); 1647 1648 unsigned Opc; 1649 if (RI.hasVGPRs(EltRC)) { 1650 Opc = ST.useVGPRIndexMode() ? 1651 AMDGPU::V_MOV_B32_indirect : AMDGPU::V_MOVRELD_B32_e32; 1652 } else { 1653 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? 1654 AMDGPU::S_MOVRELD_B64 : AMDGPU::S_MOVRELD_B32; 1655 } 1656 1657 const MCInstrDesc &OpDesc = get(Opc); 1658 Register VecReg = MI.getOperand(0).getReg(); 1659 bool IsUndef = MI.getOperand(1).isUndef(); 1660 unsigned SubReg = MI.getOperand(3).getImm(); 1661 assert(VecReg == MI.getOperand(1).getReg()); 1662 1663 MachineInstrBuilder MIB = 1664 BuildMI(MBB, MI, DL, OpDesc) 1665 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) 1666 .add(MI.getOperand(2)) 1667 .addReg(VecReg, RegState::ImplicitDefine) 1668 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); 1669 1670 const int ImpDefIdx = 1671 OpDesc.getNumOperands() + OpDesc.getNumImplicitUses(); 1672 const int ImpUseIdx = ImpDefIdx + 1; 1673 MIB->tieOperands(ImpDefIdx, ImpUseIdx); 1674 MI.eraseFromParent(); 1675 break; 1676 } 1677 case AMDGPU::SI_PC_ADD_REL_OFFSET: { 1678 MachineFunction &MF = *MBB.getParent(); 1679 Register Reg = MI.getOperand(0).getReg(); 1680 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0); 1681 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1); 1682 1683 // Create a bundle so these instructions won't be re-ordered by the 1684 // post-RA scheduler. 1685 MIBundleBuilder Bundler(MBB, MI); 1686 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); 1687 1688 // Add 32-bit offset from this instruction to the start of the 1689 // constant data. 1690 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) 1691 .addReg(RegLo) 1692 .add(MI.getOperand(1))); 1693 1694 MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) 1695 .addReg(RegHi); 1696 MIB.add(MI.getOperand(2)); 1697 1698 Bundler.append(MIB); 1699 finalizeBundle(MBB, Bundler.begin()); 1700 1701 MI.eraseFromParent(); 1702 break; 1703 } 1704 case AMDGPU::ENTER_WWM: { 1705 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when 1706 // WWM is entered. 1707 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 1708 : AMDGPU::S_OR_SAVEEXEC_B64)); 1709 break; 1710 } 1711 case AMDGPU::EXIT_WWM: { 1712 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when 1713 // WWM is exited. 1714 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); 1715 break; 1716 } 1717 } 1718 return true; 1719 } 1720 1721 std::pair<MachineInstr*, MachineInstr*> 1722 SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { 1723 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); 1724 1725 MachineBasicBlock &MBB = *MI.getParent(); 1726 DebugLoc DL = MBB.findDebugLoc(MI); 1727 MachineFunction *MF = MBB.getParent(); 1728 MachineRegisterInfo &MRI = MF->getRegInfo(); 1729 Register Dst = MI.getOperand(0).getReg(); 1730 unsigned Part = 0; 1731 MachineInstr *Split[2]; 1732 1733 1734 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) { 1735 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp)); 1736 if (Dst.isPhysical()) { 1737 MovDPP.addDef(RI.getSubReg(Dst, Sub)); 1738 } else { 1739 assert(MRI.isSSA()); 1740 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1741 MovDPP.addDef(Tmp); 1742 } 1743 1744 for (unsigned I = 1; I <= 2; ++I) { // old and src operands. 1745 const MachineOperand &SrcOp = MI.getOperand(I); 1746 assert(!SrcOp.isFPImm()); 1747 if (SrcOp.isImm()) { 1748 APInt Imm(64, SrcOp.getImm()); 1749 Imm.ashrInPlace(Part * 32); 1750 MovDPP.addImm(Imm.getLoBits(32).getZExtValue()); 1751 } else { 1752 assert(SrcOp.isReg()); 1753 Register Src = SrcOp.getReg(); 1754 if (Src.isPhysical()) 1755 MovDPP.addReg(RI.getSubReg(Src, Sub)); 1756 else 1757 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub); 1758 } 1759 } 1760 1761 for (unsigned I = 3; I < MI.getNumExplicitOperands(); ++I) 1762 MovDPP.addImm(MI.getOperand(I).getImm()); 1763 1764 Split[Part] = MovDPP; 1765 ++Part; 1766 } 1767 1768 if (Dst.isVirtual()) 1769 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst) 1770 .addReg(Split[0]->getOperand(0).getReg()) 1771 .addImm(AMDGPU::sub0) 1772 .addReg(Split[1]->getOperand(0).getReg()) 1773 .addImm(AMDGPU::sub1); 1774 1775 MI.eraseFromParent(); 1776 return std::make_pair(Split[0], Split[1]); 1777 } 1778 1779 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, 1780 MachineOperand &Src0, 1781 unsigned Src0OpName, 1782 MachineOperand &Src1, 1783 unsigned Src1OpName) const { 1784 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); 1785 if (!Src0Mods) 1786 return false; 1787 1788 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); 1789 assert(Src1Mods && 1790 "All commutable instructions have both src0 and src1 modifiers"); 1791 1792 int Src0ModsVal = Src0Mods->getImm(); 1793 int Src1ModsVal = Src1Mods->getImm(); 1794 1795 Src1Mods->setImm(Src0ModsVal); 1796 Src0Mods->setImm(Src1ModsVal); 1797 return true; 1798 } 1799 1800 static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, 1801 MachineOperand &RegOp, 1802 MachineOperand &NonRegOp) { 1803 Register Reg = RegOp.getReg(); 1804 unsigned SubReg = RegOp.getSubReg(); 1805 bool IsKill = RegOp.isKill(); 1806 bool IsDead = RegOp.isDead(); 1807 bool IsUndef = RegOp.isUndef(); 1808 bool IsDebug = RegOp.isDebug(); 1809 1810 if (NonRegOp.isImm()) 1811 RegOp.ChangeToImmediate(NonRegOp.getImm()); 1812 else if (NonRegOp.isFI()) 1813 RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); 1814 else 1815 return nullptr; 1816 1817 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); 1818 NonRegOp.setSubReg(SubReg); 1819 1820 return &MI; 1821 } 1822 1823 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, 1824 unsigned Src0Idx, 1825 unsigned Src1Idx) const { 1826 assert(!NewMI && "this should never be used"); 1827 1828 unsigned Opc = MI.getOpcode(); 1829 int CommutedOpcode = commuteOpcode(Opc); 1830 if (CommutedOpcode == -1) 1831 return nullptr; 1832 1833 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == 1834 static_cast<int>(Src0Idx) && 1835 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == 1836 static_cast<int>(Src1Idx) && 1837 "inconsistency with findCommutedOpIndices"); 1838 1839 MachineOperand &Src0 = MI.getOperand(Src0Idx); 1840 MachineOperand &Src1 = MI.getOperand(Src1Idx); 1841 1842 MachineInstr *CommutedMI = nullptr; 1843 if (Src0.isReg() && Src1.isReg()) { 1844 if (isOperandLegal(MI, Src1Idx, &Src0)) { 1845 // Be sure to copy the source modifiers to the right place. 1846 CommutedMI 1847 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); 1848 } 1849 1850 } else if (Src0.isReg() && !Src1.isReg()) { 1851 // src0 should always be able to support any operand type, so no need to 1852 // check operand legality. 1853 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); 1854 } else if (!Src0.isReg() && Src1.isReg()) { 1855 if (isOperandLegal(MI, Src1Idx, &Src0)) 1856 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); 1857 } else { 1858 // FIXME: Found two non registers to commute. This does happen. 1859 return nullptr; 1860 } 1861 1862 if (CommutedMI) { 1863 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, 1864 Src1, AMDGPU::OpName::src1_modifiers); 1865 1866 CommutedMI->setDesc(get(CommutedOpcode)); 1867 } 1868 1869 return CommutedMI; 1870 } 1871 1872 // This needs to be implemented because the source modifiers may be inserted 1873 // between the true commutable operands, and the base 1874 // TargetInstrInfo::commuteInstruction uses it. 1875 bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI, 1876 unsigned &SrcOpIdx0, 1877 unsigned &SrcOpIdx1) const { 1878 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1); 1879 } 1880 1881 bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0, 1882 unsigned &SrcOpIdx1) const { 1883 if (!Desc.isCommutable()) 1884 return false; 1885 1886 unsigned Opc = Desc.getOpcode(); 1887 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1888 if (Src0Idx == -1) 1889 return false; 1890 1891 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1892 if (Src1Idx == -1) 1893 return false; 1894 1895 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); 1896 } 1897 1898 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, 1899 int64_t BrOffset) const { 1900 // BranchRelaxation should never have to check s_setpc_b64 because its dest 1901 // block is unanalyzable. 1902 assert(BranchOp != AMDGPU::S_SETPC_B64); 1903 1904 // Convert to dwords. 1905 BrOffset /= 4; 1906 1907 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is 1908 // from the next instruction. 1909 BrOffset -= 1; 1910 1911 return isIntN(BranchOffsetBits, BrOffset); 1912 } 1913 1914 MachineBasicBlock *SIInstrInfo::getBranchDestBlock( 1915 const MachineInstr &MI) const { 1916 if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { 1917 // This would be a difficult analysis to perform, but can always be legal so 1918 // there's no need to analyze it. 1919 return nullptr; 1920 } 1921 1922 return MI.getOperand(0).getMBB(); 1923 } 1924 1925 unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, 1926 MachineBasicBlock &DestBB, 1927 const DebugLoc &DL, 1928 int64_t BrOffset, 1929 RegScavenger *RS) const { 1930 assert(RS && "RegScavenger required for long branching"); 1931 assert(MBB.empty() && 1932 "new block should be inserted for expanding unconditional branch"); 1933 assert(MBB.pred_size() == 1); 1934 1935 MachineFunction *MF = MBB.getParent(); 1936 MachineRegisterInfo &MRI = MF->getRegInfo(); 1937 1938 // FIXME: Virtual register workaround for RegScavenger not working with empty 1939 // blocks. 1940 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1941 1942 auto I = MBB.end(); 1943 1944 // We need to compute the offset relative to the instruction immediately after 1945 // s_getpc_b64. Insert pc arithmetic code before last terminator. 1946 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); 1947 1948 // TODO: Handle > 32-bit block address. 1949 if (BrOffset >= 0) { 1950 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) 1951 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 1952 .addReg(PCReg, 0, AMDGPU::sub0) 1953 .addMBB(&DestBB, MO_LONG_BRANCH_FORWARD); 1954 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) 1955 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 1956 .addReg(PCReg, 0, AMDGPU::sub1) 1957 .addImm(0); 1958 } else { 1959 // Backwards branch. 1960 BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32)) 1961 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 1962 .addReg(PCReg, 0, AMDGPU::sub0) 1963 .addMBB(&DestBB, MO_LONG_BRANCH_BACKWARD); 1964 BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32)) 1965 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 1966 .addReg(PCReg, 0, AMDGPU::sub1) 1967 .addImm(0); 1968 } 1969 1970 // Insert the indirect branch after the other terminator. 1971 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) 1972 .addReg(PCReg); 1973 1974 // FIXME: If spilling is necessary, this will fail because this scavenger has 1975 // no emergency stack slots. It is non-trivial to spill in this situation, 1976 // because the restore code needs to be specially placed after the 1977 // jump. BranchRelaxation then needs to be made aware of the newly inserted 1978 // block. 1979 // 1980 // If a spill is needed for the pc register pair, we need to insert a spill 1981 // restore block right before the destination block, and insert a short branch 1982 // into the old destination block's fallthrough predecessor. 1983 // e.g.: 1984 // 1985 // s_cbranch_scc0 skip_long_branch: 1986 // 1987 // long_branch_bb: 1988 // spill s[8:9] 1989 // s_getpc_b64 s[8:9] 1990 // s_add_u32 s8, s8, restore_bb 1991 // s_addc_u32 s9, s9, 0 1992 // s_setpc_b64 s[8:9] 1993 // 1994 // skip_long_branch: 1995 // foo; 1996 // 1997 // ..... 1998 // 1999 // dest_bb_fallthrough_predecessor: 2000 // bar; 2001 // s_branch dest_bb 2002 // 2003 // restore_bb: 2004 // restore s[8:9] 2005 // fallthrough dest_bb 2006 /// 2007 // dest_bb: 2008 // buzz; 2009 2010 RS->enterBasicBlockEnd(MBB); 2011 unsigned Scav = RS->scavengeRegisterBackwards( 2012 AMDGPU::SReg_64RegClass, 2013 MachineBasicBlock::iterator(GetPC), false, 0); 2014 MRI.replaceRegWith(PCReg, Scav); 2015 MRI.clearVirtRegs(); 2016 RS->setRegUsed(Scav); 2017 2018 return 4 + 8 + 4 + 4; 2019 } 2020 2021 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { 2022 switch (Cond) { 2023 case SIInstrInfo::SCC_TRUE: 2024 return AMDGPU::S_CBRANCH_SCC1; 2025 case SIInstrInfo::SCC_FALSE: 2026 return AMDGPU::S_CBRANCH_SCC0; 2027 case SIInstrInfo::VCCNZ: 2028 return AMDGPU::S_CBRANCH_VCCNZ; 2029 case SIInstrInfo::VCCZ: 2030 return AMDGPU::S_CBRANCH_VCCZ; 2031 case SIInstrInfo::EXECNZ: 2032 return AMDGPU::S_CBRANCH_EXECNZ; 2033 case SIInstrInfo::EXECZ: 2034 return AMDGPU::S_CBRANCH_EXECZ; 2035 default: 2036 llvm_unreachable("invalid branch predicate"); 2037 } 2038 } 2039 2040 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { 2041 switch (Opcode) { 2042 case AMDGPU::S_CBRANCH_SCC0: 2043 return SCC_FALSE; 2044 case AMDGPU::S_CBRANCH_SCC1: 2045 return SCC_TRUE; 2046 case AMDGPU::S_CBRANCH_VCCNZ: 2047 return VCCNZ; 2048 case AMDGPU::S_CBRANCH_VCCZ: 2049 return VCCZ; 2050 case AMDGPU::S_CBRANCH_EXECNZ: 2051 return EXECNZ; 2052 case AMDGPU::S_CBRANCH_EXECZ: 2053 return EXECZ; 2054 default: 2055 return INVALID_BR; 2056 } 2057 } 2058 2059 bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, 2060 MachineBasicBlock::iterator I, 2061 MachineBasicBlock *&TBB, 2062 MachineBasicBlock *&FBB, 2063 SmallVectorImpl<MachineOperand> &Cond, 2064 bool AllowModify) const { 2065 if (I->getOpcode() == AMDGPU::S_BRANCH) { 2066 // Unconditional Branch 2067 TBB = I->getOperand(0).getMBB(); 2068 return false; 2069 } 2070 2071 MachineBasicBlock *CondBB = nullptr; 2072 2073 if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 2074 CondBB = I->getOperand(1).getMBB(); 2075 Cond.push_back(I->getOperand(0)); 2076 } else { 2077 BranchPredicate Pred = getBranchPredicate(I->getOpcode()); 2078 if (Pred == INVALID_BR) 2079 return true; 2080 2081 CondBB = I->getOperand(0).getMBB(); 2082 Cond.push_back(MachineOperand::CreateImm(Pred)); 2083 Cond.push_back(I->getOperand(1)); // Save the branch register. 2084 } 2085 ++I; 2086 2087 if (I == MBB.end()) { 2088 // Conditional branch followed by fall-through. 2089 TBB = CondBB; 2090 return false; 2091 } 2092 2093 if (I->getOpcode() == AMDGPU::S_BRANCH) { 2094 TBB = CondBB; 2095 FBB = I->getOperand(0).getMBB(); 2096 return false; 2097 } 2098 2099 return true; 2100 } 2101 2102 bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, 2103 MachineBasicBlock *&FBB, 2104 SmallVectorImpl<MachineOperand> &Cond, 2105 bool AllowModify) const { 2106 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 2107 auto E = MBB.end(); 2108 if (I == E) 2109 return false; 2110 2111 // Skip over the instructions that are artificially terminators for special 2112 // exec management. 2113 while (I != E && !I->isBranch() && !I->isReturn() && 2114 I->getOpcode() != AMDGPU::SI_MASK_BRANCH) { 2115 switch (I->getOpcode()) { 2116 case AMDGPU::SI_MASK_BRANCH: 2117 case AMDGPU::S_MOV_B64_term: 2118 case AMDGPU::S_XOR_B64_term: 2119 case AMDGPU::S_ANDN2_B64_term: 2120 case AMDGPU::S_MOV_B32_term: 2121 case AMDGPU::S_XOR_B32_term: 2122 case AMDGPU::S_OR_B32_term: 2123 case AMDGPU::S_ANDN2_B32_term: 2124 break; 2125 case AMDGPU::SI_IF: 2126 case AMDGPU::SI_ELSE: 2127 case AMDGPU::SI_KILL_I1_TERMINATOR: 2128 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: 2129 // FIXME: It's messy that these need to be considered here at all. 2130 return true; 2131 default: 2132 llvm_unreachable("unexpected non-branch terminator inst"); 2133 } 2134 2135 ++I; 2136 } 2137 2138 if (I == E) 2139 return false; 2140 2141 if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH) 2142 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); 2143 2144 ++I; 2145 2146 // TODO: Should be able to treat as fallthrough? 2147 if (I == MBB.end()) 2148 return true; 2149 2150 if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify)) 2151 return true; 2152 2153 MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB(); 2154 2155 // Specifically handle the case where the conditional branch is to the same 2156 // destination as the mask branch. e.g. 2157 // 2158 // si_mask_branch BB8 2159 // s_cbranch_execz BB8 2160 // s_cbranch BB9 2161 // 2162 // This is required to understand divergent loops which may need the branches 2163 // to be relaxed. 2164 if (TBB != MaskBrDest || Cond.empty()) 2165 return true; 2166 2167 auto Pred = Cond[0].getImm(); 2168 return (Pred != EXECZ && Pred != EXECNZ); 2169 } 2170 2171 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, 2172 int *BytesRemoved) const { 2173 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 2174 2175 unsigned Count = 0; 2176 unsigned RemovedSize = 0; 2177 while (I != MBB.end()) { 2178 MachineBasicBlock::iterator Next = std::next(I); 2179 if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) { 2180 I = Next; 2181 continue; 2182 } 2183 2184 RemovedSize += getInstSizeInBytes(*I); 2185 I->eraseFromParent(); 2186 ++Count; 2187 I = Next; 2188 } 2189 2190 if (BytesRemoved) 2191 *BytesRemoved = RemovedSize; 2192 2193 return Count; 2194 } 2195 2196 // Copy the flags onto the implicit condition register operand. 2197 static void preserveCondRegFlags(MachineOperand &CondReg, 2198 const MachineOperand &OrigCond) { 2199 CondReg.setIsUndef(OrigCond.isUndef()); 2200 CondReg.setIsKill(OrigCond.isKill()); 2201 } 2202 2203 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, 2204 MachineBasicBlock *TBB, 2205 MachineBasicBlock *FBB, 2206 ArrayRef<MachineOperand> Cond, 2207 const DebugLoc &DL, 2208 int *BytesAdded) const { 2209 if (!FBB && Cond.empty()) { 2210 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 2211 .addMBB(TBB); 2212 if (BytesAdded) 2213 *BytesAdded = 4; 2214 return 1; 2215 } 2216 2217 if(Cond.size() == 1 && Cond[0].isReg()) { 2218 BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO)) 2219 .add(Cond[0]) 2220 .addMBB(TBB); 2221 return 1; 2222 } 2223 2224 assert(TBB && Cond[0].isImm()); 2225 2226 unsigned Opcode 2227 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); 2228 2229 if (!FBB) { 2230 Cond[1].isUndef(); 2231 MachineInstr *CondBr = 2232 BuildMI(&MBB, DL, get(Opcode)) 2233 .addMBB(TBB); 2234 2235 // Copy the flags onto the implicit condition register operand. 2236 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); 2237 2238 if (BytesAdded) 2239 *BytesAdded = 4; 2240 return 1; 2241 } 2242 2243 assert(TBB && FBB); 2244 2245 MachineInstr *CondBr = 2246 BuildMI(&MBB, DL, get(Opcode)) 2247 .addMBB(TBB); 2248 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 2249 .addMBB(FBB); 2250 2251 MachineOperand &CondReg = CondBr->getOperand(1); 2252 CondReg.setIsUndef(Cond[1].isUndef()); 2253 CondReg.setIsKill(Cond[1].isKill()); 2254 2255 if (BytesAdded) 2256 *BytesAdded = 8; 2257 2258 return 2; 2259 } 2260 2261 bool SIInstrInfo::reverseBranchCondition( 2262 SmallVectorImpl<MachineOperand> &Cond) const { 2263 if (Cond.size() != 2) { 2264 return true; 2265 } 2266 2267 if (Cond[0].isImm()) { 2268 Cond[0].setImm(-Cond[0].getImm()); 2269 return false; 2270 } 2271 2272 return true; 2273 } 2274 2275 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 2276 ArrayRef<MachineOperand> Cond, 2277 Register DstReg, Register TrueReg, 2278 Register FalseReg, int &CondCycles, 2279 int &TrueCycles, int &FalseCycles) const { 2280 switch (Cond[0].getImm()) { 2281 case VCCNZ: 2282 case VCCZ: { 2283 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2284 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 2285 assert(MRI.getRegClass(FalseReg) == RC); 2286 2287 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 2288 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 2289 2290 // Limit to equal cost for branch vs. N v_cndmask_b32s. 2291 return RI.hasVGPRs(RC) && NumInsts <= 6; 2292 } 2293 case SCC_TRUE: 2294 case SCC_FALSE: { 2295 // FIXME: We could insert for VGPRs if we could replace the original compare 2296 // with a vector one. 2297 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2298 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 2299 assert(MRI.getRegClass(FalseReg) == RC); 2300 2301 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 2302 2303 // Multiples of 8 can do s_cselect_b64 2304 if (NumInsts % 2 == 0) 2305 NumInsts /= 2; 2306 2307 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 2308 return RI.isSGPRClass(RC); 2309 } 2310 default: 2311 return false; 2312 } 2313 } 2314 2315 void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, 2316 MachineBasicBlock::iterator I, const DebugLoc &DL, 2317 Register DstReg, ArrayRef<MachineOperand> Cond, 2318 Register TrueReg, Register FalseReg) const { 2319 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); 2320 if (Pred == VCCZ || Pred == SCC_FALSE) { 2321 Pred = static_cast<BranchPredicate>(-Pred); 2322 std::swap(TrueReg, FalseReg); 2323 } 2324 2325 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2326 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); 2327 unsigned DstSize = RI.getRegSizeInBits(*DstRC); 2328 2329 if (DstSize == 32) { 2330 unsigned SelOp = Pred == SCC_TRUE ? 2331 AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32; 2332 2333 // Instruction's operands are backwards from what is expected. 2334 MachineInstr *Select = 2335 BuildMI(MBB, I, DL, get(SelOp), DstReg) 2336 .addReg(FalseReg) 2337 .addReg(TrueReg); 2338 2339 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 2340 return; 2341 } 2342 2343 if (DstSize == 64 && Pred == SCC_TRUE) { 2344 MachineInstr *Select = 2345 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) 2346 .addReg(FalseReg) 2347 .addReg(TrueReg); 2348 2349 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 2350 return; 2351 } 2352 2353 static const int16_t Sub0_15[] = { 2354 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 2355 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 2356 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 2357 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 2358 }; 2359 2360 static const int16_t Sub0_15_64[] = { 2361 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 2362 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 2363 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 2364 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 2365 }; 2366 2367 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; 2368 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; 2369 const int16_t *SubIndices = Sub0_15; 2370 int NElts = DstSize / 32; 2371 2372 // 64-bit select is only available for SALU. 2373 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit. 2374 if (Pred == SCC_TRUE) { 2375 if (NElts % 2) { 2376 SelOp = AMDGPU::S_CSELECT_B32; 2377 EltRC = &AMDGPU::SGPR_32RegClass; 2378 } else { 2379 SelOp = AMDGPU::S_CSELECT_B64; 2380 EltRC = &AMDGPU::SGPR_64RegClass; 2381 SubIndices = Sub0_15_64; 2382 NElts /= 2; 2383 } 2384 } 2385 2386 MachineInstrBuilder MIB = BuildMI( 2387 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); 2388 2389 I = MIB->getIterator(); 2390 2391 SmallVector<Register, 8> Regs; 2392 for (int Idx = 0; Idx != NElts; ++Idx) { 2393 Register DstElt = MRI.createVirtualRegister(EltRC); 2394 Regs.push_back(DstElt); 2395 2396 unsigned SubIdx = SubIndices[Idx]; 2397 2398 MachineInstr *Select = 2399 BuildMI(MBB, I, DL, get(SelOp), DstElt) 2400 .addReg(FalseReg, 0, SubIdx) 2401 .addReg(TrueReg, 0, SubIdx); 2402 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 2403 fixImplicitOperands(*Select); 2404 2405 MIB.addReg(DstElt) 2406 .addImm(SubIdx); 2407 } 2408 } 2409 2410 bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { 2411 switch (MI.getOpcode()) { 2412 case AMDGPU::V_MOV_B32_e32: 2413 case AMDGPU::V_MOV_B32_e64: 2414 case AMDGPU::V_MOV_B64_PSEUDO: { 2415 // If there are additional implicit register operands, this may be used for 2416 // register indexing so the source register operand isn't simply copied. 2417 unsigned NumOps = MI.getDesc().getNumOperands() + 2418 MI.getDesc().getNumImplicitUses(); 2419 2420 return MI.getNumOperands() == NumOps; 2421 } 2422 case AMDGPU::S_MOV_B32: 2423 case AMDGPU::S_MOV_B64: 2424 case AMDGPU::COPY: 2425 case AMDGPU::V_ACCVGPR_WRITE_B32: 2426 case AMDGPU::V_ACCVGPR_READ_B32: 2427 return true; 2428 default: 2429 return false; 2430 } 2431 } 2432 2433 unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind( 2434 unsigned Kind) const { 2435 switch(Kind) { 2436 case PseudoSourceValue::Stack: 2437 case PseudoSourceValue::FixedStack: 2438 return AMDGPUAS::PRIVATE_ADDRESS; 2439 case PseudoSourceValue::ConstantPool: 2440 case PseudoSourceValue::GOT: 2441 case PseudoSourceValue::JumpTable: 2442 case PseudoSourceValue::GlobalValueCallEntry: 2443 case PseudoSourceValue::ExternalSymbolCallEntry: 2444 case PseudoSourceValue::TargetCustom: 2445 return AMDGPUAS::CONSTANT_ADDRESS; 2446 } 2447 return AMDGPUAS::FLAT_ADDRESS; 2448 } 2449 2450 static void removeModOperands(MachineInstr &MI) { 2451 unsigned Opc = MI.getOpcode(); 2452 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, 2453 AMDGPU::OpName::src0_modifiers); 2454 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, 2455 AMDGPU::OpName::src1_modifiers); 2456 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, 2457 AMDGPU::OpName::src2_modifiers); 2458 2459 MI.RemoveOperand(Src2ModIdx); 2460 MI.RemoveOperand(Src1ModIdx); 2461 MI.RemoveOperand(Src0ModIdx); 2462 } 2463 2464 bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, 2465 Register Reg, MachineRegisterInfo *MRI) const { 2466 if (!MRI->hasOneNonDBGUse(Reg)) 2467 return false; 2468 2469 switch (DefMI.getOpcode()) { 2470 default: 2471 return false; 2472 case AMDGPU::S_MOV_B64: 2473 // TODO: We could fold 64-bit immediates, but this get compilicated 2474 // when there are sub-registers. 2475 return false; 2476 2477 case AMDGPU::V_MOV_B32_e32: 2478 case AMDGPU::S_MOV_B32: 2479 case AMDGPU::V_ACCVGPR_WRITE_B32: 2480 break; 2481 } 2482 2483 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); 2484 assert(ImmOp); 2485 // FIXME: We could handle FrameIndex values here. 2486 if (!ImmOp->isImm()) 2487 return false; 2488 2489 unsigned Opc = UseMI.getOpcode(); 2490 if (Opc == AMDGPU::COPY) { 2491 bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg()); 2492 unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 2493 if (RI.isAGPR(*MRI, UseMI.getOperand(0).getReg())) { 2494 if (!isInlineConstant(*ImmOp, AMDGPU::OPERAND_REG_INLINE_AC_INT32)) 2495 return false; 2496 NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32; 2497 } 2498 UseMI.setDesc(get(NewOpc)); 2499 UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm()); 2500 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); 2501 return true; 2502 } 2503 2504 if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || 2505 Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64 || 2506 Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 || 2507 Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64) { 2508 // Don't fold if we are using source or output modifiers. The new VOP2 2509 // instructions don't have them. 2510 if (hasAnyModifiersSet(UseMI)) 2511 return false; 2512 2513 // If this is a free constant, there's no reason to do this. 2514 // TODO: We could fold this here instead of letting SIFoldOperands do it 2515 // later. 2516 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); 2517 2518 // Any src operand can be used for the legality check. 2519 if (isInlineConstant(UseMI, *Src0, *ImmOp)) 2520 return false; 2521 2522 bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || 2523 Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64; 2524 bool IsFMA = Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 || 2525 Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64; 2526 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); 2527 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); 2528 2529 // Multiplied part is the constant: Use v_madmk_{f16, f32}. 2530 // We should only expect these to be on src0 due to canonicalizations. 2531 if (Src0->isReg() && Src0->getReg() == Reg) { 2532 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 2533 return false; 2534 2535 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) 2536 return false; 2537 2538 unsigned NewOpc = 2539 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : AMDGPU::V_FMAMK_F16) 2540 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16); 2541 if (pseudoToMCOpcode(NewOpc) == -1) 2542 return false; 2543 2544 // We need to swap operands 0 and 1 since madmk constant is at operand 1. 2545 2546 const int64_t Imm = ImmOp->getImm(); 2547 2548 // FIXME: This would be a lot easier if we could return a new instruction 2549 // instead of having to modify in place. 2550 2551 // Remove these first since they are at the end. 2552 UseMI.RemoveOperand( 2553 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 2554 UseMI.RemoveOperand( 2555 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 2556 2557 Register Src1Reg = Src1->getReg(); 2558 unsigned Src1SubReg = Src1->getSubReg(); 2559 Src0->setReg(Src1Reg); 2560 Src0->setSubReg(Src1SubReg); 2561 Src0->setIsKill(Src1->isKill()); 2562 2563 if (Opc == AMDGPU::V_MAC_F32_e64 || 2564 Opc == AMDGPU::V_MAC_F16_e64 || 2565 Opc == AMDGPU::V_FMAC_F32_e64 || 2566 Opc == AMDGPU::V_FMAC_F16_e64) 2567 UseMI.untieRegOperand( 2568 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 2569 2570 Src1->ChangeToImmediate(Imm); 2571 2572 removeModOperands(UseMI); 2573 UseMI.setDesc(get(NewOpc)); 2574 2575 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 2576 if (DeleteDef) 2577 DefMI.eraseFromParent(); 2578 2579 return true; 2580 } 2581 2582 // Added part is the constant: Use v_madak_{f16, f32}. 2583 if (Src2->isReg() && Src2->getReg() == Reg) { 2584 // Not allowed to use constant bus for another operand. 2585 // We can however allow an inline immediate as src0. 2586 bool Src0Inlined = false; 2587 if (Src0->isReg()) { 2588 // Try to inline constant if possible. 2589 // If the Def moves immediate and the use is single 2590 // We are saving VGPR here. 2591 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg()); 2592 if (Def && Def->isMoveImmediate() && 2593 isInlineConstant(Def->getOperand(1)) && 2594 MRI->hasOneUse(Src0->getReg())) { 2595 Src0->ChangeToImmediate(Def->getOperand(1).getImm()); 2596 Src0Inlined = true; 2597 } else if ((Register::isPhysicalRegister(Src0->getReg()) && 2598 (ST.getConstantBusLimit(Opc) <= 1 && 2599 RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) || 2600 (Register::isVirtualRegister(Src0->getReg()) && 2601 (ST.getConstantBusLimit(Opc) <= 1 && 2602 RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))) 2603 return false; 2604 // VGPR is okay as Src0 - fallthrough 2605 } 2606 2607 if (Src1->isReg() && !Src0Inlined ) { 2608 // We have one slot for inlinable constant so far - try to fill it 2609 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg()); 2610 if (Def && Def->isMoveImmediate() && 2611 isInlineConstant(Def->getOperand(1)) && 2612 MRI->hasOneUse(Src1->getReg()) && 2613 commuteInstruction(UseMI)) { 2614 Src0->ChangeToImmediate(Def->getOperand(1).getImm()); 2615 } else if ((Register::isPhysicalRegister(Src1->getReg()) && 2616 RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) || 2617 (Register::isVirtualRegister(Src1->getReg()) && 2618 RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) 2619 return false; 2620 // VGPR is okay as Src1 - fallthrough 2621 } 2622 2623 unsigned NewOpc = 2624 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 : AMDGPU::V_FMAAK_F16) 2625 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16); 2626 if (pseudoToMCOpcode(NewOpc) == -1) 2627 return false; 2628 2629 const int64_t Imm = ImmOp->getImm(); 2630 2631 // FIXME: This would be a lot easier if we could return a new instruction 2632 // instead of having to modify in place. 2633 2634 // Remove these first since they are at the end. 2635 UseMI.RemoveOperand( 2636 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 2637 UseMI.RemoveOperand( 2638 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 2639 2640 if (Opc == AMDGPU::V_MAC_F32_e64 || 2641 Opc == AMDGPU::V_MAC_F16_e64 || 2642 Opc == AMDGPU::V_FMAC_F32_e64 || 2643 Opc == AMDGPU::V_FMAC_F16_e64) 2644 UseMI.untieRegOperand( 2645 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 2646 2647 // ChangingToImmediate adds Src2 back to the instruction. 2648 Src2->ChangeToImmediate(Imm); 2649 2650 // These come before src2. 2651 removeModOperands(UseMI); 2652 UseMI.setDesc(get(NewOpc)); 2653 // It might happen that UseMI was commuted 2654 // and we now have SGPR as SRC1. If so 2 inlined 2655 // constant and SGPR are illegal. 2656 legalizeOperands(UseMI); 2657 2658 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 2659 if (DeleteDef) 2660 DefMI.eraseFromParent(); 2661 2662 return true; 2663 } 2664 } 2665 2666 return false; 2667 } 2668 2669 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 2670 int WidthB, int OffsetB) { 2671 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 2672 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 2673 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 2674 return LowOffset + LowWidth <= HighOffset; 2675 } 2676 2677 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa, 2678 const MachineInstr &MIb) const { 2679 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1; 2680 int64_t Offset0, Offset1; 2681 bool Offset0IsScalable, Offset1IsScalable; 2682 if (!getMemOperandsWithOffset(MIa, BaseOps0, Offset0, Offset0IsScalable, &RI) || 2683 !getMemOperandsWithOffset(MIb, BaseOps1, Offset1, Offset1IsScalable, &RI)) 2684 return false; 2685 2686 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1)) 2687 return false; 2688 2689 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { 2690 // FIXME: Handle ds_read2 / ds_write2. 2691 return false; 2692 } 2693 unsigned Width0 = MIa.memoperands().front()->getSize(); 2694 unsigned Width1 = MIb.memoperands().front()->getSize(); 2695 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1); 2696 } 2697 2698 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, 2699 const MachineInstr &MIb) const { 2700 assert(MIa.mayLoadOrStore() && 2701 "MIa must load from or modify a memory location"); 2702 assert(MIb.mayLoadOrStore() && 2703 "MIb must load from or modify a memory location"); 2704 2705 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) 2706 return false; 2707 2708 // XXX - Can we relax this between address spaces? 2709 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 2710 return false; 2711 2712 // TODO: Should we check the address space from the MachineMemOperand? That 2713 // would allow us to distinguish objects we know don't alias based on the 2714 // underlying address space, even if it was lowered to a different one, 2715 // e.g. private accesses lowered to use MUBUF instructions on a scratch 2716 // buffer. 2717 if (isDS(MIa)) { 2718 if (isDS(MIb)) 2719 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2720 2721 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb); 2722 } 2723 2724 if (isMUBUF(MIa) || isMTBUF(MIa)) { 2725 if (isMUBUF(MIb) || isMTBUF(MIb)) 2726 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2727 2728 return !isFLAT(MIb) && !isSMRD(MIb); 2729 } 2730 2731 if (isSMRD(MIa)) { 2732 if (isSMRD(MIb)) 2733 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2734 2735 return !isFLAT(MIb) && !isMUBUF(MIb) && !isMTBUF(MIb); 2736 } 2737 2738 if (isFLAT(MIa)) { 2739 if (isFLAT(MIb)) 2740 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2741 2742 return false; 2743 } 2744 2745 return false; 2746 } 2747 2748 static int64_t getFoldableImm(const MachineOperand* MO) { 2749 if (!MO->isReg()) 2750 return false; 2751 const MachineFunction *MF = MO->getParent()->getParent()->getParent(); 2752 const MachineRegisterInfo &MRI = MF->getRegInfo(); 2753 auto Def = MRI.getUniqueVRegDef(MO->getReg()); 2754 if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 && 2755 Def->getOperand(1).isImm()) 2756 return Def->getOperand(1).getImm(); 2757 return AMDGPU::NoRegister; 2758 } 2759 2760 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, 2761 MachineInstr &MI, 2762 LiveVariables *LV) const { 2763 unsigned Opc = MI.getOpcode(); 2764 bool IsF16 = false; 2765 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 || 2766 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64; 2767 2768 switch (Opc) { 2769 default: 2770 return nullptr; 2771 case AMDGPU::V_MAC_F16_e64: 2772 case AMDGPU::V_FMAC_F16_e64: 2773 IsF16 = true; 2774 LLVM_FALLTHROUGH; 2775 case AMDGPU::V_MAC_F32_e64: 2776 case AMDGPU::V_FMAC_F32_e64: 2777 break; 2778 case AMDGPU::V_MAC_F16_e32: 2779 case AMDGPU::V_FMAC_F16_e32: 2780 IsF16 = true; 2781 LLVM_FALLTHROUGH; 2782 case AMDGPU::V_MAC_F32_e32: 2783 case AMDGPU::V_FMAC_F32_e32: { 2784 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 2785 AMDGPU::OpName::src0); 2786 const MachineOperand *Src0 = &MI.getOperand(Src0Idx); 2787 if (!Src0->isReg() && !Src0->isImm()) 2788 return nullptr; 2789 2790 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0)) 2791 return nullptr; 2792 2793 break; 2794 } 2795 } 2796 2797 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 2798 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); 2799 const MachineOperand *Src0Mods = 2800 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 2801 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 2802 const MachineOperand *Src1Mods = 2803 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 2804 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 2805 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); 2806 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); 2807 2808 if (!Src0Mods && !Src1Mods && !Clamp && !Omod && 2809 // If we have an SGPR input, we will violate the constant bus restriction. 2810 (ST.getConstantBusLimit(Opc) > 1 || 2811 !Src0->isReg() || 2812 !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) { 2813 if (auto Imm = getFoldableImm(Src2)) { 2814 unsigned NewOpc = 2815 IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32) 2816 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32); 2817 if (pseudoToMCOpcode(NewOpc) != -1) 2818 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) 2819 .add(*Dst) 2820 .add(*Src0) 2821 .add(*Src1) 2822 .addImm(Imm); 2823 } 2824 unsigned NewOpc = 2825 IsFMA ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32) 2826 : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32); 2827 if (auto Imm = getFoldableImm(Src1)) { 2828 if (pseudoToMCOpcode(NewOpc) != -1) 2829 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) 2830 .add(*Dst) 2831 .add(*Src0) 2832 .addImm(Imm) 2833 .add(*Src2); 2834 } 2835 if (auto Imm = getFoldableImm(Src0)) { 2836 if (pseudoToMCOpcode(NewOpc) != -1 && 2837 isOperandLegal(MI, AMDGPU::getNamedOperandIdx(NewOpc, 2838 AMDGPU::OpName::src0), Src1)) 2839 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) 2840 .add(*Dst) 2841 .add(*Src1) 2842 .addImm(Imm) 2843 .add(*Src2); 2844 } 2845 } 2846 2847 unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16 : AMDGPU::V_FMA_F32) 2848 : (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32); 2849 if (pseudoToMCOpcode(NewOpc) == -1) 2850 return nullptr; 2851 2852 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) 2853 .add(*Dst) 2854 .addImm(Src0Mods ? Src0Mods->getImm() : 0) 2855 .add(*Src0) 2856 .addImm(Src1Mods ? Src1Mods->getImm() : 0) 2857 .add(*Src1) 2858 .addImm(0) // Src mods 2859 .add(*Src2) 2860 .addImm(Clamp ? Clamp->getImm() : 0) 2861 .addImm(Omod ? Omod->getImm() : 0); 2862 } 2863 2864 // It's not generally safe to move VALU instructions across these since it will 2865 // start using the register as a base index rather than directly. 2866 // XXX - Why isn't hasSideEffects sufficient for these? 2867 static bool changesVGPRIndexingMode(const MachineInstr &MI) { 2868 switch (MI.getOpcode()) { 2869 case AMDGPU::S_SET_GPR_IDX_ON: 2870 case AMDGPU::S_SET_GPR_IDX_MODE: 2871 case AMDGPU::S_SET_GPR_IDX_OFF: 2872 return true; 2873 default: 2874 return false; 2875 } 2876 } 2877 2878 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, 2879 const MachineBasicBlock *MBB, 2880 const MachineFunction &MF) const { 2881 // XXX - Do we want the SP check in the base implementation? 2882 2883 // Target-independent instructions do not have an implicit-use of EXEC, even 2884 // when they operate on VGPRs. Treating EXEC modifications as scheduling 2885 // boundaries prevents incorrect movements of such instructions. 2886 return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) || 2887 MI.modifiesRegister(AMDGPU::EXEC, &RI) || 2888 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || 2889 MI.getOpcode() == AMDGPU::S_SETREG_B32 || 2890 MI.getOpcode() == AMDGPU::S_DENORM_MODE || 2891 changesVGPRIndexingMode(MI); 2892 } 2893 2894 bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { 2895 return Opcode == AMDGPU::DS_ORDERED_COUNT || 2896 Opcode == AMDGPU::DS_GWS_INIT || 2897 Opcode == AMDGPU::DS_GWS_SEMA_V || 2898 Opcode == AMDGPU::DS_GWS_SEMA_BR || 2899 Opcode == AMDGPU::DS_GWS_SEMA_P || 2900 Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL || 2901 Opcode == AMDGPU::DS_GWS_BARRIER; 2902 } 2903 2904 bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const { 2905 unsigned Opcode = MI.getOpcode(); 2906 2907 if (MI.mayStore() && isSMRD(MI)) 2908 return true; // scalar store or atomic 2909 2910 // This will terminate the function when other lanes may need to continue. 2911 if (MI.isReturn()) 2912 return true; 2913 2914 // These instructions cause shader I/O that may cause hardware lockups 2915 // when executed with an empty EXEC mask. 2916 // 2917 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when 2918 // EXEC = 0, but checking for that case here seems not worth it 2919 // given the typical code patterns. 2920 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT || 2921 Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE || 2922 Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP || 2923 Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER) 2924 return true; 2925 2926 if (MI.isCall() || MI.isInlineAsm()) 2927 return true; // conservative assumption 2928 2929 // These are like SALU instructions in terms of effects, so it's questionable 2930 // whether we should return true for those. 2931 // 2932 // However, executing them with EXEC = 0 causes them to operate on undefined 2933 // data, which we avoid by returning true here. 2934 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || Opcode == AMDGPU::V_READLANE_B32) 2935 return true; 2936 2937 return false; 2938 } 2939 2940 bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI, 2941 const MachineInstr &MI) const { 2942 if (MI.isMetaInstruction()) 2943 return false; 2944 2945 // This won't read exec if this is an SGPR->SGPR copy. 2946 if (MI.isCopyLike()) { 2947 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg())) 2948 return true; 2949 2950 // Make sure this isn't copying exec as a normal operand 2951 return MI.readsRegister(AMDGPU::EXEC, &RI); 2952 } 2953 2954 // Make a conservative assumption about the callee. 2955 if (MI.isCall()) 2956 return true; 2957 2958 // Be conservative with any unhandled generic opcodes. 2959 if (!isTargetSpecificOpcode(MI.getOpcode())) 2960 return true; 2961 2962 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI); 2963 } 2964 2965 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 2966 switch (Imm.getBitWidth()) { 2967 case 1: // This likely will be a condition code mask. 2968 return true; 2969 2970 case 32: 2971 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(), 2972 ST.hasInv2PiInlineImm()); 2973 case 64: 2974 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), 2975 ST.hasInv2PiInlineImm()); 2976 case 16: 2977 return ST.has16BitInsts() && 2978 AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), 2979 ST.hasInv2PiInlineImm()); 2980 default: 2981 llvm_unreachable("invalid bitwidth"); 2982 } 2983 } 2984 2985 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 2986 uint8_t OperandType) const { 2987 if (!MO.isImm() || 2988 OperandType < AMDGPU::OPERAND_SRC_FIRST || 2989 OperandType > AMDGPU::OPERAND_SRC_LAST) 2990 return false; 2991 2992 // MachineOperand provides no way to tell the true operand size, since it only 2993 // records a 64-bit value. We need to know the size to determine if a 32-bit 2994 // floating point immediate bit pattern is legal for an integer immediate. It 2995 // would be for any 32-bit integer operand, but would not be for a 64-bit one. 2996 2997 int64_t Imm = MO.getImm(); 2998 switch (OperandType) { 2999 case AMDGPU::OPERAND_REG_IMM_INT32: 3000 case AMDGPU::OPERAND_REG_IMM_FP32: 3001 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 3002 case AMDGPU::OPERAND_REG_INLINE_C_FP32: 3003 case AMDGPU::OPERAND_REG_INLINE_AC_INT32: 3004 case AMDGPU::OPERAND_REG_INLINE_AC_FP32: { 3005 int32_t Trunc = static_cast<int32_t>(Imm); 3006 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); 3007 } 3008 case AMDGPU::OPERAND_REG_IMM_INT64: 3009 case AMDGPU::OPERAND_REG_IMM_FP64: 3010 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 3011 case AMDGPU::OPERAND_REG_INLINE_C_FP64: 3012 return AMDGPU::isInlinableLiteral64(MO.getImm(), 3013 ST.hasInv2PiInlineImm()); 3014 case AMDGPU::OPERAND_REG_IMM_INT16: 3015 case AMDGPU::OPERAND_REG_IMM_FP16: 3016 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 3017 case AMDGPU::OPERAND_REG_INLINE_C_FP16: 3018 case AMDGPU::OPERAND_REG_INLINE_AC_INT16: 3019 case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { 3020 if (isInt<16>(Imm) || isUInt<16>(Imm)) { 3021 // A few special case instructions have 16-bit operands on subtargets 3022 // where 16-bit instructions are not legal. 3023 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle 3024 // constants in these cases 3025 int16_t Trunc = static_cast<int16_t>(Imm); 3026 return ST.has16BitInsts() && 3027 AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); 3028 } 3029 3030 return false; 3031 } 3032 case AMDGPU::OPERAND_REG_IMM_V2INT16: 3033 case AMDGPU::OPERAND_REG_IMM_V2FP16: 3034 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: 3035 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: 3036 case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: 3037 case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { 3038 uint32_t Trunc = static_cast<uint32_t>(Imm); 3039 return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); 3040 } 3041 default: 3042 llvm_unreachable("invalid bitwidth"); 3043 } 3044 } 3045 3046 bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO, 3047 const MCOperandInfo &OpInfo) const { 3048 switch (MO.getType()) { 3049 case MachineOperand::MO_Register: 3050 return false; 3051 case MachineOperand::MO_Immediate: 3052 return !isInlineConstant(MO, OpInfo); 3053 case MachineOperand::MO_FrameIndex: 3054 case MachineOperand::MO_MachineBasicBlock: 3055 case MachineOperand::MO_ExternalSymbol: 3056 case MachineOperand::MO_GlobalAddress: 3057 case MachineOperand::MO_MCSymbol: 3058 return true; 3059 default: 3060 llvm_unreachable("unexpected operand type"); 3061 } 3062 } 3063 3064 static bool compareMachineOp(const MachineOperand &Op0, 3065 const MachineOperand &Op1) { 3066 if (Op0.getType() != Op1.getType()) 3067 return false; 3068 3069 switch (Op0.getType()) { 3070 case MachineOperand::MO_Register: 3071 return Op0.getReg() == Op1.getReg(); 3072 case MachineOperand::MO_Immediate: 3073 return Op0.getImm() == Op1.getImm(); 3074 default: 3075 llvm_unreachable("Didn't expect to be comparing these operand types"); 3076 } 3077 } 3078 3079 bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, 3080 const MachineOperand &MO) const { 3081 const MCInstrDesc &InstDesc = MI.getDesc(); 3082 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo]; 3083 3084 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); 3085 3086 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 3087 return true; 3088 3089 if (OpInfo.RegClass < 0) 3090 return false; 3091 3092 const MachineFunction *MF = MI.getParent()->getParent(); 3093 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 3094 3095 if (MO.isImm() && isInlineConstant(MO, OpInfo)) { 3096 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() && 3097 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(), 3098 AMDGPU::OpName::src2)) 3099 return false; 3100 return RI.opCanUseInlineConstant(OpInfo.OperandType); 3101 } 3102 3103 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType)) 3104 return false; 3105 3106 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo)) 3107 return true; 3108 3109 return ST.hasVOP3Literal(); 3110 } 3111 3112 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 3113 int Op32 = AMDGPU::getVOPe32(Opcode); 3114 if (Op32 == -1) 3115 return false; 3116 3117 return pseudoToMCOpcode(Op32) != -1; 3118 } 3119 3120 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 3121 // The src0_modifier operand is present on all instructions 3122 // that have modifiers. 3123 3124 return AMDGPU::getNamedOperandIdx(Opcode, 3125 AMDGPU::OpName::src0_modifiers) != -1; 3126 } 3127 3128 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 3129 unsigned OpName) const { 3130 const MachineOperand *Mods = getNamedOperand(MI, OpName); 3131 return Mods && Mods->getImm(); 3132 } 3133 3134 bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { 3135 return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || 3136 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || 3137 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) || 3138 hasModifiersSet(MI, AMDGPU::OpName::clamp) || 3139 hasModifiersSet(MI, AMDGPU::OpName::omod); 3140 } 3141 3142 bool SIInstrInfo::canShrink(const MachineInstr &MI, 3143 const MachineRegisterInfo &MRI) const { 3144 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 3145 // Can't shrink instruction with three operands. 3146 // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add 3147 // a special case for it. It can only be shrunk if the third operand 3148 // is vcc, and src0_modifiers and src1_modifiers are not set. 3149 // We should handle this the same way we handle vopc, by addding 3150 // a register allocation hint pre-regalloc and then do the shrinking 3151 // post-regalloc. 3152 if (Src2) { 3153 switch (MI.getOpcode()) { 3154 default: return false; 3155 3156 case AMDGPU::V_ADDC_U32_e64: 3157 case AMDGPU::V_SUBB_U32_e64: 3158 case AMDGPU::V_SUBBREV_U32_e64: { 3159 const MachineOperand *Src1 3160 = getNamedOperand(MI, AMDGPU::OpName::src1); 3161 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg())) 3162 return false; 3163 // Additional verification is needed for sdst/src2. 3164 return true; 3165 } 3166 case AMDGPU::V_MAC_F32_e64: 3167 case AMDGPU::V_MAC_F16_e64: 3168 case AMDGPU::V_FMAC_F32_e64: 3169 case AMDGPU::V_FMAC_F16_e64: 3170 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) || 3171 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) 3172 return false; 3173 break; 3174 3175 case AMDGPU::V_CNDMASK_B32_e64: 3176 break; 3177 } 3178 } 3179 3180 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 3181 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) || 3182 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers))) 3183 return false; 3184 3185 // We don't need to check src0, all input types are legal, so just make sure 3186 // src0 isn't using any modifiers. 3187 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) 3188 return false; 3189 3190 // Can it be shrunk to a valid 32 bit opcode? 3191 if (!hasVALU32BitEncoding(MI.getOpcode())) 3192 return false; 3193 3194 // Check output modifiers 3195 return !hasModifiersSet(MI, AMDGPU::OpName::omod) && 3196 !hasModifiersSet(MI, AMDGPU::OpName::clamp); 3197 } 3198 3199 // Set VCC operand with all flags from \p Orig, except for setting it as 3200 // implicit. 3201 static void copyFlagsToImplicitVCC(MachineInstr &MI, 3202 const MachineOperand &Orig) { 3203 3204 for (MachineOperand &Use : MI.implicit_operands()) { 3205 if (Use.isUse() && Use.getReg() == AMDGPU::VCC) { 3206 Use.setIsUndef(Orig.isUndef()); 3207 Use.setIsKill(Orig.isKill()); 3208 return; 3209 } 3210 } 3211 } 3212 3213 MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI, 3214 unsigned Op32) const { 3215 MachineBasicBlock *MBB = MI.getParent();; 3216 MachineInstrBuilder Inst32 = 3217 BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32)); 3218 3219 // Add the dst operand if the 32-bit encoding also has an explicit $vdst. 3220 // For VOPC instructions, this is replaced by an implicit def of vcc. 3221 int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst); 3222 if (Op32DstIdx != -1) { 3223 // dst 3224 Inst32.add(MI.getOperand(0)); 3225 } else { 3226 assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) || 3227 (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && 3228 "Unexpected case"); 3229 } 3230 3231 Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0)); 3232 3233 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 3234 if (Src1) 3235 Inst32.add(*Src1); 3236 3237 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 3238 3239 if (Src2) { 3240 int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); 3241 if (Op32Src2Idx != -1) { 3242 Inst32.add(*Src2); 3243 } else { 3244 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is 3245 // replaced with an implicit read of vcc. This was already added 3246 // during the initial BuildMI, so find it to preserve the flags. 3247 copyFlagsToImplicitVCC(*Inst32, *Src2); 3248 } 3249 } 3250 3251 return Inst32; 3252 } 3253 3254 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 3255 const MachineOperand &MO, 3256 const MCOperandInfo &OpInfo) const { 3257 // Literal constants use the constant bus. 3258 //if (isLiteralConstantLike(MO, OpInfo)) 3259 // return true; 3260 if (MO.isImm()) 3261 return !isInlineConstant(MO, OpInfo); 3262 3263 if (!MO.isReg()) 3264 return true; // Misc other operands like FrameIndex 3265 3266 if (!MO.isUse()) 3267 return false; 3268 3269 if (Register::isVirtualRegister(MO.getReg())) 3270 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 3271 3272 // Null is free 3273 if (MO.getReg() == AMDGPU::SGPR_NULL) 3274 return false; 3275 3276 // SGPRs use the constant bus 3277 if (MO.isImplicit()) { 3278 return MO.getReg() == AMDGPU::M0 || 3279 MO.getReg() == AMDGPU::VCC || 3280 MO.getReg() == AMDGPU::VCC_LO; 3281 } else { 3282 return AMDGPU::SReg_32RegClass.contains(MO.getReg()) || 3283 AMDGPU::SReg_64RegClass.contains(MO.getReg()); 3284 } 3285 } 3286 3287 static Register findImplicitSGPRRead(const MachineInstr &MI) { 3288 for (const MachineOperand &MO : MI.implicit_operands()) { 3289 // We only care about reads. 3290 if (MO.isDef()) 3291 continue; 3292 3293 switch (MO.getReg()) { 3294 case AMDGPU::VCC: 3295 case AMDGPU::VCC_LO: 3296 case AMDGPU::VCC_HI: 3297 case AMDGPU::M0: 3298 case AMDGPU::FLAT_SCR: 3299 return MO.getReg(); 3300 3301 default: 3302 break; 3303 } 3304 } 3305 3306 return AMDGPU::NoRegister; 3307 } 3308 3309 static bool shouldReadExec(const MachineInstr &MI) { 3310 if (SIInstrInfo::isVALU(MI)) { 3311 switch (MI.getOpcode()) { 3312 case AMDGPU::V_READLANE_B32: 3313 case AMDGPU::V_READLANE_B32_gfx6_gfx7: 3314 case AMDGPU::V_READLANE_B32_gfx10: 3315 case AMDGPU::V_READLANE_B32_vi: 3316 case AMDGPU::V_WRITELANE_B32: 3317 case AMDGPU::V_WRITELANE_B32_gfx6_gfx7: 3318 case AMDGPU::V_WRITELANE_B32_gfx10: 3319 case AMDGPU::V_WRITELANE_B32_vi: 3320 return false; 3321 } 3322 3323 return true; 3324 } 3325 3326 if (MI.isPreISelOpcode() || 3327 SIInstrInfo::isGenericOpcode(MI.getOpcode()) || 3328 SIInstrInfo::isSALU(MI) || 3329 SIInstrInfo::isSMRD(MI)) 3330 return false; 3331 3332 return true; 3333 } 3334 3335 static bool isSubRegOf(const SIRegisterInfo &TRI, 3336 const MachineOperand &SuperVec, 3337 const MachineOperand &SubReg) { 3338 if (Register::isPhysicalRegister(SubReg.getReg())) 3339 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); 3340 3341 return SubReg.getSubReg() != AMDGPU::NoSubRegister && 3342 SubReg.getReg() == SuperVec.getReg(); 3343 } 3344 3345 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, 3346 StringRef &ErrInfo) const { 3347 uint16_t Opcode = MI.getOpcode(); 3348 if (SIInstrInfo::isGenericOpcode(MI.getOpcode())) 3349 return true; 3350 3351 const MachineFunction *MF = MI.getParent()->getParent(); 3352 const MachineRegisterInfo &MRI = MF->getRegInfo(); 3353 3354 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 3355 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 3356 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 3357 3358 // Make sure the number of operands is correct. 3359 const MCInstrDesc &Desc = get(Opcode); 3360 if (!Desc.isVariadic() && 3361 Desc.getNumOperands() != MI.getNumExplicitOperands()) { 3362 ErrInfo = "Instruction has wrong number of operands."; 3363 return false; 3364 } 3365 3366 if (MI.isInlineAsm()) { 3367 // Verify register classes for inlineasm constraints. 3368 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands(); 3369 I != E; ++I) { 3370 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI); 3371 if (!RC) 3372 continue; 3373 3374 const MachineOperand &Op = MI.getOperand(I); 3375 if (!Op.isReg()) 3376 continue; 3377 3378 Register Reg = Op.getReg(); 3379 if (!Register::isVirtualRegister(Reg) && !RC->contains(Reg)) { 3380 ErrInfo = "inlineasm operand has incorrect register class."; 3381 return false; 3382 } 3383 } 3384 3385 return true; 3386 } 3387 3388 // Make sure the register classes are correct. 3389 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 3390 if (MI.getOperand(i).isFPImm()) { 3391 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 3392 "all fp values to integers."; 3393 return false; 3394 } 3395 3396 int RegClass = Desc.OpInfo[i].RegClass; 3397 3398 switch (Desc.OpInfo[i].OperandType) { 3399 case MCOI::OPERAND_REGISTER: 3400 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) { 3401 ErrInfo = "Illegal immediate value for operand."; 3402 return false; 3403 } 3404 break; 3405 case AMDGPU::OPERAND_REG_IMM_INT32: 3406 case AMDGPU::OPERAND_REG_IMM_FP32: 3407 break; 3408 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 3409 case AMDGPU::OPERAND_REG_INLINE_C_FP32: 3410 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 3411 case AMDGPU::OPERAND_REG_INLINE_C_FP64: 3412 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 3413 case AMDGPU::OPERAND_REG_INLINE_C_FP16: 3414 case AMDGPU::OPERAND_REG_INLINE_AC_INT32: 3415 case AMDGPU::OPERAND_REG_INLINE_AC_FP32: 3416 case AMDGPU::OPERAND_REG_INLINE_AC_INT16: 3417 case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { 3418 const MachineOperand &MO = MI.getOperand(i); 3419 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { 3420 ErrInfo = "Illegal immediate value for operand."; 3421 return false; 3422 } 3423 break; 3424 } 3425 case MCOI::OPERAND_IMMEDIATE: 3426 case AMDGPU::OPERAND_KIMM32: 3427 // Check if this operand is an immediate. 3428 // FrameIndex operands will be replaced by immediates, so they are 3429 // allowed. 3430 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { 3431 ErrInfo = "Expected immediate, but got non-immediate"; 3432 return false; 3433 } 3434 LLVM_FALLTHROUGH; 3435 default: 3436 continue; 3437 } 3438 3439 if (!MI.getOperand(i).isReg()) 3440 continue; 3441 3442 if (RegClass != -1) { 3443 Register Reg = MI.getOperand(i).getReg(); 3444 if (Reg == AMDGPU::NoRegister || Register::isVirtualRegister(Reg)) 3445 continue; 3446 3447 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 3448 if (!RC->contains(Reg)) { 3449 ErrInfo = "Operand has incorrect register class."; 3450 return false; 3451 } 3452 } 3453 } 3454 3455 // Verify SDWA 3456 if (isSDWA(MI)) { 3457 if (!ST.hasSDWA()) { 3458 ErrInfo = "SDWA is not supported on this target"; 3459 return false; 3460 } 3461 3462 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); 3463 3464 const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx }; 3465 3466 for (int OpIdx: OpIndicies) { 3467 if (OpIdx == -1) 3468 continue; 3469 const MachineOperand &MO = MI.getOperand(OpIdx); 3470 3471 if (!ST.hasSDWAScalar()) { 3472 // Only VGPRS on VI 3473 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) { 3474 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI"; 3475 return false; 3476 } 3477 } else { 3478 // No immediates on GFX9 3479 if (!MO.isReg()) { 3480 ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9"; 3481 return false; 3482 } 3483 } 3484 } 3485 3486 if (!ST.hasSDWAOmod()) { 3487 // No omod allowed on VI 3488 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); 3489 if (OMod != nullptr && 3490 (!OMod->isImm() || OMod->getImm() != 0)) { 3491 ErrInfo = "OMod not allowed in SDWA instructions on VI"; 3492 return false; 3493 } 3494 } 3495 3496 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode); 3497 if (isVOPC(BasicOpcode)) { 3498 if (!ST.hasSDWASdst() && DstIdx != -1) { 3499 // Only vcc allowed as dst on VI for VOPC 3500 const MachineOperand &Dst = MI.getOperand(DstIdx); 3501 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) { 3502 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI"; 3503 return false; 3504 } 3505 } else if (!ST.hasSDWAOutModsVOPC()) { 3506 // No clamp allowed on GFX9 for VOPC 3507 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); 3508 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) { 3509 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI"; 3510 return false; 3511 } 3512 3513 // No omod allowed on GFX9 for VOPC 3514 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); 3515 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) { 3516 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI"; 3517 return false; 3518 } 3519 } 3520 } 3521 3522 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused); 3523 if (DstUnused && DstUnused->isImm() && 3524 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) { 3525 const MachineOperand &Dst = MI.getOperand(DstIdx); 3526 if (!Dst.isReg() || !Dst.isTied()) { 3527 ErrInfo = "Dst register should have tied register"; 3528 return false; 3529 } 3530 3531 const MachineOperand &TiedMO = 3532 MI.getOperand(MI.findTiedOperandIdx(DstIdx)); 3533 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) { 3534 ErrInfo = 3535 "Dst register should be tied to implicit use of preserved register"; 3536 return false; 3537 } else if (Register::isPhysicalRegister(TiedMO.getReg()) && 3538 Dst.getReg() != TiedMO.getReg()) { 3539 ErrInfo = "Dst register should use same physical register as preserved"; 3540 return false; 3541 } 3542 } 3543 } 3544 3545 // Verify MIMG 3546 if (isMIMG(MI.getOpcode()) && !MI.mayStore()) { 3547 // Ensure that the return type used is large enough for all the options 3548 // being used TFE/LWE require an extra result register. 3549 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask); 3550 if (DMask) { 3551 uint64_t DMaskImm = DMask->getImm(); 3552 uint32_t RegCount = 3553 isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm); 3554 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe); 3555 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe); 3556 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16); 3557 3558 // Adjust for packed 16 bit values 3559 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem()) 3560 RegCount >>= 1; 3561 3562 // Adjust if using LWE or TFE 3563 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm())) 3564 RegCount += 1; 3565 3566 const uint32_t DstIdx = 3567 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); 3568 const MachineOperand &Dst = MI.getOperand(DstIdx); 3569 if (Dst.isReg()) { 3570 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx); 3571 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32; 3572 if (RegCount > DstSize) { 3573 ErrInfo = "MIMG instruction returns too many registers for dst " 3574 "register class"; 3575 return false; 3576 } 3577 } 3578 } 3579 } 3580 3581 // Verify VOP*. Ignore multiple sgpr operands on writelane. 3582 if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32 3583 && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) { 3584 // Only look at the true operands. Only a real operand can use the constant 3585 // bus, and we don't want to check pseudo-operands like the source modifier 3586 // flags. 3587 const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; 3588 3589 unsigned ConstantBusCount = 0; 3590 unsigned LiteralCount = 0; 3591 3592 if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) 3593 ++ConstantBusCount; 3594 3595 SmallVector<Register, 2> SGPRsUsed; 3596 Register SGPRUsed = findImplicitSGPRRead(MI); 3597 if (SGPRUsed != AMDGPU::NoRegister) { 3598 ++ConstantBusCount; 3599 SGPRsUsed.push_back(SGPRUsed); 3600 } 3601 3602 for (int OpIdx : OpIndices) { 3603 if (OpIdx == -1) 3604 break; 3605 const MachineOperand &MO = MI.getOperand(OpIdx); 3606 if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { 3607 if (MO.isReg()) { 3608 SGPRUsed = MO.getReg(); 3609 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) { 3610 return !RI.regsOverlap(SGPRUsed, SGPR); 3611 })) { 3612 ++ConstantBusCount; 3613 SGPRsUsed.push_back(SGPRUsed); 3614 } 3615 } else { 3616 ++ConstantBusCount; 3617 ++LiteralCount; 3618 } 3619 } 3620 } 3621 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 3622 // v_writelane_b32 is an exception from constant bus restriction: 3623 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const 3624 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) && 3625 Opcode != AMDGPU::V_WRITELANE_B32) { 3626 ErrInfo = "VOP* instruction violates constant bus restriction"; 3627 return false; 3628 } 3629 3630 if (isVOP3(MI) && LiteralCount) { 3631 if (LiteralCount && !ST.hasVOP3Literal()) { 3632 ErrInfo = "VOP3 instruction uses literal"; 3633 return false; 3634 } 3635 if (LiteralCount > 1) { 3636 ErrInfo = "VOP3 instruction uses more than one literal"; 3637 return false; 3638 } 3639 } 3640 } 3641 3642 // Special case for writelane - this can break the multiple constant bus rule, 3643 // but still can't use more than one SGPR register 3644 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) { 3645 unsigned SGPRCount = 0; 3646 Register SGPRUsed = AMDGPU::NoRegister; 3647 3648 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) { 3649 if (OpIdx == -1) 3650 break; 3651 3652 const MachineOperand &MO = MI.getOperand(OpIdx); 3653 3654 if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { 3655 if (MO.isReg() && MO.getReg() != AMDGPU::M0) { 3656 if (MO.getReg() != SGPRUsed) 3657 ++SGPRCount; 3658 SGPRUsed = MO.getReg(); 3659 } 3660 } 3661 if (SGPRCount > ST.getConstantBusLimit(Opcode)) { 3662 ErrInfo = "WRITELANE instruction violates constant bus restriction"; 3663 return false; 3664 } 3665 } 3666 } 3667 3668 // Verify misc. restrictions on specific instructions. 3669 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || 3670 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { 3671 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 3672 const MachineOperand &Src1 = MI.getOperand(Src1Idx); 3673 const MachineOperand &Src2 = MI.getOperand(Src2Idx); 3674 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 3675 if (!compareMachineOp(Src0, Src1) && 3676 !compareMachineOp(Src0, Src2)) { 3677 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 3678 return false; 3679 } 3680 } 3681 } 3682 3683 if (isSOP2(MI) || isSOPC(MI)) { 3684 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 3685 const MachineOperand &Src1 = MI.getOperand(Src1Idx); 3686 unsigned Immediates = 0; 3687 3688 if (!Src0.isReg() && 3689 !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType)) 3690 Immediates++; 3691 if (!Src1.isReg() && 3692 !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType)) 3693 Immediates++; 3694 3695 if (Immediates > 1) { 3696 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants"; 3697 return false; 3698 } 3699 } 3700 3701 if (isSOPK(MI)) { 3702 auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16); 3703 if (Desc.isBranch()) { 3704 if (!Op->isMBB()) { 3705 ErrInfo = "invalid branch target for SOPK instruction"; 3706 return false; 3707 } 3708 } else { 3709 uint64_t Imm = Op->getImm(); 3710 if (sopkIsZext(MI)) { 3711 if (!isUInt<16>(Imm)) { 3712 ErrInfo = "invalid immediate for SOPK instruction"; 3713 return false; 3714 } 3715 } else { 3716 if (!isInt<16>(Imm)) { 3717 ErrInfo = "invalid immediate for SOPK instruction"; 3718 return false; 3719 } 3720 } 3721 } 3722 } 3723 3724 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || 3725 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || 3726 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 3727 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { 3728 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 3729 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; 3730 3731 const unsigned StaticNumOps = Desc.getNumOperands() + 3732 Desc.getNumImplicitUses(); 3733 const unsigned NumImplicitOps = IsDst ? 2 : 1; 3734 3735 // Allow additional implicit operands. This allows a fixup done by the post 3736 // RA scheduler where the main implicit operand is killed and implicit-defs 3737 // are added for sub-registers that remain live after this instruction. 3738 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { 3739 ErrInfo = "missing implicit register operands"; 3740 return false; 3741 } 3742 3743 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 3744 if (IsDst) { 3745 if (!Dst->isUse()) { 3746 ErrInfo = "v_movreld_b32 vdst should be a use operand"; 3747 return false; 3748 } 3749 3750 unsigned UseOpIdx; 3751 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || 3752 UseOpIdx != StaticNumOps + 1) { 3753 ErrInfo = "movrel implicit operands should be tied"; 3754 return false; 3755 } 3756 } 3757 3758 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 3759 const MachineOperand &ImpUse 3760 = MI.getOperand(StaticNumOps + NumImplicitOps - 1); 3761 if (!ImpUse.isReg() || !ImpUse.isUse() || 3762 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { 3763 ErrInfo = "src0 should be subreg of implicit vector use"; 3764 return false; 3765 } 3766 } 3767 3768 // Make sure we aren't losing exec uses in the td files. This mostly requires 3769 // being careful when using let Uses to try to add other use registers. 3770 if (shouldReadExec(MI)) { 3771 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 3772 ErrInfo = "VALU instruction does not implicitly read exec mask"; 3773 return false; 3774 } 3775 } 3776 3777 if (isSMRD(MI)) { 3778 if (MI.mayStore()) { 3779 // The register offset form of scalar stores may only use m0 as the 3780 // soffset register. 3781 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff); 3782 if (Soff && Soff->getReg() != AMDGPU::M0) { 3783 ErrInfo = "scalar stores must use m0 as offset register"; 3784 return false; 3785 } 3786 } 3787 } 3788 3789 if (isFLAT(MI) && !MF->getSubtarget<GCNSubtarget>().hasFlatInstOffsets()) { 3790 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 3791 if (Offset->getImm() != 0) { 3792 ErrInfo = "subtarget does not support offsets in flat instructions"; 3793 return false; 3794 } 3795 } 3796 3797 if (isMIMG(MI)) { 3798 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim); 3799 if (DimOp) { 3800 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode, 3801 AMDGPU::OpName::vaddr0); 3802 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); 3803 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode); 3804 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3805 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); 3806 const AMDGPU::MIMGDimInfo *Dim = 3807 AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm()); 3808 3809 if (!Dim) { 3810 ErrInfo = "dim is out of range"; 3811 return false; 3812 } 3813 3814 bool IsA16 = false; 3815 if (ST.hasR128A16()) { 3816 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128); 3817 IsA16 = R128A16->getImm() != 0; 3818 } else if (ST.hasGFX10A16()) { 3819 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16); 3820 IsA16 = A16->getImm() != 0; 3821 } 3822 3823 bool PackDerivatives = IsA16; // Either A16 or G16 3824 bool IsNSA = SRsrcIdx - VAddr0Idx > 1; 3825 3826 unsigned AddrWords = BaseOpcode->NumExtraArgs; 3827 unsigned AddrComponents = (BaseOpcode->Coordinates ? Dim->NumCoords : 0) + 3828 (BaseOpcode->LodOrClampOrMip ? 1 : 0); 3829 if (IsA16) 3830 AddrWords += (AddrComponents + 1) / 2; 3831 else 3832 AddrWords += AddrComponents; 3833 3834 if (BaseOpcode->Gradients) { 3835 if (PackDerivatives) 3836 // There are two gradients per coordinate, we pack them separately. 3837 // For the 3d case, we get (dy/du, dx/du) (-, dz/du) (dy/dv, dx/dv) (-, dz/dv) 3838 AddrWords += (Dim->NumGradients / 2 + 1) / 2 * 2; 3839 else 3840 AddrWords += Dim->NumGradients; 3841 } 3842 3843 unsigned VAddrWords; 3844 if (IsNSA) { 3845 VAddrWords = SRsrcIdx - VAddr0Idx; 3846 } else { 3847 const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx); 3848 VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32; 3849 if (AddrWords > 8) 3850 AddrWords = 16; 3851 else if (AddrWords > 4) 3852 AddrWords = 8; 3853 else if (AddrWords == 4) 3854 AddrWords = 4; 3855 else if (AddrWords == 3) 3856 AddrWords = 3; 3857 } 3858 3859 if (VAddrWords != AddrWords) { 3860 ErrInfo = "bad vaddr size"; 3861 return false; 3862 } 3863 } 3864 } 3865 3866 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl); 3867 if (DppCt) { 3868 using namespace AMDGPU::DPP; 3869 3870 unsigned DC = DppCt->getImm(); 3871 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 || 3872 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST || 3873 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) || 3874 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) || 3875 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) || 3876 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) || 3877 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) { 3878 ErrInfo = "Invalid dpp_ctrl value"; 3879 return false; 3880 } 3881 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 && 3882 ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 3883 ErrInfo = "Invalid dpp_ctrl value: " 3884 "wavefront shifts are not supported on GFX10+"; 3885 return false; 3886 } 3887 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 && 3888 ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 3889 ErrInfo = "Invalid dpp_ctrl value: " 3890 "broadcasts are not supported on GFX10+"; 3891 return false; 3892 } 3893 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST && 3894 ST.getGeneration() < AMDGPUSubtarget::GFX10) { 3895 ErrInfo = "Invalid dpp_ctrl value: " 3896 "row_share and row_xmask are not supported before GFX10"; 3897 return false; 3898 } 3899 } 3900 3901 return true; 3902 } 3903 3904 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { 3905 switch (MI.getOpcode()) { 3906 default: return AMDGPU::INSTRUCTION_LIST_END; 3907 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 3908 case AMDGPU::COPY: return AMDGPU::COPY; 3909 case AMDGPU::PHI: return AMDGPU::PHI; 3910 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 3911 case AMDGPU::WQM: return AMDGPU::WQM; 3912 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM; 3913 case AMDGPU::WWM: return AMDGPU::WWM; 3914 case AMDGPU::S_MOV_B32: { 3915 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 3916 return MI.getOperand(1).isReg() || 3917 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ? 3918 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 3919 } 3920 case AMDGPU::S_ADD_I32: 3921 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32; 3922 case AMDGPU::S_ADDC_U32: 3923 return AMDGPU::V_ADDC_U32_e32; 3924 case AMDGPU::S_SUB_I32: 3925 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32; 3926 // FIXME: These are not consistently handled, and selected when the carry is 3927 // used. 3928 case AMDGPU::S_ADD_U32: 3929 return AMDGPU::V_ADD_I32_e32; 3930 case AMDGPU::S_SUB_U32: 3931 return AMDGPU::V_SUB_I32_e32; 3932 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 3933 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32; 3934 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32; 3935 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32; 3936 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; 3937 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; 3938 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; 3939 case AMDGPU::S_XNOR_B32: 3940 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; 3941 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; 3942 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; 3943 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; 3944 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; 3945 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 3946 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; 3947 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 3948 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; 3949 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 3950 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; 3951 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; 3952 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; 3953 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; 3954 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; 3955 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 3956 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 3957 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 3958 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 3959 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; 3960 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; 3961 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; 3962 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; 3963 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; 3964 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; 3965 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; 3966 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; 3967 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; 3968 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; 3969 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; 3970 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; 3971 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32; 3972 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32; 3973 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 3974 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 3975 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 3976 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 3977 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; 3978 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; 3979 } 3980 llvm_unreachable( 3981 "Unexpected scalar opcode without corresponding vector one!"); 3982 } 3983 3984 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 3985 unsigned OpNo) const { 3986 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 3987 const MCInstrDesc &Desc = get(MI.getOpcode()); 3988 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 3989 Desc.OpInfo[OpNo].RegClass == -1) { 3990 Register Reg = MI.getOperand(OpNo).getReg(); 3991 3992 if (Register::isVirtualRegister(Reg)) 3993 return MRI.getRegClass(Reg); 3994 return RI.getPhysRegClass(Reg); 3995 } 3996 3997 unsigned RCID = Desc.OpInfo[OpNo].RegClass; 3998 return RI.getRegClass(RCID); 3999 } 4000 4001 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { 4002 MachineBasicBlock::iterator I = MI; 4003 MachineBasicBlock *MBB = MI.getParent(); 4004 MachineOperand &MO = MI.getOperand(OpIdx); 4005 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 4006 const SIRegisterInfo *TRI = 4007 static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 4008 unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; 4009 const TargetRegisterClass *RC = RI.getRegClass(RCID); 4010 unsigned Size = TRI->getRegSizeInBits(*RC); 4011 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32; 4012 if (MO.isReg()) 4013 Opcode = AMDGPU::COPY; 4014 else if (RI.isSGPRClass(RC)) 4015 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 4016 4017 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 4018 if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) 4019 VRC = &AMDGPU::VReg_64RegClass; 4020 else 4021 VRC = &AMDGPU::VGPR_32RegClass; 4022 4023 Register Reg = MRI.createVirtualRegister(VRC); 4024 DebugLoc DL = MBB->findDebugLoc(I); 4025 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); 4026 MO.ChangeToRegister(Reg, false); 4027 } 4028 4029 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 4030 MachineRegisterInfo &MRI, 4031 MachineOperand &SuperReg, 4032 const TargetRegisterClass *SuperRC, 4033 unsigned SubIdx, 4034 const TargetRegisterClass *SubRC) 4035 const { 4036 MachineBasicBlock *MBB = MI->getParent(); 4037 DebugLoc DL = MI->getDebugLoc(); 4038 Register SubReg = MRI.createVirtualRegister(SubRC); 4039 4040 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { 4041 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 4042 .addReg(SuperReg.getReg(), 0, SubIdx); 4043 return SubReg; 4044 } 4045 4046 // Just in case the super register is itself a sub-register, copy it to a new 4047 // value so we don't need to worry about merging its subreg index with the 4048 // SubIdx passed to this function. The register coalescer should be able to 4049 // eliminate this extra copy. 4050 Register NewSuperReg = MRI.createVirtualRegister(SuperRC); 4051 4052 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 4053 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 4054 4055 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 4056 .addReg(NewSuperReg, 0, SubIdx); 4057 4058 return SubReg; 4059 } 4060 4061 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 4062 MachineBasicBlock::iterator MII, 4063 MachineRegisterInfo &MRI, 4064 MachineOperand &Op, 4065 const TargetRegisterClass *SuperRC, 4066 unsigned SubIdx, 4067 const TargetRegisterClass *SubRC) const { 4068 if (Op.isImm()) { 4069 if (SubIdx == AMDGPU::sub0) 4070 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); 4071 if (SubIdx == AMDGPU::sub1) 4072 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32)); 4073 4074 llvm_unreachable("Unhandled register index for immediate"); 4075 } 4076 4077 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 4078 SubIdx, SubRC); 4079 return MachineOperand::CreateReg(SubReg, false); 4080 } 4081 4082 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 4083 void SIInstrInfo::swapOperands(MachineInstr &Inst) const { 4084 assert(Inst.getNumExplicitOperands() == 3); 4085 MachineOperand Op1 = Inst.getOperand(1); 4086 Inst.RemoveOperand(1); 4087 Inst.addOperand(Op1); 4088 } 4089 4090 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, 4091 const MCOperandInfo &OpInfo, 4092 const MachineOperand &MO) const { 4093 if (!MO.isReg()) 4094 return false; 4095 4096 Register Reg = MO.getReg(); 4097 const TargetRegisterClass *RC = Register::isVirtualRegister(Reg) 4098 ? MRI.getRegClass(Reg) 4099 : RI.getPhysRegClass(Reg); 4100 4101 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass); 4102 if (MO.getSubReg()) { 4103 const MachineFunction *MF = MO.getParent()->getParent()->getParent(); 4104 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF); 4105 if (!SuperRC) 4106 return false; 4107 4108 DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()); 4109 if (!DRC) 4110 return false; 4111 } 4112 return RC->hasSuperClassEq(DRC); 4113 } 4114 4115 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, 4116 const MCOperandInfo &OpInfo, 4117 const MachineOperand &MO) const { 4118 if (MO.isReg()) 4119 return isLegalRegOperand(MRI, OpInfo, MO); 4120 4121 // Handle non-register types that are treated like immediates. 4122 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); 4123 return true; 4124 } 4125 4126 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, 4127 const MachineOperand *MO) const { 4128 const MachineFunction &MF = *MI.getParent()->getParent(); 4129 const MachineRegisterInfo &MRI = MF.getRegInfo(); 4130 const MCInstrDesc &InstDesc = MI.getDesc(); 4131 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; 4132 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 4133 const TargetRegisterClass *DefinedRC = 4134 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 4135 if (!MO) 4136 MO = &MI.getOperand(OpIdx); 4137 4138 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); 4139 int VOP3LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; 4140 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { 4141 if (isVOP3(MI) && isLiteralConstantLike(*MO, OpInfo) && !VOP3LiteralLimit--) 4142 return false; 4143 4144 SmallDenseSet<RegSubRegPair> SGPRsUsed; 4145 if (MO->isReg()) 4146 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg())); 4147 4148 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 4149 if (i == OpIdx) 4150 continue; 4151 const MachineOperand &Op = MI.getOperand(i); 4152 if (Op.isReg()) { 4153 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg()); 4154 if (!SGPRsUsed.count(SGPR) && 4155 usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) { 4156 if (--ConstantBusLimit <= 0) 4157 return false; 4158 SGPRsUsed.insert(SGPR); 4159 } 4160 } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { 4161 if (--ConstantBusLimit <= 0) 4162 return false; 4163 } else if (isVOP3(MI) && AMDGPU::isSISrcOperand(InstDesc, i) && 4164 isLiteralConstantLike(Op, InstDesc.OpInfo[i])) { 4165 if (!VOP3LiteralLimit--) 4166 return false; 4167 if (--ConstantBusLimit <= 0) 4168 return false; 4169 } 4170 } 4171 } 4172 4173 if (MO->isReg()) { 4174 assert(DefinedRC); 4175 return isLegalRegOperand(MRI, OpInfo, *MO); 4176 } 4177 4178 // Handle non-register types that are treated like immediates. 4179 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal()); 4180 4181 if (!DefinedRC) { 4182 // This operand expects an immediate. 4183 return true; 4184 } 4185 4186 return isImmOperandLegal(MI, OpIdx, *MO); 4187 } 4188 4189 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, 4190 MachineInstr &MI) const { 4191 unsigned Opc = MI.getOpcode(); 4192 const MCInstrDesc &InstrDesc = get(Opc); 4193 4194 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 4195 MachineOperand &Src0 = MI.getOperand(Src0Idx); 4196 4197 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 4198 MachineOperand &Src1 = MI.getOperand(Src1Idx); 4199 4200 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 4201 // we need to only have one constant bus use before GFX10. 4202 bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; 4203 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && 4204 Src0.isReg() && (RI.isSGPRReg(MRI, Src0.getReg()) || 4205 isLiteralConstantLike(Src0, InstrDesc.OpInfo[Src0Idx]))) 4206 legalizeOpWithMove(MI, Src0Idx); 4207 4208 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for 4209 // both the value to write (src0) and lane select (src1). Fix up non-SGPR 4210 // src0/src1 with V_READFIRSTLANE. 4211 if (Opc == AMDGPU::V_WRITELANE_B32) { 4212 const DebugLoc &DL = MI.getDebugLoc(); 4213 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) { 4214 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4215 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4216 .add(Src0); 4217 Src0.ChangeToRegister(Reg, false); 4218 } 4219 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) { 4220 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4221 const DebugLoc &DL = MI.getDebugLoc(); 4222 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4223 .add(Src1); 4224 Src1.ChangeToRegister(Reg, false); 4225 } 4226 return; 4227 } 4228 4229 // No VOP2 instructions support AGPRs. 4230 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg())) 4231 legalizeOpWithMove(MI, Src0Idx); 4232 4233 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg())) 4234 legalizeOpWithMove(MI, Src1Idx); 4235 4236 // VOP2 src0 instructions support all operand types, so we don't need to check 4237 // their legality. If src1 is already legal, we don't need to do anything. 4238 if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) 4239 return; 4240 4241 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for 4242 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane 4243 // select is uniform. 4244 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() && 4245 RI.isVGPR(MRI, Src1.getReg())) { 4246 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4247 const DebugLoc &DL = MI.getDebugLoc(); 4248 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4249 .add(Src1); 4250 Src1.ChangeToRegister(Reg, false); 4251 return; 4252 } 4253 4254 // We do not use commuteInstruction here because it is too aggressive and will 4255 // commute if it is possible. We only want to commute here if it improves 4256 // legality. This can be called a fairly large number of times so don't waste 4257 // compile time pointlessly swapping and checking legality again. 4258 if (HasImplicitSGPR || !MI.isCommutable()) { 4259 legalizeOpWithMove(MI, Src1Idx); 4260 return; 4261 } 4262 4263 // If src0 can be used as src1, commuting will make the operands legal. 4264 // Otherwise we have to give up and insert a move. 4265 // 4266 // TODO: Other immediate-like operand kinds could be commuted if there was a 4267 // MachineOperand::ChangeTo* for them. 4268 if ((!Src1.isImm() && !Src1.isReg()) || 4269 !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { 4270 legalizeOpWithMove(MI, Src1Idx); 4271 return; 4272 } 4273 4274 int CommutedOpc = commuteOpcode(MI); 4275 if (CommutedOpc == -1) { 4276 legalizeOpWithMove(MI, Src1Idx); 4277 return; 4278 } 4279 4280 MI.setDesc(get(CommutedOpc)); 4281 4282 Register Src0Reg = Src0.getReg(); 4283 unsigned Src0SubReg = Src0.getSubReg(); 4284 bool Src0Kill = Src0.isKill(); 4285 4286 if (Src1.isImm()) 4287 Src0.ChangeToImmediate(Src1.getImm()); 4288 else if (Src1.isReg()) { 4289 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); 4290 Src0.setSubReg(Src1.getSubReg()); 4291 } else 4292 llvm_unreachable("Should only have register or immediate operands"); 4293 4294 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); 4295 Src1.setSubReg(Src0SubReg); 4296 fixImplicitOperands(MI); 4297 } 4298 4299 // Legalize VOP3 operands. All operand types are supported for any operand 4300 // but only one literal constant and only starting from GFX10. 4301 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, 4302 MachineInstr &MI) const { 4303 unsigned Opc = MI.getOpcode(); 4304 4305 int VOP3Idx[3] = { 4306 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 4307 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), 4308 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) 4309 }; 4310 4311 if (Opc == AMDGPU::V_PERMLANE16_B32 || 4312 Opc == AMDGPU::V_PERMLANEX16_B32) { 4313 // src1 and src2 must be scalar 4314 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]); 4315 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]); 4316 const DebugLoc &DL = MI.getDebugLoc(); 4317 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) { 4318 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4319 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4320 .add(Src1); 4321 Src1.ChangeToRegister(Reg, false); 4322 } 4323 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) { 4324 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4325 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4326 .add(Src2); 4327 Src2.ChangeToRegister(Reg, false); 4328 } 4329 } 4330 4331 // Find the one SGPR operand we are allowed to use. 4332 int ConstantBusLimit = ST.getConstantBusLimit(Opc); 4333 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; 4334 SmallDenseSet<unsigned> SGPRsUsed; 4335 unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); 4336 if (SGPRReg != AMDGPU::NoRegister) { 4337 SGPRsUsed.insert(SGPRReg); 4338 --ConstantBusLimit; 4339 } 4340 4341 for (unsigned i = 0; i < 3; ++i) { 4342 int Idx = VOP3Idx[i]; 4343 if (Idx == -1) 4344 break; 4345 MachineOperand &MO = MI.getOperand(Idx); 4346 4347 if (!MO.isReg()) { 4348 if (!isLiteralConstantLike(MO, get(Opc).OpInfo[Idx])) 4349 continue; 4350 4351 if (LiteralLimit > 0 && ConstantBusLimit > 0) { 4352 --LiteralLimit; 4353 --ConstantBusLimit; 4354 continue; 4355 } 4356 4357 --LiteralLimit; 4358 --ConstantBusLimit; 4359 legalizeOpWithMove(MI, Idx); 4360 continue; 4361 } 4362 4363 if (RI.hasAGPRs(MRI.getRegClass(MO.getReg())) && 4364 !isOperandLegal(MI, Idx, &MO)) { 4365 legalizeOpWithMove(MI, Idx); 4366 continue; 4367 } 4368 4369 if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 4370 continue; // VGPRs are legal 4371 4372 // We can use one SGPR in each VOP3 instruction prior to GFX10 4373 // and two starting from GFX10. 4374 if (SGPRsUsed.count(MO.getReg())) 4375 continue; 4376 if (ConstantBusLimit > 0) { 4377 SGPRsUsed.insert(MO.getReg()); 4378 --ConstantBusLimit; 4379 continue; 4380 } 4381 4382 // If we make it this far, then the operand is not legal and we must 4383 // legalize it. 4384 legalizeOpWithMove(MI, Idx); 4385 } 4386 } 4387 4388 Register SIInstrInfo::readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, 4389 MachineRegisterInfo &MRI) const { 4390 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); 4391 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); 4392 Register DstReg = MRI.createVirtualRegister(SRC); 4393 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; 4394 4395 if (RI.hasAGPRs(VRC)) { 4396 VRC = RI.getEquivalentVGPRClass(VRC); 4397 Register NewSrcReg = MRI.createVirtualRegister(VRC); 4398 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 4399 get(TargetOpcode::COPY), NewSrcReg) 4400 .addReg(SrcReg); 4401 SrcReg = NewSrcReg; 4402 } 4403 4404 if (SubRegs == 1) { 4405 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 4406 get(AMDGPU::V_READFIRSTLANE_B32), DstReg) 4407 .addReg(SrcReg); 4408 return DstReg; 4409 } 4410 4411 SmallVector<unsigned, 8> SRegs; 4412 for (unsigned i = 0; i < SubRegs; ++i) { 4413 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4414 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 4415 get(AMDGPU::V_READFIRSTLANE_B32), SGPR) 4416 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); 4417 SRegs.push_back(SGPR); 4418 } 4419 4420 MachineInstrBuilder MIB = 4421 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 4422 get(AMDGPU::REG_SEQUENCE), DstReg); 4423 for (unsigned i = 0; i < SubRegs; ++i) { 4424 MIB.addReg(SRegs[i]); 4425 MIB.addImm(RI.getSubRegFromChannel(i)); 4426 } 4427 return DstReg; 4428 } 4429 4430 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, 4431 MachineInstr &MI) const { 4432 4433 // If the pointer is store in VGPRs, then we need to move them to 4434 // SGPRs using v_readfirstlane. This is safe because we only select 4435 // loads with uniform pointers to SMRD instruction so we know the 4436 // pointer value is uniform. 4437 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); 4438 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { 4439 unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); 4440 SBase->setReg(SGPR); 4441 } 4442 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff); 4443 if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) { 4444 unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI); 4445 SOff->setReg(SGPR); 4446 } 4447 } 4448 4449 void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, 4450 MachineBasicBlock::iterator I, 4451 const TargetRegisterClass *DstRC, 4452 MachineOperand &Op, 4453 MachineRegisterInfo &MRI, 4454 const DebugLoc &DL) const { 4455 Register OpReg = Op.getReg(); 4456 unsigned OpSubReg = Op.getSubReg(); 4457 4458 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( 4459 RI.getRegClassForReg(MRI, OpReg), OpSubReg); 4460 4461 // Check if operand is already the correct register class. 4462 if (DstRC == OpRC) 4463 return; 4464 4465 Register DstReg = MRI.createVirtualRegister(DstRC); 4466 MachineInstr *Copy = 4467 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); 4468 4469 Op.setReg(DstReg); 4470 Op.setSubReg(0); 4471 4472 MachineInstr *Def = MRI.getVRegDef(OpReg); 4473 if (!Def) 4474 return; 4475 4476 // Try to eliminate the copy if it is copying an immediate value. 4477 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass) 4478 FoldImmediate(*Copy, *Def, OpReg, &MRI); 4479 4480 bool ImpDef = Def->isImplicitDef(); 4481 while (!ImpDef && Def && Def->isCopy()) { 4482 if (Def->getOperand(1).getReg().isPhysical()) 4483 break; 4484 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg()); 4485 ImpDef = Def && Def->isImplicitDef(); 4486 } 4487 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) && 4488 !ImpDef) 4489 Copy->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 4490 } 4491 4492 // Emit the actual waterfall loop, executing the wrapped instruction for each 4493 // unique value of \p Rsrc across all lanes. In the best case we execute 1 4494 // iteration, in the worst case we execute 64 (once per lane). 4495 static void 4496 emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, 4497 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, 4498 const DebugLoc &DL, MachineOperand &Rsrc) { 4499 MachineFunction &MF = *OrigBB.getParent(); 4500 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 4501 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 4502 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 4503 unsigned SaveExecOpc = 4504 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 4505 unsigned XorTermOpc = 4506 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; 4507 unsigned AndOpc = 4508 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 4509 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 4510 4511 MachineBasicBlock::iterator I = LoopBB.begin(); 4512 4513 Register VRsrc = Rsrc.getReg(); 4514 unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef()); 4515 4516 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); 4517 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC); 4518 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC); 4519 Register AndCond = MRI.createVirtualRegister(BoolXExecRC); 4520 Register SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4521 Register SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4522 Register SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4523 Register SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4524 Register SRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 4525 4526 // Beginning of the loop, read the next Rsrc variant. 4527 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0) 4528 .addReg(VRsrc, VRsrcUndef, AMDGPU::sub0); 4529 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub1) 4530 .addReg(VRsrc, VRsrcUndef, AMDGPU::sub1); 4531 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub2) 4532 .addReg(VRsrc, VRsrcUndef, AMDGPU::sub2); 4533 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub3) 4534 .addReg(VRsrc, VRsrcUndef, AMDGPU::sub3); 4535 4536 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc) 4537 .addReg(SRsrcSub0) 4538 .addImm(AMDGPU::sub0) 4539 .addReg(SRsrcSub1) 4540 .addImm(AMDGPU::sub1) 4541 .addReg(SRsrcSub2) 4542 .addImm(AMDGPU::sub2) 4543 .addReg(SRsrcSub3) 4544 .addImm(AMDGPU::sub3); 4545 4546 // Update Rsrc operand to use the SGPR Rsrc. 4547 Rsrc.setReg(SRsrc); 4548 Rsrc.setIsKill(true); 4549 4550 // Identify all lanes with identical Rsrc operands in their VGPRs. 4551 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg0) 4552 .addReg(SRsrc, 0, AMDGPU::sub0_sub1) 4553 .addReg(VRsrc, 0, AMDGPU::sub0_sub1); 4554 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1) 4555 .addReg(SRsrc, 0, AMDGPU::sub2_sub3) 4556 .addReg(VRsrc, 0, AMDGPU::sub2_sub3); 4557 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndCond) 4558 .addReg(CondReg0) 4559 .addReg(CondReg1); 4560 4561 MRI.setSimpleHint(SaveExec, AndCond); 4562 4563 // Update EXEC to matching lanes, saving original to SaveExec. 4564 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec) 4565 .addReg(AndCond, RegState::Kill); 4566 4567 // The original instruction is here; we insert the terminators after it. 4568 I = LoopBB.end(); 4569 4570 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 4571 BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec) 4572 .addReg(Exec) 4573 .addReg(SaveExec); 4574 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB); 4575 } 4576 4577 // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register 4578 // with SGPRs by iterating over all unique values across all lanes. 4579 static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, 4580 MachineOperand &Rsrc, MachineDominatorTree *MDT) { 4581 MachineBasicBlock &MBB = *MI.getParent(); 4582 MachineFunction &MF = *MBB.getParent(); 4583 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 4584 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 4585 MachineRegisterInfo &MRI = MF.getRegInfo(); 4586 MachineBasicBlock::iterator I(&MI); 4587 const DebugLoc &DL = MI.getDebugLoc(); 4588 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 4589 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 4590 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 4591 4592 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); 4593 4594 // Save the EXEC mask 4595 BuildMI(MBB, I, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec); 4596 4597 // Killed uses in the instruction we are waterfalling around will be 4598 // incorrect due to the added control-flow. 4599 for (auto &MO : MI.uses()) { 4600 if (MO.isReg() && MO.isUse()) { 4601 MRI.clearKillFlags(MO.getReg()); 4602 } 4603 } 4604 4605 // To insert the loop we need to split the block. Move everything after this 4606 // point to a new block, and insert a new empty block between the two. 4607 MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock(); 4608 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock(); 4609 MachineFunction::iterator MBBI(MBB); 4610 ++MBBI; 4611 4612 MF.insert(MBBI, LoopBB); 4613 MF.insert(MBBI, RemainderBB); 4614 4615 LoopBB->addSuccessor(LoopBB); 4616 LoopBB->addSuccessor(RemainderBB); 4617 4618 // Move MI to the LoopBB, and the remainder of the block to RemainderBB. 4619 MachineBasicBlock::iterator J = I++; 4620 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 4621 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); 4622 LoopBB->splice(LoopBB->begin(), &MBB, J); 4623 4624 MBB.addSuccessor(LoopBB); 4625 4626 // Update dominators. We know that MBB immediately dominates LoopBB, that 4627 // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately 4628 // dominates all of the successors transferred to it from MBB that MBB used 4629 // to properly dominate. 4630 if (MDT) { 4631 MDT->addNewBlock(LoopBB, &MBB); 4632 MDT->addNewBlock(RemainderBB, LoopBB); 4633 for (auto &Succ : RemainderBB->successors()) { 4634 if (MDT->properlyDominates(&MBB, Succ)) { 4635 MDT->changeImmediateDominator(Succ, RemainderBB); 4636 } 4637 } 4638 } 4639 4640 emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc); 4641 4642 // Restore the EXEC mask 4643 MachineBasicBlock::iterator First = RemainderBB->begin(); 4644 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec); 4645 } 4646 4647 // Extract pointer from Rsrc and return a zero-value Rsrc replacement. 4648 static std::tuple<unsigned, unsigned> 4649 extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) { 4650 MachineBasicBlock &MBB = *MI.getParent(); 4651 MachineFunction &MF = *MBB.getParent(); 4652 MachineRegisterInfo &MRI = MF.getRegInfo(); 4653 4654 // Extract the ptr from the resource descriptor. 4655 unsigned RsrcPtr = 4656 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass, 4657 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); 4658 4659 // Create an empty resource descriptor 4660 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4661 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4662 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4663 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 4664 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat(); 4665 4666 // Zero64 = 0 4667 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64) 4668 .addImm(0); 4669 4670 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 4671 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo) 4672 .addImm(RsrcDataFormat & 0xFFFFFFFF); 4673 4674 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 4675 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi) 4676 .addImm(RsrcDataFormat >> 32); 4677 4678 // NewSRsrc = {Zero64, SRsrcFormat} 4679 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc) 4680 .addReg(Zero64) 4681 .addImm(AMDGPU::sub0_sub1) 4682 .addReg(SRsrcFormatLo) 4683 .addImm(AMDGPU::sub2) 4684 .addReg(SRsrcFormatHi) 4685 .addImm(AMDGPU::sub3); 4686 4687 return std::make_tuple(RsrcPtr, NewSRsrc); 4688 } 4689 4690 void SIInstrInfo::legalizeOperands(MachineInstr &MI, 4691 MachineDominatorTree *MDT) const { 4692 MachineFunction &MF = *MI.getParent()->getParent(); 4693 MachineRegisterInfo &MRI = MF.getRegInfo(); 4694 4695 // Legalize VOP2 4696 if (isVOP2(MI) || isVOPC(MI)) { 4697 legalizeOperandsVOP2(MRI, MI); 4698 return; 4699 } 4700 4701 // Legalize VOP3 4702 if (isVOP3(MI)) { 4703 legalizeOperandsVOP3(MRI, MI); 4704 return; 4705 } 4706 4707 // Legalize SMRD 4708 if (isSMRD(MI)) { 4709 legalizeOperandsSMRD(MRI, MI); 4710 return; 4711 } 4712 4713 // Legalize REG_SEQUENCE and PHI 4714 // The register class of the operands much be the same type as the register 4715 // class of the output. 4716 if (MI.getOpcode() == AMDGPU::PHI) { 4717 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 4718 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { 4719 if (!MI.getOperand(i).isReg() || 4720 !Register::isVirtualRegister(MI.getOperand(i).getReg())) 4721 continue; 4722 const TargetRegisterClass *OpRC = 4723 MRI.getRegClass(MI.getOperand(i).getReg()); 4724 if (RI.hasVectorRegisters(OpRC)) { 4725 VRC = OpRC; 4726 } else { 4727 SRC = OpRC; 4728 } 4729 } 4730 4731 // If any of the operands are VGPR registers, then they all most be 4732 // otherwise we will create illegal VGPR->SGPR copies when legalizing 4733 // them. 4734 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { 4735 if (!VRC) { 4736 assert(SRC); 4737 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) { 4738 VRC = &AMDGPU::VReg_1RegClass; 4739 } else 4740 VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) 4741 ? RI.getEquivalentAGPRClass(SRC) 4742 : RI.getEquivalentVGPRClass(SRC); 4743 } else { 4744 VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) 4745 ? RI.getEquivalentAGPRClass(VRC) 4746 : RI.getEquivalentVGPRClass(VRC); 4747 } 4748 RC = VRC; 4749 } else { 4750 RC = SRC; 4751 } 4752 4753 // Update all the operands so they have the same type. 4754 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 4755 MachineOperand &Op = MI.getOperand(I); 4756 if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg())) 4757 continue; 4758 4759 // MI is a PHI instruction. 4760 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); 4761 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); 4762 4763 // Avoid creating no-op copies with the same src and dst reg class. These 4764 // confuse some of the machine passes. 4765 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); 4766 } 4767 } 4768 4769 // REG_SEQUENCE doesn't really require operand legalization, but if one has a 4770 // VGPR dest type and SGPR sources, insert copies so all operands are 4771 // VGPRs. This seems to help operand folding / the register coalescer. 4772 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { 4773 MachineBasicBlock *MBB = MI.getParent(); 4774 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); 4775 if (RI.hasVGPRs(DstRC)) { 4776 // Update all the operands so they are VGPR register classes. These may 4777 // not be the same register class because REG_SEQUENCE supports mixing 4778 // subregister index types e.g. sub0_sub1 + sub2 + sub3 4779 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 4780 MachineOperand &Op = MI.getOperand(I); 4781 if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg())) 4782 continue; 4783 4784 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); 4785 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); 4786 if (VRC == OpRC) 4787 continue; 4788 4789 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc()); 4790 Op.setIsKill(); 4791 } 4792 } 4793 4794 return; 4795 } 4796 4797 // Legalize INSERT_SUBREG 4798 // src0 must have the same register class as dst 4799 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { 4800 Register Dst = MI.getOperand(0).getReg(); 4801 Register Src0 = MI.getOperand(1).getReg(); 4802 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 4803 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 4804 if (DstRC != Src0RC) { 4805 MachineBasicBlock *MBB = MI.getParent(); 4806 MachineOperand &Op = MI.getOperand(1); 4807 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); 4808 } 4809 return; 4810 } 4811 4812 // Legalize SI_INIT_M0 4813 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) { 4814 MachineOperand &Src = MI.getOperand(0); 4815 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) 4816 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); 4817 return; 4818 } 4819 4820 // Legalize MIMG and MUBUF/MTBUF for shaders. 4821 // 4822 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via 4823 // scratch memory access. In both cases, the legalization never involves 4824 // conversion to the addr64 form. 4825 if (isMIMG(MI) || 4826 (AMDGPU::isShader(MF.getFunction().getCallingConv()) && 4827 (isMUBUF(MI) || isMTBUF(MI)))) { 4828 MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); 4829 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { 4830 unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); 4831 SRsrc->setReg(SGPR); 4832 } 4833 4834 MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); 4835 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { 4836 unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); 4837 SSamp->setReg(SGPR); 4838 } 4839 return; 4840 } 4841 4842 // Legalize MUBUF* instructions. 4843 int RsrcIdx = 4844 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); 4845 if (RsrcIdx != -1) { 4846 // We have an MUBUF instruction 4847 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx); 4848 unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass; 4849 if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()), 4850 RI.getRegClass(RsrcRC))) { 4851 // The operands are legal. 4852 // FIXME: We may need to legalize operands besided srsrc. 4853 return; 4854 } 4855 4856 // Legalize a VGPR Rsrc. 4857 // 4858 // If the instruction is _ADDR64, we can avoid a waterfall by extracting 4859 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using 4860 // a zero-value SRsrc. 4861 // 4862 // If the instruction is _OFFSET (both idxen and offen disabled), and we 4863 // support ADDR64 instructions, we can convert to ADDR64 and do the same as 4864 // above. 4865 // 4866 // Otherwise we are on non-ADDR64 hardware, and/or we have 4867 // idxen/offen/bothen and we fall back to a waterfall loop. 4868 4869 MachineBasicBlock &MBB = *MI.getParent(); 4870 4871 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 4872 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) { 4873 // This is already an ADDR64 instruction so we need to add the pointer 4874 // extracted from the resource descriptor to the current value of VAddr. 4875 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4876 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4877 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 4878 4879 const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 4880 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC); 4881 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC); 4882 4883 unsigned RsrcPtr, NewSRsrc; 4884 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); 4885 4886 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0 4887 const DebugLoc &DL = MI.getDebugLoc(); 4888 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e64), NewVAddrLo) 4889 .addDef(CondReg0) 4890 .addReg(RsrcPtr, 0, AMDGPU::sub0) 4891 .addReg(VAddr->getReg(), 0, AMDGPU::sub0) 4892 .addImm(0); 4893 4894 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1 4895 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi) 4896 .addDef(CondReg1, RegState::Dead) 4897 .addReg(RsrcPtr, 0, AMDGPU::sub1) 4898 .addReg(VAddr->getReg(), 0, AMDGPU::sub1) 4899 .addReg(CondReg0, RegState::Kill) 4900 .addImm(0); 4901 4902 // NewVaddr = {NewVaddrHi, NewVaddrLo} 4903 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 4904 .addReg(NewVAddrLo) 4905 .addImm(AMDGPU::sub0) 4906 .addReg(NewVAddrHi) 4907 .addImm(AMDGPU::sub1); 4908 4909 VAddr->setReg(NewVAddr); 4910 Rsrc->setReg(NewSRsrc); 4911 } else if (!VAddr && ST.hasAddr64()) { 4912 // This instructions is the _OFFSET variant, so we need to convert it to 4913 // ADDR64. 4914 assert(MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration() 4915 < AMDGPUSubtarget::VOLCANIC_ISLANDS && 4916 "FIXME: Need to emit flat atomics here"); 4917 4918 unsigned RsrcPtr, NewSRsrc; 4919 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); 4920 4921 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 4922 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); 4923 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 4924 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); 4925 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); 4926 4927 // Atomics rith return have have an additional tied operand and are 4928 // missing some of the special bits. 4929 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); 4930 MachineInstr *Addr64; 4931 4932 if (!VDataIn) { 4933 // Regular buffer load / store. 4934 MachineInstrBuilder MIB = 4935 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 4936 .add(*VData) 4937 .addReg(NewVAddr) 4938 .addReg(NewSRsrc) 4939 .add(*SOffset) 4940 .add(*Offset); 4941 4942 // Atomics do not have this operand. 4943 if (const MachineOperand *GLC = 4944 getNamedOperand(MI, AMDGPU::OpName::glc)) { 4945 MIB.addImm(GLC->getImm()); 4946 } 4947 if (const MachineOperand *DLC = 4948 getNamedOperand(MI, AMDGPU::OpName::dlc)) { 4949 MIB.addImm(DLC->getImm()); 4950 } 4951 4952 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)); 4953 4954 if (const MachineOperand *TFE = 4955 getNamedOperand(MI, AMDGPU::OpName::tfe)) { 4956 MIB.addImm(TFE->getImm()); 4957 } 4958 4959 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz)); 4960 4961 MIB.cloneMemRefs(MI); 4962 Addr64 = MIB; 4963 } else { 4964 // Atomics with return. 4965 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 4966 .add(*VData) 4967 .add(*VDataIn) 4968 .addReg(NewVAddr) 4969 .addReg(NewSRsrc) 4970 .add(*SOffset) 4971 .add(*Offset) 4972 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) 4973 .cloneMemRefs(MI); 4974 } 4975 4976 MI.removeFromParent(); 4977 4978 // NewVaddr = {NewVaddrHi, NewVaddrLo} 4979 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), 4980 NewVAddr) 4981 .addReg(RsrcPtr, 0, AMDGPU::sub0) 4982 .addImm(AMDGPU::sub0) 4983 .addReg(RsrcPtr, 0, AMDGPU::sub1) 4984 .addImm(AMDGPU::sub1); 4985 } else { 4986 // This is another variant; legalize Rsrc with waterfall loop from VGPRs 4987 // to SGPRs. 4988 loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT); 4989 } 4990 } 4991 } 4992 4993 void SIInstrInfo::moveToVALU(MachineInstr &TopInst, 4994 MachineDominatorTree *MDT) const { 4995 SetVectorType Worklist; 4996 Worklist.insert(&TopInst); 4997 4998 while (!Worklist.empty()) { 4999 MachineInstr &Inst = *Worklist.pop_back_val(); 5000 MachineBasicBlock *MBB = Inst.getParent(); 5001 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 5002 5003 unsigned Opcode = Inst.getOpcode(); 5004 unsigned NewOpcode = getVALUOp(Inst); 5005 5006 // Handle some special cases 5007 switch (Opcode) { 5008 default: 5009 break; 5010 case AMDGPU::S_ADD_U64_PSEUDO: 5011 case AMDGPU::S_SUB_U64_PSEUDO: 5012 splitScalar64BitAddSub(Worklist, Inst, MDT); 5013 Inst.eraseFromParent(); 5014 continue; 5015 case AMDGPU::S_ADD_I32: 5016 case AMDGPU::S_SUB_I32: 5017 // FIXME: The u32 versions currently selected use the carry. 5018 if (moveScalarAddSub(Worklist, Inst, MDT)) 5019 continue; 5020 5021 // Default handling 5022 break; 5023 case AMDGPU::S_AND_B64: 5024 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); 5025 Inst.eraseFromParent(); 5026 continue; 5027 5028 case AMDGPU::S_OR_B64: 5029 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT); 5030 Inst.eraseFromParent(); 5031 continue; 5032 5033 case AMDGPU::S_XOR_B64: 5034 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT); 5035 Inst.eraseFromParent(); 5036 continue; 5037 5038 case AMDGPU::S_NAND_B64: 5039 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT); 5040 Inst.eraseFromParent(); 5041 continue; 5042 5043 case AMDGPU::S_NOR_B64: 5044 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT); 5045 Inst.eraseFromParent(); 5046 continue; 5047 5048 case AMDGPU::S_XNOR_B64: 5049 if (ST.hasDLInsts()) 5050 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); 5051 else 5052 splitScalar64BitXnor(Worklist, Inst, MDT); 5053 Inst.eraseFromParent(); 5054 continue; 5055 5056 case AMDGPU::S_ANDN2_B64: 5057 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT); 5058 Inst.eraseFromParent(); 5059 continue; 5060 5061 case AMDGPU::S_ORN2_B64: 5062 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT); 5063 Inst.eraseFromParent(); 5064 continue; 5065 5066 case AMDGPU::S_NOT_B64: 5067 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); 5068 Inst.eraseFromParent(); 5069 continue; 5070 5071 case AMDGPU::S_BCNT1_I32_B64: 5072 splitScalar64BitBCNT(Worklist, Inst); 5073 Inst.eraseFromParent(); 5074 continue; 5075 5076 case AMDGPU::S_BFE_I64: 5077 splitScalar64BitBFE(Worklist, Inst); 5078 Inst.eraseFromParent(); 5079 continue; 5080 5081 case AMDGPU::S_LSHL_B32: 5082 if (ST.hasOnlyRevVALUShifts()) { 5083 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 5084 swapOperands(Inst); 5085 } 5086 break; 5087 case AMDGPU::S_ASHR_I32: 5088 if (ST.hasOnlyRevVALUShifts()) { 5089 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 5090 swapOperands(Inst); 5091 } 5092 break; 5093 case AMDGPU::S_LSHR_B32: 5094 if (ST.hasOnlyRevVALUShifts()) { 5095 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 5096 swapOperands(Inst); 5097 } 5098 break; 5099 case AMDGPU::S_LSHL_B64: 5100 if (ST.hasOnlyRevVALUShifts()) { 5101 NewOpcode = AMDGPU::V_LSHLREV_B64; 5102 swapOperands(Inst); 5103 } 5104 break; 5105 case AMDGPU::S_ASHR_I64: 5106 if (ST.hasOnlyRevVALUShifts()) { 5107 NewOpcode = AMDGPU::V_ASHRREV_I64; 5108 swapOperands(Inst); 5109 } 5110 break; 5111 case AMDGPU::S_LSHR_B64: 5112 if (ST.hasOnlyRevVALUShifts()) { 5113 NewOpcode = AMDGPU::V_LSHRREV_B64; 5114 swapOperands(Inst); 5115 } 5116 break; 5117 5118 case AMDGPU::S_ABS_I32: 5119 lowerScalarAbs(Worklist, Inst); 5120 Inst.eraseFromParent(); 5121 continue; 5122 5123 case AMDGPU::S_CBRANCH_SCC0: 5124 case AMDGPU::S_CBRANCH_SCC1: 5125 // Clear unused bits of vcc 5126 if (ST.isWave32()) 5127 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B32), 5128 AMDGPU::VCC_LO) 5129 .addReg(AMDGPU::EXEC_LO) 5130 .addReg(AMDGPU::VCC_LO); 5131 else 5132 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), 5133 AMDGPU::VCC) 5134 .addReg(AMDGPU::EXEC) 5135 .addReg(AMDGPU::VCC); 5136 break; 5137 5138 case AMDGPU::S_BFE_U64: 5139 case AMDGPU::S_BFM_B64: 5140 llvm_unreachable("Moving this op to VALU not implemented"); 5141 5142 case AMDGPU::S_PACK_LL_B32_B16: 5143 case AMDGPU::S_PACK_LH_B32_B16: 5144 case AMDGPU::S_PACK_HH_B32_B16: 5145 movePackToVALU(Worklist, MRI, Inst); 5146 Inst.eraseFromParent(); 5147 continue; 5148 5149 case AMDGPU::S_XNOR_B32: 5150 lowerScalarXnor(Worklist, Inst); 5151 Inst.eraseFromParent(); 5152 continue; 5153 5154 case AMDGPU::S_NAND_B32: 5155 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32); 5156 Inst.eraseFromParent(); 5157 continue; 5158 5159 case AMDGPU::S_NOR_B32: 5160 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32); 5161 Inst.eraseFromParent(); 5162 continue; 5163 5164 case AMDGPU::S_ANDN2_B32: 5165 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32); 5166 Inst.eraseFromParent(); 5167 continue; 5168 5169 case AMDGPU::S_ORN2_B32: 5170 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); 5171 Inst.eraseFromParent(); 5172 continue; 5173 } 5174 5175 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 5176 // We cannot move this instruction to the VALU, so we should try to 5177 // legalize its operands instead. 5178 legalizeOperands(Inst, MDT); 5179 continue; 5180 } 5181 5182 // Use the new VALU Opcode. 5183 const MCInstrDesc &NewDesc = get(NewOpcode); 5184 Inst.setDesc(NewDesc); 5185 5186 // Remove any references to SCC. Vector instructions can't read from it, and 5187 // We're just about to add the implicit use / defs of VCC, and we don't want 5188 // both. 5189 for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { 5190 MachineOperand &Op = Inst.getOperand(i); 5191 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { 5192 // Only propagate through live-def of SCC. 5193 if (Op.isDef() && !Op.isDead()) 5194 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist); 5195 Inst.RemoveOperand(i); 5196 } 5197 } 5198 5199 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 5200 // We are converting these to a BFE, so we need to add the missing 5201 // operands for the size and offset. 5202 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 5203 Inst.addOperand(MachineOperand::CreateImm(0)); 5204 Inst.addOperand(MachineOperand::CreateImm(Size)); 5205 5206 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 5207 // The VALU version adds the second operand to the result, so insert an 5208 // extra 0 operand. 5209 Inst.addOperand(MachineOperand::CreateImm(0)); 5210 } 5211 5212 Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); 5213 fixImplicitOperands(Inst); 5214 5215 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 5216 const MachineOperand &OffsetWidthOp = Inst.getOperand(2); 5217 // If we need to move this to VGPRs, we need to unpack the second operand 5218 // back into the 2 separate ones for bit offset and width. 5219 assert(OffsetWidthOp.isImm() && 5220 "Scalar BFE is only implemented for constant width and offset"); 5221 uint32_t Imm = OffsetWidthOp.getImm(); 5222 5223 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 5224 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 5225 Inst.RemoveOperand(2); // Remove old immediate. 5226 Inst.addOperand(MachineOperand::CreateImm(Offset)); 5227 Inst.addOperand(MachineOperand::CreateImm(BitWidth)); 5228 } 5229 5230 bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); 5231 unsigned NewDstReg = AMDGPU::NoRegister; 5232 if (HasDst) { 5233 Register DstReg = Inst.getOperand(0).getReg(); 5234 if (Register::isPhysicalRegister(DstReg)) 5235 continue; 5236 5237 // Update the destination register class. 5238 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); 5239 if (!NewDstRC) 5240 continue; 5241 5242 if (Inst.isCopy() && 5243 Register::isVirtualRegister(Inst.getOperand(1).getReg()) && 5244 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { 5245 // Instead of creating a copy where src and dst are the same register 5246 // class, we just replace all uses of dst with src. These kinds of 5247 // copies interfere with the heuristics MachineSink uses to decide 5248 // whether or not to split a critical edge. Since the pass assumes 5249 // that copies will end up as machine instructions and not be 5250 // eliminated. 5251 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); 5252 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); 5253 MRI.clearKillFlags(Inst.getOperand(1).getReg()); 5254 Inst.getOperand(0).setReg(DstReg); 5255 5256 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally 5257 // these are deleted later, but at -O0 it would leave a suspicious 5258 // looking illegal copy of an undef register. 5259 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I) 5260 Inst.RemoveOperand(I); 5261 Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); 5262 continue; 5263 } 5264 5265 NewDstReg = MRI.createVirtualRegister(NewDstRC); 5266 MRI.replaceRegWith(DstReg, NewDstReg); 5267 } 5268 5269 // Legalize the operands 5270 legalizeOperands(Inst, MDT); 5271 5272 if (HasDst) 5273 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 5274 } 5275 } 5276 5277 // Add/sub require special handling to deal with carry outs. 5278 bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, 5279 MachineDominatorTree *MDT) const { 5280 if (ST.hasAddNoCarry()) { 5281 // Assume there is no user of scc since we don't select this in that case. 5282 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant 5283 // is used. 5284 5285 MachineBasicBlock &MBB = *Inst.getParent(); 5286 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5287 5288 Register OldDstReg = Inst.getOperand(0).getReg(); 5289 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5290 5291 unsigned Opc = Inst.getOpcode(); 5292 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32); 5293 5294 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ? 5295 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64; 5296 5297 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC); 5298 Inst.RemoveOperand(3); 5299 5300 Inst.setDesc(get(NewOpc)); 5301 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit 5302 Inst.addImplicitDefUseOperands(*MBB.getParent()); 5303 MRI.replaceRegWith(OldDstReg, ResultReg); 5304 legalizeOperands(Inst, MDT); 5305 5306 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 5307 return true; 5308 } 5309 5310 return false; 5311 } 5312 5313 void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, 5314 MachineInstr &Inst) const { 5315 MachineBasicBlock &MBB = *Inst.getParent(); 5316 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5317 MachineBasicBlock::iterator MII = Inst; 5318 DebugLoc DL = Inst.getDebugLoc(); 5319 5320 MachineOperand &Dest = Inst.getOperand(0); 5321 MachineOperand &Src = Inst.getOperand(1); 5322 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5323 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5324 5325 unsigned SubOp = ST.hasAddNoCarry() ? 5326 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32; 5327 5328 BuildMI(MBB, MII, DL, get(SubOp), TmpReg) 5329 .addImm(0) 5330 .addReg(Src.getReg()); 5331 5332 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) 5333 .addReg(Src.getReg()) 5334 .addReg(TmpReg); 5335 5336 MRI.replaceRegWith(Dest.getReg(), ResultReg); 5337 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 5338 } 5339 5340 void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, 5341 MachineInstr &Inst) const { 5342 MachineBasicBlock &MBB = *Inst.getParent(); 5343 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5344 MachineBasicBlock::iterator MII = Inst; 5345 const DebugLoc &DL = Inst.getDebugLoc(); 5346 5347 MachineOperand &Dest = Inst.getOperand(0); 5348 MachineOperand &Src0 = Inst.getOperand(1); 5349 MachineOperand &Src1 = Inst.getOperand(2); 5350 5351 if (ST.hasDLInsts()) { 5352 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5353 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); 5354 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); 5355 5356 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest) 5357 .add(Src0) 5358 .add(Src1); 5359 5360 MRI.replaceRegWith(Dest.getReg(), NewDest); 5361 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 5362 } else { 5363 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can 5364 // invert either source and then perform the XOR. If either source is a 5365 // scalar register, then we can leave the inversion on the scalar unit to 5366 // acheive a better distrubution of scalar and vector instructions. 5367 bool Src0IsSGPR = Src0.isReg() && 5368 RI.isSGPRClass(MRI.getRegClass(Src0.getReg())); 5369 bool Src1IsSGPR = Src1.isReg() && 5370 RI.isSGPRClass(MRI.getRegClass(Src1.getReg())); 5371 MachineInstr *Xor; 5372 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5373 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5374 5375 // Build a pair of scalar instructions and add them to the work list. 5376 // The next iteration over the work list will lower these to the vector 5377 // unit as necessary. 5378 if (Src0IsSGPR) { 5379 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0); 5380 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) 5381 .addReg(Temp) 5382 .add(Src1); 5383 } else if (Src1IsSGPR) { 5384 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1); 5385 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) 5386 .add(Src0) 5387 .addReg(Temp); 5388 } else { 5389 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp) 5390 .add(Src0) 5391 .add(Src1); 5392 MachineInstr *Not = 5393 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp); 5394 Worklist.insert(Not); 5395 } 5396 5397 MRI.replaceRegWith(Dest.getReg(), NewDest); 5398 5399 Worklist.insert(Xor); 5400 5401 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 5402 } 5403 } 5404 5405 void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist, 5406 MachineInstr &Inst, 5407 unsigned Opcode) const { 5408 MachineBasicBlock &MBB = *Inst.getParent(); 5409 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5410 MachineBasicBlock::iterator MII = Inst; 5411 const DebugLoc &DL = Inst.getDebugLoc(); 5412 5413 MachineOperand &Dest = Inst.getOperand(0); 5414 MachineOperand &Src0 = Inst.getOperand(1); 5415 MachineOperand &Src1 = Inst.getOperand(2); 5416 5417 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5418 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5419 5420 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm) 5421 .add(Src0) 5422 .add(Src1); 5423 5424 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest) 5425 .addReg(Interm); 5426 5427 Worklist.insert(&Op); 5428 Worklist.insert(&Not); 5429 5430 MRI.replaceRegWith(Dest.getReg(), NewDest); 5431 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 5432 } 5433 5434 void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist, 5435 MachineInstr &Inst, 5436 unsigned Opcode) const { 5437 MachineBasicBlock &MBB = *Inst.getParent(); 5438 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5439 MachineBasicBlock::iterator MII = Inst; 5440 const DebugLoc &DL = Inst.getDebugLoc(); 5441 5442 MachineOperand &Dest = Inst.getOperand(0); 5443 MachineOperand &Src0 = Inst.getOperand(1); 5444 MachineOperand &Src1 = Inst.getOperand(2); 5445 5446 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 5447 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 5448 5449 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm) 5450 .add(Src1); 5451 5452 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest) 5453 .add(Src0) 5454 .addReg(Interm); 5455 5456 Worklist.insert(&Not); 5457 Worklist.insert(&Op); 5458 5459 MRI.replaceRegWith(Dest.getReg(), NewDest); 5460 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 5461 } 5462 5463 void SIInstrInfo::splitScalar64BitUnaryOp( 5464 SetVectorType &Worklist, MachineInstr &Inst, 5465 unsigned Opcode) const { 5466 MachineBasicBlock &MBB = *Inst.getParent(); 5467 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5468 5469 MachineOperand &Dest = Inst.getOperand(0); 5470 MachineOperand &Src0 = Inst.getOperand(1); 5471 DebugLoc DL = Inst.getDebugLoc(); 5472 5473 MachineBasicBlock::iterator MII = Inst; 5474 5475 const MCInstrDesc &InstDesc = get(Opcode); 5476 const TargetRegisterClass *Src0RC = Src0.isReg() ? 5477 MRI.getRegClass(Src0.getReg()) : 5478 &AMDGPU::SGPR_32RegClass; 5479 5480 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 5481 5482 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5483 AMDGPU::sub0, Src0SubRC); 5484 5485 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 5486 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 5487 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 5488 5489 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 5490 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); 5491 5492 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5493 AMDGPU::sub1, Src0SubRC); 5494 5495 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 5496 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); 5497 5498 Register FullDestReg = MRI.createVirtualRegister(NewDestRC); 5499 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 5500 .addReg(DestSub0) 5501 .addImm(AMDGPU::sub0) 5502 .addReg(DestSub1) 5503 .addImm(AMDGPU::sub1); 5504 5505 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 5506 5507 Worklist.insert(&LoHalf); 5508 Worklist.insert(&HiHalf); 5509 5510 // We don't need to legalizeOperands here because for a single operand, src0 5511 // will support any kind of input. 5512 5513 // Move all users of this moved value. 5514 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 5515 } 5516 5517 void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, 5518 MachineInstr &Inst, 5519 MachineDominatorTree *MDT) const { 5520 bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); 5521 5522 MachineBasicBlock &MBB = *Inst.getParent(); 5523 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5524 const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 5525 5526 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 5527 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5528 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5529 5530 Register CarryReg = MRI.createVirtualRegister(CarryRC); 5531 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC); 5532 5533 MachineOperand &Dest = Inst.getOperand(0); 5534 MachineOperand &Src0 = Inst.getOperand(1); 5535 MachineOperand &Src1 = Inst.getOperand(2); 5536 const DebugLoc &DL = Inst.getDebugLoc(); 5537 MachineBasicBlock::iterator MII = Inst; 5538 5539 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); 5540 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); 5541 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 5542 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 5543 5544 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5545 AMDGPU::sub0, Src0SubRC); 5546 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 5547 AMDGPU::sub0, Src1SubRC); 5548 5549 5550 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5551 AMDGPU::sub1, Src0SubRC); 5552 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 5553 AMDGPU::sub1, Src1SubRC); 5554 5555 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; 5556 MachineInstr *LoHalf = 5557 BuildMI(MBB, MII, DL, get(LoOpc), DestSub0) 5558 .addReg(CarryReg, RegState::Define) 5559 .add(SrcReg0Sub0) 5560 .add(SrcReg1Sub0) 5561 .addImm(0); // clamp bit 5562 5563 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 5564 MachineInstr *HiHalf = 5565 BuildMI(MBB, MII, DL, get(HiOpc), DestSub1) 5566 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 5567 .add(SrcReg0Sub1) 5568 .add(SrcReg1Sub1) 5569 .addReg(CarryReg, RegState::Kill) 5570 .addImm(0); // clamp bit 5571 5572 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 5573 .addReg(DestSub0) 5574 .addImm(AMDGPU::sub0) 5575 .addReg(DestSub1) 5576 .addImm(AMDGPU::sub1); 5577 5578 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 5579 5580 // Try to legalize the operands in case we need to swap the order to keep it 5581 // valid. 5582 legalizeOperands(*LoHalf, MDT); 5583 legalizeOperands(*HiHalf, MDT); 5584 5585 // Move all users of this moved vlaue. 5586 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 5587 } 5588 5589 void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, 5590 MachineInstr &Inst, unsigned Opcode, 5591 MachineDominatorTree *MDT) const { 5592 MachineBasicBlock &MBB = *Inst.getParent(); 5593 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5594 5595 MachineOperand &Dest = Inst.getOperand(0); 5596 MachineOperand &Src0 = Inst.getOperand(1); 5597 MachineOperand &Src1 = Inst.getOperand(2); 5598 DebugLoc DL = Inst.getDebugLoc(); 5599 5600 MachineBasicBlock::iterator MII = Inst; 5601 5602 const MCInstrDesc &InstDesc = get(Opcode); 5603 const TargetRegisterClass *Src0RC = Src0.isReg() ? 5604 MRI.getRegClass(Src0.getReg()) : 5605 &AMDGPU::SGPR_32RegClass; 5606 5607 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 5608 const TargetRegisterClass *Src1RC = Src1.isReg() ? 5609 MRI.getRegClass(Src1.getReg()) : 5610 &AMDGPU::SGPR_32RegClass; 5611 5612 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 5613 5614 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5615 AMDGPU::sub0, Src0SubRC); 5616 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 5617 AMDGPU::sub0, Src1SubRC); 5618 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5619 AMDGPU::sub1, Src0SubRC); 5620 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 5621 AMDGPU::sub1, Src1SubRC); 5622 5623 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 5624 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 5625 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 5626 5627 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 5628 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) 5629 .add(SrcReg0Sub0) 5630 .add(SrcReg1Sub0); 5631 5632 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 5633 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) 5634 .add(SrcReg0Sub1) 5635 .add(SrcReg1Sub1); 5636 5637 Register FullDestReg = MRI.createVirtualRegister(NewDestRC); 5638 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 5639 .addReg(DestSub0) 5640 .addImm(AMDGPU::sub0) 5641 .addReg(DestSub1) 5642 .addImm(AMDGPU::sub1); 5643 5644 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 5645 5646 Worklist.insert(&LoHalf); 5647 Worklist.insert(&HiHalf); 5648 5649 // Move all users of this moved vlaue. 5650 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 5651 } 5652 5653 void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist, 5654 MachineInstr &Inst, 5655 MachineDominatorTree *MDT) const { 5656 MachineBasicBlock &MBB = *Inst.getParent(); 5657 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5658 5659 MachineOperand &Dest = Inst.getOperand(0); 5660 MachineOperand &Src0 = Inst.getOperand(1); 5661 MachineOperand &Src1 = Inst.getOperand(2); 5662 const DebugLoc &DL = Inst.getDebugLoc(); 5663 5664 MachineBasicBlock::iterator MII = Inst; 5665 5666 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 5667 5668 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 5669 5670 MachineOperand* Op0; 5671 MachineOperand* Op1; 5672 5673 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) { 5674 Op0 = &Src0; 5675 Op1 = &Src1; 5676 } else { 5677 Op0 = &Src1; 5678 Op1 = &Src0; 5679 } 5680 5681 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm) 5682 .add(*Op0); 5683 5684 Register NewDest = MRI.createVirtualRegister(DestRC); 5685 5686 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest) 5687 .addReg(Interm) 5688 .add(*Op1); 5689 5690 MRI.replaceRegWith(Dest.getReg(), NewDest); 5691 5692 Worklist.insert(&Xor); 5693 } 5694 5695 void SIInstrInfo::splitScalar64BitBCNT( 5696 SetVectorType &Worklist, MachineInstr &Inst) const { 5697 MachineBasicBlock &MBB = *Inst.getParent(); 5698 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5699 5700 MachineBasicBlock::iterator MII = Inst; 5701 const DebugLoc &DL = Inst.getDebugLoc(); 5702 5703 MachineOperand &Dest = Inst.getOperand(0); 5704 MachineOperand &Src = Inst.getOperand(1); 5705 5706 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 5707 const TargetRegisterClass *SrcRC = Src.isReg() ? 5708 MRI.getRegClass(Src.getReg()) : 5709 &AMDGPU::SGPR_32RegClass; 5710 5711 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5712 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5713 5714 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); 5715 5716 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 5717 AMDGPU::sub0, SrcSubRC); 5718 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 5719 AMDGPU::sub1, SrcSubRC); 5720 5721 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0); 5722 5723 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg); 5724 5725 MRI.replaceRegWith(Dest.getReg(), ResultReg); 5726 5727 // We don't need to legalize operands here. src0 for etiher instruction can be 5728 // an SGPR, and the second input is unused or determined here. 5729 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 5730 } 5731 5732 void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, 5733 MachineInstr &Inst) const { 5734 MachineBasicBlock &MBB = *Inst.getParent(); 5735 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5736 MachineBasicBlock::iterator MII = Inst; 5737 const DebugLoc &DL = Inst.getDebugLoc(); 5738 5739 MachineOperand &Dest = Inst.getOperand(0); 5740 uint32_t Imm = Inst.getOperand(2).getImm(); 5741 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 5742 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 5743 5744 (void) Offset; 5745 5746 // Only sext_inreg cases handled. 5747 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && 5748 Offset == 0 && "Not implemented"); 5749 5750 if (BitWidth < 32) { 5751 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5752 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5753 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 5754 5755 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) 5756 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) 5757 .addImm(0) 5758 .addImm(BitWidth); 5759 5760 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 5761 .addImm(31) 5762 .addReg(MidRegLo); 5763 5764 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 5765 .addReg(MidRegLo) 5766 .addImm(AMDGPU::sub0) 5767 .addReg(MidRegHi) 5768 .addImm(AMDGPU::sub1); 5769 5770 MRI.replaceRegWith(Dest.getReg(), ResultReg); 5771 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 5772 return; 5773 } 5774 5775 MachineOperand &Src = Inst.getOperand(1); 5776 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5777 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 5778 5779 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 5780 .addImm(31) 5781 .addReg(Src.getReg(), 0, AMDGPU::sub0); 5782 5783 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 5784 .addReg(Src.getReg(), 0, AMDGPU::sub0) 5785 .addImm(AMDGPU::sub0) 5786 .addReg(TmpReg) 5787 .addImm(AMDGPU::sub1); 5788 5789 MRI.replaceRegWith(Dest.getReg(), ResultReg); 5790 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 5791 } 5792 5793 void SIInstrInfo::addUsersToMoveToVALUWorklist( 5794 Register DstReg, 5795 MachineRegisterInfo &MRI, 5796 SetVectorType &Worklist) const { 5797 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), 5798 E = MRI.use_end(); I != E;) { 5799 MachineInstr &UseMI = *I->getParent(); 5800 5801 unsigned OpNo = 0; 5802 5803 switch (UseMI.getOpcode()) { 5804 case AMDGPU::COPY: 5805 case AMDGPU::WQM: 5806 case AMDGPU::SOFT_WQM: 5807 case AMDGPU::WWM: 5808 case AMDGPU::REG_SEQUENCE: 5809 case AMDGPU::PHI: 5810 case AMDGPU::INSERT_SUBREG: 5811 break; 5812 default: 5813 OpNo = I.getOperandNo(); 5814 break; 5815 } 5816 5817 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) { 5818 Worklist.insert(&UseMI); 5819 5820 do { 5821 ++I; 5822 } while (I != E && I->getParent() == &UseMI); 5823 } else { 5824 ++I; 5825 } 5826 } 5827 } 5828 5829 void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, 5830 MachineRegisterInfo &MRI, 5831 MachineInstr &Inst) const { 5832 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5833 MachineBasicBlock *MBB = Inst.getParent(); 5834 MachineOperand &Src0 = Inst.getOperand(1); 5835 MachineOperand &Src1 = Inst.getOperand(2); 5836 const DebugLoc &DL = Inst.getDebugLoc(); 5837 5838 switch (Inst.getOpcode()) { 5839 case AMDGPU::S_PACK_LL_B32_B16: { 5840 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5841 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5842 5843 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are 5844 // 0. 5845 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 5846 .addImm(0xffff); 5847 5848 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg) 5849 .addReg(ImmReg, RegState::Kill) 5850 .add(Src0); 5851 5852 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg) 5853 .add(Src1) 5854 .addImm(16) 5855 .addReg(TmpReg, RegState::Kill); 5856 break; 5857 } 5858 case AMDGPU::S_PACK_LH_B32_B16: { 5859 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5860 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 5861 .addImm(0xffff); 5862 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg) 5863 .addReg(ImmReg, RegState::Kill) 5864 .add(Src0) 5865 .add(Src1); 5866 break; 5867 } 5868 case AMDGPU::S_PACK_HH_B32_B16: { 5869 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5870 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5871 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) 5872 .addImm(16) 5873 .add(Src0); 5874 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 5875 .addImm(0xffff0000); 5876 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg) 5877 .add(Src1) 5878 .addReg(ImmReg, RegState::Kill) 5879 .addReg(TmpReg, RegState::Kill); 5880 break; 5881 } 5882 default: 5883 llvm_unreachable("unhandled s_pack_* instruction"); 5884 } 5885 5886 MachineOperand &Dest = Inst.getOperand(0); 5887 MRI.replaceRegWith(Dest.getReg(), ResultReg); 5888 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 5889 } 5890 5891 void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, 5892 MachineInstr &SCCDefInst, 5893 SetVectorType &Worklist) const { 5894 // Ensure that def inst defines SCC, which is still live. 5895 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && 5896 !Op.isDead() && Op.getParent() == &SCCDefInst); 5897 // This assumes that all the users of SCC are in the same block 5898 // as the SCC def. 5899 for (MachineInstr &MI : // Skip the def inst itself. 5900 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)), 5901 SCCDefInst.getParent()->end())) { 5902 // Check if SCC is used first. 5903 if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) 5904 Worklist.insert(&MI); 5905 // Exit if we find another SCC def. 5906 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1) 5907 return; 5908 } 5909 } 5910 5911 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( 5912 const MachineInstr &Inst) const { 5913 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); 5914 5915 switch (Inst.getOpcode()) { 5916 // For target instructions, getOpRegClass just returns the virtual register 5917 // class associated with the operand, so we need to find an equivalent VGPR 5918 // register class in order to move the instruction to the VALU. 5919 case AMDGPU::COPY: 5920 case AMDGPU::PHI: 5921 case AMDGPU::REG_SEQUENCE: 5922 case AMDGPU::INSERT_SUBREG: 5923 case AMDGPU::WQM: 5924 case AMDGPU::SOFT_WQM: 5925 case AMDGPU::WWM: { 5926 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1); 5927 if (RI.hasAGPRs(SrcRC)) { 5928 if (RI.hasAGPRs(NewDstRC)) 5929 return nullptr; 5930 5931 switch (Inst.getOpcode()) { 5932 case AMDGPU::PHI: 5933 case AMDGPU::REG_SEQUENCE: 5934 case AMDGPU::INSERT_SUBREG: 5935 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC); 5936 break; 5937 default: 5938 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 5939 } 5940 5941 if (!NewDstRC) 5942 return nullptr; 5943 } else { 5944 if (RI.hasVGPRs(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass) 5945 return nullptr; 5946 5947 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 5948 if (!NewDstRC) 5949 return nullptr; 5950 } 5951 5952 return NewDstRC; 5953 } 5954 default: 5955 return NewDstRC; 5956 } 5957 } 5958 5959 // Find the one SGPR operand we are allowed to use. 5960 Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI, 5961 int OpIndices[3]) const { 5962 const MCInstrDesc &Desc = MI.getDesc(); 5963 5964 // Find the one SGPR operand we are allowed to use. 5965 // 5966 // First we need to consider the instruction's operand requirements before 5967 // legalizing. Some operands are required to be SGPRs, such as implicit uses 5968 // of VCC, but we are still bound by the constant bus requirement to only use 5969 // one. 5970 // 5971 // If the operand's class is an SGPR, we can never move it. 5972 5973 Register SGPRReg = findImplicitSGPRRead(MI); 5974 if (SGPRReg != AMDGPU::NoRegister) 5975 return SGPRReg; 5976 5977 Register UsedSGPRs[3] = { AMDGPU::NoRegister }; 5978 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 5979 5980 for (unsigned i = 0; i < 3; ++i) { 5981 int Idx = OpIndices[i]; 5982 if (Idx == -1) 5983 break; 5984 5985 const MachineOperand &MO = MI.getOperand(Idx); 5986 if (!MO.isReg()) 5987 continue; 5988 5989 // Is this operand statically required to be an SGPR based on the operand 5990 // constraints? 5991 const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); 5992 bool IsRequiredSGPR = RI.isSGPRClass(OpRC); 5993 if (IsRequiredSGPR) 5994 return MO.getReg(); 5995 5996 // If this could be a VGPR or an SGPR, Check the dynamic register class. 5997 Register Reg = MO.getReg(); 5998 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); 5999 if (RI.isSGPRClass(RegRC)) 6000 UsedSGPRs[i] = Reg; 6001 } 6002 6003 // We don't have a required SGPR operand, so we have a bit more freedom in 6004 // selecting operands to move. 6005 6006 // Try to select the most used SGPR. If an SGPR is equal to one of the 6007 // others, we choose that. 6008 // 6009 // e.g. 6010 // V_FMA_F32 v0, s0, s0, s0 -> No moves 6011 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 6012 6013 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should 6014 // prefer those. 6015 6016 if (UsedSGPRs[0] != AMDGPU::NoRegister) { 6017 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 6018 SGPRReg = UsedSGPRs[0]; 6019 } 6020 6021 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { 6022 if (UsedSGPRs[1] == UsedSGPRs[2]) 6023 SGPRReg = UsedSGPRs[1]; 6024 } 6025 6026 return SGPRReg; 6027 } 6028 6029 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 6030 unsigned OperandName) const { 6031 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 6032 if (Idx == -1) 6033 return nullptr; 6034 6035 return &MI.getOperand(Idx); 6036 } 6037 6038 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 6039 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 6040 return (22ULL << 44) | // IMG_FORMAT_32_FLOAT 6041 (1ULL << 56) | // RESOURCE_LEVEL = 1 6042 (3ULL << 60); // OOB_SELECT = 3 6043 } 6044 6045 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 6046 if (ST.isAmdHsaOS()) { 6047 // Set ATC = 1. GFX9 doesn't have this bit. 6048 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) 6049 RsrcDataFormat |= (1ULL << 56); 6050 6051 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this. 6052 // BTW, it disables TC L2 and therefore decreases performance. 6053 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) 6054 RsrcDataFormat |= (2ULL << 59); 6055 } 6056 6057 return RsrcDataFormat; 6058 } 6059 6060 uint64_t SIInstrInfo::getScratchRsrcWords23() const { 6061 uint64_t Rsrc23 = getDefaultRsrcDataFormat() | 6062 AMDGPU::RSRC_TID_ENABLE | 6063 0xffffffff; // Size; 6064 6065 // GFX9 doesn't have ELEMENT_SIZE. 6066 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 6067 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; 6068 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; 6069 } 6070 6071 // IndexStride = 64 / 32. 6072 uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2; 6073 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; 6074 6075 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. 6076 // Clear them unless we want a huge stride. 6077 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 6078 ST.getGeneration() <= AMDGPUSubtarget::GFX9) 6079 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; 6080 6081 return Rsrc23; 6082 } 6083 6084 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { 6085 unsigned Opc = MI.getOpcode(); 6086 6087 return isSMRD(Opc); 6088 } 6089 6090 bool SIInstrInfo::isHighLatencyDef(int Opc) const { 6091 return get(Opc).mayLoad() && 6092 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc)); 6093 } 6094 6095 unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, 6096 int &FrameIndex) const { 6097 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 6098 if (!Addr || !Addr->isFI()) 6099 return AMDGPU::NoRegister; 6100 6101 assert(!MI.memoperands_empty() && 6102 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS); 6103 6104 FrameIndex = Addr->getIndex(); 6105 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); 6106 } 6107 6108 unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, 6109 int &FrameIndex) const { 6110 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr); 6111 assert(Addr && Addr->isFI()); 6112 FrameIndex = Addr->getIndex(); 6113 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg(); 6114 } 6115 6116 unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 6117 int &FrameIndex) const { 6118 if (!MI.mayLoad()) 6119 return AMDGPU::NoRegister; 6120 6121 if (isMUBUF(MI) || isVGPRSpill(MI)) 6122 return isStackAccess(MI, FrameIndex); 6123 6124 if (isSGPRSpill(MI)) 6125 return isSGPRStackAccess(MI, FrameIndex); 6126 6127 return AMDGPU::NoRegister; 6128 } 6129 6130 unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI, 6131 int &FrameIndex) const { 6132 if (!MI.mayStore()) 6133 return AMDGPU::NoRegister; 6134 6135 if (isMUBUF(MI) || isVGPRSpill(MI)) 6136 return isStackAccess(MI, FrameIndex); 6137 6138 if (isSGPRSpill(MI)) 6139 return isSGPRStackAccess(MI, FrameIndex); 6140 6141 return AMDGPU::NoRegister; 6142 } 6143 6144 unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const { 6145 unsigned Size = 0; 6146 MachineBasicBlock::const_instr_iterator I = MI.getIterator(); 6147 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); 6148 while (++I != E && I->isInsideBundle()) { 6149 assert(!I->isBundle() && "No nested bundle!"); 6150 Size += getInstSizeInBytes(*I); 6151 } 6152 6153 return Size; 6154 } 6155 6156 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 6157 unsigned Opc = MI.getOpcode(); 6158 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); 6159 unsigned DescSize = Desc.getSize(); 6160 6161 // If we have a definitive size, we can use it. Otherwise we need to inspect 6162 // the operands to know the size. 6163 if (isFixedSize(MI)) 6164 return DescSize; 6165 6166 // 4-byte instructions may have a 32-bit literal encoded after them. Check 6167 // operands that coud ever be literals. 6168 if (isVALU(MI) || isSALU(MI)) { 6169 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 6170 if (Src0Idx == -1) 6171 return DescSize; // No operands. 6172 6173 if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx])) 6174 return isVOP3(MI) ? 12 : (DescSize + 4); 6175 6176 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 6177 if (Src1Idx == -1) 6178 return DescSize; 6179 6180 if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx])) 6181 return isVOP3(MI) ? 12 : (DescSize + 4); 6182 6183 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 6184 if (Src2Idx == -1) 6185 return DescSize; 6186 6187 if (isLiteralConstantLike(MI.getOperand(Src2Idx), Desc.OpInfo[Src2Idx])) 6188 return isVOP3(MI) ? 12 : (DescSize + 4); 6189 6190 return DescSize; 6191 } 6192 6193 // Check whether we have extra NSA words. 6194 if (isMIMG(MI)) { 6195 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 6196 if (VAddr0Idx < 0) 6197 return 8; 6198 6199 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 6200 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4); 6201 } 6202 6203 switch (Opc) { 6204 case TargetOpcode::IMPLICIT_DEF: 6205 case TargetOpcode::KILL: 6206 case TargetOpcode::DBG_VALUE: 6207 case TargetOpcode::EH_LABEL: 6208 return 0; 6209 case TargetOpcode::BUNDLE: 6210 return getInstBundleSize(MI); 6211 case TargetOpcode::INLINEASM: 6212 case TargetOpcode::INLINEASM_BR: { 6213 const MachineFunction *MF = MI.getParent()->getParent(); 6214 const char *AsmStr = MI.getOperand(0).getSymbolName(); 6215 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), 6216 &MF->getSubtarget()); 6217 } 6218 default: 6219 return DescSize; 6220 } 6221 } 6222 6223 bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { 6224 if (!isFLAT(MI)) 6225 return false; 6226 6227 if (MI.memoperands_empty()) 6228 return true; 6229 6230 for (const MachineMemOperand *MMO : MI.memoperands()) { 6231 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) 6232 return true; 6233 } 6234 return false; 6235 } 6236 6237 bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const { 6238 return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO; 6239 } 6240 6241 void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, 6242 MachineBasicBlock *IfEnd) const { 6243 MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator(); 6244 assert(TI != IfEntry->end()); 6245 6246 MachineInstr *Branch = &(*TI); 6247 MachineFunction *MF = IfEntry->getParent(); 6248 MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo(); 6249 6250 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 6251 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); 6252 MachineInstr *SIIF = 6253 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg) 6254 .add(Branch->getOperand(0)) 6255 .add(Branch->getOperand(1)); 6256 MachineInstr *SIEND = 6257 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF)) 6258 .addReg(DstReg); 6259 6260 IfEntry->erase(TI); 6261 IfEntry->insert(IfEntry->end(), SIIF); 6262 IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND); 6263 } 6264 } 6265 6266 void SIInstrInfo::convertNonUniformLoopRegion( 6267 MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const { 6268 MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator(); 6269 // We expect 2 terminators, one conditional and one unconditional. 6270 assert(TI != LoopEnd->end()); 6271 6272 MachineInstr *Branch = &(*TI); 6273 MachineFunction *MF = LoopEnd->getParent(); 6274 MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo(); 6275 6276 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 6277 6278 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); 6279 Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC()); 6280 MachineInstrBuilder HeaderPHIBuilder = 6281 BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg); 6282 for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(), 6283 E = LoopEntry->pred_end(); 6284 PI != E; ++PI) { 6285 if (*PI == LoopEnd) { 6286 HeaderPHIBuilder.addReg(BackEdgeReg); 6287 } else { 6288 MachineBasicBlock *PMBB = *PI; 6289 Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC()); 6290 materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(), 6291 ZeroReg, 0); 6292 HeaderPHIBuilder.addReg(ZeroReg); 6293 } 6294 HeaderPHIBuilder.addMBB(*PI); 6295 } 6296 MachineInstr *HeaderPhi = HeaderPHIBuilder; 6297 MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(), 6298 get(AMDGPU::SI_IF_BREAK), BackEdgeReg) 6299 .addReg(DstReg) 6300 .add(Branch->getOperand(0)); 6301 MachineInstr *SILOOP = 6302 BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP)) 6303 .addReg(BackEdgeReg) 6304 .addMBB(LoopEntry); 6305 6306 LoopEntry->insert(LoopEntry->begin(), HeaderPhi); 6307 LoopEnd->erase(TI); 6308 LoopEnd->insert(LoopEnd->end(), SIIFBREAK); 6309 LoopEnd->insert(LoopEnd->end(), SILOOP); 6310 } 6311 } 6312 6313 ArrayRef<std::pair<int, const char *>> 6314 SIInstrInfo::getSerializableTargetIndices() const { 6315 static const std::pair<int, const char *> TargetIndices[] = { 6316 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, 6317 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, 6318 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, 6319 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, 6320 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; 6321 return makeArrayRef(TargetIndices); 6322 } 6323 6324 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The 6325 /// post-RA version of misched uses CreateTargetMIHazardRecognizer. 6326 ScheduleHazardRecognizer * 6327 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, 6328 const ScheduleDAG *DAG) const { 6329 return new GCNHazardRecognizer(DAG->MF); 6330 } 6331 6332 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer 6333 /// pass. 6334 ScheduleHazardRecognizer * 6335 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { 6336 return new GCNHazardRecognizer(MF); 6337 } 6338 6339 std::pair<unsigned, unsigned> 6340 SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { 6341 return std::make_pair(TF & MO_MASK, TF & ~MO_MASK); 6342 } 6343 6344 ArrayRef<std::pair<unsigned, const char *>> 6345 SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { 6346 static const std::pair<unsigned, const char *> TargetFlags[] = { 6347 { MO_GOTPCREL, "amdgpu-gotprel" }, 6348 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" }, 6349 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" }, 6350 { MO_REL32_LO, "amdgpu-rel32-lo" }, 6351 { MO_REL32_HI, "amdgpu-rel32-hi" }, 6352 { MO_ABS32_LO, "amdgpu-abs32-lo" }, 6353 { MO_ABS32_HI, "amdgpu-abs32-hi" }, 6354 }; 6355 6356 return makeArrayRef(TargetFlags); 6357 } 6358 6359 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { 6360 return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && 6361 MI.modifiesRegister(AMDGPU::EXEC, &RI); 6362 } 6363 6364 MachineInstrBuilder 6365 SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, 6366 MachineBasicBlock::iterator I, 6367 const DebugLoc &DL, 6368 Register DestReg) const { 6369 if (ST.hasAddNoCarry()) 6370 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg); 6371 6372 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6373 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC()); 6374 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC()); 6375 6376 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) 6377 .addReg(UnusedCarry, RegState::Define | RegState::Dead); 6378 } 6379 6380 MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, 6381 MachineBasicBlock::iterator I, 6382 const DebugLoc &DL, 6383 Register DestReg, 6384 RegScavenger &RS) const { 6385 if (ST.hasAddNoCarry()) 6386 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg); 6387 6388 // If available, prefer to use vcc. 6389 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC) 6390 ? Register(RI.getVCC()) 6391 : RS.scavengeRegister(RI.getBoolRC(), I, 0, false); 6392 6393 // TODO: Users need to deal with this. 6394 if (!UnusedCarry.isValid()) 6395 return MachineInstrBuilder(); 6396 6397 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) 6398 .addReg(UnusedCarry, RegState::Define | RegState::Dead); 6399 } 6400 6401 bool SIInstrInfo::isKillTerminator(unsigned Opcode) { 6402 switch (Opcode) { 6403 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: 6404 case AMDGPU::SI_KILL_I1_TERMINATOR: 6405 return true; 6406 default: 6407 return false; 6408 } 6409 } 6410 6411 const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const { 6412 switch (Opcode) { 6413 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: 6414 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR); 6415 case AMDGPU::SI_KILL_I1_PSEUDO: 6416 return get(AMDGPU::SI_KILL_I1_TERMINATOR); 6417 default: 6418 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO"); 6419 } 6420 } 6421 6422 void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const { 6423 MachineBasicBlock *MBB = MI.getParent(); 6424 MachineFunction *MF = MBB->getParent(); 6425 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 6426 6427 if (!ST.isWave32()) 6428 return; 6429 6430 for (auto &Op : MI.implicit_operands()) { 6431 if (Op.isReg() && Op.getReg() == AMDGPU::VCC) 6432 Op.setReg(AMDGPU::VCC_LO); 6433 } 6434 } 6435 6436 bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { 6437 if (!isSMRD(MI)) 6438 return false; 6439 6440 // Check that it is using a buffer resource. 6441 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase); 6442 if (Idx == -1) // e.g. s_memtime 6443 return false; 6444 6445 const auto RCID = MI.getDesc().OpInfo[Idx].RegClass; 6446 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass); 6447 } 6448 6449 unsigned SIInstrInfo::getNumFlatOffsetBits(unsigned AddrSpace, 6450 bool Signed) const { 6451 if (!ST.hasFlatInstOffsets()) 6452 return 0; 6453 6454 if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS) 6455 return 0; 6456 6457 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) 6458 return Signed ? 12 : 11; 6459 6460 return Signed ? 13 : 12; 6461 } 6462 6463 bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, 6464 bool Signed) const { 6465 // TODO: Should 0 be special cased? 6466 if (!ST.hasFlatInstOffsets()) 6467 return false; 6468 6469 if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS) 6470 return false; 6471 6472 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 6473 return (Signed && isInt<12>(Offset)) || 6474 (!Signed && isUInt<11>(Offset)); 6475 } 6476 6477 return (Signed && isInt<13>(Offset)) || 6478 (!Signed && isUInt<12>(Offset)); 6479 } 6480 6481 6482 // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td 6483 enum SIEncodingFamily { 6484 SI = 0, 6485 VI = 1, 6486 SDWA = 2, 6487 SDWA9 = 3, 6488 GFX80 = 4, 6489 GFX9 = 5, 6490 GFX10 = 6, 6491 SDWA10 = 7 6492 }; 6493 6494 static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) { 6495 switch (ST.getGeneration()) { 6496 default: 6497 break; 6498 case AMDGPUSubtarget::SOUTHERN_ISLANDS: 6499 case AMDGPUSubtarget::SEA_ISLANDS: 6500 return SIEncodingFamily::SI; 6501 case AMDGPUSubtarget::VOLCANIC_ISLANDS: 6502 case AMDGPUSubtarget::GFX9: 6503 return SIEncodingFamily::VI; 6504 case AMDGPUSubtarget::GFX10: 6505 return SIEncodingFamily::GFX10; 6506 } 6507 llvm_unreachable("Unknown subtarget generation!"); 6508 } 6509 6510 bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const { 6511 switch(MCOp) { 6512 // These opcodes use indirect register addressing so 6513 // they need special handling by codegen (currently missing). 6514 // Therefore it is too risky to allow these opcodes 6515 // to be selected by dpp combiner or sdwa peepholer. 6516 case AMDGPU::V_MOVRELS_B32_dpp_gfx10: 6517 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10: 6518 case AMDGPU::V_MOVRELD_B32_dpp_gfx10: 6519 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10: 6520 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10: 6521 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10: 6522 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10: 6523 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10: 6524 return true; 6525 default: 6526 return false; 6527 } 6528 } 6529 6530 int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { 6531 SIEncodingFamily Gen = subtargetEncodingFamily(ST); 6532 6533 if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 && 6534 ST.getGeneration() == AMDGPUSubtarget::GFX9) 6535 Gen = SIEncodingFamily::GFX9; 6536 6537 // Adjust the encoding family to GFX80 for D16 buffer instructions when the 6538 // subtarget has UnpackedD16VMem feature. 6539 // TODO: remove this when we discard GFX80 encoding. 6540 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf)) 6541 Gen = SIEncodingFamily::GFX80; 6542 6543 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) { 6544 switch (ST.getGeneration()) { 6545 default: 6546 Gen = SIEncodingFamily::SDWA; 6547 break; 6548 case AMDGPUSubtarget::GFX9: 6549 Gen = SIEncodingFamily::SDWA9; 6550 break; 6551 case AMDGPUSubtarget::GFX10: 6552 Gen = SIEncodingFamily::SDWA10; 6553 break; 6554 } 6555 } 6556 6557 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); 6558 6559 // -1 means that Opcode is already a native instruction. 6560 if (MCOp == -1) 6561 return Opcode; 6562 6563 // (uint16_t)-1 means that Opcode is a pseudo instruction that has 6564 // no encoding in the given subtarget generation. 6565 if (MCOp == (uint16_t)-1) 6566 return -1; 6567 6568 if (isAsmOnlyOpcode(MCOp)) 6569 return -1; 6570 6571 return MCOp; 6572 } 6573 6574 static 6575 TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) { 6576 assert(RegOpnd.isReg()); 6577 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() : 6578 getRegSubRegPair(RegOpnd); 6579 } 6580 6581 TargetInstrInfo::RegSubRegPair 6582 llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) { 6583 assert(MI.isRegSequence()); 6584 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I) 6585 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) { 6586 auto &RegOp = MI.getOperand(1 + 2 * I); 6587 return getRegOrUndef(RegOp); 6588 } 6589 return TargetInstrInfo::RegSubRegPair(); 6590 } 6591 6592 // Try to find the definition of reg:subreg in subreg-manipulation pseudos 6593 // Following a subreg of reg:subreg isn't supported 6594 static bool followSubRegDef(MachineInstr &MI, 6595 TargetInstrInfo::RegSubRegPair &RSR) { 6596 if (!RSR.SubReg) 6597 return false; 6598 switch (MI.getOpcode()) { 6599 default: break; 6600 case AMDGPU::REG_SEQUENCE: 6601 RSR = getRegSequenceSubReg(MI, RSR.SubReg); 6602 return true; 6603 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg 6604 case AMDGPU::INSERT_SUBREG: 6605 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm()) 6606 // inserted the subreg we're looking for 6607 RSR = getRegOrUndef(MI.getOperand(2)); 6608 else { // the subreg in the rest of the reg 6609 auto R1 = getRegOrUndef(MI.getOperand(1)); 6610 if (R1.SubReg) // subreg of subreg isn't supported 6611 return false; 6612 RSR.Reg = R1.Reg; 6613 } 6614 return true; 6615 } 6616 return false; 6617 } 6618 6619 MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, 6620 MachineRegisterInfo &MRI) { 6621 assert(MRI.isSSA()); 6622 if (!Register::isVirtualRegister(P.Reg)) 6623 return nullptr; 6624 6625 auto RSR = P; 6626 auto *DefInst = MRI.getVRegDef(RSR.Reg); 6627 while (auto *MI = DefInst) { 6628 DefInst = nullptr; 6629 switch (MI->getOpcode()) { 6630 case AMDGPU::COPY: 6631 case AMDGPU::V_MOV_B32_e32: { 6632 auto &Op1 = MI->getOperand(1); 6633 if (Op1.isReg() && Register::isVirtualRegister(Op1.getReg())) { 6634 if (Op1.isUndef()) 6635 return nullptr; 6636 RSR = getRegSubRegPair(Op1); 6637 DefInst = MRI.getVRegDef(RSR.Reg); 6638 } 6639 break; 6640 } 6641 default: 6642 if (followSubRegDef(*MI, RSR)) { 6643 if (!RSR.Reg) 6644 return nullptr; 6645 DefInst = MRI.getVRegDef(RSR.Reg); 6646 } 6647 } 6648 if (!DefInst) 6649 return MI; 6650 } 6651 return nullptr; 6652 } 6653 6654 bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, 6655 Register VReg, 6656 const MachineInstr &DefMI, 6657 const MachineInstr &UseMI) { 6658 assert(MRI.isSSA() && "Must be run on SSA"); 6659 6660 auto *TRI = MRI.getTargetRegisterInfo(); 6661 auto *DefBB = DefMI.getParent(); 6662 6663 // Don't bother searching between blocks, although it is possible this block 6664 // doesn't modify exec. 6665 if (UseMI.getParent() != DefBB) 6666 return true; 6667 6668 const int MaxInstScan = 20; 6669 int NumInst = 0; 6670 6671 // Stop scan at the use. 6672 auto E = UseMI.getIterator(); 6673 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) { 6674 if (I->isDebugInstr()) 6675 continue; 6676 6677 if (++NumInst > MaxInstScan) 6678 return true; 6679 6680 if (I->modifiesRegister(AMDGPU::EXEC, TRI)) 6681 return true; 6682 } 6683 6684 return false; 6685 } 6686 6687 bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, 6688 Register VReg, 6689 const MachineInstr &DefMI) { 6690 assert(MRI.isSSA() && "Must be run on SSA"); 6691 6692 auto *TRI = MRI.getTargetRegisterInfo(); 6693 auto *DefBB = DefMI.getParent(); 6694 6695 const int MaxUseInstScan = 10; 6696 int NumUseInst = 0; 6697 6698 for (auto &UseInst : MRI.use_nodbg_instructions(VReg)) { 6699 // Don't bother searching between blocks, although it is possible this block 6700 // doesn't modify exec. 6701 if (UseInst.getParent() != DefBB) 6702 return true; 6703 6704 if (++NumUseInst > MaxUseInstScan) 6705 return true; 6706 } 6707 6708 const int MaxInstScan = 20; 6709 int NumInst = 0; 6710 6711 // Stop scan when we have seen all the uses. 6712 for (auto I = std::next(DefMI.getIterator()); ; ++I) { 6713 if (I->isDebugInstr()) 6714 continue; 6715 6716 if (++NumInst > MaxInstScan) 6717 return true; 6718 6719 if (I->readsRegister(VReg)) 6720 if (--NumUseInst == 0) 6721 return false; 6722 6723 if (I->modifiesRegister(AMDGPU::EXEC, TRI)) 6724 return true; 6725 } 6726 } 6727 6728 MachineInstr *SIInstrInfo::createPHIDestinationCopy( 6729 MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt, 6730 const DebugLoc &DL, Register Src, Register Dst) const { 6731 auto Cur = MBB.begin(); 6732 if (Cur != MBB.end()) 6733 do { 6734 if (!Cur->isPHI() && Cur->readsRegister(Dst)) 6735 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src); 6736 ++Cur; 6737 } while (Cur != MBB.end() && Cur != LastPHIIt); 6738 6739 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src, 6740 Dst); 6741 } 6742 6743 MachineInstr *SIInstrInfo::createPHISourceCopy( 6744 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, 6745 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const { 6746 if (InsPt != MBB.end() && 6747 (InsPt->getOpcode() == AMDGPU::SI_IF || 6748 InsPt->getOpcode() == AMDGPU::SI_ELSE || 6749 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) && 6750 InsPt->definesRegister(Src)) { 6751 InsPt++; 6752 return BuildMI(MBB, InsPt, DL, 6753 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term 6754 : AMDGPU::S_MOV_B64_term), 6755 Dst) 6756 .addReg(Src, 0, SrcSubReg) 6757 .addReg(AMDGPU::EXEC, RegState::Implicit); 6758 } 6759 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg, 6760 Dst); 6761 } 6762 6763 bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); } 6764 6765 MachineInstr *SIInstrInfo::foldMemoryOperandImpl( 6766 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, 6767 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS, 6768 VirtRegMap *VRM) const { 6769 // This is a bit of a hack (copied from AArch64). Consider this instruction: 6770 // 6771 // %0:sreg_32 = COPY $m0 6772 // 6773 // We explicitly chose SReg_32 for the virtual register so such a copy might 6774 // be eliminated by RegisterCoalescer. However, that may not be possible, and 6775 // %0 may even spill. We can't spill $m0 normally (it would require copying to 6776 // a numbered SGPR anyway), and since it is in the SReg_32 register class, 6777 // TargetInstrInfo::foldMemoryOperand() is going to try. 6778 // 6779 // To prevent that, constrain the %0 register class here. 6780 if (MI.isFullCopy()) { 6781 Register DstReg = MI.getOperand(0).getReg(); 6782 Register SrcReg = MI.getOperand(1).getReg(); 6783 6784 if (DstReg == AMDGPU::M0 && SrcReg.isVirtual()) { 6785 MF.getRegInfo().constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); 6786 return nullptr; 6787 } 6788 6789 if (SrcReg == AMDGPU::M0 && DstReg.isVirtual()) { 6790 MF.getRegInfo().constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass); 6791 return nullptr; 6792 } 6793 } 6794 6795 return nullptr; 6796 } 6797 6798 unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, 6799 const MachineInstr &MI, 6800 unsigned *PredCost) const { 6801 if (MI.isBundle()) { 6802 MachineBasicBlock::const_instr_iterator I(MI.getIterator()); 6803 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end()); 6804 unsigned Lat = 0, Count = 0; 6805 for (++I; I != E && I->isBundledWithPred(); ++I) { 6806 ++Count; 6807 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I)); 6808 } 6809 return Lat + Count - 1; 6810 } 6811 6812 return SchedModel.computeInstrLatency(&MI); 6813 } 6814