1 //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI Implementation of TargetInstrInfo. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIInstrInfo.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUInstrInfo.h" 17 #include "GCNHazardRecognizer.h" 18 #include "GCNSubtarget.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "llvm/Analysis/ValueTracking.h" 22 #include "llvm/CodeGen/LiveIntervals.h" 23 #include "llvm/CodeGen/LiveVariables.h" 24 #include "llvm/CodeGen/MachineDominators.h" 25 #include "llvm/CodeGen/MachineFrameInfo.h" 26 #include "llvm/CodeGen/MachineScheduler.h" 27 #include "llvm/CodeGen/RegisterScavenging.h" 28 #include "llvm/CodeGen/ScheduleDAG.h" 29 #include "llvm/IR/DiagnosticInfo.h" 30 #include "llvm/IR/IntrinsicsAMDGPU.h" 31 #include "llvm/MC/MCContext.h" 32 #include "llvm/Support/CommandLine.h" 33 #include "llvm/Target/TargetMachine.h" 34 35 using namespace llvm; 36 37 #define DEBUG_TYPE "si-instr-info" 38 39 #define GET_INSTRINFO_CTOR_DTOR 40 #include "AMDGPUGenInstrInfo.inc" 41 42 namespace llvm { 43 44 class AAResults; 45 46 namespace AMDGPU { 47 #define GET_D16ImageDimIntrinsics_IMPL 48 #define GET_ImageDimIntrinsicTable_IMPL 49 #define GET_RsrcIntrinsics_IMPL 50 #include "AMDGPUGenSearchableTables.inc" 51 } 52 } 53 54 55 // Must be at least 4 to be able to branch over minimum unconditional branch 56 // code. This is only for making it possible to write reasonably small tests for 57 // long branches. 58 static cl::opt<unsigned> 59 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), 60 cl::desc("Restrict range of branch instructions (DEBUG)")); 61 62 static cl::opt<bool> Fix16BitCopies( 63 "amdgpu-fix-16-bit-physreg-copies", 64 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), 65 cl::init(true), 66 cl::ReallyHidden); 67 68 SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) 69 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), 70 RI(ST), ST(ST) { 71 SchedModel.init(&ST); 72 } 73 74 //===----------------------------------------------------------------------===// 75 // TargetInstrInfo callbacks 76 //===----------------------------------------------------------------------===// 77 78 static unsigned getNumOperandsNoGlue(SDNode *Node) { 79 unsigned N = Node->getNumOperands(); 80 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 81 --N; 82 return N; 83 } 84 85 /// Returns true if both nodes have the same value for the given 86 /// operand \p Op, or if both nodes do not have this operand. 87 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 88 unsigned Opc0 = N0->getMachineOpcode(); 89 unsigned Opc1 = N1->getMachineOpcode(); 90 91 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 92 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 93 94 if (Op0Idx == -1 && Op1Idx == -1) 95 return true; 96 97 98 if ((Op0Idx == -1 && Op1Idx != -1) || 99 (Op1Idx == -1 && Op0Idx != -1)) 100 return false; 101 102 // getNamedOperandIdx returns the index for the MachineInstr's operands, 103 // which includes the result as the first operand. We are indexing into the 104 // MachineSDNode's operands, so we need to skip the result operand to get 105 // the real index. 106 --Op0Idx; 107 --Op1Idx; 108 109 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 110 } 111 112 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, 113 AAResults *AA) const { 114 if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI) || isSALU(MI)) { 115 // Normally VALU use of exec would block the rematerialization, but that 116 // is OK in this case to have an implicit exec read as all VALU do. 117 // We really want all of the generic logic for this except for this. 118 119 // Another potential implicit use is mode register. The core logic of 120 // the RA will not attempt rematerialization if mode is set anywhere 121 // in the function, otherwise it is safe since mode is not changed. 122 123 // There is difference to generic method which does not allow 124 // rematerialization if there are virtual register uses. We allow this, 125 // therefore this method includes SOP instructions as well. 126 return !MI.hasImplicitDef() && 127 MI.getNumImplicitOperands() == MI.getDesc().getNumImplicitUses() && 128 !MI.mayRaiseFPException(); 129 } 130 131 return false; 132 } 133 134 // Returns true if the scalar result of a VALU instruction depends on exec. 135 static bool resultDependsOnExec(const MachineInstr &MI) { 136 // Ignore comparisons which are only used masked with exec. 137 // This allows some hoisting/sinking of VALU comparisons. 138 if (MI.isCompare()) { 139 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 140 Register DstReg = MI.getOperand(0).getReg(); 141 if (!DstReg.isVirtual()) 142 return true; 143 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) { 144 switch (Use.getOpcode()) { 145 case AMDGPU::S_AND_SAVEEXEC_B32: 146 case AMDGPU::S_AND_SAVEEXEC_B64: 147 break; 148 case AMDGPU::S_AND_B32: 149 case AMDGPU::S_AND_B64: 150 if (!Use.readsRegister(AMDGPU::EXEC)) 151 return true; 152 break; 153 default: 154 return true; 155 } 156 } 157 return false; 158 } 159 160 switch (MI.getOpcode()) { 161 default: 162 break; 163 case AMDGPU::V_READFIRSTLANE_B32: 164 return true; 165 } 166 167 return false; 168 } 169 170 bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const { 171 // Any implicit use of exec by VALU is not a real register read. 172 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() && 173 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent()); 174 } 175 176 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 177 int64_t &Offset0, 178 int64_t &Offset1) const { 179 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 180 return false; 181 182 unsigned Opc0 = Load0->getMachineOpcode(); 183 unsigned Opc1 = Load1->getMachineOpcode(); 184 185 // Make sure both are actually loads. 186 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 187 return false; 188 189 if (isDS(Opc0) && isDS(Opc1)) { 190 191 // FIXME: Handle this case: 192 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 193 return false; 194 195 // Check base reg. 196 if (Load0->getOperand(0) != Load1->getOperand(0)) 197 return false; 198 199 // Skip read2 / write2 variants for simplicity. 200 // TODO: We should report true if the used offsets are adjacent (excluded 201 // st64 versions). 202 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 203 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 204 if (Offset0Idx == -1 || Offset1Idx == -1) 205 return false; 206 207 // XXX - be careful of dataless loads 208 // getNamedOperandIdx returns the index for MachineInstrs. Since they 209 // include the output in the operand list, but SDNodes don't, we need to 210 // subtract the index by one. 211 Offset0Idx -= get(Opc0).NumDefs; 212 Offset1Idx -= get(Opc1).NumDefs; 213 Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue(); 214 Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue(); 215 return true; 216 } 217 218 if (isSMRD(Opc0) && isSMRD(Opc1)) { 219 // Skip time and cache invalidation instructions. 220 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 || 221 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1) 222 return false; 223 224 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); 225 226 // Check base reg. 227 if (Load0->getOperand(0) != Load1->getOperand(0)) 228 return false; 229 230 const ConstantSDNode *Load0Offset = 231 dyn_cast<ConstantSDNode>(Load0->getOperand(1)); 232 const ConstantSDNode *Load1Offset = 233 dyn_cast<ConstantSDNode>(Load1->getOperand(1)); 234 235 if (!Load0Offset || !Load1Offset) 236 return false; 237 238 Offset0 = Load0Offset->getZExtValue(); 239 Offset1 = Load1Offset->getZExtValue(); 240 return true; 241 } 242 243 // MUBUF and MTBUF can access the same addresses. 244 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 245 246 // MUBUF and MTBUF have vaddr at different indices. 247 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 248 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 249 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 250 return false; 251 252 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 253 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 254 255 if (OffIdx0 == -1 || OffIdx1 == -1) 256 return false; 257 258 // getNamedOperandIdx returns the index for MachineInstrs. Since they 259 // include the output in the operand list, but SDNodes don't, we need to 260 // subtract the index by one. 261 OffIdx0 -= get(Opc0).NumDefs; 262 OffIdx1 -= get(Opc1).NumDefs; 263 264 SDValue Off0 = Load0->getOperand(OffIdx0); 265 SDValue Off1 = Load1->getOperand(OffIdx1); 266 267 // The offset might be a FrameIndexSDNode. 268 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 269 return false; 270 271 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 272 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 273 return true; 274 } 275 276 return false; 277 } 278 279 static bool isStride64(unsigned Opc) { 280 switch (Opc) { 281 case AMDGPU::DS_READ2ST64_B32: 282 case AMDGPU::DS_READ2ST64_B64: 283 case AMDGPU::DS_WRITE2ST64_B32: 284 case AMDGPU::DS_WRITE2ST64_B64: 285 return true; 286 default: 287 return false; 288 } 289 } 290 291 bool SIInstrInfo::getMemOperandsWithOffsetWidth( 292 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, 293 int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, 294 const TargetRegisterInfo *TRI) const { 295 if (!LdSt.mayLoadOrStore()) 296 return false; 297 298 unsigned Opc = LdSt.getOpcode(); 299 OffsetIsScalable = false; 300 const MachineOperand *BaseOp, *OffsetOp; 301 int DataOpIdx; 302 303 if (isDS(LdSt)) { 304 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); 305 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); 306 if (OffsetOp) { 307 // Normal, single offset LDS instruction. 308 if (!BaseOp) { 309 // DS_CONSUME/DS_APPEND use M0 for the base address. 310 // TODO: find the implicit use operand for M0 and use that as BaseOp? 311 return false; 312 } 313 BaseOps.push_back(BaseOp); 314 Offset = OffsetOp->getImm(); 315 // Get appropriate operand, and compute width accordingly. 316 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 317 if (DataOpIdx == -1) 318 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 319 Width = getOpSize(LdSt, DataOpIdx); 320 } else { 321 // The 2 offset instructions use offset0 and offset1 instead. We can treat 322 // these as a load with a single offset if the 2 offsets are consecutive. 323 // We will use this for some partially aligned loads. 324 const MachineOperand *Offset0Op = 325 getNamedOperand(LdSt, AMDGPU::OpName::offset0); 326 const MachineOperand *Offset1Op = 327 getNamedOperand(LdSt, AMDGPU::OpName::offset1); 328 329 unsigned Offset0 = Offset0Op->getImm(); 330 unsigned Offset1 = Offset1Op->getImm(); 331 if (Offset0 + 1 != Offset1) 332 return false; 333 334 // Each of these offsets is in element sized units, so we need to convert 335 // to bytes of the individual reads. 336 337 unsigned EltSize; 338 if (LdSt.mayLoad()) 339 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16; 340 else { 341 assert(LdSt.mayStore()); 342 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 343 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8; 344 } 345 346 if (isStride64(Opc)) 347 EltSize *= 64; 348 349 BaseOps.push_back(BaseOp); 350 Offset = EltSize * Offset0; 351 // Get appropriate operand(s), and compute width accordingly. 352 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 353 if (DataOpIdx == -1) { 354 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 355 Width = getOpSize(LdSt, DataOpIdx); 356 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1); 357 Width += getOpSize(LdSt, DataOpIdx); 358 } else { 359 Width = getOpSize(LdSt, DataOpIdx); 360 } 361 } 362 return true; 363 } 364 365 if (isMUBUF(LdSt) || isMTBUF(LdSt)) { 366 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); 367 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL 368 return false; 369 BaseOps.push_back(RSrc); 370 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 371 if (BaseOp && !BaseOp->isFI()) 372 BaseOps.push_back(BaseOp); 373 const MachineOperand *OffsetImm = 374 getNamedOperand(LdSt, AMDGPU::OpName::offset); 375 Offset = OffsetImm->getImm(); 376 const MachineOperand *SOffset = 377 getNamedOperand(LdSt, AMDGPU::OpName::soffset); 378 if (SOffset) { 379 if (SOffset->isReg()) 380 BaseOps.push_back(SOffset); 381 else 382 Offset += SOffset->getImm(); 383 } 384 // Get appropriate operand, and compute width accordingly. 385 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 386 if (DataOpIdx == -1) 387 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); 388 Width = getOpSize(LdSt, DataOpIdx); 389 return true; 390 } 391 392 if (isMIMG(LdSt)) { 393 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 394 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx)); 395 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 396 if (VAddr0Idx >= 0) { 397 // GFX10 possible NSA encoding. 398 for (int I = VAddr0Idx; I < SRsrcIdx; ++I) 399 BaseOps.push_back(&LdSt.getOperand(I)); 400 } else { 401 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr)); 402 } 403 Offset = 0; 404 // Get appropriate operand, and compute width accordingly. 405 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); 406 Width = getOpSize(LdSt, DataOpIdx); 407 return true; 408 } 409 410 if (isSMRD(LdSt)) { 411 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase); 412 if (!BaseOp) // e.g. S_MEMTIME 413 return false; 414 BaseOps.push_back(BaseOp); 415 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); 416 Offset = OffsetOp ? OffsetOp->getImm() : 0; 417 // Get appropriate operand, and compute width accordingly. 418 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst); 419 Width = getOpSize(LdSt, DataOpIdx); 420 return true; 421 } 422 423 if (isFLAT(LdSt)) { 424 // Instructions have either vaddr or saddr or both or none. 425 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 426 if (BaseOp) 427 BaseOps.push_back(BaseOp); 428 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr); 429 if (BaseOp) 430 BaseOps.push_back(BaseOp); 431 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm(); 432 // Get appropriate operand, and compute width accordingly. 433 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 434 if (DataOpIdx == -1) 435 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); 436 Width = getOpSize(LdSt, DataOpIdx); 437 return true; 438 } 439 440 return false; 441 } 442 443 static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, 444 ArrayRef<const MachineOperand *> BaseOps1, 445 const MachineInstr &MI2, 446 ArrayRef<const MachineOperand *> BaseOps2) { 447 // Only examine the first "base" operand of each instruction, on the 448 // assumption that it represents the real base address of the memory access. 449 // Other operands are typically offsets or indices from this base address. 450 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front())) 451 return true; 452 453 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand()) 454 return false; 455 456 auto MO1 = *MI1.memoperands_begin(); 457 auto MO2 = *MI2.memoperands_begin(); 458 if (MO1->getAddrSpace() != MO2->getAddrSpace()) 459 return false; 460 461 auto Base1 = MO1->getValue(); 462 auto Base2 = MO2->getValue(); 463 if (!Base1 || !Base2) 464 return false; 465 Base1 = getUnderlyingObject(Base1); 466 Base2 = getUnderlyingObject(Base2); 467 468 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2)) 469 return false; 470 471 return Base1 == Base2; 472 } 473 474 bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1, 475 ArrayRef<const MachineOperand *> BaseOps2, 476 unsigned NumLoads, 477 unsigned NumBytes) const { 478 // If the mem ops (to be clustered) do not have the same base ptr, then they 479 // should not be clustered 480 if (!BaseOps1.empty() && !BaseOps2.empty()) { 481 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent(); 482 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent(); 483 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) 484 return false; 485 } else if (!BaseOps1.empty() || !BaseOps2.empty()) { 486 // If only one base op is empty, they do not have the same base ptr 487 return false; 488 } 489 490 // In order to avoid register pressure, on an average, the number of DWORDS 491 // loaded together by all clustered mem ops should not exceed 8. This is an 492 // empirical value based on certain observations and performance related 493 // experiments. 494 // The good thing about this heuristic is - it avoids clustering of too many 495 // sub-word loads, and also avoids clustering of wide loads. Below is the 496 // brief summary of how the heuristic behaves for various `LoadSize`. 497 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops 498 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops 499 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops 500 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops 501 // (5) LoadSize >= 17: do not cluster 502 const unsigned LoadSize = NumBytes / NumLoads; 503 const unsigned NumDWORDs = ((LoadSize + 3) / 4) * NumLoads; 504 return NumDWORDs <= 8; 505 } 506 507 // FIXME: This behaves strangely. If, for example, you have 32 load + stores, 508 // the first 16 loads will be interleaved with the stores, and the next 16 will 509 // be clustered as expected. It should really split into 2 16 store batches. 510 // 511 // Loads are clustered until this returns false, rather than trying to schedule 512 // groups of stores. This also means we have to deal with saying different 513 // address space loads should be clustered, and ones which might cause bank 514 // conflicts. 515 // 516 // This might be deprecated so it might not be worth that much effort to fix. 517 bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, 518 int64_t Offset0, int64_t Offset1, 519 unsigned NumLoads) const { 520 assert(Offset1 > Offset0 && 521 "Second offset should be larger than first offset!"); 522 // If we have less than 16 loads in a row, and the offsets are within 64 523 // bytes, then schedule together. 524 525 // A cacheline is 64 bytes (for global memory). 526 return (NumLoads <= 16 && (Offset1 - Offset0) < 64); 527 } 528 529 static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, 530 MachineBasicBlock::iterator MI, 531 const DebugLoc &DL, MCRegister DestReg, 532 MCRegister SrcReg, bool KillSrc, 533 const char *Msg = "illegal SGPR to VGPR copy") { 534 MachineFunction *MF = MBB.getParent(); 535 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error); 536 LLVMContext &C = MF->getFunction().getContext(); 537 C.diagnose(IllegalCopy); 538 539 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg) 540 .addReg(SrcReg, getKillRegState(KillSrc)); 541 } 542 543 /// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not 544 /// possible to have a direct copy in these cases on GFX908, so an intermediate 545 /// VGPR copy is required. 546 static void indirectCopyToAGPR(const SIInstrInfo &TII, 547 MachineBasicBlock &MBB, 548 MachineBasicBlock::iterator MI, 549 const DebugLoc &DL, MCRegister DestReg, 550 MCRegister SrcReg, bool KillSrc, 551 RegScavenger &RS, 552 Register ImpDefSuperReg = Register(), 553 Register ImpUseSuperReg = Register()) { 554 assert((TII.getSubtarget().hasMAIInsts() && 555 !TII.getSubtarget().hasGFX90AInsts()) && 556 "Expected GFX908 subtarget."); 557 558 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) || 559 AMDGPU::AGPR_32RegClass.contains(SrcReg)) && 560 "Source register of the copy should be either an SGPR or an AGPR."); 561 562 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) && 563 "Destination register of the copy should be an AGPR."); 564 565 const SIRegisterInfo &RI = TII.getRegisterInfo(); 566 567 // First try to find defining accvgpr_write to avoid temporary registers. 568 for (auto Def = MI, E = MBB.begin(); Def != E; ) { 569 --Def; 570 if (!Def->definesRegister(SrcReg, &RI)) 571 continue; 572 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 573 break; 574 575 MachineOperand &DefOp = Def->getOperand(1); 576 assert(DefOp.isReg() || DefOp.isImm()); 577 578 if (DefOp.isReg()) { 579 // Check that register source operand if not clobbered before MI. 580 // Immediate operands are always safe to propagate. 581 bool SafeToPropagate = true; 582 for (auto I = Def; I != MI && SafeToPropagate; ++I) 583 if (I->modifiesRegister(DefOp.getReg(), &RI)) 584 SafeToPropagate = false; 585 586 if (!SafeToPropagate) 587 break; 588 589 DefOp.setIsKill(false); 590 } 591 592 MachineInstrBuilder Builder = 593 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) 594 .add(DefOp); 595 if (ImpDefSuperReg) 596 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); 597 598 if (ImpUseSuperReg) { 599 Builder.addReg(ImpUseSuperReg, 600 getKillRegState(KillSrc) | RegState::Implicit); 601 } 602 603 return; 604 } 605 606 RS.enterBasicBlock(MBB); 607 RS.forward(MI); 608 609 // Ideally we want to have three registers for a long reg_sequence copy 610 // to hide 2 waitstates between v_mov_b32 and accvgpr_write. 611 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 612 *MBB.getParent()); 613 614 // Registers in the sequence are allocated contiguously so we can just 615 // use register number to pick one of three round-robin temps. 616 unsigned RegNo = DestReg % 3; 617 Register Tmp = 618 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy(); 619 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) && 620 "VGPR used for an intermediate copy should have been reserved."); 621 622 // Only loop through if there are any free registers left, otherwise 623 // scavenger may report a fatal error without emergency spill slot 624 // or spill with the slot. 625 while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { 626 Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); 627 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) 628 break; 629 Tmp = Tmp2; 630 RS.setRegUsed(Tmp); 631 } 632 633 // Insert copy to temporary VGPR. 634 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32; 635 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) { 636 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64; 637 } else { 638 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 639 } 640 641 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp) 642 .addReg(SrcReg, getKillRegState(KillSrc)); 643 if (ImpUseSuperReg) { 644 UseBuilder.addReg(ImpUseSuperReg, 645 getKillRegState(KillSrc) | RegState::Implicit); 646 } 647 648 MachineInstrBuilder DefBuilder 649 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) 650 .addReg(Tmp, RegState::Kill); 651 652 if (ImpDefSuperReg) 653 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); 654 } 655 656 static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, 657 MachineBasicBlock::iterator MI, const DebugLoc &DL, 658 MCRegister DestReg, MCRegister SrcReg, bool KillSrc, 659 const TargetRegisterClass *RC, bool Forward) { 660 const SIRegisterInfo &RI = TII.getRegisterInfo(); 661 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4); 662 MachineBasicBlock::iterator I = MI; 663 MachineInstr *FirstMI = nullptr, *LastMI = nullptr; 664 665 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) { 666 int16_t SubIdx = BaseIndices[Idx]; 667 Register Reg = RI.getSubReg(DestReg, SubIdx); 668 unsigned Opcode = AMDGPU::S_MOV_B32; 669 670 // Is SGPR aligned? If so try to combine with next. 671 Register Src = RI.getSubReg(SrcReg, SubIdx); 672 bool AlignedDest = ((Reg - AMDGPU::SGPR0) % 2) == 0; 673 bool AlignedSrc = ((Src - AMDGPU::SGPR0) % 2) == 0; 674 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) { 675 // Can use SGPR64 copy 676 unsigned Channel = RI.getChannelFromSubReg(SubIdx); 677 SubIdx = RI.getSubRegFromChannel(Channel, 2); 678 Opcode = AMDGPU::S_MOV_B64; 679 Idx++; 680 } 681 682 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), RI.getSubReg(DestReg, SubIdx)) 683 .addReg(RI.getSubReg(SrcReg, SubIdx)) 684 .addReg(SrcReg, RegState::Implicit); 685 686 if (!FirstMI) 687 FirstMI = LastMI; 688 689 if (!Forward) 690 I--; 691 } 692 693 assert(FirstMI && LastMI); 694 if (!Forward) 695 std::swap(FirstMI, LastMI); 696 697 FirstMI->addOperand( 698 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/)); 699 700 if (KillSrc) 701 LastMI->addRegisterKilled(SrcReg, &RI); 702 } 703 704 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 705 MachineBasicBlock::iterator MI, 706 const DebugLoc &DL, MCRegister DestReg, 707 MCRegister SrcReg, bool KillSrc) const { 708 const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg); 709 710 // FIXME: This is hack to resolve copies between 16 bit and 32 bit 711 // registers until all patterns are fixed. 712 if (Fix16BitCopies && 713 ((RI.getRegSizeInBits(*RC) == 16) ^ 714 (RI.getRegSizeInBits(*RI.getPhysRegClass(SrcReg)) == 16))) { 715 MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg; 716 MCRegister Super = RI.get32BitRegister(RegToFix); 717 assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix); 718 RegToFix = Super; 719 720 if (DestReg == SrcReg) { 721 // Insert empty bundle since ExpandPostRA expects an instruction here. 722 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE)); 723 return; 724 } 725 726 RC = RI.getPhysRegClass(DestReg); 727 } 728 729 if (RC == &AMDGPU::VGPR_32RegClass) { 730 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 731 AMDGPU::SReg_32RegClass.contains(SrcReg) || 732 AMDGPU::AGPR_32RegClass.contains(SrcReg)); 733 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ? 734 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32; 735 BuildMI(MBB, MI, DL, get(Opc), DestReg) 736 .addReg(SrcReg, getKillRegState(KillSrc)); 737 return; 738 } 739 740 if (RC == &AMDGPU::SReg_32_XM0RegClass || 741 RC == &AMDGPU::SReg_32RegClass) { 742 if (SrcReg == AMDGPU::SCC) { 743 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) 744 .addImm(1) 745 .addImm(0); 746 return; 747 } 748 749 if (DestReg == AMDGPU::VCC_LO) { 750 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) { 751 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO) 752 .addReg(SrcReg, getKillRegState(KillSrc)); 753 } else { 754 // FIXME: Hack until VReg_1 removed. 755 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 756 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) 757 .addImm(0) 758 .addReg(SrcReg, getKillRegState(KillSrc)); 759 } 760 761 return; 762 } 763 764 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) { 765 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 766 return; 767 } 768 769 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 770 .addReg(SrcReg, getKillRegState(KillSrc)); 771 return; 772 } 773 774 if (RC == &AMDGPU::SReg_64RegClass) { 775 if (SrcReg == AMDGPU::SCC) { 776 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg) 777 .addImm(1) 778 .addImm(0); 779 return; 780 } 781 782 if (DestReg == AMDGPU::VCC) { 783 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 784 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 785 .addReg(SrcReg, getKillRegState(KillSrc)); 786 } else { 787 // FIXME: Hack until VReg_1 removed. 788 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 789 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) 790 .addImm(0) 791 .addReg(SrcReg, getKillRegState(KillSrc)); 792 } 793 794 return; 795 } 796 797 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) { 798 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 799 return; 800 } 801 802 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 803 .addReg(SrcReg, getKillRegState(KillSrc)); 804 return; 805 } 806 807 if (DestReg == AMDGPU::SCC) { 808 // Copying 64-bit or 32-bit sources to SCC barely makes sense, 809 // but SelectionDAG emits such copies for i1 sources. 810 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 811 // This copy can only be produced by patterns 812 // with explicit SCC, which are known to be enabled 813 // only for subtargets with S_CMP_LG_U64 present. 814 assert(ST.hasScalarCompareEq64()); 815 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64)) 816 .addReg(SrcReg, getKillRegState(KillSrc)) 817 .addImm(0); 818 } else { 819 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 820 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) 821 .addReg(SrcReg, getKillRegState(KillSrc)) 822 .addImm(0); 823 } 824 825 return; 826 } 827 828 if (RC == &AMDGPU::AGPR_32RegClass) { 829 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) || 830 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) { 831 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) 832 .addReg(SrcReg, getKillRegState(KillSrc)); 833 return; 834 } 835 836 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) { 837 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg) 838 .addReg(SrcReg, getKillRegState(KillSrc)); 839 return; 840 } 841 842 // FIXME: Pass should maintain scavenger to avoid scan through the block on 843 // every AGPR spill. 844 RegScavenger RS; 845 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS); 846 return; 847 } 848 849 const unsigned Size = RI.getRegSizeInBits(*RC); 850 if (Size == 16) { 851 assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || 852 AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || 853 AMDGPU::SReg_LO16RegClass.contains(SrcReg) || 854 AMDGPU::AGPR_LO16RegClass.contains(SrcReg)); 855 856 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg); 857 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg); 858 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg); 859 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg); 860 bool DstLow = AMDGPU::VGPR_LO16RegClass.contains(DestReg) || 861 AMDGPU::SReg_LO16RegClass.contains(DestReg) || 862 AMDGPU::AGPR_LO16RegClass.contains(DestReg); 863 bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || 864 AMDGPU::SReg_LO16RegClass.contains(SrcReg) || 865 AMDGPU::AGPR_LO16RegClass.contains(SrcReg); 866 MCRegister NewDestReg = RI.get32BitRegister(DestReg); 867 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg); 868 869 if (IsSGPRDst) { 870 if (!IsSGPRSrc) { 871 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 872 return; 873 } 874 875 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg) 876 .addReg(NewSrcReg, getKillRegState(KillSrc)); 877 return; 878 } 879 880 if (IsAGPRDst || IsAGPRSrc) { 881 if (!DstLow || !SrcLow) { 882 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, 883 "Cannot use hi16 subreg with an AGPR!"); 884 } 885 886 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc); 887 return; 888 } 889 890 if (IsSGPRSrc && !ST.hasSDWAScalar()) { 891 if (!DstLow || !SrcLow) { 892 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, 893 "Cannot use hi16 subreg on VI!"); 894 } 895 896 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg) 897 .addReg(NewSrcReg, getKillRegState(KillSrc)); 898 return; 899 } 900 901 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg) 902 .addImm(0) // src0_modifiers 903 .addReg(NewSrcReg) 904 .addImm(0) // clamp 905 .addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0 906 : AMDGPU::SDWA::SdwaSel::WORD_1) 907 .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) 908 .addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0 909 : AMDGPU::SDWA::SdwaSel::WORD_1) 910 .addReg(NewDestReg, RegState::Implicit | RegState::Undef); 911 // First implicit operand is $exec. 912 MIB->tieOperands(0, MIB->getNumOperands() - 1); 913 return; 914 } 915 916 const TargetRegisterClass *SrcRC = RI.getPhysRegClass(SrcReg); 917 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) { 918 if (ST.hasMovB64()) { 919 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg) 920 .addReg(SrcReg, getKillRegState(KillSrc)); 921 return; 922 } 923 if (ST.hasPackedFP32Ops()) { 924 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg) 925 .addImm(SISrcMods::OP_SEL_1) 926 .addReg(SrcReg) 927 .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) 928 .addReg(SrcReg) 929 .addImm(0) // op_sel_lo 930 .addImm(0) // op_sel_hi 931 .addImm(0) // neg_lo 932 .addImm(0) // neg_hi 933 .addImm(0) // clamp 934 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit); 935 return; 936 } 937 } 938 939 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); 940 if (RI.isSGPRClass(RC)) { 941 if (!RI.isSGPRClass(SrcRC)) { 942 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 943 return; 944 } 945 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg); 946 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC, 947 Forward); 948 return; 949 } 950 951 unsigned EltSize = 4; 952 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 953 if (RI.isAGPRClass(RC)) { 954 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC)) 955 Opcode = AMDGPU::V_ACCVGPR_MOV_B32; 956 else if (RI.hasVGPRs(SrcRC) || 957 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC))) 958 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64; 959 else 960 Opcode = AMDGPU::INSTRUCTION_LIST_END; 961 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) { 962 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64; 963 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) && 964 (RI.isProperlyAlignedRC(*RC) && 965 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) { 966 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov. 967 if (ST.hasMovB64()) { 968 Opcode = AMDGPU::V_MOV_B64_e32; 969 EltSize = 8; 970 } else if (ST.hasPackedFP32Ops()) { 971 Opcode = AMDGPU::V_PK_MOV_B32; 972 EltSize = 8; 973 } 974 } 975 976 // For the cases where we need an intermediate instruction/temporary register 977 // (destination is an AGPR), we need a scavenger. 978 // 979 // FIXME: The pass should maintain this for us so we don't have to re-scan the 980 // whole block for every handled copy. 981 std::unique_ptr<RegScavenger> RS; 982 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) 983 RS.reset(new RegScavenger()); 984 985 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); 986 987 // If there is an overlap, we can't kill the super-register on the last 988 // instruction, since it will also kill the components made live by this def. 989 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg); 990 991 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 992 unsigned SubIdx; 993 if (Forward) 994 SubIdx = SubIndices[Idx]; 995 else 996 SubIdx = SubIndices[SubIndices.size() - Idx - 1]; 997 998 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1; 999 1000 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) { 1001 Register ImpDefSuper = Idx == 0 ? Register(DestReg) : Register(); 1002 Register ImpUseSuper = SrcReg; 1003 indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx), 1004 RI.getSubReg(SrcReg, SubIdx), UseKill, *RS, 1005 ImpDefSuper, ImpUseSuper); 1006 } else if (Opcode == AMDGPU::V_PK_MOV_B32) { 1007 Register DstSubReg = RI.getSubReg(DestReg, SubIdx); 1008 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx); 1009 MachineInstrBuilder MIB = 1010 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DstSubReg) 1011 .addImm(SISrcMods::OP_SEL_1) 1012 .addReg(SrcSubReg) 1013 .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) 1014 .addReg(SrcSubReg) 1015 .addImm(0) // op_sel_lo 1016 .addImm(0) // op_sel_hi 1017 .addImm(0) // neg_lo 1018 .addImm(0) // neg_hi 1019 .addImm(0) // clamp 1020 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); 1021 if (Idx == 0) 1022 MIB.addReg(DestReg, RegState::Define | RegState::Implicit); 1023 } else { 1024 MachineInstrBuilder Builder = 1025 BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx)) 1026 .addReg(RI.getSubReg(SrcReg, SubIdx)); 1027 if (Idx == 0) 1028 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 1029 1030 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); 1031 } 1032 } 1033 } 1034 1035 int SIInstrInfo::commuteOpcode(unsigned Opcode) const { 1036 int NewOpc; 1037 1038 // Try to map original to commuted opcode 1039 NewOpc = AMDGPU::getCommuteRev(Opcode); 1040 if (NewOpc != -1) 1041 // Check if the commuted (REV) opcode exists on the target. 1042 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 1043 1044 // Try to map commuted to original opcode 1045 NewOpc = AMDGPU::getCommuteOrig(Opcode); 1046 if (NewOpc != -1) 1047 // Check if the original (non-REV) opcode exists on the target. 1048 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 1049 1050 return Opcode; 1051 } 1052 1053 void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, 1054 MachineBasicBlock::iterator MI, 1055 const DebugLoc &DL, unsigned DestReg, 1056 int64_t Value) const { 1057 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 1058 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg); 1059 if (RegClass == &AMDGPU::SReg_32RegClass || 1060 RegClass == &AMDGPU::SGPR_32RegClass || 1061 RegClass == &AMDGPU::SReg_32_XM0RegClass || 1062 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) { 1063 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 1064 .addImm(Value); 1065 return; 1066 } 1067 1068 if (RegClass == &AMDGPU::SReg_64RegClass || 1069 RegClass == &AMDGPU::SGPR_64RegClass || 1070 RegClass == &AMDGPU::SReg_64_XEXECRegClass) { 1071 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 1072 .addImm(Value); 1073 return; 1074 } 1075 1076 if (RegClass == &AMDGPU::VGPR_32RegClass) { 1077 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 1078 .addImm(Value); 1079 return; 1080 } 1081 if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) { 1082 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg) 1083 .addImm(Value); 1084 return; 1085 } 1086 1087 unsigned EltSize = 4; 1088 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 1089 if (RI.isSGPRClass(RegClass)) { 1090 if (RI.getRegSizeInBits(*RegClass) > 32) { 1091 Opcode = AMDGPU::S_MOV_B64; 1092 EltSize = 8; 1093 } else { 1094 Opcode = AMDGPU::S_MOV_B32; 1095 EltSize = 4; 1096 } 1097 } 1098 1099 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize); 1100 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 1101 int64_t IdxValue = Idx == 0 ? Value : 0; 1102 1103 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 1104 get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx])); 1105 Builder.addImm(IdxValue); 1106 } 1107 } 1108 1109 const TargetRegisterClass * 1110 SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const { 1111 return &AMDGPU::VGPR_32RegClass; 1112 } 1113 1114 void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, 1115 MachineBasicBlock::iterator I, 1116 const DebugLoc &DL, Register DstReg, 1117 ArrayRef<MachineOperand> Cond, 1118 Register TrueReg, 1119 Register FalseReg) const { 1120 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 1121 const TargetRegisterClass *BoolXExecRC = 1122 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1123 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && 1124 "Not a VGPR32 reg"); 1125 1126 if (Cond.size() == 1) { 1127 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1128 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 1129 .add(Cond[0]); 1130 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1131 .addImm(0) 1132 .addReg(FalseReg) 1133 .addImm(0) 1134 .addReg(TrueReg) 1135 .addReg(SReg); 1136 } else if (Cond.size() == 2) { 1137 assert(Cond[0].isImm() && "Cond[0] is not an immediate"); 1138 switch (Cond[0].getImm()) { 1139 case SIInstrInfo::SCC_TRUE: { 1140 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1141 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 1142 : AMDGPU::S_CSELECT_B64), SReg) 1143 .addImm(1) 1144 .addImm(0); 1145 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1146 .addImm(0) 1147 .addReg(FalseReg) 1148 .addImm(0) 1149 .addReg(TrueReg) 1150 .addReg(SReg); 1151 break; 1152 } 1153 case SIInstrInfo::SCC_FALSE: { 1154 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1155 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 1156 : AMDGPU::S_CSELECT_B64), SReg) 1157 .addImm(0) 1158 .addImm(1); 1159 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1160 .addImm(0) 1161 .addReg(FalseReg) 1162 .addImm(0) 1163 .addReg(TrueReg) 1164 .addReg(SReg); 1165 break; 1166 } 1167 case SIInstrInfo::VCCNZ: { 1168 MachineOperand RegOp = Cond[1]; 1169 RegOp.setImplicit(false); 1170 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1171 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 1172 .add(RegOp); 1173 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1174 .addImm(0) 1175 .addReg(FalseReg) 1176 .addImm(0) 1177 .addReg(TrueReg) 1178 .addReg(SReg); 1179 break; 1180 } 1181 case SIInstrInfo::VCCZ: { 1182 MachineOperand RegOp = Cond[1]; 1183 RegOp.setImplicit(false); 1184 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1185 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 1186 .add(RegOp); 1187 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1188 .addImm(0) 1189 .addReg(TrueReg) 1190 .addImm(0) 1191 .addReg(FalseReg) 1192 .addReg(SReg); 1193 break; 1194 } 1195 case SIInstrInfo::EXECNZ: { 1196 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1197 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); 1198 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 1199 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 1200 .addImm(0); 1201 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 1202 : AMDGPU::S_CSELECT_B64), SReg) 1203 .addImm(1) 1204 .addImm(0); 1205 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1206 .addImm(0) 1207 .addReg(FalseReg) 1208 .addImm(0) 1209 .addReg(TrueReg) 1210 .addReg(SReg); 1211 break; 1212 } 1213 case SIInstrInfo::EXECZ: { 1214 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1215 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); 1216 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 1217 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 1218 .addImm(0); 1219 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 1220 : AMDGPU::S_CSELECT_B64), SReg) 1221 .addImm(0) 1222 .addImm(1); 1223 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1224 .addImm(0) 1225 .addReg(FalseReg) 1226 .addImm(0) 1227 .addReg(TrueReg) 1228 .addReg(SReg); 1229 llvm_unreachable("Unhandled branch predicate EXECZ"); 1230 break; 1231 } 1232 default: 1233 llvm_unreachable("invalid branch predicate"); 1234 } 1235 } else { 1236 llvm_unreachable("Can only handle Cond size 1 or 2"); 1237 } 1238 } 1239 1240 Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB, 1241 MachineBasicBlock::iterator I, 1242 const DebugLoc &DL, 1243 Register SrcReg, int Value) const { 1244 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1245 Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); 1246 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) 1247 .addImm(Value) 1248 .addReg(SrcReg); 1249 1250 return Reg; 1251 } 1252 1253 Register SIInstrInfo::insertNE(MachineBasicBlock *MBB, 1254 MachineBasicBlock::iterator I, 1255 const DebugLoc &DL, 1256 Register SrcReg, int Value) const { 1257 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1258 Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); 1259 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) 1260 .addImm(Value) 1261 .addReg(SrcReg); 1262 1263 return Reg; 1264 } 1265 1266 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 1267 1268 if (RI.isAGPRClass(DstRC)) 1269 return AMDGPU::COPY; 1270 if (RI.getRegSizeInBits(*DstRC) == 32) { 1271 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1272 } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { 1273 return AMDGPU::S_MOV_B64; 1274 } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) { 1275 return AMDGPU::V_MOV_B64_PSEUDO; 1276 } 1277 return AMDGPU::COPY; 1278 } 1279 1280 const MCInstrDesc & 1281 SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize, 1282 bool IsIndirectSrc) const { 1283 if (IsIndirectSrc) { 1284 if (VecSize <= 32) // 4 bytes 1285 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1); 1286 if (VecSize <= 64) // 8 bytes 1287 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2); 1288 if (VecSize <= 96) // 12 bytes 1289 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3); 1290 if (VecSize <= 128) // 16 bytes 1291 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4); 1292 if (VecSize <= 160) // 20 bytes 1293 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5); 1294 if (VecSize <= 256) // 32 bytes 1295 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8); 1296 if (VecSize <= 512) // 64 bytes 1297 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16); 1298 if (VecSize <= 1024) // 128 bytes 1299 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32); 1300 1301 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos"); 1302 } 1303 1304 if (VecSize <= 32) // 4 bytes 1305 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1); 1306 if (VecSize <= 64) // 8 bytes 1307 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2); 1308 if (VecSize <= 96) // 12 bytes 1309 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3); 1310 if (VecSize <= 128) // 16 bytes 1311 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4); 1312 if (VecSize <= 160) // 20 bytes 1313 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5); 1314 if (VecSize <= 256) // 32 bytes 1315 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8); 1316 if (VecSize <= 512) // 64 bytes 1317 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16); 1318 if (VecSize <= 1024) // 128 bytes 1319 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32); 1320 1321 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos"); 1322 } 1323 1324 static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) { 1325 if (VecSize <= 32) // 4 bytes 1326 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1; 1327 if (VecSize <= 64) // 8 bytes 1328 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2; 1329 if (VecSize <= 96) // 12 bytes 1330 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3; 1331 if (VecSize <= 128) // 16 bytes 1332 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4; 1333 if (VecSize <= 160) // 20 bytes 1334 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5; 1335 if (VecSize <= 256) // 32 bytes 1336 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8; 1337 if (VecSize <= 512) // 64 bytes 1338 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16; 1339 if (VecSize <= 1024) // 128 bytes 1340 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32; 1341 1342 llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); 1343 } 1344 1345 static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) { 1346 if (VecSize <= 32) // 4 bytes 1347 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1; 1348 if (VecSize <= 64) // 8 bytes 1349 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2; 1350 if (VecSize <= 96) // 12 bytes 1351 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3; 1352 if (VecSize <= 128) // 16 bytes 1353 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4; 1354 if (VecSize <= 160) // 20 bytes 1355 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5; 1356 if (VecSize <= 256) // 32 bytes 1357 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8; 1358 if (VecSize <= 512) // 64 bytes 1359 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16; 1360 if (VecSize <= 1024) // 128 bytes 1361 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32; 1362 1363 llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); 1364 } 1365 1366 static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) { 1367 if (VecSize <= 64) // 8 bytes 1368 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1; 1369 if (VecSize <= 128) // 16 bytes 1370 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2; 1371 if (VecSize <= 256) // 32 bytes 1372 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4; 1373 if (VecSize <= 512) // 64 bytes 1374 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8; 1375 if (VecSize <= 1024) // 128 bytes 1376 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16; 1377 1378 llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); 1379 } 1380 1381 const MCInstrDesc & 1382 SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, 1383 bool IsSGPR) const { 1384 if (IsSGPR) { 1385 switch (EltSize) { 1386 case 32: 1387 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize)); 1388 case 64: 1389 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize)); 1390 default: 1391 llvm_unreachable("invalid reg indexing elt size"); 1392 } 1393 } 1394 1395 assert(EltSize == 32 && "invalid reg indexing elt size"); 1396 return get(getIndirectVGPRWriteMovRelPseudoOpc(VecSize)); 1397 } 1398 1399 static unsigned getSGPRSpillSaveOpcode(unsigned Size) { 1400 switch (Size) { 1401 case 4: 1402 return AMDGPU::SI_SPILL_S32_SAVE; 1403 case 8: 1404 return AMDGPU::SI_SPILL_S64_SAVE; 1405 case 12: 1406 return AMDGPU::SI_SPILL_S96_SAVE; 1407 case 16: 1408 return AMDGPU::SI_SPILL_S128_SAVE; 1409 case 20: 1410 return AMDGPU::SI_SPILL_S160_SAVE; 1411 case 24: 1412 return AMDGPU::SI_SPILL_S192_SAVE; 1413 case 28: 1414 return AMDGPU::SI_SPILL_S224_SAVE; 1415 case 32: 1416 return AMDGPU::SI_SPILL_S256_SAVE; 1417 case 64: 1418 return AMDGPU::SI_SPILL_S512_SAVE; 1419 case 128: 1420 return AMDGPU::SI_SPILL_S1024_SAVE; 1421 default: 1422 llvm_unreachable("unknown register size"); 1423 } 1424 } 1425 1426 static unsigned getVGPRSpillSaveOpcode(unsigned Size) { 1427 switch (Size) { 1428 case 4: 1429 return AMDGPU::SI_SPILL_V32_SAVE; 1430 case 8: 1431 return AMDGPU::SI_SPILL_V64_SAVE; 1432 case 12: 1433 return AMDGPU::SI_SPILL_V96_SAVE; 1434 case 16: 1435 return AMDGPU::SI_SPILL_V128_SAVE; 1436 case 20: 1437 return AMDGPU::SI_SPILL_V160_SAVE; 1438 case 24: 1439 return AMDGPU::SI_SPILL_V192_SAVE; 1440 case 28: 1441 return AMDGPU::SI_SPILL_V224_SAVE; 1442 case 32: 1443 return AMDGPU::SI_SPILL_V256_SAVE; 1444 case 64: 1445 return AMDGPU::SI_SPILL_V512_SAVE; 1446 case 128: 1447 return AMDGPU::SI_SPILL_V1024_SAVE; 1448 default: 1449 llvm_unreachable("unknown register size"); 1450 } 1451 } 1452 1453 static unsigned getAGPRSpillSaveOpcode(unsigned Size) { 1454 switch (Size) { 1455 case 4: 1456 return AMDGPU::SI_SPILL_A32_SAVE; 1457 case 8: 1458 return AMDGPU::SI_SPILL_A64_SAVE; 1459 case 12: 1460 return AMDGPU::SI_SPILL_A96_SAVE; 1461 case 16: 1462 return AMDGPU::SI_SPILL_A128_SAVE; 1463 case 20: 1464 return AMDGPU::SI_SPILL_A160_SAVE; 1465 case 24: 1466 return AMDGPU::SI_SPILL_A192_SAVE; 1467 case 28: 1468 return AMDGPU::SI_SPILL_A224_SAVE; 1469 case 32: 1470 return AMDGPU::SI_SPILL_A256_SAVE; 1471 case 64: 1472 return AMDGPU::SI_SPILL_A512_SAVE; 1473 case 128: 1474 return AMDGPU::SI_SPILL_A1024_SAVE; 1475 default: 1476 llvm_unreachable("unknown register size"); 1477 } 1478 } 1479 1480 static unsigned getAVSpillSaveOpcode(unsigned Size) { 1481 switch (Size) { 1482 case 4: 1483 return AMDGPU::SI_SPILL_AV32_SAVE; 1484 case 8: 1485 return AMDGPU::SI_SPILL_AV64_SAVE; 1486 case 12: 1487 return AMDGPU::SI_SPILL_AV96_SAVE; 1488 case 16: 1489 return AMDGPU::SI_SPILL_AV128_SAVE; 1490 case 20: 1491 return AMDGPU::SI_SPILL_AV160_SAVE; 1492 case 24: 1493 return AMDGPU::SI_SPILL_AV192_SAVE; 1494 case 28: 1495 return AMDGPU::SI_SPILL_AV224_SAVE; 1496 case 32: 1497 return AMDGPU::SI_SPILL_AV256_SAVE; 1498 case 64: 1499 return AMDGPU::SI_SPILL_AV512_SAVE; 1500 case 128: 1501 return AMDGPU::SI_SPILL_AV1024_SAVE; 1502 default: 1503 llvm_unreachable("unknown register size"); 1504 } 1505 } 1506 1507 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 1508 MachineBasicBlock::iterator MI, 1509 Register SrcReg, bool isKill, 1510 int FrameIndex, 1511 const TargetRegisterClass *RC, 1512 const TargetRegisterInfo *TRI) const { 1513 MachineFunction *MF = MBB.getParent(); 1514 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1515 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1516 const DebugLoc &DL = MBB.findDebugLoc(MI); 1517 1518 MachinePointerInfo PtrInfo 1519 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 1520 MachineMemOperand *MMO = MF->getMachineMemOperand( 1521 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex), 1522 FrameInfo.getObjectAlign(FrameIndex)); 1523 unsigned SpillSize = TRI->getSpillSize(*RC); 1524 1525 MachineRegisterInfo &MRI = MF->getRegInfo(); 1526 if (RI.isSGPRClass(RC)) { 1527 MFI->setHasSpilledSGPRs(); 1528 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled"); 1529 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && 1530 SrcReg != AMDGPU::EXEC && "exec should not be spilled"); 1531 1532 // We are only allowed to create one new instruction when spilling 1533 // registers, so we need to use pseudo instruction for spilling SGPRs. 1534 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize)); 1535 1536 // The SGPR spill/restore instructions only work on number sgprs, so we need 1537 // to make sure we are using the correct register class. 1538 if (SrcReg.isVirtual() && SpillSize == 4) { 1539 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); 1540 } 1541 1542 BuildMI(MBB, MI, DL, OpDesc) 1543 .addReg(SrcReg, getKillRegState(isKill)) // data 1544 .addFrameIndex(FrameIndex) // addr 1545 .addMemOperand(MMO) 1546 .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); 1547 1548 if (RI.spillSGPRToVGPR()) 1549 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); 1550 return; 1551 } 1552 1553 unsigned Opcode = RI.isVectorSuperClass(RC) ? getAVSpillSaveOpcode(SpillSize) 1554 : RI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(SpillSize) 1555 : getVGPRSpillSaveOpcode(SpillSize); 1556 MFI->setHasSpilledVGPRs(); 1557 1558 BuildMI(MBB, MI, DL, get(Opcode)) 1559 .addReg(SrcReg, getKillRegState(isKill)) // data 1560 .addFrameIndex(FrameIndex) // addr 1561 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset 1562 .addImm(0) // offset 1563 .addMemOperand(MMO); 1564 } 1565 1566 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { 1567 switch (Size) { 1568 case 4: 1569 return AMDGPU::SI_SPILL_S32_RESTORE; 1570 case 8: 1571 return AMDGPU::SI_SPILL_S64_RESTORE; 1572 case 12: 1573 return AMDGPU::SI_SPILL_S96_RESTORE; 1574 case 16: 1575 return AMDGPU::SI_SPILL_S128_RESTORE; 1576 case 20: 1577 return AMDGPU::SI_SPILL_S160_RESTORE; 1578 case 24: 1579 return AMDGPU::SI_SPILL_S192_RESTORE; 1580 case 28: 1581 return AMDGPU::SI_SPILL_S224_RESTORE; 1582 case 32: 1583 return AMDGPU::SI_SPILL_S256_RESTORE; 1584 case 64: 1585 return AMDGPU::SI_SPILL_S512_RESTORE; 1586 case 128: 1587 return AMDGPU::SI_SPILL_S1024_RESTORE; 1588 default: 1589 llvm_unreachable("unknown register size"); 1590 } 1591 } 1592 1593 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { 1594 switch (Size) { 1595 case 4: 1596 return AMDGPU::SI_SPILL_V32_RESTORE; 1597 case 8: 1598 return AMDGPU::SI_SPILL_V64_RESTORE; 1599 case 12: 1600 return AMDGPU::SI_SPILL_V96_RESTORE; 1601 case 16: 1602 return AMDGPU::SI_SPILL_V128_RESTORE; 1603 case 20: 1604 return AMDGPU::SI_SPILL_V160_RESTORE; 1605 case 24: 1606 return AMDGPU::SI_SPILL_V192_RESTORE; 1607 case 28: 1608 return AMDGPU::SI_SPILL_V224_RESTORE; 1609 case 32: 1610 return AMDGPU::SI_SPILL_V256_RESTORE; 1611 case 64: 1612 return AMDGPU::SI_SPILL_V512_RESTORE; 1613 case 128: 1614 return AMDGPU::SI_SPILL_V1024_RESTORE; 1615 default: 1616 llvm_unreachable("unknown register size"); 1617 } 1618 } 1619 1620 static unsigned getAGPRSpillRestoreOpcode(unsigned Size) { 1621 switch (Size) { 1622 case 4: 1623 return AMDGPU::SI_SPILL_A32_RESTORE; 1624 case 8: 1625 return AMDGPU::SI_SPILL_A64_RESTORE; 1626 case 12: 1627 return AMDGPU::SI_SPILL_A96_RESTORE; 1628 case 16: 1629 return AMDGPU::SI_SPILL_A128_RESTORE; 1630 case 20: 1631 return AMDGPU::SI_SPILL_A160_RESTORE; 1632 case 24: 1633 return AMDGPU::SI_SPILL_A192_RESTORE; 1634 case 28: 1635 return AMDGPU::SI_SPILL_A224_RESTORE; 1636 case 32: 1637 return AMDGPU::SI_SPILL_A256_RESTORE; 1638 case 64: 1639 return AMDGPU::SI_SPILL_A512_RESTORE; 1640 case 128: 1641 return AMDGPU::SI_SPILL_A1024_RESTORE; 1642 default: 1643 llvm_unreachable("unknown register size"); 1644 } 1645 } 1646 1647 static unsigned getAVSpillRestoreOpcode(unsigned Size) { 1648 switch (Size) { 1649 case 4: 1650 return AMDGPU::SI_SPILL_AV32_RESTORE; 1651 case 8: 1652 return AMDGPU::SI_SPILL_AV64_RESTORE; 1653 case 12: 1654 return AMDGPU::SI_SPILL_AV96_RESTORE; 1655 case 16: 1656 return AMDGPU::SI_SPILL_AV128_RESTORE; 1657 case 20: 1658 return AMDGPU::SI_SPILL_AV160_RESTORE; 1659 case 24: 1660 return AMDGPU::SI_SPILL_AV192_RESTORE; 1661 case 28: 1662 return AMDGPU::SI_SPILL_AV224_RESTORE; 1663 case 32: 1664 return AMDGPU::SI_SPILL_AV256_RESTORE; 1665 case 64: 1666 return AMDGPU::SI_SPILL_AV512_RESTORE; 1667 case 128: 1668 return AMDGPU::SI_SPILL_AV1024_RESTORE; 1669 default: 1670 llvm_unreachable("unknown register size"); 1671 } 1672 } 1673 1674 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 1675 MachineBasicBlock::iterator MI, 1676 Register DestReg, int FrameIndex, 1677 const TargetRegisterClass *RC, 1678 const TargetRegisterInfo *TRI) const { 1679 MachineFunction *MF = MBB.getParent(); 1680 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1681 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1682 const DebugLoc &DL = MBB.findDebugLoc(MI); 1683 unsigned SpillSize = TRI->getSpillSize(*RC); 1684 1685 MachinePointerInfo PtrInfo 1686 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 1687 1688 MachineMemOperand *MMO = MF->getMachineMemOperand( 1689 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex), 1690 FrameInfo.getObjectAlign(FrameIndex)); 1691 1692 if (RI.isSGPRClass(RC)) { 1693 MFI->setHasSpilledSGPRs(); 1694 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into"); 1695 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && 1696 DestReg != AMDGPU::EXEC && "exec should not be spilled"); 1697 1698 // FIXME: Maybe this should not include a memoperand because it will be 1699 // lowered to non-memory instructions. 1700 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); 1701 if (DestReg.isVirtual() && SpillSize == 4) { 1702 MachineRegisterInfo &MRI = MF->getRegInfo(); 1703 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); 1704 } 1705 1706 if (RI.spillSGPRToVGPR()) 1707 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); 1708 BuildMI(MBB, MI, DL, OpDesc, DestReg) 1709 .addFrameIndex(FrameIndex) // addr 1710 .addMemOperand(MMO) 1711 .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); 1712 1713 return; 1714 } 1715 1716 unsigned Opcode = RI.isVectorSuperClass(RC) 1717 ? getAVSpillRestoreOpcode(SpillSize) 1718 : RI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(SpillSize) 1719 : getVGPRSpillRestoreOpcode(SpillSize); 1720 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 1721 .addFrameIndex(FrameIndex) // vaddr 1722 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset 1723 .addImm(0) // offset 1724 .addMemOperand(MMO); 1725 } 1726 1727 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, 1728 MachineBasicBlock::iterator MI) const { 1729 insertNoops(MBB, MI, 1); 1730 } 1731 1732 void SIInstrInfo::insertNoops(MachineBasicBlock &MBB, 1733 MachineBasicBlock::iterator MI, 1734 unsigned Quantity) const { 1735 DebugLoc DL = MBB.findDebugLoc(MI); 1736 while (Quantity > 0) { 1737 unsigned Arg = std::min(Quantity, 8u); 1738 Quantity -= Arg; 1739 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1); 1740 } 1741 } 1742 1743 void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { 1744 auto MF = MBB.getParent(); 1745 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 1746 1747 assert(Info->isEntryFunction()); 1748 1749 if (MBB.succ_empty()) { 1750 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end(); 1751 if (HasNoTerminator) { 1752 if (Info->returnsVoid()) { 1753 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0); 1754 } else { 1755 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG)); 1756 } 1757 } 1758 } 1759 } 1760 1761 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { 1762 switch (MI.getOpcode()) { 1763 default: 1764 if (MI.isMetaInstruction()) 1765 return 0; 1766 return 1; // FIXME: Do wait states equal cycles? 1767 1768 case AMDGPU::S_NOP: 1769 return MI.getOperand(0).getImm() + 1; 1770 1771 // FIXME: Any other pseudo instruction? 1772 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The 1773 // hazard, even if one exist, won't really be visible. Should we handle it? 1774 case AMDGPU::SI_MASKED_UNREACHABLE: 1775 case AMDGPU::WAVE_BARRIER: 1776 case AMDGPU::SCHED_BARRIER: 1777 return 0; 1778 } 1779 } 1780 1781 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 1782 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1783 MachineBasicBlock &MBB = *MI.getParent(); 1784 DebugLoc DL = MBB.findDebugLoc(MI); 1785 switch (MI.getOpcode()) { 1786 default: return TargetInstrInfo::expandPostRAPseudo(MI); 1787 case AMDGPU::S_MOV_B64_term: 1788 // This is only a terminator to get the correct spill code placement during 1789 // register allocation. 1790 MI.setDesc(get(AMDGPU::S_MOV_B64)); 1791 break; 1792 1793 case AMDGPU::S_MOV_B32_term: 1794 // This is only a terminator to get the correct spill code placement during 1795 // register allocation. 1796 MI.setDesc(get(AMDGPU::S_MOV_B32)); 1797 break; 1798 1799 case AMDGPU::S_XOR_B64_term: 1800 // This is only a terminator to get the correct spill code placement during 1801 // register allocation. 1802 MI.setDesc(get(AMDGPU::S_XOR_B64)); 1803 break; 1804 1805 case AMDGPU::S_XOR_B32_term: 1806 // This is only a terminator to get the correct spill code placement during 1807 // register allocation. 1808 MI.setDesc(get(AMDGPU::S_XOR_B32)); 1809 break; 1810 case AMDGPU::S_OR_B64_term: 1811 // This is only a terminator to get the correct spill code placement during 1812 // register allocation. 1813 MI.setDesc(get(AMDGPU::S_OR_B64)); 1814 break; 1815 case AMDGPU::S_OR_B32_term: 1816 // This is only a terminator to get the correct spill code placement during 1817 // register allocation. 1818 MI.setDesc(get(AMDGPU::S_OR_B32)); 1819 break; 1820 1821 case AMDGPU::S_ANDN2_B64_term: 1822 // This is only a terminator to get the correct spill code placement during 1823 // register allocation. 1824 MI.setDesc(get(AMDGPU::S_ANDN2_B64)); 1825 break; 1826 1827 case AMDGPU::S_ANDN2_B32_term: 1828 // This is only a terminator to get the correct spill code placement during 1829 // register allocation. 1830 MI.setDesc(get(AMDGPU::S_ANDN2_B32)); 1831 break; 1832 1833 case AMDGPU::S_AND_B64_term: 1834 // This is only a terminator to get the correct spill code placement during 1835 // register allocation. 1836 MI.setDesc(get(AMDGPU::S_AND_B64)); 1837 break; 1838 1839 case AMDGPU::S_AND_B32_term: 1840 // This is only a terminator to get the correct spill code placement during 1841 // register allocation. 1842 MI.setDesc(get(AMDGPU::S_AND_B32)); 1843 break; 1844 1845 case AMDGPU::V_MOV_B64_PSEUDO: { 1846 Register Dst = MI.getOperand(0).getReg(); 1847 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 1848 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 1849 1850 const MachineOperand &SrcOp = MI.getOperand(1); 1851 // FIXME: Will this work for 64-bit floating point immediates? 1852 assert(!SrcOp.isFPImm()); 1853 if (ST.hasMovB64()) { 1854 MI.setDesc(get(AMDGPU::V_MOV_B64_e32)); 1855 if (!isLiteralConstant(MI, 1) || isUInt<32>(SrcOp.getImm())) 1856 break; 1857 } 1858 if (SrcOp.isImm()) { 1859 APInt Imm(64, SrcOp.getImm()); 1860 APInt Lo(32, Imm.getLoBits(32).getZExtValue()); 1861 APInt Hi(32, Imm.getHiBits(32).getZExtValue()); 1862 if (ST.hasPackedFP32Ops() && Lo == Hi && isInlineConstant(Lo)) { 1863 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) 1864 .addImm(SISrcMods::OP_SEL_1) 1865 .addImm(Lo.getSExtValue()) 1866 .addImm(SISrcMods::OP_SEL_1) 1867 .addImm(Lo.getSExtValue()) 1868 .addImm(0) // op_sel_lo 1869 .addImm(0) // op_sel_hi 1870 .addImm(0) // neg_lo 1871 .addImm(0) // neg_hi 1872 .addImm(0); // clamp 1873 } else { 1874 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 1875 .addImm(Lo.getSExtValue()) 1876 .addReg(Dst, RegState::Implicit | RegState::Define); 1877 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 1878 .addImm(Hi.getSExtValue()) 1879 .addReg(Dst, RegState::Implicit | RegState::Define); 1880 } 1881 } else { 1882 assert(SrcOp.isReg()); 1883 if (ST.hasPackedFP32Ops() && 1884 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) { 1885 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) 1886 .addImm(SISrcMods::OP_SEL_1) // src0_mod 1887 .addReg(SrcOp.getReg()) 1888 .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod 1889 .addReg(SrcOp.getReg()) 1890 .addImm(0) // op_sel_lo 1891 .addImm(0) // op_sel_hi 1892 .addImm(0) // neg_lo 1893 .addImm(0) // neg_hi 1894 .addImm(0); // clamp 1895 } else { 1896 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 1897 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 1898 .addReg(Dst, RegState::Implicit | RegState::Define); 1899 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 1900 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 1901 .addReg(Dst, RegState::Implicit | RegState::Define); 1902 } 1903 } 1904 MI.eraseFromParent(); 1905 break; 1906 } 1907 case AMDGPU::V_MOV_B64_DPP_PSEUDO: { 1908 expandMovDPP64(MI); 1909 break; 1910 } 1911 case AMDGPU::S_MOV_B64_IMM_PSEUDO: { 1912 const MachineOperand &SrcOp = MI.getOperand(1); 1913 assert(!SrcOp.isFPImm()); 1914 APInt Imm(64, SrcOp.getImm()); 1915 if (Imm.isIntN(32) || isInlineConstant(Imm)) { 1916 MI.setDesc(get(AMDGPU::S_MOV_B64)); 1917 break; 1918 } 1919 1920 Register Dst = MI.getOperand(0).getReg(); 1921 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 1922 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 1923 1924 APInt Lo(32, Imm.getLoBits(32).getZExtValue()); 1925 APInt Hi(32, Imm.getHiBits(32).getZExtValue()); 1926 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo) 1927 .addImm(Lo.getSExtValue()) 1928 .addReg(Dst, RegState::Implicit | RegState::Define); 1929 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi) 1930 .addImm(Hi.getSExtValue()) 1931 .addReg(Dst, RegState::Implicit | RegState::Define); 1932 MI.eraseFromParent(); 1933 break; 1934 } 1935 case AMDGPU::V_SET_INACTIVE_B32: { 1936 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; 1937 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1938 // FIXME: We may possibly optimize the COPY once we find ways to make LLVM 1939 // optimizations (mainly Register Coalescer) aware of WWM register liveness. 1940 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) 1941 .add(MI.getOperand(1)); 1942 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); 1943 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten 1944 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) 1945 .add(MI.getOperand(2)); 1946 BuildMI(MBB, MI, DL, get(NotOpc), Exec) 1947 .addReg(Exec); 1948 MI.eraseFromParent(); 1949 break; 1950 } 1951 case AMDGPU::V_SET_INACTIVE_B64: { 1952 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; 1953 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1954 MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), 1955 MI.getOperand(0).getReg()) 1956 .add(MI.getOperand(1)); 1957 expandPostRAPseudo(*Copy); 1958 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); 1959 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten 1960 Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), 1961 MI.getOperand(0).getReg()) 1962 .add(MI.getOperand(2)); 1963 expandPostRAPseudo(*Copy); 1964 BuildMI(MBB, MI, DL, get(NotOpc), Exec) 1965 .addReg(Exec); 1966 MI.eraseFromParent(); 1967 break; 1968 } 1969 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1: 1970 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2: 1971 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3: 1972 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4: 1973 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5: 1974 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8: 1975 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16: 1976 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32: 1977 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1: 1978 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2: 1979 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3: 1980 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4: 1981 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5: 1982 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8: 1983 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16: 1984 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32: 1985 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1: 1986 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2: 1987 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4: 1988 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8: 1989 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: { 1990 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2); 1991 1992 unsigned Opc; 1993 if (RI.hasVGPRs(EltRC)) { 1994 Opc = AMDGPU::V_MOVRELD_B32_e32; 1995 } else { 1996 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64 1997 : AMDGPU::S_MOVRELD_B32; 1998 } 1999 2000 const MCInstrDesc &OpDesc = get(Opc); 2001 Register VecReg = MI.getOperand(0).getReg(); 2002 bool IsUndef = MI.getOperand(1).isUndef(); 2003 unsigned SubReg = MI.getOperand(3).getImm(); 2004 assert(VecReg == MI.getOperand(1).getReg()); 2005 2006 MachineInstrBuilder MIB = 2007 BuildMI(MBB, MI, DL, OpDesc) 2008 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) 2009 .add(MI.getOperand(2)) 2010 .addReg(VecReg, RegState::ImplicitDefine) 2011 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); 2012 2013 const int ImpDefIdx = 2014 OpDesc.getNumOperands() + OpDesc.getNumImplicitUses(); 2015 const int ImpUseIdx = ImpDefIdx + 1; 2016 MIB->tieOperands(ImpDefIdx, ImpUseIdx); 2017 MI.eraseFromParent(); 2018 break; 2019 } 2020 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1: 2021 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2: 2022 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3: 2023 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4: 2024 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5: 2025 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8: 2026 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16: 2027 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: { 2028 assert(ST.useVGPRIndexMode()); 2029 Register VecReg = MI.getOperand(0).getReg(); 2030 bool IsUndef = MI.getOperand(1).isUndef(); 2031 Register Idx = MI.getOperand(3).getReg(); 2032 Register SubReg = MI.getOperand(4).getImm(); 2033 2034 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON)) 2035 .addReg(Idx) 2036 .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE); 2037 SetOn->getOperand(3).setIsUndef(); 2038 2039 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write); 2040 MachineInstrBuilder MIB = 2041 BuildMI(MBB, MI, DL, OpDesc) 2042 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) 2043 .add(MI.getOperand(2)) 2044 .addReg(VecReg, RegState::ImplicitDefine) 2045 .addReg(VecReg, 2046 RegState::Implicit | (IsUndef ? RegState::Undef : 0)); 2047 2048 const int ImpDefIdx = OpDesc.getNumOperands() + OpDesc.getNumImplicitUses(); 2049 const int ImpUseIdx = ImpDefIdx + 1; 2050 MIB->tieOperands(ImpDefIdx, ImpUseIdx); 2051 2052 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF)); 2053 2054 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator())); 2055 2056 MI.eraseFromParent(); 2057 break; 2058 } 2059 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1: 2060 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2: 2061 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3: 2062 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4: 2063 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5: 2064 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8: 2065 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16: 2066 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: { 2067 assert(ST.useVGPRIndexMode()); 2068 Register Dst = MI.getOperand(0).getReg(); 2069 Register VecReg = MI.getOperand(1).getReg(); 2070 bool IsUndef = MI.getOperand(1).isUndef(); 2071 Register Idx = MI.getOperand(2).getReg(); 2072 Register SubReg = MI.getOperand(3).getImm(); 2073 2074 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON)) 2075 .addReg(Idx) 2076 .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE); 2077 SetOn->getOperand(3).setIsUndef(); 2078 2079 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read)) 2080 .addDef(Dst) 2081 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) 2082 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); 2083 2084 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF)); 2085 2086 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator())); 2087 2088 MI.eraseFromParent(); 2089 break; 2090 } 2091 case AMDGPU::SI_PC_ADD_REL_OFFSET: { 2092 MachineFunction &MF = *MBB.getParent(); 2093 Register Reg = MI.getOperand(0).getReg(); 2094 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0); 2095 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1); 2096 2097 // Create a bundle so these instructions won't be re-ordered by the 2098 // post-RA scheduler. 2099 MIBundleBuilder Bundler(MBB, MI); 2100 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); 2101 2102 // Add 32-bit offset from this instruction to the start of the 2103 // constant data. 2104 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) 2105 .addReg(RegLo) 2106 .add(MI.getOperand(1))); 2107 2108 MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) 2109 .addReg(RegHi); 2110 MIB.add(MI.getOperand(2)); 2111 2112 Bundler.append(MIB); 2113 finalizeBundle(MBB, Bundler.begin()); 2114 2115 MI.eraseFromParent(); 2116 break; 2117 } 2118 case AMDGPU::ENTER_STRICT_WWM: { 2119 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when 2120 // Whole Wave Mode is entered. 2121 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 2122 : AMDGPU::S_OR_SAVEEXEC_B64)); 2123 break; 2124 } 2125 case AMDGPU::ENTER_STRICT_WQM: { 2126 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when 2127 // STRICT_WQM is entered. 2128 const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 2129 const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64; 2130 const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 2131 BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec); 2132 BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec); 2133 2134 MI.eraseFromParent(); 2135 break; 2136 } 2137 case AMDGPU::EXIT_STRICT_WWM: 2138 case AMDGPU::EXIT_STRICT_WQM: { 2139 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when 2140 // WWM/STICT_WQM is exited. 2141 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); 2142 break; 2143 } 2144 case AMDGPU::SI_RETURN: { 2145 const MachineFunction *MF = MBB.getParent(); 2146 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 2147 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 2148 // Hiding the return address use with SI_RETURN may lead to extra kills in 2149 // the function and missing live-ins. We are fine in practice because callee 2150 // saved register handling ensures the register value is restored before 2151 // RET, but we need the undef flag here to appease the MachineVerifier 2152 // liveness checks. 2153 MachineInstrBuilder MIB = 2154 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return)) 2155 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef); 2156 2157 MIB.copyImplicitOps(MI); 2158 MI.eraseFromParent(); 2159 break; 2160 } 2161 } 2162 return true; 2163 } 2164 2165 std::pair<MachineInstr*, MachineInstr*> 2166 SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { 2167 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); 2168 2169 if (ST.hasMovB64() && 2170 AMDGPU::isLegal64BitDPPControl( 2171 getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) { 2172 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp)); 2173 return std::make_pair(&MI, nullptr); 2174 } 2175 2176 MachineBasicBlock &MBB = *MI.getParent(); 2177 DebugLoc DL = MBB.findDebugLoc(MI); 2178 MachineFunction *MF = MBB.getParent(); 2179 MachineRegisterInfo &MRI = MF->getRegInfo(); 2180 Register Dst = MI.getOperand(0).getReg(); 2181 unsigned Part = 0; 2182 MachineInstr *Split[2]; 2183 2184 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) { 2185 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp)); 2186 if (Dst.isPhysical()) { 2187 MovDPP.addDef(RI.getSubReg(Dst, Sub)); 2188 } else { 2189 assert(MRI.isSSA()); 2190 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2191 MovDPP.addDef(Tmp); 2192 } 2193 2194 for (unsigned I = 1; I <= 2; ++I) { // old and src operands. 2195 const MachineOperand &SrcOp = MI.getOperand(I); 2196 assert(!SrcOp.isFPImm()); 2197 if (SrcOp.isImm()) { 2198 APInt Imm(64, SrcOp.getImm()); 2199 Imm.ashrInPlace(Part * 32); 2200 MovDPP.addImm(Imm.getLoBits(32).getZExtValue()); 2201 } else { 2202 assert(SrcOp.isReg()); 2203 Register Src = SrcOp.getReg(); 2204 if (Src.isPhysical()) 2205 MovDPP.addReg(RI.getSubReg(Src, Sub)); 2206 else 2207 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub); 2208 } 2209 } 2210 2211 for (unsigned I = 3; I < MI.getNumExplicitOperands(); ++I) 2212 MovDPP.addImm(MI.getOperand(I).getImm()); 2213 2214 Split[Part] = MovDPP; 2215 ++Part; 2216 } 2217 2218 if (Dst.isVirtual()) 2219 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst) 2220 .addReg(Split[0]->getOperand(0).getReg()) 2221 .addImm(AMDGPU::sub0) 2222 .addReg(Split[1]->getOperand(0).getReg()) 2223 .addImm(AMDGPU::sub1); 2224 2225 MI.eraseFromParent(); 2226 return std::make_pair(Split[0], Split[1]); 2227 } 2228 2229 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, 2230 MachineOperand &Src0, 2231 unsigned Src0OpName, 2232 MachineOperand &Src1, 2233 unsigned Src1OpName) const { 2234 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); 2235 if (!Src0Mods) 2236 return false; 2237 2238 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); 2239 assert(Src1Mods && 2240 "All commutable instructions have both src0 and src1 modifiers"); 2241 2242 int Src0ModsVal = Src0Mods->getImm(); 2243 int Src1ModsVal = Src1Mods->getImm(); 2244 2245 Src1Mods->setImm(Src0ModsVal); 2246 Src0Mods->setImm(Src1ModsVal); 2247 return true; 2248 } 2249 2250 static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, 2251 MachineOperand &RegOp, 2252 MachineOperand &NonRegOp) { 2253 Register Reg = RegOp.getReg(); 2254 unsigned SubReg = RegOp.getSubReg(); 2255 bool IsKill = RegOp.isKill(); 2256 bool IsDead = RegOp.isDead(); 2257 bool IsUndef = RegOp.isUndef(); 2258 bool IsDebug = RegOp.isDebug(); 2259 2260 if (NonRegOp.isImm()) 2261 RegOp.ChangeToImmediate(NonRegOp.getImm()); 2262 else if (NonRegOp.isFI()) 2263 RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); 2264 else if (NonRegOp.isGlobal()) { 2265 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(), 2266 NonRegOp.getTargetFlags()); 2267 } else 2268 return nullptr; 2269 2270 // Make sure we don't reinterpret a subreg index in the target flags. 2271 RegOp.setTargetFlags(NonRegOp.getTargetFlags()); 2272 2273 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); 2274 NonRegOp.setSubReg(SubReg); 2275 2276 return &MI; 2277 } 2278 2279 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, 2280 unsigned Src0Idx, 2281 unsigned Src1Idx) const { 2282 assert(!NewMI && "this should never be used"); 2283 2284 unsigned Opc = MI.getOpcode(); 2285 int CommutedOpcode = commuteOpcode(Opc); 2286 if (CommutedOpcode == -1) 2287 return nullptr; 2288 2289 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == 2290 static_cast<int>(Src0Idx) && 2291 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == 2292 static_cast<int>(Src1Idx) && 2293 "inconsistency with findCommutedOpIndices"); 2294 2295 MachineOperand &Src0 = MI.getOperand(Src0Idx); 2296 MachineOperand &Src1 = MI.getOperand(Src1Idx); 2297 2298 MachineInstr *CommutedMI = nullptr; 2299 if (Src0.isReg() && Src1.isReg()) { 2300 if (isOperandLegal(MI, Src1Idx, &Src0)) { 2301 // Be sure to copy the source modifiers to the right place. 2302 CommutedMI 2303 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); 2304 } 2305 2306 } else if (Src0.isReg() && !Src1.isReg()) { 2307 // src0 should always be able to support any operand type, so no need to 2308 // check operand legality. 2309 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); 2310 } else if (!Src0.isReg() && Src1.isReg()) { 2311 if (isOperandLegal(MI, Src1Idx, &Src0)) 2312 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); 2313 } else { 2314 // FIXME: Found two non registers to commute. This does happen. 2315 return nullptr; 2316 } 2317 2318 if (CommutedMI) { 2319 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, 2320 Src1, AMDGPU::OpName::src1_modifiers); 2321 2322 CommutedMI->setDesc(get(CommutedOpcode)); 2323 } 2324 2325 return CommutedMI; 2326 } 2327 2328 // This needs to be implemented because the source modifiers may be inserted 2329 // between the true commutable operands, and the base 2330 // TargetInstrInfo::commuteInstruction uses it. 2331 bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI, 2332 unsigned &SrcOpIdx0, 2333 unsigned &SrcOpIdx1) const { 2334 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1); 2335 } 2336 2337 bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0, 2338 unsigned &SrcOpIdx1) const { 2339 if (!Desc.isCommutable()) 2340 return false; 2341 2342 unsigned Opc = Desc.getOpcode(); 2343 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 2344 if (Src0Idx == -1) 2345 return false; 2346 2347 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 2348 if (Src1Idx == -1) 2349 return false; 2350 2351 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); 2352 } 2353 2354 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, 2355 int64_t BrOffset) const { 2356 // BranchRelaxation should never have to check s_setpc_b64 because its dest 2357 // block is unanalyzable. 2358 assert(BranchOp != AMDGPU::S_SETPC_B64); 2359 2360 // Convert to dwords. 2361 BrOffset /= 4; 2362 2363 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is 2364 // from the next instruction. 2365 BrOffset -= 1; 2366 2367 return isIntN(BranchOffsetBits, BrOffset); 2368 } 2369 2370 MachineBasicBlock *SIInstrInfo::getBranchDestBlock( 2371 const MachineInstr &MI) const { 2372 if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { 2373 // This would be a difficult analysis to perform, but can always be legal so 2374 // there's no need to analyze it. 2375 return nullptr; 2376 } 2377 2378 return MI.getOperand(0).getMBB(); 2379 } 2380 2381 void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, 2382 MachineBasicBlock &DestBB, 2383 MachineBasicBlock &RestoreBB, 2384 const DebugLoc &DL, int64_t BrOffset, 2385 RegScavenger *RS) const { 2386 assert(RS && "RegScavenger required for long branching"); 2387 assert(MBB.empty() && 2388 "new block should be inserted for expanding unconditional branch"); 2389 assert(MBB.pred_size() == 1); 2390 assert(RestoreBB.empty() && 2391 "restore block should be inserted for restoring clobbered registers"); 2392 2393 MachineFunction *MF = MBB.getParent(); 2394 MachineRegisterInfo &MRI = MF->getRegInfo(); 2395 2396 // FIXME: Virtual register workaround for RegScavenger not working with empty 2397 // blocks. 2398 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2399 2400 auto I = MBB.end(); 2401 2402 // We need to compute the offset relative to the instruction immediately after 2403 // s_getpc_b64. Insert pc arithmetic code before last terminator. 2404 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); 2405 2406 auto &MCCtx = MF->getContext(); 2407 MCSymbol *PostGetPCLabel = 2408 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true); 2409 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel); 2410 2411 MCSymbol *OffsetLo = 2412 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true); 2413 MCSymbol *OffsetHi = 2414 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true); 2415 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) 2416 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 2417 .addReg(PCReg, 0, AMDGPU::sub0) 2418 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET); 2419 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) 2420 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 2421 .addReg(PCReg, 0, AMDGPU::sub1) 2422 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET); 2423 2424 // Insert the indirect branch after the other terminator. 2425 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) 2426 .addReg(PCReg); 2427 2428 // FIXME: If spilling is necessary, this will fail because this scavenger has 2429 // no emergency stack slots. It is non-trivial to spill in this situation, 2430 // because the restore code needs to be specially placed after the 2431 // jump. BranchRelaxation then needs to be made aware of the newly inserted 2432 // block. 2433 // 2434 // If a spill is needed for the pc register pair, we need to insert a spill 2435 // restore block right before the destination block, and insert a short branch 2436 // into the old destination block's fallthrough predecessor. 2437 // e.g.: 2438 // 2439 // s_cbranch_scc0 skip_long_branch: 2440 // 2441 // long_branch_bb: 2442 // spill s[8:9] 2443 // s_getpc_b64 s[8:9] 2444 // s_add_u32 s8, s8, restore_bb 2445 // s_addc_u32 s9, s9, 0 2446 // s_setpc_b64 s[8:9] 2447 // 2448 // skip_long_branch: 2449 // foo; 2450 // 2451 // ..... 2452 // 2453 // dest_bb_fallthrough_predecessor: 2454 // bar; 2455 // s_branch dest_bb 2456 // 2457 // restore_bb: 2458 // restore s[8:9] 2459 // fallthrough dest_bb 2460 /// 2461 // dest_bb: 2462 // buzz; 2463 2464 RS->enterBasicBlockEnd(MBB); 2465 Register Scav = RS->scavengeRegisterBackwards( 2466 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC), 2467 /* RestoreAfter */ false, 0, /* AllowSpill */ false); 2468 if (Scav) { 2469 RS->setRegUsed(Scav); 2470 MRI.replaceRegWith(PCReg, Scav); 2471 MRI.clearVirtRegs(); 2472 } else { 2473 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for 2474 // SGPR spill. 2475 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 2476 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 2477 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS); 2478 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1); 2479 MRI.clearVirtRegs(); 2480 } 2481 2482 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol(); 2483 // Now, the distance could be defined. 2484 auto *Offset = MCBinaryExpr::createSub( 2485 MCSymbolRefExpr::create(DestLabel, MCCtx), 2486 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx); 2487 // Add offset assignments. 2488 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx); 2489 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx)); 2490 auto *ShAmt = MCConstantExpr::create(32, MCCtx); 2491 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx)); 2492 } 2493 2494 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { 2495 switch (Cond) { 2496 case SIInstrInfo::SCC_TRUE: 2497 return AMDGPU::S_CBRANCH_SCC1; 2498 case SIInstrInfo::SCC_FALSE: 2499 return AMDGPU::S_CBRANCH_SCC0; 2500 case SIInstrInfo::VCCNZ: 2501 return AMDGPU::S_CBRANCH_VCCNZ; 2502 case SIInstrInfo::VCCZ: 2503 return AMDGPU::S_CBRANCH_VCCZ; 2504 case SIInstrInfo::EXECNZ: 2505 return AMDGPU::S_CBRANCH_EXECNZ; 2506 case SIInstrInfo::EXECZ: 2507 return AMDGPU::S_CBRANCH_EXECZ; 2508 default: 2509 llvm_unreachable("invalid branch predicate"); 2510 } 2511 } 2512 2513 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { 2514 switch (Opcode) { 2515 case AMDGPU::S_CBRANCH_SCC0: 2516 return SCC_FALSE; 2517 case AMDGPU::S_CBRANCH_SCC1: 2518 return SCC_TRUE; 2519 case AMDGPU::S_CBRANCH_VCCNZ: 2520 return VCCNZ; 2521 case AMDGPU::S_CBRANCH_VCCZ: 2522 return VCCZ; 2523 case AMDGPU::S_CBRANCH_EXECNZ: 2524 return EXECNZ; 2525 case AMDGPU::S_CBRANCH_EXECZ: 2526 return EXECZ; 2527 default: 2528 return INVALID_BR; 2529 } 2530 } 2531 2532 bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, 2533 MachineBasicBlock::iterator I, 2534 MachineBasicBlock *&TBB, 2535 MachineBasicBlock *&FBB, 2536 SmallVectorImpl<MachineOperand> &Cond, 2537 bool AllowModify) const { 2538 if (I->getOpcode() == AMDGPU::S_BRANCH) { 2539 // Unconditional Branch 2540 TBB = I->getOperand(0).getMBB(); 2541 return false; 2542 } 2543 2544 MachineBasicBlock *CondBB = nullptr; 2545 2546 if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 2547 CondBB = I->getOperand(1).getMBB(); 2548 Cond.push_back(I->getOperand(0)); 2549 } else { 2550 BranchPredicate Pred = getBranchPredicate(I->getOpcode()); 2551 if (Pred == INVALID_BR) 2552 return true; 2553 2554 CondBB = I->getOperand(0).getMBB(); 2555 Cond.push_back(MachineOperand::CreateImm(Pred)); 2556 Cond.push_back(I->getOperand(1)); // Save the branch register. 2557 } 2558 ++I; 2559 2560 if (I == MBB.end()) { 2561 // Conditional branch followed by fall-through. 2562 TBB = CondBB; 2563 return false; 2564 } 2565 2566 if (I->getOpcode() == AMDGPU::S_BRANCH) { 2567 TBB = CondBB; 2568 FBB = I->getOperand(0).getMBB(); 2569 return false; 2570 } 2571 2572 return true; 2573 } 2574 2575 bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, 2576 MachineBasicBlock *&FBB, 2577 SmallVectorImpl<MachineOperand> &Cond, 2578 bool AllowModify) const { 2579 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 2580 auto E = MBB.end(); 2581 if (I == E) 2582 return false; 2583 2584 // Skip over the instructions that are artificially terminators for special 2585 // exec management. 2586 while (I != E && !I->isBranch() && !I->isReturn()) { 2587 switch (I->getOpcode()) { 2588 case AMDGPU::S_MOV_B64_term: 2589 case AMDGPU::S_XOR_B64_term: 2590 case AMDGPU::S_OR_B64_term: 2591 case AMDGPU::S_ANDN2_B64_term: 2592 case AMDGPU::S_AND_B64_term: 2593 case AMDGPU::S_MOV_B32_term: 2594 case AMDGPU::S_XOR_B32_term: 2595 case AMDGPU::S_OR_B32_term: 2596 case AMDGPU::S_ANDN2_B32_term: 2597 case AMDGPU::S_AND_B32_term: 2598 break; 2599 case AMDGPU::SI_IF: 2600 case AMDGPU::SI_ELSE: 2601 case AMDGPU::SI_KILL_I1_TERMINATOR: 2602 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: 2603 // FIXME: It's messy that these need to be considered here at all. 2604 return true; 2605 default: 2606 llvm_unreachable("unexpected non-branch terminator inst"); 2607 } 2608 2609 ++I; 2610 } 2611 2612 if (I == E) 2613 return false; 2614 2615 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); 2616 } 2617 2618 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, 2619 int *BytesRemoved) const { 2620 unsigned Count = 0; 2621 unsigned RemovedSize = 0; 2622 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) { 2623 // Skip over artificial terminators when removing instructions. 2624 if (MI.isBranch() || MI.isReturn()) { 2625 RemovedSize += getInstSizeInBytes(MI); 2626 MI.eraseFromParent(); 2627 ++Count; 2628 } 2629 } 2630 2631 if (BytesRemoved) 2632 *BytesRemoved = RemovedSize; 2633 2634 return Count; 2635 } 2636 2637 // Copy the flags onto the implicit condition register operand. 2638 static void preserveCondRegFlags(MachineOperand &CondReg, 2639 const MachineOperand &OrigCond) { 2640 CondReg.setIsUndef(OrigCond.isUndef()); 2641 CondReg.setIsKill(OrigCond.isKill()); 2642 } 2643 2644 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, 2645 MachineBasicBlock *TBB, 2646 MachineBasicBlock *FBB, 2647 ArrayRef<MachineOperand> Cond, 2648 const DebugLoc &DL, 2649 int *BytesAdded) const { 2650 if (!FBB && Cond.empty()) { 2651 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 2652 .addMBB(TBB); 2653 if (BytesAdded) 2654 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4; 2655 return 1; 2656 } 2657 2658 if(Cond.size() == 1 && Cond[0].isReg()) { 2659 BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO)) 2660 .add(Cond[0]) 2661 .addMBB(TBB); 2662 return 1; 2663 } 2664 2665 assert(TBB && Cond[0].isImm()); 2666 2667 unsigned Opcode 2668 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); 2669 2670 if (!FBB) { 2671 Cond[1].isUndef(); 2672 MachineInstr *CondBr = 2673 BuildMI(&MBB, DL, get(Opcode)) 2674 .addMBB(TBB); 2675 2676 // Copy the flags onto the implicit condition register operand. 2677 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); 2678 fixImplicitOperands(*CondBr); 2679 2680 if (BytesAdded) 2681 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4; 2682 return 1; 2683 } 2684 2685 assert(TBB && FBB); 2686 2687 MachineInstr *CondBr = 2688 BuildMI(&MBB, DL, get(Opcode)) 2689 .addMBB(TBB); 2690 fixImplicitOperands(*CondBr); 2691 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 2692 .addMBB(FBB); 2693 2694 MachineOperand &CondReg = CondBr->getOperand(1); 2695 CondReg.setIsUndef(Cond[1].isUndef()); 2696 CondReg.setIsKill(Cond[1].isKill()); 2697 2698 if (BytesAdded) 2699 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8; 2700 2701 return 2; 2702 } 2703 2704 bool SIInstrInfo::reverseBranchCondition( 2705 SmallVectorImpl<MachineOperand> &Cond) const { 2706 if (Cond.size() != 2) { 2707 return true; 2708 } 2709 2710 if (Cond[0].isImm()) { 2711 Cond[0].setImm(-Cond[0].getImm()); 2712 return false; 2713 } 2714 2715 return true; 2716 } 2717 2718 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 2719 ArrayRef<MachineOperand> Cond, 2720 Register DstReg, Register TrueReg, 2721 Register FalseReg, int &CondCycles, 2722 int &TrueCycles, int &FalseCycles) const { 2723 switch (Cond[0].getImm()) { 2724 case VCCNZ: 2725 case VCCZ: { 2726 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2727 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 2728 if (MRI.getRegClass(FalseReg) != RC) 2729 return false; 2730 2731 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 2732 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 2733 2734 // Limit to equal cost for branch vs. N v_cndmask_b32s. 2735 return RI.hasVGPRs(RC) && NumInsts <= 6; 2736 } 2737 case SCC_TRUE: 2738 case SCC_FALSE: { 2739 // FIXME: We could insert for VGPRs if we could replace the original compare 2740 // with a vector one. 2741 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2742 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 2743 if (MRI.getRegClass(FalseReg) != RC) 2744 return false; 2745 2746 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 2747 2748 // Multiples of 8 can do s_cselect_b64 2749 if (NumInsts % 2 == 0) 2750 NumInsts /= 2; 2751 2752 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 2753 return RI.isSGPRClass(RC); 2754 } 2755 default: 2756 return false; 2757 } 2758 } 2759 2760 void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, 2761 MachineBasicBlock::iterator I, const DebugLoc &DL, 2762 Register DstReg, ArrayRef<MachineOperand> Cond, 2763 Register TrueReg, Register FalseReg) const { 2764 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); 2765 if (Pred == VCCZ || Pred == SCC_FALSE) { 2766 Pred = static_cast<BranchPredicate>(-Pred); 2767 std::swap(TrueReg, FalseReg); 2768 } 2769 2770 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2771 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); 2772 unsigned DstSize = RI.getRegSizeInBits(*DstRC); 2773 2774 if (DstSize == 32) { 2775 MachineInstr *Select; 2776 if (Pred == SCC_TRUE) { 2777 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg) 2778 .addReg(TrueReg) 2779 .addReg(FalseReg); 2780 } else { 2781 // Instruction's operands are backwards from what is expected. 2782 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg) 2783 .addReg(FalseReg) 2784 .addReg(TrueReg); 2785 } 2786 2787 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 2788 return; 2789 } 2790 2791 if (DstSize == 64 && Pred == SCC_TRUE) { 2792 MachineInstr *Select = 2793 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) 2794 .addReg(TrueReg) 2795 .addReg(FalseReg); 2796 2797 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 2798 return; 2799 } 2800 2801 static const int16_t Sub0_15[] = { 2802 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 2803 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 2804 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 2805 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 2806 }; 2807 2808 static const int16_t Sub0_15_64[] = { 2809 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 2810 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 2811 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 2812 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 2813 }; 2814 2815 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; 2816 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; 2817 const int16_t *SubIndices = Sub0_15; 2818 int NElts = DstSize / 32; 2819 2820 // 64-bit select is only available for SALU. 2821 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit. 2822 if (Pred == SCC_TRUE) { 2823 if (NElts % 2) { 2824 SelOp = AMDGPU::S_CSELECT_B32; 2825 EltRC = &AMDGPU::SGPR_32RegClass; 2826 } else { 2827 SelOp = AMDGPU::S_CSELECT_B64; 2828 EltRC = &AMDGPU::SGPR_64RegClass; 2829 SubIndices = Sub0_15_64; 2830 NElts /= 2; 2831 } 2832 } 2833 2834 MachineInstrBuilder MIB = BuildMI( 2835 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); 2836 2837 I = MIB->getIterator(); 2838 2839 SmallVector<Register, 8> Regs; 2840 for (int Idx = 0; Idx != NElts; ++Idx) { 2841 Register DstElt = MRI.createVirtualRegister(EltRC); 2842 Regs.push_back(DstElt); 2843 2844 unsigned SubIdx = SubIndices[Idx]; 2845 2846 MachineInstr *Select; 2847 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) { 2848 Select = 2849 BuildMI(MBB, I, DL, get(SelOp), DstElt) 2850 .addReg(FalseReg, 0, SubIdx) 2851 .addReg(TrueReg, 0, SubIdx); 2852 } else { 2853 Select = 2854 BuildMI(MBB, I, DL, get(SelOp), DstElt) 2855 .addReg(TrueReg, 0, SubIdx) 2856 .addReg(FalseReg, 0, SubIdx); 2857 } 2858 2859 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 2860 fixImplicitOperands(*Select); 2861 2862 MIB.addReg(DstElt) 2863 .addImm(SubIdx); 2864 } 2865 } 2866 2867 bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) { 2868 switch (MI.getOpcode()) { 2869 case AMDGPU::V_MOV_B32_e32: 2870 case AMDGPU::V_MOV_B32_e64: 2871 case AMDGPU::V_MOV_B64_PSEUDO: 2872 case AMDGPU::V_MOV_B64_e32: 2873 case AMDGPU::V_MOV_B64_e64: 2874 case AMDGPU::S_MOV_B32: 2875 case AMDGPU::S_MOV_B64: 2876 case AMDGPU::COPY: 2877 case AMDGPU::V_ACCVGPR_WRITE_B32_e64: 2878 case AMDGPU::V_ACCVGPR_READ_B32_e64: 2879 case AMDGPU::V_ACCVGPR_MOV_B32: 2880 return true; 2881 default: 2882 return false; 2883 } 2884 } 2885 2886 unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind( 2887 unsigned Kind) const { 2888 switch(Kind) { 2889 case PseudoSourceValue::Stack: 2890 case PseudoSourceValue::FixedStack: 2891 return AMDGPUAS::PRIVATE_ADDRESS; 2892 case PseudoSourceValue::ConstantPool: 2893 case PseudoSourceValue::GOT: 2894 case PseudoSourceValue::JumpTable: 2895 case PseudoSourceValue::GlobalValueCallEntry: 2896 case PseudoSourceValue::ExternalSymbolCallEntry: 2897 case PseudoSourceValue::TargetCustom: 2898 return AMDGPUAS::CONSTANT_ADDRESS; 2899 } 2900 return AMDGPUAS::FLAT_ADDRESS; 2901 } 2902 2903 static void removeModOperands(MachineInstr &MI) { 2904 unsigned Opc = MI.getOpcode(); 2905 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, 2906 AMDGPU::OpName::src0_modifiers); 2907 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, 2908 AMDGPU::OpName::src1_modifiers); 2909 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, 2910 AMDGPU::OpName::src2_modifiers); 2911 2912 MI.removeOperand(Src2ModIdx); 2913 MI.removeOperand(Src1ModIdx); 2914 MI.removeOperand(Src0ModIdx); 2915 } 2916 2917 bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, 2918 Register Reg, MachineRegisterInfo *MRI) const { 2919 if (!MRI->hasOneNonDBGUse(Reg)) 2920 return false; 2921 2922 switch (DefMI.getOpcode()) { 2923 default: 2924 return false; 2925 case AMDGPU::S_MOV_B64: 2926 // TODO: We could fold 64-bit immediates, but this get complicated 2927 // when there are sub-registers. 2928 return false; 2929 2930 case AMDGPU::V_MOV_B32_e32: 2931 case AMDGPU::S_MOV_B32: 2932 case AMDGPU::V_ACCVGPR_WRITE_B32_e64: 2933 break; 2934 } 2935 2936 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); 2937 assert(ImmOp); 2938 // FIXME: We could handle FrameIndex values here. 2939 if (!ImmOp->isImm()) 2940 return false; 2941 2942 unsigned Opc = UseMI.getOpcode(); 2943 if (Opc == AMDGPU::COPY) { 2944 Register DstReg = UseMI.getOperand(0).getReg(); 2945 bool Is16Bit = getOpSize(UseMI, 0) == 2; 2946 bool isVGPRCopy = RI.isVGPR(*MRI, DstReg); 2947 unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 2948 APInt Imm(32, ImmOp->getImm()); 2949 2950 if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16) 2951 Imm = Imm.ashr(16); 2952 2953 if (RI.isAGPR(*MRI, DstReg)) { 2954 if (!isInlineConstant(Imm)) 2955 return false; 2956 NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64; 2957 } 2958 2959 if (Is16Bit) { 2960 if (isVGPRCopy) 2961 return false; // Do not clobber vgpr_hi16 2962 2963 if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16) 2964 return false; 2965 2966 UseMI.getOperand(0).setSubReg(0); 2967 if (DstReg.isPhysical()) { 2968 DstReg = RI.get32BitRegister(DstReg); 2969 UseMI.getOperand(0).setReg(DstReg); 2970 } 2971 assert(UseMI.getOperand(1).getReg().isVirtual()); 2972 } 2973 2974 UseMI.setDesc(get(NewOpc)); 2975 UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue()); 2976 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); 2977 return true; 2978 } 2979 2980 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 || 2981 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 || 2982 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || 2983 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64) { 2984 // Don't fold if we are using source or output modifiers. The new VOP2 2985 // instructions don't have them. 2986 if (hasAnyModifiersSet(UseMI)) 2987 return false; 2988 2989 // If this is a free constant, there's no reason to do this. 2990 // TODO: We could fold this here instead of letting SIFoldOperands do it 2991 // later. 2992 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); 2993 2994 // Any src operand can be used for the legality check. 2995 if (isInlineConstant(UseMI, *Src0, *ImmOp)) 2996 return false; 2997 2998 bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 || 2999 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64; 3000 bool IsFMA = Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || 3001 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64; 3002 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); 3003 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); 3004 3005 // Multiplied part is the constant: Use v_madmk_{f16, f32}. 3006 // We should only expect these to be on src0 due to canonicalization. 3007 if (Src0->isReg() && Src0->getReg() == Reg) { 3008 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 3009 return false; 3010 3011 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) 3012 return false; 3013 3014 unsigned NewOpc = 3015 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : AMDGPU::V_FMAMK_F16) 3016 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16); 3017 if (pseudoToMCOpcode(NewOpc) == -1) 3018 return false; 3019 3020 // We need to swap operands 0 and 1 since madmk constant is at operand 1. 3021 3022 const int64_t Imm = ImmOp->getImm(); 3023 3024 // FIXME: This would be a lot easier if we could return a new instruction 3025 // instead of having to modify in place. 3026 3027 // Remove these first since they are at the end. 3028 UseMI.removeOperand( 3029 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 3030 UseMI.removeOperand( 3031 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 3032 3033 Register Src1Reg = Src1->getReg(); 3034 unsigned Src1SubReg = Src1->getSubReg(); 3035 Src0->setReg(Src1Reg); 3036 Src0->setSubReg(Src1SubReg); 3037 Src0->setIsKill(Src1->isKill()); 3038 3039 if (Opc == AMDGPU::V_MAC_F32_e64 || 3040 Opc == AMDGPU::V_MAC_F16_e64 || 3041 Opc == AMDGPU::V_FMAC_F32_e64 || 3042 Opc == AMDGPU::V_FMAC_F16_e64) 3043 UseMI.untieRegOperand( 3044 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 3045 3046 Src1->ChangeToImmediate(Imm); 3047 3048 removeModOperands(UseMI); 3049 UseMI.setDesc(get(NewOpc)); 3050 3051 bool DeleteDef = MRI->use_nodbg_empty(Reg); 3052 if (DeleteDef) 3053 DefMI.eraseFromParent(); 3054 3055 return true; 3056 } 3057 3058 // Added part is the constant: Use v_madak_{f16, f32}. 3059 if (Src2->isReg() && Src2->getReg() == Reg) { 3060 // Not allowed to use constant bus for another operand. 3061 // We can however allow an inline immediate as src0. 3062 bool Src0Inlined = false; 3063 if (Src0->isReg()) { 3064 // Try to inline constant if possible. 3065 // If the Def moves immediate and the use is single 3066 // We are saving VGPR here. 3067 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg()); 3068 if (Def && Def->isMoveImmediate() && 3069 isInlineConstant(Def->getOperand(1)) && 3070 MRI->hasOneUse(Src0->getReg())) { 3071 Src0->ChangeToImmediate(Def->getOperand(1).getImm()); 3072 Src0Inlined = true; 3073 } else if ((Src0->getReg().isPhysical() && 3074 (ST.getConstantBusLimit(Opc) <= 1 && 3075 RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) || 3076 (Src0->getReg().isVirtual() && 3077 (ST.getConstantBusLimit(Opc) <= 1 && 3078 RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))) 3079 return false; 3080 // VGPR is okay as Src0 - fallthrough 3081 } 3082 3083 if (Src1->isReg() && !Src0Inlined ) { 3084 // We have one slot for inlinable constant so far - try to fill it 3085 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg()); 3086 if (Def && Def->isMoveImmediate() && 3087 isInlineConstant(Def->getOperand(1)) && 3088 MRI->hasOneUse(Src1->getReg()) && 3089 commuteInstruction(UseMI)) { 3090 Src0->ChangeToImmediate(Def->getOperand(1).getImm()); 3091 } else if ((Src1->getReg().isPhysical() && 3092 RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) || 3093 (Src1->getReg().isVirtual() && 3094 RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) 3095 return false; 3096 // VGPR is okay as Src1 - fallthrough 3097 } 3098 3099 unsigned NewOpc = 3100 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 : AMDGPU::V_FMAAK_F16) 3101 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16); 3102 if (pseudoToMCOpcode(NewOpc) == -1) 3103 return false; 3104 3105 const int64_t Imm = ImmOp->getImm(); 3106 3107 // FIXME: This would be a lot easier if we could return a new instruction 3108 // instead of having to modify in place. 3109 3110 // Remove these first since they are at the end. 3111 UseMI.removeOperand( 3112 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 3113 UseMI.removeOperand( 3114 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 3115 3116 if (Opc == AMDGPU::V_MAC_F32_e64 || 3117 Opc == AMDGPU::V_MAC_F16_e64 || 3118 Opc == AMDGPU::V_FMAC_F32_e64 || 3119 Opc == AMDGPU::V_FMAC_F16_e64) 3120 UseMI.untieRegOperand( 3121 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 3122 3123 // ChangingToImmediate adds Src2 back to the instruction. 3124 Src2->ChangeToImmediate(Imm); 3125 3126 // These come before src2. 3127 removeModOperands(UseMI); 3128 UseMI.setDesc(get(NewOpc)); 3129 // It might happen that UseMI was commuted 3130 // and we now have SGPR as SRC1. If so 2 inlined 3131 // constant and SGPR are illegal. 3132 legalizeOperands(UseMI); 3133 3134 bool DeleteDef = MRI->use_nodbg_empty(Reg); 3135 if (DeleteDef) 3136 DefMI.eraseFromParent(); 3137 3138 return true; 3139 } 3140 } 3141 3142 return false; 3143 } 3144 3145 static bool 3146 memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1, 3147 ArrayRef<const MachineOperand *> BaseOps2) { 3148 if (BaseOps1.size() != BaseOps2.size()) 3149 return false; 3150 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) { 3151 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I])) 3152 return false; 3153 } 3154 return true; 3155 } 3156 3157 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 3158 int WidthB, int OffsetB) { 3159 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 3160 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 3161 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 3162 return LowOffset + LowWidth <= HighOffset; 3163 } 3164 3165 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa, 3166 const MachineInstr &MIb) const { 3167 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1; 3168 int64_t Offset0, Offset1; 3169 unsigned Dummy0, Dummy1; 3170 bool Offset0IsScalable, Offset1IsScalable; 3171 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable, 3172 Dummy0, &RI) || 3173 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable, 3174 Dummy1, &RI)) 3175 return false; 3176 3177 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1)) 3178 return false; 3179 3180 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { 3181 // FIXME: Handle ds_read2 / ds_write2. 3182 return false; 3183 } 3184 unsigned Width0 = MIa.memoperands().front()->getSize(); 3185 unsigned Width1 = MIb.memoperands().front()->getSize(); 3186 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1); 3187 } 3188 3189 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, 3190 const MachineInstr &MIb) const { 3191 assert(MIa.mayLoadOrStore() && 3192 "MIa must load from or modify a memory location"); 3193 assert(MIb.mayLoadOrStore() && 3194 "MIb must load from or modify a memory location"); 3195 3196 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) 3197 return false; 3198 3199 // XXX - Can we relax this between address spaces? 3200 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 3201 return false; 3202 3203 // TODO: Should we check the address space from the MachineMemOperand? That 3204 // would allow us to distinguish objects we know don't alias based on the 3205 // underlying address space, even if it was lowered to a different one, 3206 // e.g. private accesses lowered to use MUBUF instructions on a scratch 3207 // buffer. 3208 if (isDS(MIa)) { 3209 if (isDS(MIb)) 3210 return checkInstOffsetsDoNotOverlap(MIa, MIb); 3211 3212 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb); 3213 } 3214 3215 if (isMUBUF(MIa) || isMTBUF(MIa)) { 3216 if (isMUBUF(MIb) || isMTBUF(MIb)) 3217 return checkInstOffsetsDoNotOverlap(MIa, MIb); 3218 3219 return !isFLAT(MIb) && !isSMRD(MIb); 3220 } 3221 3222 if (isSMRD(MIa)) { 3223 if (isSMRD(MIb)) 3224 return checkInstOffsetsDoNotOverlap(MIa, MIb); 3225 3226 return !isFLAT(MIb) && !isMUBUF(MIb) && !isMTBUF(MIb); 3227 } 3228 3229 if (isFLAT(MIa)) { 3230 if (isFLAT(MIb)) 3231 return checkInstOffsetsDoNotOverlap(MIa, MIb); 3232 3233 return false; 3234 } 3235 3236 return false; 3237 } 3238 3239 static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, 3240 int64_t &Imm, MachineInstr **DefMI = nullptr) { 3241 if (Reg.isPhysical()) 3242 return false; 3243 auto *Def = MRI.getUniqueVRegDef(Reg); 3244 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) { 3245 Imm = Def->getOperand(1).getImm(); 3246 if (DefMI) 3247 *DefMI = Def; 3248 return true; 3249 } 3250 return false; 3251 } 3252 3253 static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm, 3254 MachineInstr **DefMI = nullptr) { 3255 if (!MO->isReg()) 3256 return false; 3257 const MachineFunction *MF = MO->getParent()->getParent()->getParent(); 3258 const MachineRegisterInfo &MRI = MF->getRegInfo(); 3259 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI); 3260 } 3261 3262 static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, 3263 MachineInstr &NewMI) { 3264 if (LV) { 3265 unsigned NumOps = MI.getNumOperands(); 3266 for (unsigned I = 1; I < NumOps; ++I) { 3267 MachineOperand &Op = MI.getOperand(I); 3268 if (Op.isReg() && Op.isKill()) 3269 LV->replaceKillInstruction(Op.getReg(), MI, NewMI); 3270 } 3271 } 3272 } 3273 3274 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, 3275 LiveVariables *LV, 3276 LiveIntervals *LIS) const { 3277 MachineBasicBlock &MBB = *MI.getParent(); 3278 unsigned Opc = MI.getOpcode(); 3279 3280 // Handle MFMA. 3281 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc); 3282 if (NewMFMAOpc != -1) { 3283 MachineInstrBuilder MIB = 3284 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc)); 3285 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) 3286 MIB.add(MI.getOperand(I)); 3287 updateLiveVariables(LV, MI, *MIB); 3288 if (LIS) 3289 LIS->ReplaceMachineInstrInMaps(MI, *MIB); 3290 return MIB; 3291 } 3292 3293 // Handle MAC/FMAC. 3294 bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 || 3295 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64; 3296 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 || 3297 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 || 3298 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 || 3299 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 || 3300 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; 3301 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; 3302 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 || 3303 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 || 3304 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 || 3305 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64; 3306 bool Src0Literal = false; 3307 3308 switch (Opc) { 3309 default: 3310 return nullptr; 3311 case AMDGPU::V_MAC_F16_e64: 3312 case AMDGPU::V_FMAC_F16_e64: 3313 case AMDGPU::V_MAC_F32_e64: 3314 case AMDGPU::V_MAC_LEGACY_F32_e64: 3315 case AMDGPU::V_FMAC_F32_e64: 3316 case AMDGPU::V_FMAC_LEGACY_F32_e64: 3317 case AMDGPU::V_FMAC_F64_e64: 3318 break; 3319 case AMDGPU::V_MAC_F16_e32: 3320 case AMDGPU::V_FMAC_F16_e32: 3321 case AMDGPU::V_MAC_F32_e32: 3322 case AMDGPU::V_MAC_LEGACY_F32_e32: 3323 case AMDGPU::V_FMAC_F32_e32: 3324 case AMDGPU::V_FMAC_LEGACY_F32_e32: 3325 case AMDGPU::V_FMAC_F64_e32: { 3326 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 3327 AMDGPU::OpName::src0); 3328 const MachineOperand *Src0 = &MI.getOperand(Src0Idx); 3329 if (!Src0->isReg() && !Src0->isImm()) 3330 return nullptr; 3331 3332 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0)) 3333 Src0Literal = true; 3334 3335 break; 3336 } 3337 } 3338 3339 MachineInstrBuilder MIB; 3340 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 3341 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); 3342 const MachineOperand *Src0Mods = 3343 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 3344 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 3345 const MachineOperand *Src1Mods = 3346 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 3347 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 3348 const MachineOperand *Src2Mods = 3349 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers); 3350 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); 3351 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); 3352 3353 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 && 3354 !IsLegacy && 3355 // If we have an SGPR input, we will violate the constant bus restriction. 3356 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() || 3357 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) { 3358 MachineInstr *DefMI; 3359 const auto killDef = [&DefMI, &MBB, this]() -> void { 3360 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3361 // The only user is the instruction which will be killed. 3362 if (!MRI.hasOneNonDBGUse(DefMI->getOperand(0).getReg())) 3363 return; 3364 // We cannot just remove the DefMI here, calling pass will crash. 3365 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF)); 3366 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I) 3367 DefMI->removeOperand(I); 3368 }; 3369 3370 int64_t Imm; 3371 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) { 3372 unsigned NewOpc = 3373 IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32) 3374 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32); 3375 if (pseudoToMCOpcode(NewOpc) != -1) { 3376 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) 3377 .add(*Dst) 3378 .add(*Src0) 3379 .add(*Src1) 3380 .addImm(Imm); 3381 updateLiveVariables(LV, MI, *MIB); 3382 if (LIS) 3383 LIS->ReplaceMachineInstrInMaps(MI, *MIB); 3384 killDef(); 3385 return MIB; 3386 } 3387 } 3388 unsigned NewOpc = IsFMA 3389 ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32) 3390 : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32); 3391 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) { 3392 if (pseudoToMCOpcode(NewOpc) != -1) { 3393 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) 3394 .add(*Dst) 3395 .add(*Src0) 3396 .addImm(Imm) 3397 .add(*Src2); 3398 updateLiveVariables(LV, MI, *MIB); 3399 if (LIS) 3400 LIS->ReplaceMachineInstrInMaps(MI, *MIB); 3401 killDef(); 3402 return MIB; 3403 } 3404 } 3405 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) { 3406 if (Src0Literal) { 3407 Imm = Src0->getImm(); 3408 DefMI = nullptr; 3409 } 3410 if (pseudoToMCOpcode(NewOpc) != -1 && 3411 isOperandLegal( 3412 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0), 3413 Src1)) { 3414 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) 3415 .add(*Dst) 3416 .add(*Src1) 3417 .addImm(Imm) 3418 .add(*Src2); 3419 updateLiveVariables(LV, MI, *MIB); 3420 if (LIS) 3421 LIS->ReplaceMachineInstrInMaps(MI, *MIB); 3422 if (DefMI) 3423 killDef(); 3424 return MIB; 3425 } 3426 } 3427 } 3428 3429 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma 3430 // because VOP3 does not allow a literal operand. 3431 // TODO: Remove this restriction for GFX10. 3432 if (Src0Literal) 3433 return nullptr; 3434 3435 unsigned NewOpc = IsFMA ? IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64 3436 : IsF64 ? AMDGPU::V_FMA_F64_e64 3437 : IsLegacy 3438 ? AMDGPU::V_FMA_LEGACY_F32_e64 3439 : AMDGPU::V_FMA_F32_e64 3440 : IsF16 ? AMDGPU::V_MAD_F16_e64 3441 : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64 3442 : AMDGPU::V_MAD_F32_e64; 3443 if (pseudoToMCOpcode(NewOpc) == -1) 3444 return nullptr; 3445 3446 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) 3447 .add(*Dst) 3448 .addImm(Src0Mods ? Src0Mods->getImm() : 0) 3449 .add(*Src0) 3450 .addImm(Src1Mods ? Src1Mods->getImm() : 0) 3451 .add(*Src1) 3452 .addImm(Src2Mods ? Src2Mods->getImm() : 0) 3453 .add(*Src2) 3454 .addImm(Clamp ? Clamp->getImm() : 0) 3455 .addImm(Omod ? Omod->getImm() : 0); 3456 updateLiveVariables(LV, MI, *MIB); 3457 if (LIS) 3458 LIS->ReplaceMachineInstrInMaps(MI, *MIB); 3459 return MIB; 3460 } 3461 3462 // It's not generally safe to move VALU instructions across these since it will 3463 // start using the register as a base index rather than directly. 3464 // XXX - Why isn't hasSideEffects sufficient for these? 3465 static bool changesVGPRIndexingMode(const MachineInstr &MI) { 3466 switch (MI.getOpcode()) { 3467 case AMDGPU::S_SET_GPR_IDX_ON: 3468 case AMDGPU::S_SET_GPR_IDX_MODE: 3469 case AMDGPU::S_SET_GPR_IDX_OFF: 3470 return true; 3471 default: 3472 return false; 3473 } 3474 } 3475 3476 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, 3477 const MachineBasicBlock *MBB, 3478 const MachineFunction &MF) const { 3479 // Skipping the check for SP writes in the base implementation. The reason it 3480 // was added was apparently due to compile time concerns. 3481 // 3482 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops 3483 // but is probably avoidable. 3484 3485 // Copied from base implementation. 3486 // Terminators and labels can't be scheduled around. 3487 if (MI.isTerminator() || MI.isPosition()) 3488 return true; 3489 3490 // INLINEASM_BR can jump to another block 3491 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR) 3492 return true; 3493 3494 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0) 3495 return true; 3496 3497 // Target-independent instructions do not have an implicit-use of EXEC, even 3498 // when they operate on VGPRs. Treating EXEC modifications as scheduling 3499 // boundaries prevents incorrect movements of such instructions. 3500 return MI.modifiesRegister(AMDGPU::EXEC, &RI) || 3501 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || 3502 MI.getOpcode() == AMDGPU::S_SETREG_B32 || 3503 changesVGPRIndexingMode(MI); 3504 } 3505 3506 bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { 3507 return Opcode == AMDGPU::DS_ORDERED_COUNT || 3508 Opcode == AMDGPU::DS_GWS_INIT || 3509 Opcode == AMDGPU::DS_GWS_SEMA_V || 3510 Opcode == AMDGPU::DS_GWS_SEMA_BR || 3511 Opcode == AMDGPU::DS_GWS_SEMA_P || 3512 Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL || 3513 Opcode == AMDGPU::DS_GWS_BARRIER; 3514 } 3515 3516 bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) { 3517 // Skip the full operand and register alias search modifiesRegister 3518 // does. There's only a handful of instructions that touch this, it's only an 3519 // implicit def, and doesn't alias any other registers. 3520 if (const MCPhysReg *ImpDef = MI.getDesc().getImplicitDefs()) { 3521 for (; ImpDef && *ImpDef; ++ImpDef) { 3522 if (*ImpDef == AMDGPU::MODE) 3523 return true; 3524 } 3525 } 3526 3527 return false; 3528 } 3529 3530 bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const { 3531 unsigned Opcode = MI.getOpcode(); 3532 3533 if (MI.mayStore() && isSMRD(MI)) 3534 return true; // scalar store or atomic 3535 3536 // This will terminate the function when other lanes may need to continue. 3537 if (MI.isReturn()) 3538 return true; 3539 3540 // These instructions cause shader I/O that may cause hardware lockups 3541 // when executed with an empty EXEC mask. 3542 // 3543 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when 3544 // EXEC = 0, but checking for that case here seems not worth it 3545 // given the typical code patterns. 3546 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT || 3547 isEXP(Opcode) || 3548 Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP || 3549 Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER) 3550 return true; 3551 3552 if (MI.isCall() || MI.isInlineAsm()) 3553 return true; // conservative assumption 3554 3555 // A mode change is a scalar operation that influences vector instructions. 3556 if (modifiesModeRegister(MI)) 3557 return true; 3558 3559 // These are like SALU instructions in terms of effects, so it's questionable 3560 // whether we should return true for those. 3561 // 3562 // However, executing them with EXEC = 0 causes them to operate on undefined 3563 // data, which we avoid by returning true here. 3564 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || 3565 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32) 3566 return true; 3567 3568 return false; 3569 } 3570 3571 bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI, 3572 const MachineInstr &MI) const { 3573 if (MI.isMetaInstruction()) 3574 return false; 3575 3576 // This won't read exec if this is an SGPR->SGPR copy. 3577 if (MI.isCopyLike()) { 3578 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg())) 3579 return true; 3580 3581 // Make sure this isn't copying exec as a normal operand 3582 return MI.readsRegister(AMDGPU::EXEC, &RI); 3583 } 3584 3585 // Make a conservative assumption about the callee. 3586 if (MI.isCall()) 3587 return true; 3588 3589 // Be conservative with any unhandled generic opcodes. 3590 if (!isTargetSpecificOpcode(MI.getOpcode())) 3591 return true; 3592 3593 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI); 3594 } 3595 3596 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 3597 switch (Imm.getBitWidth()) { 3598 case 1: // This likely will be a condition code mask. 3599 return true; 3600 3601 case 32: 3602 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(), 3603 ST.hasInv2PiInlineImm()); 3604 case 64: 3605 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), 3606 ST.hasInv2PiInlineImm()); 3607 case 16: 3608 return ST.has16BitInsts() && 3609 AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), 3610 ST.hasInv2PiInlineImm()); 3611 default: 3612 llvm_unreachable("invalid bitwidth"); 3613 } 3614 } 3615 3616 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 3617 uint8_t OperandType) const { 3618 if (!MO.isImm() || 3619 OperandType < AMDGPU::OPERAND_SRC_FIRST || 3620 OperandType > AMDGPU::OPERAND_SRC_LAST) 3621 return false; 3622 3623 // MachineOperand provides no way to tell the true operand size, since it only 3624 // records a 64-bit value. We need to know the size to determine if a 32-bit 3625 // floating point immediate bit pattern is legal for an integer immediate. It 3626 // would be for any 32-bit integer operand, but would not be for a 64-bit one. 3627 3628 int64_t Imm = MO.getImm(); 3629 switch (OperandType) { 3630 case AMDGPU::OPERAND_REG_IMM_INT32: 3631 case AMDGPU::OPERAND_REG_IMM_FP32: 3632 case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED: 3633 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 3634 case AMDGPU::OPERAND_REG_INLINE_C_FP32: 3635 case AMDGPU::OPERAND_REG_IMM_V2FP32: 3636 case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: 3637 case AMDGPU::OPERAND_REG_IMM_V2INT32: 3638 case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: 3639 case AMDGPU::OPERAND_REG_INLINE_AC_INT32: 3640 case AMDGPU::OPERAND_REG_INLINE_AC_FP32: { 3641 int32_t Trunc = static_cast<int32_t>(Imm); 3642 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); 3643 } 3644 case AMDGPU::OPERAND_REG_IMM_INT64: 3645 case AMDGPU::OPERAND_REG_IMM_FP64: 3646 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 3647 case AMDGPU::OPERAND_REG_INLINE_C_FP64: 3648 case AMDGPU::OPERAND_REG_INLINE_AC_FP64: 3649 return AMDGPU::isInlinableLiteral64(MO.getImm(), 3650 ST.hasInv2PiInlineImm()); 3651 case AMDGPU::OPERAND_REG_IMM_INT16: 3652 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 3653 case AMDGPU::OPERAND_REG_INLINE_AC_INT16: 3654 // We would expect inline immediates to not be concerned with an integer/fp 3655 // distinction. However, in the case of 16-bit integer operations, the 3656 // "floating point" values appear to not work. It seems read the low 16-bits 3657 // of 32-bit immediates, which happens to always work for the integer 3658 // values. 3659 // 3660 // See llvm bugzilla 46302. 3661 // 3662 // TODO: Theoretically we could use op-sel to use the high bits of the 3663 // 32-bit FP values. 3664 return AMDGPU::isInlinableIntLiteral(Imm); 3665 case AMDGPU::OPERAND_REG_IMM_V2INT16: 3666 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: 3667 case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: 3668 // This suffers the same problem as the scalar 16-bit cases. 3669 return AMDGPU::isInlinableIntLiteralV216(Imm); 3670 case AMDGPU::OPERAND_REG_IMM_FP16: 3671 case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: 3672 case AMDGPU::OPERAND_REG_INLINE_C_FP16: 3673 case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { 3674 if (isInt<16>(Imm) || isUInt<16>(Imm)) { 3675 // A few special case instructions have 16-bit operands on subtargets 3676 // where 16-bit instructions are not legal. 3677 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle 3678 // constants in these cases 3679 int16_t Trunc = static_cast<int16_t>(Imm); 3680 return ST.has16BitInsts() && 3681 AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); 3682 } 3683 3684 return false; 3685 } 3686 case AMDGPU::OPERAND_REG_IMM_V2FP16: 3687 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: 3688 case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { 3689 uint32_t Trunc = static_cast<uint32_t>(Imm); 3690 return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); 3691 } 3692 case AMDGPU::OPERAND_KIMM32: 3693 case AMDGPU::OPERAND_KIMM16: 3694 return false; 3695 default: 3696 llvm_unreachable("invalid bitwidth"); 3697 } 3698 } 3699 3700 bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO, 3701 const MCOperandInfo &OpInfo) const { 3702 switch (MO.getType()) { 3703 case MachineOperand::MO_Register: 3704 return false; 3705 case MachineOperand::MO_Immediate: 3706 return !isInlineConstant(MO, OpInfo); 3707 case MachineOperand::MO_FrameIndex: 3708 case MachineOperand::MO_MachineBasicBlock: 3709 case MachineOperand::MO_ExternalSymbol: 3710 case MachineOperand::MO_GlobalAddress: 3711 case MachineOperand::MO_MCSymbol: 3712 return true; 3713 default: 3714 llvm_unreachable("unexpected operand type"); 3715 } 3716 } 3717 3718 static bool compareMachineOp(const MachineOperand &Op0, 3719 const MachineOperand &Op1) { 3720 if (Op0.getType() != Op1.getType()) 3721 return false; 3722 3723 switch (Op0.getType()) { 3724 case MachineOperand::MO_Register: 3725 return Op0.getReg() == Op1.getReg(); 3726 case MachineOperand::MO_Immediate: 3727 return Op0.getImm() == Op1.getImm(); 3728 default: 3729 llvm_unreachable("Didn't expect to be comparing these operand types"); 3730 } 3731 } 3732 3733 bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, 3734 const MachineOperand &MO) const { 3735 const MCInstrDesc &InstDesc = MI.getDesc(); 3736 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo]; 3737 3738 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); 3739 3740 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 3741 return true; 3742 3743 if (OpInfo.RegClass < 0) 3744 return false; 3745 3746 if (MO.isImm() && isInlineConstant(MO, OpInfo)) { 3747 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() && 3748 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(), 3749 AMDGPU::OpName::src2)) 3750 return false; 3751 return RI.opCanUseInlineConstant(OpInfo.OperandType); 3752 } 3753 3754 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType)) 3755 return false; 3756 3757 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo)) 3758 return true; 3759 3760 return ST.hasVOP3Literal(); 3761 } 3762 3763 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 3764 // GFX90A does not have V_MUL_LEGACY_F32_e32. 3765 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts()) 3766 return false; 3767 3768 int Op32 = AMDGPU::getVOPe32(Opcode); 3769 if (Op32 == -1) 3770 return false; 3771 3772 return pseudoToMCOpcode(Op32) != -1; 3773 } 3774 3775 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 3776 // The src0_modifier operand is present on all instructions 3777 // that have modifiers. 3778 3779 return AMDGPU::getNamedOperandIdx(Opcode, 3780 AMDGPU::OpName::src0_modifiers) != -1; 3781 } 3782 3783 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 3784 unsigned OpName) const { 3785 const MachineOperand *Mods = getNamedOperand(MI, OpName); 3786 return Mods && Mods->getImm(); 3787 } 3788 3789 bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { 3790 return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || 3791 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || 3792 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) || 3793 hasModifiersSet(MI, AMDGPU::OpName::clamp) || 3794 hasModifiersSet(MI, AMDGPU::OpName::omod); 3795 } 3796 3797 bool SIInstrInfo::canShrink(const MachineInstr &MI, 3798 const MachineRegisterInfo &MRI) const { 3799 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 3800 // Can't shrink instruction with three operands. 3801 if (Src2) { 3802 switch (MI.getOpcode()) { 3803 default: return false; 3804 3805 case AMDGPU::V_ADDC_U32_e64: 3806 case AMDGPU::V_SUBB_U32_e64: 3807 case AMDGPU::V_SUBBREV_U32_e64: { 3808 const MachineOperand *Src1 3809 = getNamedOperand(MI, AMDGPU::OpName::src1); 3810 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg())) 3811 return false; 3812 // Additional verification is needed for sdst/src2. 3813 return true; 3814 } 3815 case AMDGPU::V_MAC_F16_e64: 3816 case AMDGPU::V_MAC_F32_e64: 3817 case AMDGPU::V_MAC_LEGACY_F32_e64: 3818 case AMDGPU::V_FMAC_F16_e64: 3819 case AMDGPU::V_FMAC_F32_e64: 3820 case AMDGPU::V_FMAC_F64_e64: 3821 case AMDGPU::V_FMAC_LEGACY_F32_e64: 3822 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) || 3823 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) 3824 return false; 3825 break; 3826 3827 case AMDGPU::V_CNDMASK_B32_e64: 3828 break; 3829 } 3830 } 3831 3832 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 3833 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) || 3834 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers))) 3835 return false; 3836 3837 // We don't need to check src0, all input types are legal, so just make sure 3838 // src0 isn't using any modifiers. 3839 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) 3840 return false; 3841 3842 // Can it be shrunk to a valid 32 bit opcode? 3843 if (!hasVALU32BitEncoding(MI.getOpcode())) 3844 return false; 3845 3846 // Check output modifiers 3847 return !hasModifiersSet(MI, AMDGPU::OpName::omod) && 3848 !hasModifiersSet(MI, AMDGPU::OpName::clamp); 3849 } 3850 3851 // Set VCC operand with all flags from \p Orig, except for setting it as 3852 // implicit. 3853 static void copyFlagsToImplicitVCC(MachineInstr &MI, 3854 const MachineOperand &Orig) { 3855 3856 for (MachineOperand &Use : MI.implicit_operands()) { 3857 if (Use.isUse() && 3858 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) { 3859 Use.setIsUndef(Orig.isUndef()); 3860 Use.setIsKill(Orig.isKill()); 3861 return; 3862 } 3863 } 3864 } 3865 3866 MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI, 3867 unsigned Op32) const { 3868 MachineBasicBlock *MBB = MI.getParent(); 3869 MachineInstrBuilder Inst32 = 3870 BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32)) 3871 .setMIFlags(MI.getFlags()); 3872 3873 // Add the dst operand if the 32-bit encoding also has an explicit $vdst. 3874 // For VOPC instructions, this is replaced by an implicit def of vcc. 3875 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst) != -1) { 3876 // dst 3877 Inst32.add(MI.getOperand(0)); 3878 } else if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::sdst) != -1) { 3879 // VOPCX instructions won't be writing to an explicit dst, so this should 3880 // not fail for these instructions. 3881 assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) || 3882 (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && 3883 "Unexpected case"); 3884 } 3885 3886 Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0)); 3887 3888 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 3889 if (Src1) 3890 Inst32.add(*Src1); 3891 3892 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 3893 3894 if (Src2) { 3895 int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); 3896 if (Op32Src2Idx != -1) { 3897 Inst32.add(*Src2); 3898 } else { 3899 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is 3900 // replaced with an implicit read of vcc or vcc_lo. The implicit read 3901 // of vcc was already added during the initial BuildMI, but we 3902 // 1) may need to change vcc to vcc_lo to preserve the original register 3903 // 2) have to preserve the original flags. 3904 fixImplicitOperands(*Inst32); 3905 copyFlagsToImplicitVCC(*Inst32, *Src2); 3906 } 3907 } 3908 3909 return Inst32; 3910 } 3911 3912 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 3913 const MachineOperand &MO, 3914 const MCOperandInfo &OpInfo) const { 3915 // Literal constants use the constant bus. 3916 //if (isLiteralConstantLike(MO, OpInfo)) 3917 // return true; 3918 if (MO.isImm()) 3919 return !isInlineConstant(MO, OpInfo); 3920 3921 if (!MO.isReg()) 3922 return true; // Misc other operands like FrameIndex 3923 3924 if (!MO.isUse()) 3925 return false; 3926 3927 if (MO.getReg().isVirtual()) 3928 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 3929 3930 // Null is free 3931 if (MO.getReg() == AMDGPU::SGPR_NULL) 3932 return false; 3933 3934 // SGPRs use the constant bus 3935 if (MO.isImplicit()) { 3936 return MO.getReg() == AMDGPU::M0 || 3937 MO.getReg() == AMDGPU::VCC || 3938 MO.getReg() == AMDGPU::VCC_LO; 3939 } else { 3940 return AMDGPU::SReg_32RegClass.contains(MO.getReg()) || 3941 AMDGPU::SReg_64RegClass.contains(MO.getReg()); 3942 } 3943 } 3944 3945 static Register findImplicitSGPRRead(const MachineInstr &MI) { 3946 for (const MachineOperand &MO : MI.implicit_operands()) { 3947 // We only care about reads. 3948 if (MO.isDef()) 3949 continue; 3950 3951 switch (MO.getReg()) { 3952 case AMDGPU::VCC: 3953 case AMDGPU::VCC_LO: 3954 case AMDGPU::VCC_HI: 3955 case AMDGPU::M0: 3956 case AMDGPU::FLAT_SCR: 3957 return MO.getReg(); 3958 3959 default: 3960 break; 3961 } 3962 } 3963 3964 return AMDGPU::NoRegister; 3965 } 3966 3967 static bool shouldReadExec(const MachineInstr &MI) { 3968 if (SIInstrInfo::isVALU(MI)) { 3969 switch (MI.getOpcode()) { 3970 case AMDGPU::V_READLANE_B32: 3971 case AMDGPU::V_WRITELANE_B32: 3972 return false; 3973 } 3974 3975 return true; 3976 } 3977 3978 if (MI.isPreISelOpcode() || 3979 SIInstrInfo::isGenericOpcode(MI.getOpcode()) || 3980 SIInstrInfo::isSALU(MI) || 3981 SIInstrInfo::isSMRD(MI)) 3982 return false; 3983 3984 return true; 3985 } 3986 3987 static bool isSubRegOf(const SIRegisterInfo &TRI, 3988 const MachineOperand &SuperVec, 3989 const MachineOperand &SubReg) { 3990 if (SubReg.getReg().isPhysical()) 3991 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); 3992 3993 return SubReg.getSubReg() != AMDGPU::NoSubRegister && 3994 SubReg.getReg() == SuperVec.getReg(); 3995 } 3996 3997 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, 3998 StringRef &ErrInfo) const { 3999 uint16_t Opcode = MI.getOpcode(); 4000 if (SIInstrInfo::isGenericOpcode(MI.getOpcode())) 4001 return true; 4002 4003 const MachineFunction *MF = MI.getParent()->getParent(); 4004 const MachineRegisterInfo &MRI = MF->getRegInfo(); 4005 4006 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 4007 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 4008 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 4009 4010 // Make sure the number of operands is correct. 4011 const MCInstrDesc &Desc = get(Opcode); 4012 if (!Desc.isVariadic() && 4013 Desc.getNumOperands() != MI.getNumExplicitOperands()) { 4014 ErrInfo = "Instruction has wrong number of operands."; 4015 return false; 4016 } 4017 4018 if (MI.isInlineAsm()) { 4019 // Verify register classes for inlineasm constraints. 4020 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands(); 4021 I != E; ++I) { 4022 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI); 4023 if (!RC) 4024 continue; 4025 4026 const MachineOperand &Op = MI.getOperand(I); 4027 if (!Op.isReg()) 4028 continue; 4029 4030 Register Reg = Op.getReg(); 4031 if (!Reg.isVirtual() && !RC->contains(Reg)) { 4032 ErrInfo = "inlineasm operand has incorrect register class."; 4033 return false; 4034 } 4035 } 4036 4037 return true; 4038 } 4039 4040 if (isMIMG(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) { 4041 ErrInfo = "missing memory operand from MIMG instruction."; 4042 return false; 4043 } 4044 4045 // Make sure the register classes are correct. 4046 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 4047 const MachineOperand &MO = MI.getOperand(i); 4048 if (MO.isFPImm()) { 4049 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 4050 "all fp values to integers."; 4051 return false; 4052 } 4053 4054 int RegClass = Desc.OpInfo[i].RegClass; 4055 4056 switch (Desc.OpInfo[i].OperandType) { 4057 case MCOI::OPERAND_REGISTER: 4058 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) { 4059 ErrInfo = "Illegal immediate value for operand."; 4060 return false; 4061 } 4062 break; 4063 case AMDGPU::OPERAND_REG_IMM_INT32: 4064 case AMDGPU::OPERAND_REG_IMM_FP32: 4065 case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED: 4066 case AMDGPU::OPERAND_REG_IMM_V2FP32: 4067 break; 4068 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 4069 case AMDGPU::OPERAND_REG_INLINE_C_FP32: 4070 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 4071 case AMDGPU::OPERAND_REG_INLINE_C_FP64: 4072 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 4073 case AMDGPU::OPERAND_REG_INLINE_C_FP16: 4074 case AMDGPU::OPERAND_REG_INLINE_AC_INT32: 4075 case AMDGPU::OPERAND_REG_INLINE_AC_FP32: 4076 case AMDGPU::OPERAND_REG_INLINE_AC_INT16: 4077 case AMDGPU::OPERAND_REG_INLINE_AC_FP16: 4078 case AMDGPU::OPERAND_REG_INLINE_AC_FP64: { 4079 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { 4080 ErrInfo = "Illegal immediate value for operand."; 4081 return false; 4082 } 4083 break; 4084 } 4085 case MCOI::OPERAND_IMMEDIATE: 4086 case AMDGPU::OPERAND_KIMM32: 4087 // Check if this operand is an immediate. 4088 // FrameIndex operands will be replaced by immediates, so they are 4089 // allowed. 4090 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { 4091 ErrInfo = "Expected immediate, but got non-immediate"; 4092 return false; 4093 } 4094 LLVM_FALLTHROUGH; 4095 default: 4096 continue; 4097 } 4098 4099 if (!MO.isReg()) 4100 continue; 4101 Register Reg = MO.getReg(); 4102 if (!Reg) 4103 continue; 4104 4105 // FIXME: Ideally we would have separate instruction definitions with the 4106 // aligned register constraint. 4107 // FIXME: We do not verify inline asm operands, but custom inline asm 4108 // verification is broken anyway 4109 if (ST.needsAlignedVGPRs()) { 4110 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg); 4111 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) { 4112 const TargetRegisterClass *SubRC = 4113 RI.getSubRegClass(RC, MO.getSubReg()); 4114 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg()); 4115 if (RC) 4116 RC = SubRC; 4117 } 4118 4119 // Check that this is the aligned version of the class. 4120 if (!RC || !RI.isProperlyAlignedRC(*RC)) { 4121 ErrInfo = "Subtarget requires even aligned vector registers"; 4122 return false; 4123 } 4124 } 4125 4126 if (RegClass != -1) { 4127 if (Reg.isVirtual()) 4128 continue; 4129 4130 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 4131 if (!RC->contains(Reg)) { 4132 ErrInfo = "Operand has incorrect register class."; 4133 return false; 4134 } 4135 } 4136 } 4137 4138 // Verify SDWA 4139 if (isSDWA(MI)) { 4140 if (!ST.hasSDWA()) { 4141 ErrInfo = "SDWA is not supported on this target"; 4142 return false; 4143 } 4144 4145 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); 4146 4147 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) { 4148 if (OpIdx == -1) 4149 continue; 4150 const MachineOperand &MO = MI.getOperand(OpIdx); 4151 4152 if (!ST.hasSDWAScalar()) { 4153 // Only VGPRS on VI 4154 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) { 4155 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI"; 4156 return false; 4157 } 4158 } else { 4159 // No immediates on GFX9 4160 if (!MO.isReg()) { 4161 ErrInfo = 4162 "Only reg allowed as operands in SDWA instructions on GFX9+"; 4163 return false; 4164 } 4165 } 4166 } 4167 4168 if (!ST.hasSDWAOmod()) { 4169 // No omod allowed on VI 4170 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); 4171 if (OMod != nullptr && 4172 (!OMod->isImm() || OMod->getImm() != 0)) { 4173 ErrInfo = "OMod not allowed in SDWA instructions on VI"; 4174 return false; 4175 } 4176 } 4177 4178 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode); 4179 if (isVOPC(BasicOpcode)) { 4180 if (!ST.hasSDWASdst() && DstIdx != -1) { 4181 // Only vcc allowed as dst on VI for VOPC 4182 const MachineOperand &Dst = MI.getOperand(DstIdx); 4183 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) { 4184 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI"; 4185 return false; 4186 } 4187 } else if (!ST.hasSDWAOutModsVOPC()) { 4188 // No clamp allowed on GFX9 for VOPC 4189 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); 4190 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) { 4191 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI"; 4192 return false; 4193 } 4194 4195 // No omod allowed on GFX9 for VOPC 4196 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); 4197 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) { 4198 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI"; 4199 return false; 4200 } 4201 } 4202 } 4203 4204 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused); 4205 if (DstUnused && DstUnused->isImm() && 4206 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) { 4207 const MachineOperand &Dst = MI.getOperand(DstIdx); 4208 if (!Dst.isReg() || !Dst.isTied()) { 4209 ErrInfo = "Dst register should have tied register"; 4210 return false; 4211 } 4212 4213 const MachineOperand &TiedMO = 4214 MI.getOperand(MI.findTiedOperandIdx(DstIdx)); 4215 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) { 4216 ErrInfo = 4217 "Dst register should be tied to implicit use of preserved register"; 4218 return false; 4219 } else if (TiedMO.getReg().isPhysical() && 4220 Dst.getReg() != TiedMO.getReg()) { 4221 ErrInfo = "Dst register should use same physical register as preserved"; 4222 return false; 4223 } 4224 } 4225 } 4226 4227 // Verify MIMG 4228 if (isMIMG(MI.getOpcode()) && !MI.mayStore()) { 4229 // Ensure that the return type used is large enough for all the options 4230 // being used TFE/LWE require an extra result register. 4231 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask); 4232 if (DMask) { 4233 uint64_t DMaskImm = DMask->getImm(); 4234 uint32_t RegCount = 4235 isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm); 4236 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe); 4237 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe); 4238 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16); 4239 4240 // Adjust for packed 16 bit values 4241 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem()) 4242 RegCount >>= 1; 4243 4244 // Adjust if using LWE or TFE 4245 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm())) 4246 RegCount += 1; 4247 4248 const uint32_t DstIdx = 4249 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); 4250 const MachineOperand &Dst = MI.getOperand(DstIdx); 4251 if (Dst.isReg()) { 4252 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx); 4253 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32; 4254 if (RegCount > DstSize) { 4255 ErrInfo = "MIMG instruction returns too many registers for dst " 4256 "register class"; 4257 return false; 4258 } 4259 } 4260 } 4261 } 4262 4263 // Verify VOP*. Ignore multiple sgpr operands on writelane. 4264 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) { 4265 unsigned ConstantBusCount = 0; 4266 bool UsesLiteral = false; 4267 const MachineOperand *LiteralVal = nullptr; 4268 4269 if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) 4270 ++ConstantBusCount; 4271 4272 SmallVector<Register, 2> SGPRsUsed; 4273 Register SGPRUsed; 4274 4275 // Only look at the true operands. Only a real operand can use the constant 4276 // bus, and we don't want to check pseudo-operands like the source modifier 4277 // flags. 4278 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) { 4279 if (OpIdx == -1) 4280 break; 4281 const MachineOperand &MO = MI.getOperand(OpIdx); 4282 if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { 4283 if (MO.isReg()) { 4284 SGPRUsed = MO.getReg(); 4285 if (llvm::all_of(SGPRsUsed, [SGPRUsed](unsigned SGPR) { 4286 return SGPRUsed != SGPR; 4287 })) { 4288 ++ConstantBusCount; 4289 SGPRsUsed.push_back(SGPRUsed); 4290 } 4291 } else { 4292 if (!UsesLiteral) { 4293 ++ConstantBusCount; 4294 UsesLiteral = true; 4295 LiteralVal = &MO; 4296 } else if (!MO.isIdenticalTo(*LiteralVal)) { 4297 assert(isVOP3(MI)); 4298 ErrInfo = "VOP3 instruction uses more than one literal"; 4299 return false; 4300 } 4301 } 4302 } 4303 } 4304 4305 SGPRUsed = findImplicitSGPRRead(MI); 4306 if (SGPRUsed != AMDGPU::NoRegister) { 4307 // Implicit uses may safely overlap true operands 4308 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) { 4309 return !RI.regsOverlap(SGPRUsed, SGPR); 4310 })) { 4311 ++ConstantBusCount; 4312 SGPRsUsed.push_back(SGPRUsed); 4313 } 4314 } 4315 4316 // v_writelane_b32 is an exception from constant bus restriction: 4317 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const 4318 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) && 4319 Opcode != AMDGPU::V_WRITELANE_B32) { 4320 ErrInfo = "VOP* instruction violates constant bus restriction"; 4321 return false; 4322 } 4323 4324 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) { 4325 ErrInfo = "VOP3 instruction uses literal"; 4326 return false; 4327 } 4328 } 4329 4330 // Special case for writelane - this can break the multiple constant bus rule, 4331 // but still can't use more than one SGPR register 4332 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) { 4333 unsigned SGPRCount = 0; 4334 Register SGPRUsed = AMDGPU::NoRegister; 4335 4336 for (int OpIdx : {Src0Idx, Src1Idx}) { 4337 if (OpIdx == -1) 4338 break; 4339 4340 const MachineOperand &MO = MI.getOperand(OpIdx); 4341 4342 if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { 4343 if (MO.isReg() && MO.getReg() != AMDGPU::M0) { 4344 if (MO.getReg() != SGPRUsed) 4345 ++SGPRCount; 4346 SGPRUsed = MO.getReg(); 4347 } 4348 } 4349 if (SGPRCount > ST.getConstantBusLimit(Opcode)) { 4350 ErrInfo = "WRITELANE instruction violates constant bus restriction"; 4351 return false; 4352 } 4353 } 4354 } 4355 4356 // Verify misc. restrictions on specific instructions. 4357 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 || 4358 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) { 4359 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 4360 const MachineOperand &Src1 = MI.getOperand(Src1Idx); 4361 const MachineOperand &Src2 = MI.getOperand(Src2Idx); 4362 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 4363 if (!compareMachineOp(Src0, Src1) && 4364 !compareMachineOp(Src0, Src2)) { 4365 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 4366 return false; 4367 } 4368 } 4369 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() & 4370 SISrcMods::ABS) || 4371 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() & 4372 SISrcMods::ABS) || 4373 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() & 4374 SISrcMods::ABS)) { 4375 ErrInfo = "ABS not allowed in VOP3B instructions"; 4376 return false; 4377 } 4378 } 4379 4380 if (isSOP2(MI) || isSOPC(MI)) { 4381 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 4382 const MachineOperand &Src1 = MI.getOperand(Src1Idx); 4383 unsigned Immediates = 0; 4384 4385 if (!Src0.isReg() && 4386 !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType)) 4387 Immediates++; 4388 if (!Src1.isReg() && 4389 !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType)) 4390 Immediates++; 4391 4392 if (Immediates > 1) { 4393 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants"; 4394 return false; 4395 } 4396 } 4397 4398 if (isSOPK(MI)) { 4399 auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16); 4400 if (Desc.isBranch()) { 4401 if (!Op->isMBB()) { 4402 ErrInfo = "invalid branch target for SOPK instruction"; 4403 return false; 4404 } 4405 } else { 4406 uint64_t Imm = Op->getImm(); 4407 if (sopkIsZext(MI)) { 4408 if (!isUInt<16>(Imm)) { 4409 ErrInfo = "invalid immediate for SOPK instruction"; 4410 return false; 4411 } 4412 } else { 4413 if (!isInt<16>(Imm)) { 4414 ErrInfo = "invalid immediate for SOPK instruction"; 4415 return false; 4416 } 4417 } 4418 } 4419 } 4420 4421 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || 4422 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || 4423 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 4424 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { 4425 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 4426 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; 4427 4428 const unsigned StaticNumOps = Desc.getNumOperands() + 4429 Desc.getNumImplicitUses(); 4430 const unsigned NumImplicitOps = IsDst ? 2 : 1; 4431 4432 // Allow additional implicit operands. This allows a fixup done by the post 4433 // RA scheduler where the main implicit operand is killed and implicit-defs 4434 // are added for sub-registers that remain live after this instruction. 4435 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { 4436 ErrInfo = "missing implicit register operands"; 4437 return false; 4438 } 4439 4440 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 4441 if (IsDst) { 4442 if (!Dst->isUse()) { 4443 ErrInfo = "v_movreld_b32 vdst should be a use operand"; 4444 return false; 4445 } 4446 4447 unsigned UseOpIdx; 4448 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || 4449 UseOpIdx != StaticNumOps + 1) { 4450 ErrInfo = "movrel implicit operands should be tied"; 4451 return false; 4452 } 4453 } 4454 4455 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 4456 const MachineOperand &ImpUse 4457 = MI.getOperand(StaticNumOps + NumImplicitOps - 1); 4458 if (!ImpUse.isReg() || !ImpUse.isUse() || 4459 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { 4460 ErrInfo = "src0 should be subreg of implicit vector use"; 4461 return false; 4462 } 4463 } 4464 4465 // Make sure we aren't losing exec uses in the td files. This mostly requires 4466 // being careful when using let Uses to try to add other use registers. 4467 if (shouldReadExec(MI)) { 4468 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 4469 ErrInfo = "VALU instruction does not implicitly read exec mask"; 4470 return false; 4471 } 4472 } 4473 4474 if (isSMRD(MI)) { 4475 if (MI.mayStore()) { 4476 // The register offset form of scalar stores may only use m0 as the 4477 // soffset register. 4478 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset); 4479 if (Soff && Soff->getReg() != AMDGPU::M0) { 4480 ErrInfo = "scalar stores must use m0 as offset register"; 4481 return false; 4482 } 4483 } 4484 } 4485 4486 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) { 4487 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 4488 if (Offset->getImm() != 0) { 4489 ErrInfo = "subtarget does not support offsets in flat instructions"; 4490 return false; 4491 } 4492 } 4493 4494 if (isMIMG(MI)) { 4495 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim); 4496 if (DimOp) { 4497 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode, 4498 AMDGPU::OpName::vaddr0); 4499 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); 4500 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode); 4501 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4502 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); 4503 const AMDGPU::MIMGDimInfo *Dim = 4504 AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm()); 4505 4506 if (!Dim) { 4507 ErrInfo = "dim is out of range"; 4508 return false; 4509 } 4510 4511 bool IsA16 = false; 4512 if (ST.hasR128A16()) { 4513 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128); 4514 IsA16 = R128A16->getImm() != 0; 4515 } else if (ST.hasGFX10A16()) { 4516 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16); 4517 IsA16 = A16->getImm() != 0; 4518 } 4519 4520 bool IsNSA = SRsrcIdx - VAddr0Idx > 1; 4521 4522 unsigned AddrWords = 4523 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16()); 4524 4525 unsigned VAddrWords; 4526 if (IsNSA) { 4527 VAddrWords = SRsrcIdx - VAddr0Idx; 4528 } else { 4529 const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx); 4530 VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32; 4531 if (AddrWords > 8) 4532 AddrWords = 16; 4533 } 4534 4535 if (VAddrWords != AddrWords) { 4536 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords 4537 << " but got " << VAddrWords << "\n"); 4538 ErrInfo = "bad vaddr size"; 4539 return false; 4540 } 4541 } 4542 } 4543 4544 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl); 4545 if (DppCt) { 4546 using namespace AMDGPU::DPP; 4547 4548 unsigned DC = DppCt->getImm(); 4549 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 || 4550 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST || 4551 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) || 4552 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) || 4553 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) || 4554 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) || 4555 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) { 4556 ErrInfo = "Invalid dpp_ctrl value"; 4557 return false; 4558 } 4559 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 && 4560 ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 4561 ErrInfo = "Invalid dpp_ctrl value: " 4562 "wavefront shifts are not supported on GFX10+"; 4563 return false; 4564 } 4565 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 && 4566 ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 4567 ErrInfo = "Invalid dpp_ctrl value: " 4568 "broadcasts are not supported on GFX10+"; 4569 return false; 4570 } 4571 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST && 4572 ST.getGeneration() < AMDGPUSubtarget::GFX10) { 4573 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST && 4574 DC <= DppCtrl::ROW_NEWBCAST_LAST && 4575 !ST.hasGFX90AInsts()) { 4576 ErrInfo = "Invalid dpp_ctrl value: " 4577 "row_newbroadcast/row_share is not supported before " 4578 "GFX90A/GFX10"; 4579 return false; 4580 } else if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) { 4581 ErrInfo = "Invalid dpp_ctrl value: " 4582 "row_share and row_xmask are not supported before GFX10"; 4583 return false; 4584 } 4585 } 4586 4587 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); 4588 4589 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO && 4590 ((DstIdx >= 0 && 4591 (Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64RegClassID || 4592 Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64_Align2RegClassID)) || 4593 ((Src0Idx >= 0 && 4594 (Desc.OpInfo[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID || 4595 Desc.OpInfo[Src0Idx].RegClass == 4596 AMDGPU::VReg_64_Align2RegClassID)))) && 4597 !AMDGPU::isLegal64BitDPPControl(DC)) { 4598 ErrInfo = "Invalid dpp_ctrl value: " 4599 "64 bit dpp only support row_newbcast"; 4600 return false; 4601 } 4602 } 4603 4604 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) { 4605 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 4606 uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0 4607 : AMDGPU::OpName::vdata; 4608 const MachineOperand *Data = getNamedOperand(MI, DataNameIdx); 4609 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1); 4610 if (Data && !Data->isReg()) 4611 Data = nullptr; 4612 4613 if (ST.hasGFX90AInsts()) { 4614 if (Dst && Data && 4615 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) { 4616 ErrInfo = "Invalid register class: " 4617 "vdata and vdst should be both VGPR or AGPR"; 4618 return false; 4619 } 4620 if (Data && Data2 && 4621 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) { 4622 ErrInfo = "Invalid register class: " 4623 "both data operands should be VGPR or AGPR"; 4624 return false; 4625 } 4626 } else { 4627 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) || 4628 (Data && RI.isAGPR(MRI, Data->getReg())) || 4629 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) { 4630 ErrInfo = "Invalid register class: " 4631 "agpr loads and stores not supported on this GPU"; 4632 return false; 4633 } 4634 } 4635 } 4636 4637 if (ST.needsAlignedVGPRs() && 4638 (MI.getOpcode() == AMDGPU::DS_GWS_INIT || 4639 MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR || 4640 MI.getOpcode() == AMDGPU::DS_GWS_BARRIER)) { 4641 const MachineOperand *Op = getNamedOperand(MI, AMDGPU::OpName::data0); 4642 Register Reg = Op->getReg(); 4643 bool Aligned = true; 4644 if (Reg.isPhysical()) { 4645 Aligned = !(RI.getHWRegIndex(Reg) & 1); 4646 } else { 4647 const TargetRegisterClass &RC = *MRI.getRegClass(Reg); 4648 Aligned = RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) && 4649 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1); 4650 } 4651 4652 if (!Aligned) { 4653 ErrInfo = "Subtarget requires even aligned vector registers " 4654 "for DS_GWS instructions"; 4655 return false; 4656 } 4657 } 4658 4659 if (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && 4660 !ST.hasGFX90AInsts()) { 4661 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0); 4662 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) { 4663 ErrInfo = "Invalid register class: " 4664 "v_accvgpr_write with an SGPR is not supported on this GPU"; 4665 return false; 4666 } 4667 } 4668 4669 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) { 4670 const MachineOperand &SrcOp = MI.getOperand(1); 4671 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) { 4672 ErrInfo = "pseudo expects only physical SGPRs"; 4673 return false; 4674 } 4675 } 4676 4677 return true; 4678 } 4679 4680 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { 4681 switch (MI.getOpcode()) { 4682 default: return AMDGPU::INSTRUCTION_LIST_END; 4683 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 4684 case AMDGPU::COPY: return AMDGPU::COPY; 4685 case AMDGPU::PHI: return AMDGPU::PHI; 4686 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 4687 case AMDGPU::WQM: return AMDGPU::WQM; 4688 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM; 4689 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM; 4690 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM; 4691 case AMDGPU::S_MOV_B32: { 4692 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 4693 return MI.getOperand(1).isReg() || 4694 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ? 4695 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 4696 } 4697 case AMDGPU::S_ADD_I32: 4698 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32; 4699 case AMDGPU::S_ADDC_U32: 4700 return AMDGPU::V_ADDC_U32_e32; 4701 case AMDGPU::S_SUB_I32: 4702 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32; 4703 // FIXME: These are not consistently handled, and selected when the carry is 4704 // used. 4705 case AMDGPU::S_ADD_U32: 4706 return AMDGPU::V_ADD_CO_U32_e32; 4707 case AMDGPU::S_SUB_U32: 4708 return AMDGPU::V_SUB_CO_U32_e32; 4709 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 4710 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64; 4711 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64; 4712 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64; 4713 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; 4714 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; 4715 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; 4716 case AMDGPU::S_XNOR_B32: 4717 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; 4718 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; 4719 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; 4720 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; 4721 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; 4722 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 4723 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64; 4724 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 4725 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64; 4726 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 4727 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64; 4728 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64; 4729 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64; 4730 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64; 4731 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64; 4732 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 4733 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 4734 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 4735 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 4736 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64; 4737 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64; 4738 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64; 4739 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64; 4740 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64; 4741 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64; 4742 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64; 4743 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64; 4744 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64; 4745 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64; 4746 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64; 4747 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64; 4748 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64; 4749 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64; 4750 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 4751 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 4752 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 4753 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 4754 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; 4755 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; 4756 } 4757 llvm_unreachable( 4758 "Unexpected scalar opcode without corresponding vector one!"); 4759 } 4760 4761 static const TargetRegisterClass * 4762 adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, 4763 const MachineRegisterInfo &MRI, 4764 const MCInstrDesc &TID, unsigned RCID, 4765 bool IsAllocatable) { 4766 if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && 4767 (((TID.mayLoad() || TID.mayStore()) && 4768 !(TID.TSFlags & SIInstrFlags::VGPRSpill)) || 4769 (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) { 4770 switch (RCID) { 4771 case AMDGPU::AV_32RegClassID: 4772 RCID = AMDGPU::VGPR_32RegClassID; 4773 break; 4774 case AMDGPU::AV_64RegClassID: 4775 RCID = AMDGPU::VReg_64RegClassID; 4776 break; 4777 case AMDGPU::AV_96RegClassID: 4778 RCID = AMDGPU::VReg_96RegClassID; 4779 break; 4780 case AMDGPU::AV_128RegClassID: 4781 RCID = AMDGPU::VReg_128RegClassID; 4782 break; 4783 case AMDGPU::AV_160RegClassID: 4784 RCID = AMDGPU::VReg_160RegClassID; 4785 break; 4786 case AMDGPU::AV_512RegClassID: 4787 RCID = AMDGPU::VReg_512RegClassID; 4788 break; 4789 default: 4790 break; 4791 } 4792 } 4793 4794 return RI.getProperlyAlignedRC(RI.getRegClass(RCID)); 4795 } 4796 4797 const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID, 4798 unsigned OpNum, const TargetRegisterInfo *TRI, 4799 const MachineFunction &MF) 4800 const { 4801 if (OpNum >= TID.getNumOperands()) 4802 return nullptr; 4803 auto RegClass = TID.OpInfo[OpNum].RegClass; 4804 bool IsAllocatable = false; 4805 if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) { 4806 // vdst and vdata should be both VGPR or AGPR, same for the DS instructions 4807 // with two data operands. Request register class constrained to VGPR only 4808 // of both operands present as Machine Copy Propagation can not check this 4809 // constraint and possibly other passes too. 4810 // 4811 // The check is limited to FLAT and DS because atomics in non-flat encoding 4812 // have their vdst and vdata tied to be the same register. 4813 const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode, 4814 AMDGPU::OpName::vdst); 4815 const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode, 4816 (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0 4817 : AMDGPU::OpName::vdata); 4818 if (DataIdx != -1) { 4819 IsAllocatable = VDstIdx != -1 || 4820 AMDGPU::getNamedOperandIdx(TID.Opcode, 4821 AMDGPU::OpName::data1) != -1; 4822 } 4823 } 4824 return adjustAllocatableRegClass(ST, RI, MF.getRegInfo(), TID, RegClass, 4825 IsAllocatable); 4826 } 4827 4828 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 4829 unsigned OpNo) const { 4830 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 4831 const MCInstrDesc &Desc = get(MI.getOpcode()); 4832 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 4833 Desc.OpInfo[OpNo].RegClass == -1) { 4834 Register Reg = MI.getOperand(OpNo).getReg(); 4835 4836 if (Reg.isVirtual()) 4837 return MRI.getRegClass(Reg); 4838 return RI.getPhysRegClass(Reg); 4839 } 4840 4841 unsigned RCID = Desc.OpInfo[OpNo].RegClass; 4842 return adjustAllocatableRegClass(ST, RI, MRI, Desc, RCID, true); 4843 } 4844 4845 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { 4846 MachineBasicBlock::iterator I = MI; 4847 MachineBasicBlock *MBB = MI.getParent(); 4848 MachineOperand &MO = MI.getOperand(OpIdx); 4849 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 4850 unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; 4851 const TargetRegisterClass *RC = RI.getRegClass(RCID); 4852 unsigned Size = RI.getRegSizeInBits(*RC); 4853 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32; 4854 if (MO.isReg()) 4855 Opcode = AMDGPU::COPY; 4856 else if (RI.isSGPRClass(RC)) 4857 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 4858 4859 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 4860 const TargetRegisterClass *VRC64 = RI.getVGPR64Class(); 4861 if (RI.getCommonSubClass(VRC64, VRC)) 4862 VRC = VRC64; 4863 else 4864 VRC = &AMDGPU::VGPR_32RegClass; 4865 4866 Register Reg = MRI.createVirtualRegister(VRC); 4867 DebugLoc DL = MBB->findDebugLoc(I); 4868 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); 4869 MO.ChangeToRegister(Reg, false); 4870 } 4871 4872 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 4873 MachineRegisterInfo &MRI, 4874 MachineOperand &SuperReg, 4875 const TargetRegisterClass *SuperRC, 4876 unsigned SubIdx, 4877 const TargetRegisterClass *SubRC) 4878 const { 4879 MachineBasicBlock *MBB = MI->getParent(); 4880 DebugLoc DL = MI->getDebugLoc(); 4881 Register SubReg = MRI.createVirtualRegister(SubRC); 4882 4883 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { 4884 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 4885 .addReg(SuperReg.getReg(), 0, SubIdx); 4886 return SubReg; 4887 } 4888 4889 // Just in case the super register is itself a sub-register, copy it to a new 4890 // value so we don't need to worry about merging its subreg index with the 4891 // SubIdx passed to this function. The register coalescer should be able to 4892 // eliminate this extra copy. 4893 Register NewSuperReg = MRI.createVirtualRegister(SuperRC); 4894 4895 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 4896 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 4897 4898 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 4899 .addReg(NewSuperReg, 0, SubIdx); 4900 4901 return SubReg; 4902 } 4903 4904 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 4905 MachineBasicBlock::iterator MII, 4906 MachineRegisterInfo &MRI, 4907 MachineOperand &Op, 4908 const TargetRegisterClass *SuperRC, 4909 unsigned SubIdx, 4910 const TargetRegisterClass *SubRC) const { 4911 if (Op.isImm()) { 4912 if (SubIdx == AMDGPU::sub0) 4913 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); 4914 if (SubIdx == AMDGPU::sub1) 4915 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32)); 4916 4917 llvm_unreachable("Unhandled register index for immediate"); 4918 } 4919 4920 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 4921 SubIdx, SubRC); 4922 return MachineOperand::CreateReg(SubReg, false); 4923 } 4924 4925 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 4926 void SIInstrInfo::swapOperands(MachineInstr &Inst) const { 4927 assert(Inst.getNumExplicitOperands() == 3); 4928 MachineOperand Op1 = Inst.getOperand(1); 4929 Inst.removeOperand(1); 4930 Inst.addOperand(Op1); 4931 } 4932 4933 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, 4934 const MCOperandInfo &OpInfo, 4935 const MachineOperand &MO) const { 4936 if (!MO.isReg()) 4937 return false; 4938 4939 Register Reg = MO.getReg(); 4940 4941 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass); 4942 if (Reg.isPhysical()) 4943 return DRC->contains(Reg); 4944 4945 const TargetRegisterClass *RC = MRI.getRegClass(Reg); 4946 4947 if (MO.getSubReg()) { 4948 const MachineFunction *MF = MO.getParent()->getParent()->getParent(); 4949 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF); 4950 if (!SuperRC) 4951 return false; 4952 4953 DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()); 4954 if (!DRC) 4955 return false; 4956 } 4957 return RC->hasSuperClassEq(DRC); 4958 } 4959 4960 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, 4961 const MCOperandInfo &OpInfo, 4962 const MachineOperand &MO) const { 4963 if (MO.isReg()) 4964 return isLegalRegOperand(MRI, OpInfo, MO); 4965 4966 // Handle non-register types that are treated like immediates. 4967 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); 4968 return true; 4969 } 4970 4971 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, 4972 const MachineOperand *MO) const { 4973 const MachineFunction &MF = *MI.getParent()->getParent(); 4974 const MachineRegisterInfo &MRI = MF.getRegInfo(); 4975 const MCInstrDesc &InstDesc = MI.getDesc(); 4976 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; 4977 const TargetRegisterClass *DefinedRC = 4978 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 4979 if (!MO) 4980 MO = &MI.getOperand(OpIdx); 4981 4982 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); 4983 int VOP3LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; 4984 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { 4985 if (isVOP3(MI) && isLiteralConstantLike(*MO, OpInfo) && !VOP3LiteralLimit--) 4986 return false; 4987 4988 SmallDenseSet<RegSubRegPair> SGPRsUsed; 4989 if (MO->isReg()) 4990 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg())); 4991 4992 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 4993 if (i == OpIdx) 4994 continue; 4995 const MachineOperand &Op = MI.getOperand(i); 4996 if (Op.isReg()) { 4997 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg()); 4998 if (!SGPRsUsed.count(SGPR) && 4999 usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) { 5000 if (--ConstantBusLimit <= 0) 5001 return false; 5002 SGPRsUsed.insert(SGPR); 5003 } 5004 } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { 5005 if (--ConstantBusLimit <= 0) 5006 return false; 5007 } else if (isVOP3(MI) && AMDGPU::isSISrcOperand(InstDesc, i) && 5008 isLiteralConstantLike(Op, InstDesc.OpInfo[i])) { 5009 if (!VOP3LiteralLimit--) 5010 return false; 5011 if (--ConstantBusLimit <= 0) 5012 return false; 5013 } 5014 } 5015 } 5016 5017 if (MO->isReg()) { 5018 if (!DefinedRC) { 5019 // This operand allows any register. 5020 return true; 5021 } 5022 if (!isLegalRegOperand(MRI, OpInfo, *MO)) 5023 return false; 5024 bool IsAGPR = RI.isAGPR(MRI, MO->getReg()); 5025 if (IsAGPR && !ST.hasMAIInsts()) 5026 return false; 5027 unsigned Opc = MI.getOpcode(); 5028 if (IsAGPR && 5029 (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && 5030 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc))) 5031 return false; 5032 // Atomics should have both vdst and vdata either vgpr or agpr. 5033 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 5034 const int DataIdx = AMDGPU::getNamedOperandIdx(Opc, 5035 isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata); 5036 if ((int)OpIdx == VDstIdx && DataIdx != -1 && 5037 MI.getOperand(DataIdx).isReg() && 5038 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR) 5039 return false; 5040 if ((int)OpIdx == DataIdx) { 5041 if (VDstIdx != -1 && 5042 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR) 5043 return false; 5044 // DS instructions with 2 src operands also must have tied RC. 5045 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, 5046 AMDGPU::OpName::data1); 5047 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() && 5048 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR) 5049 return false; 5050 } 5051 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() && 5052 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) && 5053 RI.isSGPRReg(MRI, MO->getReg())) 5054 return false; 5055 return true; 5056 } 5057 5058 // Handle non-register types that are treated like immediates. 5059 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal()); 5060 5061 if (!DefinedRC) { 5062 // This operand expects an immediate. 5063 return true; 5064 } 5065 5066 return isImmOperandLegal(MI, OpIdx, *MO); 5067 } 5068 5069 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, 5070 MachineInstr &MI) const { 5071 unsigned Opc = MI.getOpcode(); 5072 const MCInstrDesc &InstrDesc = get(Opc); 5073 5074 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 5075 MachineOperand &Src0 = MI.getOperand(Src0Idx); 5076 5077 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 5078 MachineOperand &Src1 = MI.getOperand(Src1Idx); 5079 5080 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 5081 // we need to only have one constant bus use before GFX10. 5082 bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; 5083 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && 5084 Src0.isReg() && (RI.isSGPRReg(MRI, Src0.getReg()) || 5085 isLiteralConstantLike(Src0, InstrDesc.OpInfo[Src0Idx]))) 5086 legalizeOpWithMove(MI, Src0Idx); 5087 5088 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for 5089 // both the value to write (src0) and lane select (src1). Fix up non-SGPR 5090 // src0/src1 with V_READFIRSTLANE. 5091 if (Opc == AMDGPU::V_WRITELANE_B32) { 5092 const DebugLoc &DL = MI.getDebugLoc(); 5093 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) { 5094 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 5095 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 5096 .add(Src0); 5097 Src0.ChangeToRegister(Reg, false); 5098 } 5099 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) { 5100 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 5101 const DebugLoc &DL = MI.getDebugLoc(); 5102 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 5103 .add(Src1); 5104 Src1.ChangeToRegister(Reg, false); 5105 } 5106 return; 5107 } 5108 5109 // No VOP2 instructions support AGPRs. 5110 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg())) 5111 legalizeOpWithMove(MI, Src0Idx); 5112 5113 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg())) 5114 legalizeOpWithMove(MI, Src1Idx); 5115 5116 // VOP2 src0 instructions support all operand types, so we don't need to check 5117 // their legality. If src1 is already legal, we don't need to do anything. 5118 if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) 5119 return; 5120 5121 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for 5122 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane 5123 // select is uniform. 5124 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() && 5125 RI.isVGPR(MRI, Src1.getReg())) { 5126 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 5127 const DebugLoc &DL = MI.getDebugLoc(); 5128 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 5129 .add(Src1); 5130 Src1.ChangeToRegister(Reg, false); 5131 return; 5132 } 5133 5134 // We do not use commuteInstruction here because it is too aggressive and will 5135 // commute if it is possible. We only want to commute here if it improves 5136 // legality. This can be called a fairly large number of times so don't waste 5137 // compile time pointlessly swapping and checking legality again. 5138 if (HasImplicitSGPR || !MI.isCommutable()) { 5139 legalizeOpWithMove(MI, Src1Idx); 5140 return; 5141 } 5142 5143 // If src0 can be used as src1, commuting will make the operands legal. 5144 // Otherwise we have to give up and insert a move. 5145 // 5146 // TODO: Other immediate-like operand kinds could be commuted if there was a 5147 // MachineOperand::ChangeTo* for them. 5148 if ((!Src1.isImm() && !Src1.isReg()) || 5149 !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { 5150 legalizeOpWithMove(MI, Src1Idx); 5151 return; 5152 } 5153 5154 int CommutedOpc = commuteOpcode(MI); 5155 if (CommutedOpc == -1) { 5156 legalizeOpWithMove(MI, Src1Idx); 5157 return; 5158 } 5159 5160 MI.setDesc(get(CommutedOpc)); 5161 5162 Register Src0Reg = Src0.getReg(); 5163 unsigned Src0SubReg = Src0.getSubReg(); 5164 bool Src0Kill = Src0.isKill(); 5165 5166 if (Src1.isImm()) 5167 Src0.ChangeToImmediate(Src1.getImm()); 5168 else if (Src1.isReg()) { 5169 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); 5170 Src0.setSubReg(Src1.getSubReg()); 5171 } else 5172 llvm_unreachable("Should only have register or immediate operands"); 5173 5174 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); 5175 Src1.setSubReg(Src0SubReg); 5176 fixImplicitOperands(MI); 5177 } 5178 5179 // Legalize VOP3 operands. All operand types are supported for any operand 5180 // but only one literal constant and only starting from GFX10. 5181 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, 5182 MachineInstr &MI) const { 5183 unsigned Opc = MI.getOpcode(); 5184 5185 int VOP3Idx[3] = { 5186 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 5187 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), 5188 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) 5189 }; 5190 5191 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 || 5192 Opc == AMDGPU::V_PERMLANEX16_B32_e64) { 5193 // src1 and src2 must be scalar 5194 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]); 5195 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]); 5196 const DebugLoc &DL = MI.getDebugLoc(); 5197 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) { 5198 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 5199 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 5200 .add(Src1); 5201 Src1.ChangeToRegister(Reg, false); 5202 } 5203 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) { 5204 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 5205 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 5206 .add(Src2); 5207 Src2.ChangeToRegister(Reg, false); 5208 } 5209 } 5210 5211 // Find the one SGPR operand we are allowed to use. 5212 int ConstantBusLimit = ST.getConstantBusLimit(Opc); 5213 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; 5214 SmallDenseSet<unsigned> SGPRsUsed; 5215 Register SGPRReg = findUsedSGPR(MI, VOP3Idx); 5216 if (SGPRReg != AMDGPU::NoRegister) { 5217 SGPRsUsed.insert(SGPRReg); 5218 --ConstantBusLimit; 5219 } 5220 5221 for (int Idx : VOP3Idx) { 5222 if (Idx == -1) 5223 break; 5224 MachineOperand &MO = MI.getOperand(Idx); 5225 5226 if (!MO.isReg()) { 5227 if (!isLiteralConstantLike(MO, get(Opc).OpInfo[Idx])) 5228 continue; 5229 5230 if (LiteralLimit > 0 && ConstantBusLimit > 0) { 5231 --LiteralLimit; 5232 --ConstantBusLimit; 5233 continue; 5234 } 5235 5236 --LiteralLimit; 5237 --ConstantBusLimit; 5238 legalizeOpWithMove(MI, Idx); 5239 continue; 5240 } 5241 5242 if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) && 5243 !isOperandLegal(MI, Idx, &MO)) { 5244 legalizeOpWithMove(MI, Idx); 5245 continue; 5246 } 5247 5248 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg()))) 5249 continue; // VGPRs are legal 5250 5251 // We can use one SGPR in each VOP3 instruction prior to GFX10 5252 // and two starting from GFX10. 5253 if (SGPRsUsed.count(MO.getReg())) 5254 continue; 5255 if (ConstantBusLimit > 0) { 5256 SGPRsUsed.insert(MO.getReg()); 5257 --ConstantBusLimit; 5258 continue; 5259 } 5260 5261 // If we make it this far, then the operand is not legal and we must 5262 // legalize it. 5263 legalizeOpWithMove(MI, Idx); 5264 } 5265 } 5266 5267 Register SIInstrInfo::readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, 5268 MachineRegisterInfo &MRI) const { 5269 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); 5270 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); 5271 Register DstReg = MRI.createVirtualRegister(SRC); 5272 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; 5273 5274 if (RI.hasAGPRs(VRC)) { 5275 VRC = RI.getEquivalentVGPRClass(VRC); 5276 Register NewSrcReg = MRI.createVirtualRegister(VRC); 5277 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 5278 get(TargetOpcode::COPY), NewSrcReg) 5279 .addReg(SrcReg); 5280 SrcReg = NewSrcReg; 5281 } 5282 5283 if (SubRegs == 1) { 5284 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 5285 get(AMDGPU::V_READFIRSTLANE_B32), DstReg) 5286 .addReg(SrcReg); 5287 return DstReg; 5288 } 5289 5290 SmallVector<unsigned, 8> SRegs; 5291 for (unsigned i = 0; i < SubRegs; ++i) { 5292 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 5293 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 5294 get(AMDGPU::V_READFIRSTLANE_B32), SGPR) 5295 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); 5296 SRegs.push_back(SGPR); 5297 } 5298 5299 MachineInstrBuilder MIB = 5300 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 5301 get(AMDGPU::REG_SEQUENCE), DstReg); 5302 for (unsigned i = 0; i < SubRegs; ++i) { 5303 MIB.addReg(SRegs[i]); 5304 MIB.addImm(RI.getSubRegFromChannel(i)); 5305 } 5306 return DstReg; 5307 } 5308 5309 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, 5310 MachineInstr &MI) const { 5311 5312 // If the pointer is store in VGPRs, then we need to move them to 5313 // SGPRs using v_readfirstlane. This is safe because we only select 5314 // loads with uniform pointers to SMRD instruction so we know the 5315 // pointer value is uniform. 5316 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); 5317 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { 5318 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); 5319 SBase->setReg(SGPR); 5320 } 5321 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset); 5322 if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) { 5323 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI); 5324 SOff->setReg(SGPR); 5325 } 5326 } 5327 5328 bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const { 5329 unsigned Opc = Inst.getOpcode(); 5330 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); 5331 if (OldSAddrIdx < 0) 5332 return false; 5333 5334 assert(isSegmentSpecificFLAT(Inst)); 5335 5336 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc); 5337 if (NewOpc < 0) 5338 NewOpc = AMDGPU::getFlatScratchInstSVfromSS(Opc); 5339 if (NewOpc < 0) 5340 return false; 5341 5342 MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo(); 5343 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx); 5344 if (RI.isSGPRReg(MRI, SAddr.getReg())) 5345 return false; 5346 5347 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr); 5348 if (NewVAddrIdx < 0) 5349 return false; 5350 5351 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 5352 5353 // Check vaddr, it shall be zero or absent. 5354 MachineInstr *VAddrDef = nullptr; 5355 if (OldVAddrIdx >= 0) { 5356 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx); 5357 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg()); 5358 if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 || 5359 !VAddrDef->getOperand(1).isImm() || 5360 VAddrDef->getOperand(1).getImm() != 0) 5361 return false; 5362 } 5363 5364 const MCInstrDesc &NewDesc = get(NewOpc); 5365 Inst.setDesc(NewDesc); 5366 5367 // Callers expect iterator to be valid after this call, so modify the 5368 // instruction in place. 5369 if (OldVAddrIdx == NewVAddrIdx) { 5370 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx); 5371 // Clear use list from the old vaddr holding a zero register. 5372 MRI.removeRegOperandFromUseList(&NewVAddr); 5373 MRI.moveOperands(&NewVAddr, &SAddr, 1); 5374 Inst.removeOperand(OldSAddrIdx); 5375 // Update the use list with the pointer we have just moved from vaddr to 5376 // saddr position. Otherwise new vaddr will be missing from the use list. 5377 MRI.removeRegOperandFromUseList(&NewVAddr); 5378 MRI.addRegOperandToUseList(&NewVAddr); 5379 } else { 5380 assert(OldSAddrIdx == NewVAddrIdx); 5381 5382 if (OldVAddrIdx >= 0) { 5383 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc, 5384 AMDGPU::OpName::vdst_in); 5385 5386 // removeOperand doesn't try to fixup tied operand indexes at it goes, so 5387 // it asserts. Untie the operands for now and retie them afterwards. 5388 if (NewVDstIn != -1) { 5389 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in); 5390 Inst.untieRegOperand(OldVDstIn); 5391 } 5392 5393 Inst.removeOperand(OldVAddrIdx); 5394 5395 if (NewVDstIn != -1) { 5396 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst); 5397 Inst.tieOperands(NewVDst, NewVDstIn); 5398 } 5399 } 5400 } 5401 5402 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg())) 5403 VAddrDef->eraseFromParent(); 5404 5405 return true; 5406 } 5407 5408 // FIXME: Remove this when SelectionDAG is obsoleted. 5409 void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI, 5410 MachineInstr &MI) const { 5411 if (!isSegmentSpecificFLAT(MI)) 5412 return; 5413 5414 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence 5415 // thinks they are uniform, so a readfirstlane should be valid. 5416 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr); 5417 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg()))) 5418 return; 5419 5420 if (moveFlatAddrToVGPR(MI)) 5421 return; 5422 5423 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI); 5424 SAddr->setReg(ToSGPR); 5425 } 5426 5427 void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, 5428 MachineBasicBlock::iterator I, 5429 const TargetRegisterClass *DstRC, 5430 MachineOperand &Op, 5431 MachineRegisterInfo &MRI, 5432 const DebugLoc &DL) const { 5433 Register OpReg = Op.getReg(); 5434 unsigned OpSubReg = Op.getSubReg(); 5435 5436 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( 5437 RI.getRegClassForReg(MRI, OpReg), OpSubReg); 5438 5439 // Check if operand is already the correct register class. 5440 if (DstRC == OpRC) 5441 return; 5442 5443 Register DstReg = MRI.createVirtualRegister(DstRC); 5444 auto Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); 5445 5446 Op.setReg(DstReg); 5447 Op.setSubReg(0); 5448 5449 MachineInstr *Def = MRI.getVRegDef(OpReg); 5450 if (!Def) 5451 return; 5452 5453 // Try to eliminate the copy if it is copying an immediate value. 5454 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass) 5455 FoldImmediate(*Copy, *Def, OpReg, &MRI); 5456 5457 bool ImpDef = Def->isImplicitDef(); 5458 while (!ImpDef && Def && Def->isCopy()) { 5459 if (Def->getOperand(1).getReg().isPhysical()) 5460 break; 5461 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg()); 5462 ImpDef = Def && Def->isImplicitDef(); 5463 } 5464 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) && 5465 !ImpDef) 5466 Copy.addReg(AMDGPU::EXEC, RegState::Implicit); 5467 } 5468 5469 // Emit the actual waterfall loop, executing the wrapped instruction for each 5470 // unique value of \p Rsrc across all lanes. In the best case we execute 1 5471 // iteration, in the worst case we execute 64 (once per lane). 5472 static void 5473 emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, 5474 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, 5475 MachineBasicBlock &BodyBB, const DebugLoc &DL, 5476 MachineOperand &Rsrc) { 5477 MachineFunction &MF = *OrigBB.getParent(); 5478 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 5479 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 5480 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 5481 unsigned SaveExecOpc = 5482 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 5483 unsigned XorTermOpc = 5484 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; 5485 unsigned AndOpc = 5486 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 5487 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 5488 5489 MachineBasicBlock::iterator I = LoopBB.begin(); 5490 5491 SmallVector<Register, 8> ReadlanePieces; 5492 Register CondReg = AMDGPU::NoRegister; 5493 5494 Register VRsrc = Rsrc.getReg(); 5495 unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef()); 5496 5497 unsigned RegSize = TRI->getRegSizeInBits(Rsrc.getReg(), MRI); 5498 unsigned NumSubRegs = RegSize / 32; 5499 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 && "Unhandled register size"); 5500 5501 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) { 5502 5503 Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 5504 Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 5505 5506 // Read the next variant <- also loop target. 5507 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo) 5508 .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx)); 5509 5510 // Read the next variant <- also loop target. 5511 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi) 5512 .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx + 1)); 5513 5514 ReadlanePieces.push_back(CurRegLo); 5515 ReadlanePieces.push_back(CurRegHi); 5516 5517 // Comparison is to be done as 64-bit. 5518 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass); 5519 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg) 5520 .addReg(CurRegLo) 5521 .addImm(AMDGPU::sub0) 5522 .addReg(CurRegHi) 5523 .addImm(AMDGPU::sub1); 5524 5525 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC); 5526 auto Cmp = 5527 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), NewCondReg) 5528 .addReg(CurReg); 5529 if (NumSubRegs <= 2) 5530 Cmp.addReg(VRsrc); 5531 else 5532 Cmp.addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx, 2)); 5533 5534 // Combine the comparison results with AND. 5535 if (CondReg == AMDGPU::NoRegister) // First. 5536 CondReg = NewCondReg; 5537 else { // If not the first, we create an AND. 5538 Register AndReg = MRI.createVirtualRegister(BoolXExecRC); 5539 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg) 5540 .addReg(CondReg) 5541 .addReg(NewCondReg); 5542 CondReg = AndReg; 5543 } 5544 } // End for loop. 5545 5546 auto SRsrcRC = TRI->getEquivalentSGPRClass(MRI.getRegClass(VRsrc)); 5547 Register SRsrc = MRI.createVirtualRegister(SRsrcRC); 5548 5549 // Build scalar Rsrc. 5550 auto Merge = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc); 5551 unsigned Channel = 0; 5552 for (Register Piece : ReadlanePieces) { 5553 Merge.addReg(Piece) 5554 .addImm(TRI->getSubRegFromChannel(Channel++)); 5555 } 5556 5557 // Update Rsrc operand to use the SGPR Rsrc. 5558 Rsrc.setReg(SRsrc); 5559 Rsrc.setIsKill(true); 5560 5561 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); 5562 MRI.setSimpleHint(SaveExec, CondReg); 5563 5564 // Update EXEC to matching lanes, saving original to SaveExec. 5565 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec) 5566 .addReg(CondReg, RegState::Kill); 5567 5568 // The original instruction is here; we insert the terminators after it. 5569 I = BodyBB.end(); 5570 5571 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 5572 BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec) 5573 .addReg(Exec) 5574 .addReg(SaveExec); 5575 5576 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB); 5577 } 5578 5579 // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register 5580 // with SGPRs by iterating over all unique values across all lanes. 5581 // Returns the loop basic block that now contains \p MI. 5582 static MachineBasicBlock * 5583 loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, 5584 MachineOperand &Rsrc, MachineDominatorTree *MDT, 5585 MachineBasicBlock::iterator Begin = nullptr, 5586 MachineBasicBlock::iterator End = nullptr) { 5587 MachineBasicBlock &MBB = *MI.getParent(); 5588 MachineFunction &MF = *MBB.getParent(); 5589 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 5590 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 5591 MachineRegisterInfo &MRI = MF.getRegInfo(); 5592 if (!Begin.isValid()) 5593 Begin = &MI; 5594 if (!End.isValid()) { 5595 End = &MI; 5596 ++End; 5597 } 5598 const DebugLoc &DL = MI.getDebugLoc(); 5599 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 5600 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 5601 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 5602 5603 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); 5604 5605 // Save the EXEC mask 5606 BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec); 5607 5608 // Killed uses in the instruction we are waterfalling around will be 5609 // incorrect due to the added control-flow. 5610 MachineBasicBlock::iterator AfterMI = MI; 5611 ++AfterMI; 5612 for (auto I = Begin; I != AfterMI; I++) { 5613 for (auto &MO : I->uses()) { 5614 if (MO.isReg() && MO.isUse()) { 5615 MRI.clearKillFlags(MO.getReg()); 5616 } 5617 } 5618 } 5619 5620 // To insert the loop we need to split the block. Move everything after this 5621 // point to a new block, and insert a new empty block between the two. 5622 MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock(); 5623 MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock(); 5624 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock(); 5625 MachineFunction::iterator MBBI(MBB); 5626 ++MBBI; 5627 5628 MF.insert(MBBI, LoopBB); 5629 MF.insert(MBBI, BodyBB); 5630 MF.insert(MBBI, RemainderBB); 5631 5632 LoopBB->addSuccessor(BodyBB); 5633 BodyBB->addSuccessor(LoopBB); 5634 BodyBB->addSuccessor(RemainderBB); 5635 5636 // Move Begin to MI to the BodyBB, and the remainder of the block to 5637 // RemainderBB. 5638 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 5639 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end()); 5640 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end()); 5641 5642 MBB.addSuccessor(LoopBB); 5643 5644 // Update dominators. We know that MBB immediately dominates LoopBB, that 5645 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates 5646 // RemainderBB. RemainderBB immediately dominates all of the successors 5647 // transferred to it from MBB that MBB used to properly dominate. 5648 if (MDT) { 5649 MDT->addNewBlock(LoopBB, &MBB); 5650 MDT->addNewBlock(BodyBB, LoopBB); 5651 MDT->addNewBlock(RemainderBB, BodyBB); 5652 for (auto &Succ : RemainderBB->successors()) { 5653 if (MDT->properlyDominates(&MBB, Succ)) { 5654 MDT->changeImmediateDominator(Succ, RemainderBB); 5655 } 5656 } 5657 } 5658 5659 emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, Rsrc); 5660 5661 // Restore the EXEC mask 5662 MachineBasicBlock::iterator First = RemainderBB->begin(); 5663 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec); 5664 return BodyBB; 5665 } 5666 5667 // Extract pointer from Rsrc and return a zero-value Rsrc replacement. 5668 static std::tuple<unsigned, unsigned> 5669 extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) { 5670 MachineBasicBlock &MBB = *MI.getParent(); 5671 MachineFunction &MF = *MBB.getParent(); 5672 MachineRegisterInfo &MRI = MF.getRegInfo(); 5673 5674 // Extract the ptr from the resource descriptor. 5675 unsigned RsrcPtr = 5676 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass, 5677 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); 5678 5679 // Create an empty resource descriptor 5680 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 5681 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 5682 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 5683 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 5684 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat(); 5685 5686 // Zero64 = 0 5687 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64) 5688 .addImm(0); 5689 5690 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 5691 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo) 5692 .addImm(RsrcDataFormat & 0xFFFFFFFF); 5693 5694 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 5695 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi) 5696 .addImm(RsrcDataFormat >> 32); 5697 5698 // NewSRsrc = {Zero64, SRsrcFormat} 5699 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc) 5700 .addReg(Zero64) 5701 .addImm(AMDGPU::sub0_sub1) 5702 .addReg(SRsrcFormatLo) 5703 .addImm(AMDGPU::sub2) 5704 .addReg(SRsrcFormatHi) 5705 .addImm(AMDGPU::sub3); 5706 5707 return std::make_tuple(RsrcPtr, NewSRsrc); 5708 } 5709 5710 MachineBasicBlock * 5711 SIInstrInfo::legalizeOperands(MachineInstr &MI, 5712 MachineDominatorTree *MDT) const { 5713 MachineFunction &MF = *MI.getParent()->getParent(); 5714 MachineRegisterInfo &MRI = MF.getRegInfo(); 5715 MachineBasicBlock *CreatedBB = nullptr; 5716 5717 // Legalize VOP2 5718 if (isVOP2(MI) || isVOPC(MI)) { 5719 legalizeOperandsVOP2(MRI, MI); 5720 return CreatedBB; 5721 } 5722 5723 // Legalize VOP3 5724 if (isVOP3(MI)) { 5725 legalizeOperandsVOP3(MRI, MI); 5726 return CreatedBB; 5727 } 5728 5729 // Legalize SMRD 5730 if (isSMRD(MI)) { 5731 legalizeOperandsSMRD(MRI, MI); 5732 return CreatedBB; 5733 } 5734 5735 // Legalize FLAT 5736 if (isFLAT(MI)) { 5737 legalizeOperandsFLAT(MRI, MI); 5738 return CreatedBB; 5739 } 5740 5741 // Legalize REG_SEQUENCE and PHI 5742 // The register class of the operands much be the same type as the register 5743 // class of the output. 5744 if (MI.getOpcode() == AMDGPU::PHI) { 5745 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 5746 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { 5747 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual()) 5748 continue; 5749 const TargetRegisterClass *OpRC = 5750 MRI.getRegClass(MI.getOperand(i).getReg()); 5751 if (RI.hasVectorRegisters(OpRC)) { 5752 VRC = OpRC; 5753 } else { 5754 SRC = OpRC; 5755 } 5756 } 5757 5758 // If any of the operands are VGPR registers, then they all most be 5759 // otherwise we will create illegal VGPR->SGPR copies when legalizing 5760 // them. 5761 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { 5762 if (!VRC) { 5763 assert(SRC); 5764 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) { 5765 VRC = &AMDGPU::VReg_1RegClass; 5766 } else 5767 VRC = RI.isAGPRClass(getOpRegClass(MI, 0)) 5768 ? RI.getEquivalentAGPRClass(SRC) 5769 : RI.getEquivalentVGPRClass(SRC); 5770 } else { 5771 VRC = RI.isAGPRClass(getOpRegClass(MI, 0)) 5772 ? RI.getEquivalentAGPRClass(VRC) 5773 : RI.getEquivalentVGPRClass(VRC); 5774 } 5775 RC = VRC; 5776 } else { 5777 RC = SRC; 5778 } 5779 5780 // Update all the operands so they have the same type. 5781 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 5782 MachineOperand &Op = MI.getOperand(I); 5783 if (!Op.isReg() || !Op.getReg().isVirtual()) 5784 continue; 5785 5786 // MI is a PHI instruction. 5787 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); 5788 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); 5789 5790 // Avoid creating no-op copies with the same src and dst reg class. These 5791 // confuse some of the machine passes. 5792 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); 5793 } 5794 } 5795 5796 // REG_SEQUENCE doesn't really require operand legalization, but if one has a 5797 // VGPR dest type and SGPR sources, insert copies so all operands are 5798 // VGPRs. This seems to help operand folding / the register coalescer. 5799 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { 5800 MachineBasicBlock *MBB = MI.getParent(); 5801 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); 5802 if (RI.hasVGPRs(DstRC)) { 5803 // Update all the operands so they are VGPR register classes. These may 5804 // not be the same register class because REG_SEQUENCE supports mixing 5805 // subregister index types e.g. sub0_sub1 + sub2 + sub3 5806 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 5807 MachineOperand &Op = MI.getOperand(I); 5808 if (!Op.isReg() || !Op.getReg().isVirtual()) 5809 continue; 5810 5811 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); 5812 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); 5813 if (VRC == OpRC) 5814 continue; 5815 5816 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc()); 5817 Op.setIsKill(); 5818 } 5819 } 5820 5821 return CreatedBB; 5822 } 5823 5824 // Legalize INSERT_SUBREG 5825 // src0 must have the same register class as dst 5826 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { 5827 Register Dst = MI.getOperand(0).getReg(); 5828 Register Src0 = MI.getOperand(1).getReg(); 5829 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 5830 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 5831 if (DstRC != Src0RC) { 5832 MachineBasicBlock *MBB = MI.getParent(); 5833 MachineOperand &Op = MI.getOperand(1); 5834 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); 5835 } 5836 return CreatedBB; 5837 } 5838 5839 // Legalize SI_INIT_M0 5840 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) { 5841 MachineOperand &Src = MI.getOperand(0); 5842 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) 5843 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); 5844 return CreatedBB; 5845 } 5846 5847 // Legalize MIMG and MUBUF/MTBUF for shaders. 5848 // 5849 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via 5850 // scratch memory access. In both cases, the legalization never involves 5851 // conversion to the addr64 form. 5852 if (isMIMG(MI) || (AMDGPU::isGraphics(MF.getFunction().getCallingConv()) && 5853 (isMUBUF(MI) || isMTBUF(MI)))) { 5854 MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); 5855 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) 5856 CreatedBB = loadSRsrcFromVGPR(*this, MI, *SRsrc, MDT); 5857 5858 MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); 5859 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) 5860 CreatedBB = loadSRsrcFromVGPR(*this, MI, *SSamp, MDT); 5861 5862 return CreatedBB; 5863 } 5864 5865 // Legalize SI_CALL 5866 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) { 5867 MachineOperand *Dest = &MI.getOperand(0); 5868 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) { 5869 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and 5870 // following copies, we also need to move copies from and to physical 5871 // registers into the loop block. 5872 unsigned FrameSetupOpcode = getCallFrameSetupOpcode(); 5873 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode(); 5874 5875 // Also move the copies to physical registers into the loop block 5876 MachineBasicBlock &MBB = *MI.getParent(); 5877 MachineBasicBlock::iterator Start(&MI); 5878 while (Start->getOpcode() != FrameSetupOpcode) 5879 --Start; 5880 MachineBasicBlock::iterator End(&MI); 5881 while (End->getOpcode() != FrameDestroyOpcode) 5882 ++End; 5883 // Also include following copies of the return value 5884 ++End; 5885 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() && 5886 MI.definesRegister(End->getOperand(1).getReg())) 5887 ++End; 5888 CreatedBB = loadSRsrcFromVGPR(*this, MI, *Dest, MDT, Start, End); 5889 } 5890 } 5891 5892 // Legalize MUBUF* instructions. 5893 int RsrcIdx = 5894 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); 5895 if (RsrcIdx != -1) { 5896 // We have an MUBUF instruction 5897 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx); 5898 unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass; 5899 if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()), 5900 RI.getRegClass(RsrcRC))) { 5901 // The operands are legal. 5902 // FIXME: We may need to legalize operands besides srsrc. 5903 return CreatedBB; 5904 } 5905 5906 // Legalize a VGPR Rsrc. 5907 // 5908 // If the instruction is _ADDR64, we can avoid a waterfall by extracting 5909 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using 5910 // a zero-value SRsrc. 5911 // 5912 // If the instruction is _OFFSET (both idxen and offen disabled), and we 5913 // support ADDR64 instructions, we can convert to ADDR64 and do the same as 5914 // above. 5915 // 5916 // Otherwise we are on non-ADDR64 hardware, and/or we have 5917 // idxen/offen/bothen and we fall back to a waterfall loop. 5918 5919 MachineBasicBlock &MBB = *MI.getParent(); 5920 5921 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 5922 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) { 5923 // This is already an ADDR64 instruction so we need to add the pointer 5924 // extracted from the resource descriptor to the current value of VAddr. 5925 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5926 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5927 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 5928 5929 const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 5930 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC); 5931 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC); 5932 5933 unsigned RsrcPtr, NewSRsrc; 5934 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); 5935 5936 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0 5937 const DebugLoc &DL = MI.getDebugLoc(); 5938 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo) 5939 .addDef(CondReg0) 5940 .addReg(RsrcPtr, 0, AMDGPU::sub0) 5941 .addReg(VAddr->getReg(), 0, AMDGPU::sub0) 5942 .addImm(0); 5943 5944 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1 5945 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi) 5946 .addDef(CondReg1, RegState::Dead) 5947 .addReg(RsrcPtr, 0, AMDGPU::sub1) 5948 .addReg(VAddr->getReg(), 0, AMDGPU::sub1) 5949 .addReg(CondReg0, RegState::Kill) 5950 .addImm(0); 5951 5952 // NewVaddr = {NewVaddrHi, NewVaddrLo} 5953 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 5954 .addReg(NewVAddrLo) 5955 .addImm(AMDGPU::sub0) 5956 .addReg(NewVAddrHi) 5957 .addImm(AMDGPU::sub1); 5958 5959 VAddr->setReg(NewVAddr); 5960 Rsrc->setReg(NewSRsrc); 5961 } else if (!VAddr && ST.hasAddr64()) { 5962 // This instructions is the _OFFSET variant, so we need to convert it to 5963 // ADDR64. 5964 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS && 5965 "FIXME: Need to emit flat atomics here"); 5966 5967 unsigned RsrcPtr, NewSRsrc; 5968 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); 5969 5970 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 5971 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); 5972 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 5973 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); 5974 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); 5975 5976 // Atomics with return have an additional tied operand and are 5977 // missing some of the special bits. 5978 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); 5979 MachineInstr *Addr64; 5980 5981 if (!VDataIn) { 5982 // Regular buffer load / store. 5983 MachineInstrBuilder MIB = 5984 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 5985 .add(*VData) 5986 .addReg(NewVAddr) 5987 .addReg(NewSRsrc) 5988 .add(*SOffset) 5989 .add(*Offset); 5990 5991 if (const MachineOperand *CPol = 5992 getNamedOperand(MI, AMDGPU::OpName::cpol)) { 5993 MIB.addImm(CPol->getImm()); 5994 } 5995 5996 if (const MachineOperand *TFE = 5997 getNamedOperand(MI, AMDGPU::OpName::tfe)) { 5998 MIB.addImm(TFE->getImm()); 5999 } 6000 6001 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz)); 6002 6003 MIB.cloneMemRefs(MI); 6004 Addr64 = MIB; 6005 } else { 6006 // Atomics with return. 6007 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 6008 .add(*VData) 6009 .add(*VDataIn) 6010 .addReg(NewVAddr) 6011 .addReg(NewSRsrc) 6012 .add(*SOffset) 6013 .add(*Offset) 6014 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol)) 6015 .cloneMemRefs(MI); 6016 } 6017 6018 MI.removeFromParent(); 6019 6020 // NewVaddr = {NewVaddrHi, NewVaddrLo} 6021 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), 6022 NewVAddr) 6023 .addReg(RsrcPtr, 0, AMDGPU::sub0) 6024 .addImm(AMDGPU::sub0) 6025 .addReg(RsrcPtr, 0, AMDGPU::sub1) 6026 .addImm(AMDGPU::sub1); 6027 } else { 6028 // This is another variant; legalize Rsrc with waterfall loop from VGPRs 6029 // to SGPRs. 6030 CreatedBB = loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT); 6031 return CreatedBB; 6032 } 6033 } 6034 return CreatedBB; 6035 } 6036 6037 MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst, 6038 MachineDominatorTree *MDT) const { 6039 SetVectorType Worklist; 6040 Worklist.insert(&TopInst); 6041 MachineBasicBlock *CreatedBB = nullptr; 6042 MachineBasicBlock *CreatedBBTmp = nullptr; 6043 6044 while (!Worklist.empty()) { 6045 MachineInstr &Inst = *Worklist.pop_back_val(); 6046 MachineBasicBlock *MBB = Inst.getParent(); 6047 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 6048 6049 unsigned Opcode = Inst.getOpcode(); 6050 unsigned NewOpcode = getVALUOp(Inst); 6051 6052 // Handle some special cases 6053 switch (Opcode) { 6054 default: 6055 break; 6056 case AMDGPU::S_ADD_U64_PSEUDO: 6057 case AMDGPU::S_SUB_U64_PSEUDO: 6058 splitScalar64BitAddSub(Worklist, Inst, MDT); 6059 Inst.eraseFromParent(); 6060 continue; 6061 case AMDGPU::S_ADD_I32: 6062 case AMDGPU::S_SUB_I32: { 6063 // FIXME: The u32 versions currently selected use the carry. 6064 bool Changed; 6065 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT); 6066 if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) 6067 CreatedBB = CreatedBBTmp; 6068 if (Changed) 6069 continue; 6070 6071 // Default handling 6072 break; 6073 } 6074 case AMDGPU::S_AND_B64: 6075 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); 6076 Inst.eraseFromParent(); 6077 continue; 6078 6079 case AMDGPU::S_OR_B64: 6080 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT); 6081 Inst.eraseFromParent(); 6082 continue; 6083 6084 case AMDGPU::S_XOR_B64: 6085 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT); 6086 Inst.eraseFromParent(); 6087 continue; 6088 6089 case AMDGPU::S_NAND_B64: 6090 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT); 6091 Inst.eraseFromParent(); 6092 continue; 6093 6094 case AMDGPU::S_NOR_B64: 6095 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT); 6096 Inst.eraseFromParent(); 6097 continue; 6098 6099 case AMDGPU::S_XNOR_B64: 6100 if (ST.hasDLInsts()) 6101 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); 6102 else 6103 splitScalar64BitXnor(Worklist, Inst, MDT); 6104 Inst.eraseFromParent(); 6105 continue; 6106 6107 case AMDGPU::S_ANDN2_B64: 6108 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT); 6109 Inst.eraseFromParent(); 6110 continue; 6111 6112 case AMDGPU::S_ORN2_B64: 6113 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT); 6114 Inst.eraseFromParent(); 6115 continue; 6116 6117 case AMDGPU::S_BREV_B64: 6118 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true); 6119 Inst.eraseFromParent(); 6120 continue; 6121 6122 case AMDGPU::S_NOT_B64: 6123 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); 6124 Inst.eraseFromParent(); 6125 continue; 6126 6127 case AMDGPU::S_BCNT1_I32_B64: 6128 splitScalar64BitBCNT(Worklist, Inst); 6129 Inst.eraseFromParent(); 6130 continue; 6131 6132 case AMDGPU::S_BFE_I64: 6133 splitScalar64BitBFE(Worklist, Inst); 6134 Inst.eraseFromParent(); 6135 continue; 6136 6137 case AMDGPU::S_LSHL_B32: 6138 if (ST.hasOnlyRevVALUShifts()) { 6139 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 6140 swapOperands(Inst); 6141 } 6142 break; 6143 case AMDGPU::S_ASHR_I32: 6144 if (ST.hasOnlyRevVALUShifts()) { 6145 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 6146 swapOperands(Inst); 6147 } 6148 break; 6149 case AMDGPU::S_LSHR_B32: 6150 if (ST.hasOnlyRevVALUShifts()) { 6151 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 6152 swapOperands(Inst); 6153 } 6154 break; 6155 case AMDGPU::S_LSHL_B64: 6156 if (ST.hasOnlyRevVALUShifts()) { 6157 NewOpcode = AMDGPU::V_LSHLREV_B64_e64; 6158 swapOperands(Inst); 6159 } 6160 break; 6161 case AMDGPU::S_ASHR_I64: 6162 if (ST.hasOnlyRevVALUShifts()) { 6163 NewOpcode = AMDGPU::V_ASHRREV_I64_e64; 6164 swapOperands(Inst); 6165 } 6166 break; 6167 case AMDGPU::S_LSHR_B64: 6168 if (ST.hasOnlyRevVALUShifts()) { 6169 NewOpcode = AMDGPU::V_LSHRREV_B64_e64; 6170 swapOperands(Inst); 6171 } 6172 break; 6173 6174 case AMDGPU::S_ABS_I32: 6175 lowerScalarAbs(Worklist, Inst); 6176 Inst.eraseFromParent(); 6177 continue; 6178 6179 case AMDGPU::S_CBRANCH_SCC0: 6180 case AMDGPU::S_CBRANCH_SCC1: { 6181 // Clear unused bits of vcc 6182 Register CondReg = Inst.getOperand(1).getReg(); 6183 bool IsSCC = CondReg == AMDGPU::SCC; 6184 Register VCC = RI.getVCC(); 6185 Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 6186 unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 6187 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC) 6188 .addReg(EXEC) 6189 .addReg(IsSCC ? VCC : CondReg); 6190 Inst.removeOperand(1); 6191 } 6192 break; 6193 6194 case AMDGPU::S_BFE_U64: 6195 case AMDGPU::S_BFM_B64: 6196 llvm_unreachable("Moving this op to VALU not implemented"); 6197 6198 case AMDGPU::S_PACK_LL_B32_B16: 6199 case AMDGPU::S_PACK_LH_B32_B16: 6200 case AMDGPU::S_PACK_HH_B32_B16: 6201 movePackToVALU(Worklist, MRI, Inst); 6202 Inst.eraseFromParent(); 6203 continue; 6204 6205 case AMDGPU::S_XNOR_B32: 6206 lowerScalarXnor(Worklist, Inst); 6207 Inst.eraseFromParent(); 6208 continue; 6209 6210 case AMDGPU::S_NAND_B32: 6211 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32); 6212 Inst.eraseFromParent(); 6213 continue; 6214 6215 case AMDGPU::S_NOR_B32: 6216 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32); 6217 Inst.eraseFromParent(); 6218 continue; 6219 6220 case AMDGPU::S_ANDN2_B32: 6221 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32); 6222 Inst.eraseFromParent(); 6223 continue; 6224 6225 case AMDGPU::S_ORN2_B32: 6226 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); 6227 Inst.eraseFromParent(); 6228 continue; 6229 6230 // TODO: remove as soon as everything is ready 6231 // to replace VGPR to SGPR copy with V_READFIRSTLANEs. 6232 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO 6233 // can only be selected from the uniform SDNode. 6234 case AMDGPU::S_ADD_CO_PSEUDO: 6235 case AMDGPU::S_SUB_CO_PSEUDO: { 6236 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) 6237 ? AMDGPU::V_ADDC_U32_e64 6238 : AMDGPU::V_SUBB_U32_e64; 6239 const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 6240 6241 Register CarryInReg = Inst.getOperand(4).getReg(); 6242 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) { 6243 Register NewCarryReg = MRI.createVirtualRegister(CarryRC); 6244 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg) 6245 .addReg(CarryInReg); 6246 } 6247 6248 Register CarryOutReg = Inst.getOperand(1).getReg(); 6249 6250 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass( 6251 MRI.getRegClass(Inst.getOperand(0).getReg()))); 6252 MachineInstr *CarryOp = 6253 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg) 6254 .addReg(CarryOutReg, RegState::Define) 6255 .add(Inst.getOperand(2)) 6256 .add(Inst.getOperand(3)) 6257 .addReg(CarryInReg) 6258 .addImm(0); 6259 CreatedBBTmp = legalizeOperands(*CarryOp); 6260 if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) 6261 CreatedBB = CreatedBBTmp; 6262 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg); 6263 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); 6264 Inst.eraseFromParent(); 6265 } 6266 continue; 6267 case AMDGPU::S_UADDO_PSEUDO: 6268 case AMDGPU::S_USUBO_PSEUDO: { 6269 const DebugLoc &DL = Inst.getDebugLoc(); 6270 MachineOperand &Dest0 = Inst.getOperand(0); 6271 MachineOperand &Dest1 = Inst.getOperand(1); 6272 MachineOperand &Src0 = Inst.getOperand(2); 6273 MachineOperand &Src1 = Inst.getOperand(3); 6274 6275 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO) 6276 ? AMDGPU::V_ADD_CO_U32_e64 6277 : AMDGPU::V_SUB_CO_U32_e64; 6278 const TargetRegisterClass *NewRC = 6279 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg())); 6280 Register DestReg = MRI.createVirtualRegister(NewRC); 6281 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg) 6282 .addReg(Dest1.getReg(), RegState::Define) 6283 .add(Src0) 6284 .add(Src1) 6285 .addImm(0); // clamp bit 6286 6287 CreatedBBTmp = legalizeOperands(*NewInstr, MDT); 6288 if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) 6289 CreatedBB = CreatedBBTmp; 6290 6291 MRI.replaceRegWith(Dest0.getReg(), DestReg); 6292 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, 6293 Worklist); 6294 Inst.eraseFromParent(); 6295 } 6296 continue; 6297 6298 case AMDGPU::S_CSELECT_B32: 6299 case AMDGPU::S_CSELECT_B64: 6300 lowerSelect(Worklist, Inst, MDT); 6301 Inst.eraseFromParent(); 6302 continue; 6303 case AMDGPU::S_CMP_EQ_I32: 6304 case AMDGPU::S_CMP_LG_I32: 6305 case AMDGPU::S_CMP_GT_I32: 6306 case AMDGPU::S_CMP_GE_I32: 6307 case AMDGPU::S_CMP_LT_I32: 6308 case AMDGPU::S_CMP_LE_I32: 6309 case AMDGPU::S_CMP_EQ_U32: 6310 case AMDGPU::S_CMP_LG_U32: 6311 case AMDGPU::S_CMP_GT_U32: 6312 case AMDGPU::S_CMP_GE_U32: 6313 case AMDGPU::S_CMP_LT_U32: 6314 case AMDGPU::S_CMP_LE_U32: 6315 case AMDGPU::S_CMP_EQ_U64: 6316 case AMDGPU::S_CMP_LG_U64: { 6317 const MCInstrDesc &NewDesc = get(NewOpcode); 6318 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass()); 6319 MachineInstr *NewInstr = 6320 BuildMI(*MBB, Inst, Inst.getDebugLoc(), NewDesc, CondReg) 6321 .add(Inst.getOperand(0)) 6322 .add(Inst.getOperand(1)); 6323 legalizeOperands(*NewInstr, MDT); 6324 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC); 6325 MachineOperand SCCOp = Inst.getOperand(SCCIdx); 6326 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); 6327 Inst.eraseFromParent(); 6328 } 6329 continue; 6330 } 6331 6332 6333 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 6334 // We cannot move this instruction to the VALU, so we should try to 6335 // legalize its operands instead. 6336 CreatedBBTmp = legalizeOperands(Inst, MDT); 6337 if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) 6338 CreatedBB = CreatedBBTmp; 6339 continue; 6340 } 6341 6342 // Use the new VALU Opcode. 6343 const MCInstrDesc &NewDesc = get(NewOpcode); 6344 Inst.setDesc(NewDesc); 6345 6346 // Remove any references to SCC. Vector instructions can't read from it, and 6347 // We're just about to add the implicit use / defs of VCC, and we don't want 6348 // both. 6349 for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { 6350 MachineOperand &Op = Inst.getOperand(i); 6351 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { 6352 // Only propagate through live-def of SCC. 6353 if (Op.isDef() && !Op.isDead()) 6354 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist); 6355 if (Op.isUse()) 6356 addSCCDefsToVALUWorklist(Op, Worklist); 6357 Inst.removeOperand(i); 6358 } 6359 } 6360 6361 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 6362 // We are converting these to a BFE, so we need to add the missing 6363 // operands for the size and offset. 6364 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 6365 Inst.addOperand(MachineOperand::CreateImm(0)); 6366 Inst.addOperand(MachineOperand::CreateImm(Size)); 6367 6368 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 6369 // The VALU version adds the second operand to the result, so insert an 6370 // extra 0 operand. 6371 Inst.addOperand(MachineOperand::CreateImm(0)); 6372 } 6373 6374 Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); 6375 fixImplicitOperands(Inst); 6376 6377 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 6378 const MachineOperand &OffsetWidthOp = Inst.getOperand(2); 6379 // If we need to move this to VGPRs, we need to unpack the second operand 6380 // back into the 2 separate ones for bit offset and width. 6381 assert(OffsetWidthOp.isImm() && 6382 "Scalar BFE is only implemented for constant width and offset"); 6383 uint32_t Imm = OffsetWidthOp.getImm(); 6384 6385 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 6386 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 6387 Inst.removeOperand(2); // Remove old immediate. 6388 Inst.addOperand(MachineOperand::CreateImm(Offset)); 6389 Inst.addOperand(MachineOperand::CreateImm(BitWidth)); 6390 } 6391 6392 bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); 6393 unsigned NewDstReg = AMDGPU::NoRegister; 6394 if (HasDst) { 6395 Register DstReg = Inst.getOperand(0).getReg(); 6396 if (DstReg.isPhysical()) 6397 continue; 6398 6399 // Update the destination register class. 6400 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); 6401 if (!NewDstRC) 6402 continue; 6403 6404 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() && 6405 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { 6406 // Instead of creating a copy where src and dst are the same register 6407 // class, we just replace all uses of dst with src. These kinds of 6408 // copies interfere with the heuristics MachineSink uses to decide 6409 // whether or not to split a critical edge. Since the pass assumes 6410 // that copies will end up as machine instructions and not be 6411 // eliminated. 6412 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); 6413 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); 6414 MRI.clearKillFlags(Inst.getOperand(1).getReg()); 6415 Inst.getOperand(0).setReg(DstReg); 6416 6417 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally 6418 // these are deleted later, but at -O0 it would leave a suspicious 6419 // looking illegal copy of an undef register. 6420 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I) 6421 Inst.removeOperand(I); 6422 Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); 6423 continue; 6424 } 6425 6426 NewDstReg = MRI.createVirtualRegister(NewDstRC); 6427 MRI.replaceRegWith(DstReg, NewDstReg); 6428 } 6429 6430 // Legalize the operands 6431 CreatedBBTmp = legalizeOperands(Inst, MDT); 6432 if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) 6433 CreatedBB = CreatedBBTmp; 6434 6435 if (HasDst) 6436 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 6437 } 6438 return CreatedBB; 6439 } 6440 6441 // Add/sub require special handling to deal with carry outs. 6442 std::pair<bool, MachineBasicBlock *> 6443 SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, 6444 MachineDominatorTree *MDT) const { 6445 if (ST.hasAddNoCarry()) { 6446 // Assume there is no user of scc since we don't select this in that case. 6447 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant 6448 // is used. 6449 6450 MachineBasicBlock &MBB = *Inst.getParent(); 6451 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6452 6453 Register OldDstReg = Inst.getOperand(0).getReg(); 6454 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6455 6456 unsigned Opc = Inst.getOpcode(); 6457 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32); 6458 6459 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ? 6460 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64; 6461 6462 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC); 6463 Inst.removeOperand(3); 6464 6465 Inst.setDesc(get(NewOpc)); 6466 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit 6467 Inst.addImplicitDefUseOperands(*MBB.getParent()); 6468 MRI.replaceRegWith(OldDstReg, ResultReg); 6469 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT); 6470 6471 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 6472 return std::make_pair(true, NewBB); 6473 } 6474 6475 return std::make_pair(false, nullptr); 6476 } 6477 6478 void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, 6479 MachineDominatorTree *MDT) const { 6480 6481 MachineBasicBlock &MBB = *Inst.getParent(); 6482 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6483 MachineBasicBlock::iterator MII = Inst; 6484 DebugLoc DL = Inst.getDebugLoc(); 6485 6486 MachineOperand &Dest = Inst.getOperand(0); 6487 MachineOperand &Src0 = Inst.getOperand(1); 6488 MachineOperand &Src1 = Inst.getOperand(2); 6489 MachineOperand &Cond = Inst.getOperand(3); 6490 6491 Register SCCSource = Cond.getReg(); 6492 bool IsSCC = (SCCSource == AMDGPU::SCC); 6493 6494 // If this is a trivial select where the condition is effectively not SCC 6495 // (SCCSource is a source of copy to SCC), then the select is semantically 6496 // equivalent to copying SCCSource. Hence, there is no need to create 6497 // V_CNDMASK, we can just use that and bail out. 6498 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() && 6499 (Src1.getImm() == 0)) { 6500 MRI.replaceRegWith(Dest.getReg(), SCCSource); 6501 return; 6502 } 6503 6504 const TargetRegisterClass *TC = 6505 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 6506 6507 Register CopySCC = MRI.createVirtualRegister(TC); 6508 6509 if (IsSCC) { 6510 // Now look for the closest SCC def if it is a copy 6511 // replacing the SCCSource with the COPY source register 6512 bool CopyFound = false; 6513 for (MachineInstr &CandI : 6514 make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)), 6515 Inst.getParent()->rend())) { 6516 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != 6517 -1) { 6518 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) { 6519 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC) 6520 .addReg(CandI.getOperand(1).getReg()); 6521 CopyFound = true; 6522 } 6523 break; 6524 } 6525 } 6526 if (!CopyFound) { 6527 // SCC def is not a copy 6528 // Insert a trivial select instead of creating a copy, because a copy from 6529 // SCC would semantically mean just copying a single bit, but we may need 6530 // the result to be a vector condition mask that needs preserving. 6531 unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64 6532 : AMDGPU::S_CSELECT_B32; 6533 auto NewSelect = 6534 BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0); 6535 NewSelect->getOperand(3).setIsUndef(Cond.isUndef()); 6536 } 6537 } 6538 6539 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6540 6541 auto UpdatedInst = 6542 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), ResultReg) 6543 .addImm(0) 6544 .add(Src1) // False 6545 .addImm(0) 6546 .add(Src0) // True 6547 .addReg(IsSCC ? CopySCC : SCCSource); 6548 6549 MRI.replaceRegWith(Dest.getReg(), ResultReg); 6550 legalizeOperands(*UpdatedInst, MDT); 6551 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 6552 } 6553 6554 void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, 6555 MachineInstr &Inst) const { 6556 MachineBasicBlock &MBB = *Inst.getParent(); 6557 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6558 MachineBasicBlock::iterator MII = Inst; 6559 DebugLoc DL = Inst.getDebugLoc(); 6560 6561 MachineOperand &Dest = Inst.getOperand(0); 6562 MachineOperand &Src = Inst.getOperand(1); 6563 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6564 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6565 6566 unsigned SubOp = ST.hasAddNoCarry() ? 6567 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32; 6568 6569 BuildMI(MBB, MII, DL, get(SubOp), TmpReg) 6570 .addImm(0) 6571 .addReg(Src.getReg()); 6572 6573 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) 6574 .addReg(Src.getReg()) 6575 .addReg(TmpReg); 6576 6577 MRI.replaceRegWith(Dest.getReg(), ResultReg); 6578 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 6579 } 6580 6581 void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, 6582 MachineInstr &Inst) const { 6583 MachineBasicBlock &MBB = *Inst.getParent(); 6584 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6585 MachineBasicBlock::iterator MII = Inst; 6586 const DebugLoc &DL = Inst.getDebugLoc(); 6587 6588 MachineOperand &Dest = Inst.getOperand(0); 6589 MachineOperand &Src0 = Inst.getOperand(1); 6590 MachineOperand &Src1 = Inst.getOperand(2); 6591 6592 if (ST.hasDLInsts()) { 6593 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6594 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); 6595 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); 6596 6597 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest) 6598 .add(Src0) 6599 .add(Src1); 6600 6601 MRI.replaceRegWith(Dest.getReg(), NewDest); 6602 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 6603 } else { 6604 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can 6605 // invert either source and then perform the XOR. If either source is a 6606 // scalar register, then we can leave the inversion on the scalar unit to 6607 // achieve a better distribution of scalar and vector instructions. 6608 bool Src0IsSGPR = Src0.isReg() && 6609 RI.isSGPRClass(MRI.getRegClass(Src0.getReg())); 6610 bool Src1IsSGPR = Src1.isReg() && 6611 RI.isSGPRClass(MRI.getRegClass(Src1.getReg())); 6612 MachineInstr *Xor; 6613 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 6614 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 6615 6616 // Build a pair of scalar instructions and add them to the work list. 6617 // The next iteration over the work list will lower these to the vector 6618 // unit as necessary. 6619 if (Src0IsSGPR) { 6620 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0); 6621 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) 6622 .addReg(Temp) 6623 .add(Src1); 6624 } else if (Src1IsSGPR) { 6625 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1); 6626 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) 6627 .add(Src0) 6628 .addReg(Temp); 6629 } else { 6630 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp) 6631 .add(Src0) 6632 .add(Src1); 6633 MachineInstr *Not = 6634 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp); 6635 Worklist.insert(Not); 6636 } 6637 6638 MRI.replaceRegWith(Dest.getReg(), NewDest); 6639 6640 Worklist.insert(Xor); 6641 6642 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 6643 } 6644 } 6645 6646 void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist, 6647 MachineInstr &Inst, 6648 unsigned Opcode) const { 6649 MachineBasicBlock &MBB = *Inst.getParent(); 6650 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6651 MachineBasicBlock::iterator MII = Inst; 6652 const DebugLoc &DL = Inst.getDebugLoc(); 6653 6654 MachineOperand &Dest = Inst.getOperand(0); 6655 MachineOperand &Src0 = Inst.getOperand(1); 6656 MachineOperand &Src1 = Inst.getOperand(2); 6657 6658 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 6659 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 6660 6661 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm) 6662 .add(Src0) 6663 .add(Src1); 6664 6665 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest) 6666 .addReg(Interm); 6667 6668 Worklist.insert(&Op); 6669 Worklist.insert(&Not); 6670 6671 MRI.replaceRegWith(Dest.getReg(), NewDest); 6672 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 6673 } 6674 6675 void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist, 6676 MachineInstr &Inst, 6677 unsigned Opcode) const { 6678 MachineBasicBlock &MBB = *Inst.getParent(); 6679 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6680 MachineBasicBlock::iterator MII = Inst; 6681 const DebugLoc &DL = Inst.getDebugLoc(); 6682 6683 MachineOperand &Dest = Inst.getOperand(0); 6684 MachineOperand &Src0 = Inst.getOperand(1); 6685 MachineOperand &Src1 = Inst.getOperand(2); 6686 6687 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 6688 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 6689 6690 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm) 6691 .add(Src1); 6692 6693 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest) 6694 .add(Src0) 6695 .addReg(Interm); 6696 6697 Worklist.insert(&Not); 6698 Worklist.insert(&Op); 6699 6700 MRI.replaceRegWith(Dest.getReg(), NewDest); 6701 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 6702 } 6703 6704 void SIInstrInfo::splitScalar64BitUnaryOp( 6705 SetVectorType &Worklist, MachineInstr &Inst, 6706 unsigned Opcode, bool Swap) const { 6707 MachineBasicBlock &MBB = *Inst.getParent(); 6708 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6709 6710 MachineOperand &Dest = Inst.getOperand(0); 6711 MachineOperand &Src0 = Inst.getOperand(1); 6712 DebugLoc DL = Inst.getDebugLoc(); 6713 6714 MachineBasicBlock::iterator MII = Inst; 6715 6716 const MCInstrDesc &InstDesc = get(Opcode); 6717 const TargetRegisterClass *Src0RC = Src0.isReg() ? 6718 MRI.getRegClass(Src0.getReg()) : 6719 &AMDGPU::SGPR_32RegClass; 6720 6721 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 6722 6723 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 6724 AMDGPU::sub0, Src0SubRC); 6725 6726 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 6727 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 6728 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 6729 6730 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 6731 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); 6732 6733 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 6734 AMDGPU::sub1, Src0SubRC); 6735 6736 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 6737 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); 6738 6739 if (Swap) 6740 std::swap(DestSub0, DestSub1); 6741 6742 Register FullDestReg = MRI.createVirtualRegister(NewDestRC); 6743 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 6744 .addReg(DestSub0) 6745 .addImm(AMDGPU::sub0) 6746 .addReg(DestSub1) 6747 .addImm(AMDGPU::sub1); 6748 6749 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 6750 6751 Worklist.insert(&LoHalf); 6752 Worklist.insert(&HiHalf); 6753 6754 // We don't need to legalizeOperands here because for a single operand, src0 6755 // will support any kind of input. 6756 6757 // Move all users of this moved value. 6758 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 6759 } 6760 6761 void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, 6762 MachineInstr &Inst, 6763 MachineDominatorTree *MDT) const { 6764 bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); 6765 6766 MachineBasicBlock &MBB = *Inst.getParent(); 6767 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6768 const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 6769 6770 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 6771 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6772 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6773 6774 Register CarryReg = MRI.createVirtualRegister(CarryRC); 6775 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC); 6776 6777 MachineOperand &Dest = Inst.getOperand(0); 6778 MachineOperand &Src0 = Inst.getOperand(1); 6779 MachineOperand &Src1 = Inst.getOperand(2); 6780 const DebugLoc &DL = Inst.getDebugLoc(); 6781 MachineBasicBlock::iterator MII = Inst; 6782 6783 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); 6784 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); 6785 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 6786 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 6787 6788 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 6789 AMDGPU::sub0, Src0SubRC); 6790 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 6791 AMDGPU::sub0, Src1SubRC); 6792 6793 6794 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 6795 AMDGPU::sub1, Src0SubRC); 6796 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 6797 AMDGPU::sub1, Src1SubRC); 6798 6799 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; 6800 MachineInstr *LoHalf = 6801 BuildMI(MBB, MII, DL, get(LoOpc), DestSub0) 6802 .addReg(CarryReg, RegState::Define) 6803 .add(SrcReg0Sub0) 6804 .add(SrcReg1Sub0) 6805 .addImm(0); // clamp bit 6806 6807 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 6808 MachineInstr *HiHalf = 6809 BuildMI(MBB, MII, DL, get(HiOpc), DestSub1) 6810 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 6811 .add(SrcReg0Sub1) 6812 .add(SrcReg1Sub1) 6813 .addReg(CarryReg, RegState::Kill) 6814 .addImm(0); // clamp bit 6815 6816 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 6817 .addReg(DestSub0) 6818 .addImm(AMDGPU::sub0) 6819 .addReg(DestSub1) 6820 .addImm(AMDGPU::sub1); 6821 6822 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 6823 6824 // Try to legalize the operands in case we need to swap the order to keep it 6825 // valid. 6826 legalizeOperands(*LoHalf, MDT); 6827 legalizeOperands(*HiHalf, MDT); 6828 6829 // Move all users of this moved value. 6830 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 6831 } 6832 6833 void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, 6834 MachineInstr &Inst, unsigned Opcode, 6835 MachineDominatorTree *MDT) const { 6836 MachineBasicBlock &MBB = *Inst.getParent(); 6837 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6838 6839 MachineOperand &Dest = Inst.getOperand(0); 6840 MachineOperand &Src0 = Inst.getOperand(1); 6841 MachineOperand &Src1 = Inst.getOperand(2); 6842 DebugLoc DL = Inst.getDebugLoc(); 6843 6844 MachineBasicBlock::iterator MII = Inst; 6845 6846 const MCInstrDesc &InstDesc = get(Opcode); 6847 const TargetRegisterClass *Src0RC = Src0.isReg() ? 6848 MRI.getRegClass(Src0.getReg()) : 6849 &AMDGPU::SGPR_32RegClass; 6850 6851 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 6852 const TargetRegisterClass *Src1RC = Src1.isReg() ? 6853 MRI.getRegClass(Src1.getReg()) : 6854 &AMDGPU::SGPR_32RegClass; 6855 6856 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 6857 6858 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 6859 AMDGPU::sub0, Src0SubRC); 6860 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 6861 AMDGPU::sub0, Src1SubRC); 6862 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 6863 AMDGPU::sub1, Src0SubRC); 6864 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 6865 AMDGPU::sub1, Src1SubRC); 6866 6867 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 6868 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 6869 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 6870 6871 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 6872 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) 6873 .add(SrcReg0Sub0) 6874 .add(SrcReg1Sub0); 6875 6876 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 6877 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) 6878 .add(SrcReg0Sub1) 6879 .add(SrcReg1Sub1); 6880 6881 Register FullDestReg = MRI.createVirtualRegister(NewDestRC); 6882 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 6883 .addReg(DestSub0) 6884 .addImm(AMDGPU::sub0) 6885 .addReg(DestSub1) 6886 .addImm(AMDGPU::sub1); 6887 6888 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 6889 6890 Worklist.insert(&LoHalf); 6891 Worklist.insert(&HiHalf); 6892 6893 // Move all users of this moved value. 6894 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 6895 } 6896 6897 void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist, 6898 MachineInstr &Inst, 6899 MachineDominatorTree *MDT) const { 6900 MachineBasicBlock &MBB = *Inst.getParent(); 6901 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6902 6903 MachineOperand &Dest = Inst.getOperand(0); 6904 MachineOperand &Src0 = Inst.getOperand(1); 6905 MachineOperand &Src1 = Inst.getOperand(2); 6906 const DebugLoc &DL = Inst.getDebugLoc(); 6907 6908 MachineBasicBlock::iterator MII = Inst; 6909 6910 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 6911 6912 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 6913 6914 MachineOperand* Op0; 6915 MachineOperand* Op1; 6916 6917 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) { 6918 Op0 = &Src0; 6919 Op1 = &Src1; 6920 } else { 6921 Op0 = &Src1; 6922 Op1 = &Src0; 6923 } 6924 6925 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm) 6926 .add(*Op0); 6927 6928 Register NewDest = MRI.createVirtualRegister(DestRC); 6929 6930 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest) 6931 .addReg(Interm) 6932 .add(*Op1); 6933 6934 MRI.replaceRegWith(Dest.getReg(), NewDest); 6935 6936 Worklist.insert(&Xor); 6937 } 6938 6939 void SIInstrInfo::splitScalar64BitBCNT( 6940 SetVectorType &Worklist, MachineInstr &Inst) const { 6941 MachineBasicBlock &MBB = *Inst.getParent(); 6942 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6943 6944 MachineBasicBlock::iterator MII = Inst; 6945 const DebugLoc &DL = Inst.getDebugLoc(); 6946 6947 MachineOperand &Dest = Inst.getOperand(0); 6948 MachineOperand &Src = Inst.getOperand(1); 6949 6950 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 6951 const TargetRegisterClass *SrcRC = Src.isReg() ? 6952 MRI.getRegClass(Src.getReg()) : 6953 &AMDGPU::SGPR_32RegClass; 6954 6955 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6956 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6957 6958 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); 6959 6960 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 6961 AMDGPU::sub0, SrcSubRC); 6962 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 6963 AMDGPU::sub1, SrcSubRC); 6964 6965 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0); 6966 6967 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg); 6968 6969 MRI.replaceRegWith(Dest.getReg(), ResultReg); 6970 6971 // We don't need to legalize operands here. src0 for either instruction can be 6972 // an SGPR, and the second input is unused or determined here. 6973 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 6974 } 6975 6976 void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, 6977 MachineInstr &Inst) const { 6978 MachineBasicBlock &MBB = *Inst.getParent(); 6979 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6980 MachineBasicBlock::iterator MII = Inst; 6981 const DebugLoc &DL = Inst.getDebugLoc(); 6982 6983 MachineOperand &Dest = Inst.getOperand(0); 6984 uint32_t Imm = Inst.getOperand(2).getImm(); 6985 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 6986 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 6987 6988 (void) Offset; 6989 6990 // Only sext_inreg cases handled. 6991 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && 6992 Offset == 0 && "Not implemented"); 6993 6994 if (BitWidth < 32) { 6995 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6996 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6997 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 6998 6999 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo) 7000 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) 7001 .addImm(0) 7002 .addImm(BitWidth); 7003 7004 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 7005 .addImm(31) 7006 .addReg(MidRegLo); 7007 7008 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 7009 .addReg(MidRegLo) 7010 .addImm(AMDGPU::sub0) 7011 .addReg(MidRegHi) 7012 .addImm(AMDGPU::sub1); 7013 7014 MRI.replaceRegWith(Dest.getReg(), ResultReg); 7015 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 7016 return; 7017 } 7018 7019 MachineOperand &Src = Inst.getOperand(1); 7020 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7021 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 7022 7023 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 7024 .addImm(31) 7025 .addReg(Src.getReg(), 0, AMDGPU::sub0); 7026 7027 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 7028 .addReg(Src.getReg(), 0, AMDGPU::sub0) 7029 .addImm(AMDGPU::sub0) 7030 .addReg(TmpReg) 7031 .addImm(AMDGPU::sub1); 7032 7033 MRI.replaceRegWith(Dest.getReg(), ResultReg); 7034 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 7035 } 7036 7037 void SIInstrInfo::addUsersToMoveToVALUWorklist( 7038 Register DstReg, 7039 MachineRegisterInfo &MRI, 7040 SetVectorType &Worklist) const { 7041 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), 7042 E = MRI.use_end(); I != E;) { 7043 MachineInstr &UseMI = *I->getParent(); 7044 7045 unsigned OpNo = 0; 7046 7047 switch (UseMI.getOpcode()) { 7048 case AMDGPU::COPY: 7049 case AMDGPU::WQM: 7050 case AMDGPU::SOFT_WQM: 7051 case AMDGPU::STRICT_WWM: 7052 case AMDGPU::STRICT_WQM: 7053 case AMDGPU::REG_SEQUENCE: 7054 case AMDGPU::PHI: 7055 case AMDGPU::INSERT_SUBREG: 7056 break; 7057 default: 7058 OpNo = I.getOperandNo(); 7059 break; 7060 } 7061 7062 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) { 7063 Worklist.insert(&UseMI); 7064 7065 do { 7066 ++I; 7067 } while (I != E && I->getParent() == &UseMI); 7068 } else { 7069 ++I; 7070 } 7071 } 7072 } 7073 7074 void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, 7075 MachineRegisterInfo &MRI, 7076 MachineInstr &Inst) const { 7077 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7078 MachineBasicBlock *MBB = Inst.getParent(); 7079 MachineOperand &Src0 = Inst.getOperand(1); 7080 MachineOperand &Src1 = Inst.getOperand(2); 7081 const DebugLoc &DL = Inst.getDebugLoc(); 7082 7083 switch (Inst.getOpcode()) { 7084 case AMDGPU::S_PACK_LL_B32_B16: { 7085 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7086 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7087 7088 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are 7089 // 0. 7090 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 7091 .addImm(0xffff); 7092 7093 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg) 7094 .addReg(ImmReg, RegState::Kill) 7095 .add(Src0); 7096 7097 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg) 7098 .add(Src1) 7099 .addImm(16) 7100 .addReg(TmpReg, RegState::Kill); 7101 break; 7102 } 7103 case AMDGPU::S_PACK_LH_B32_B16: { 7104 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7105 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 7106 .addImm(0xffff); 7107 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg) 7108 .addReg(ImmReg, RegState::Kill) 7109 .add(Src0) 7110 .add(Src1); 7111 break; 7112 } 7113 case AMDGPU::S_PACK_HH_B32_B16: { 7114 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7115 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7116 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) 7117 .addImm(16) 7118 .add(Src0); 7119 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 7120 .addImm(0xffff0000); 7121 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg) 7122 .add(Src1) 7123 .addReg(ImmReg, RegState::Kill) 7124 .addReg(TmpReg, RegState::Kill); 7125 break; 7126 } 7127 default: 7128 llvm_unreachable("unhandled s_pack_* instruction"); 7129 } 7130 7131 MachineOperand &Dest = Inst.getOperand(0); 7132 MRI.replaceRegWith(Dest.getReg(), ResultReg); 7133 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 7134 } 7135 7136 void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, 7137 MachineInstr &SCCDefInst, 7138 SetVectorType &Worklist, 7139 Register NewCond) const { 7140 7141 // Ensure that def inst defines SCC, which is still live. 7142 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && 7143 !Op.isDead() && Op.getParent() == &SCCDefInst); 7144 SmallVector<MachineInstr *, 4> CopyToDelete; 7145 // This assumes that all the users of SCC are in the same block 7146 // as the SCC def. 7147 for (MachineInstr &MI : // Skip the def inst itself. 7148 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)), 7149 SCCDefInst.getParent()->end())) { 7150 // Check if SCC is used first. 7151 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI); 7152 if (SCCIdx != -1) { 7153 if (MI.isCopy()) { 7154 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 7155 Register DestReg = MI.getOperand(0).getReg(); 7156 7157 MRI.replaceRegWith(DestReg, NewCond); 7158 CopyToDelete.push_back(&MI); 7159 } else { 7160 7161 if (NewCond.isValid()) 7162 MI.getOperand(SCCIdx).setReg(NewCond); 7163 7164 Worklist.insert(&MI); 7165 } 7166 } 7167 // Exit if we find another SCC def. 7168 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1) 7169 break; 7170 } 7171 for (auto &Copy : CopyToDelete) 7172 Copy->eraseFromParent(); 7173 } 7174 7175 // Instructions that use SCC may be converted to VALU instructions. When that 7176 // happens, the SCC register is changed to VCC_LO. The instruction that defines 7177 // SCC must be changed to an instruction that defines VCC. This function makes 7178 // sure that the instruction that defines SCC is added to the moveToVALU 7179 // worklist. 7180 void SIInstrInfo::addSCCDefsToVALUWorklist(MachineOperand &Op, 7181 SetVectorType &Worklist) const { 7182 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isUse()); 7183 7184 MachineInstr *SCCUseInst = Op.getParent(); 7185 // Look for a preceding instruction that either defines VCC or SCC. If VCC 7186 // then there is nothing to do because the defining instruction has been 7187 // converted to a VALU already. If SCC then that instruction needs to be 7188 // converted to a VALU. 7189 for (MachineInstr &MI : 7190 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)), 7191 SCCUseInst->getParent()->rend())) { 7192 if (MI.modifiesRegister(AMDGPU::VCC, &RI)) 7193 break; 7194 if (MI.definesRegister(AMDGPU::SCC, &RI)) { 7195 Worklist.insert(&MI); 7196 break; 7197 } 7198 } 7199 } 7200 7201 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( 7202 const MachineInstr &Inst) const { 7203 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); 7204 7205 switch (Inst.getOpcode()) { 7206 // For target instructions, getOpRegClass just returns the virtual register 7207 // class associated with the operand, so we need to find an equivalent VGPR 7208 // register class in order to move the instruction to the VALU. 7209 case AMDGPU::COPY: 7210 case AMDGPU::PHI: 7211 case AMDGPU::REG_SEQUENCE: 7212 case AMDGPU::INSERT_SUBREG: 7213 case AMDGPU::WQM: 7214 case AMDGPU::SOFT_WQM: 7215 case AMDGPU::STRICT_WWM: 7216 case AMDGPU::STRICT_WQM: { 7217 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1); 7218 if (RI.isAGPRClass(SrcRC)) { 7219 if (RI.isAGPRClass(NewDstRC)) 7220 return nullptr; 7221 7222 switch (Inst.getOpcode()) { 7223 case AMDGPU::PHI: 7224 case AMDGPU::REG_SEQUENCE: 7225 case AMDGPU::INSERT_SUBREG: 7226 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC); 7227 break; 7228 default: 7229 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 7230 } 7231 7232 if (!NewDstRC) 7233 return nullptr; 7234 } else { 7235 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass) 7236 return nullptr; 7237 7238 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 7239 if (!NewDstRC) 7240 return nullptr; 7241 } 7242 7243 return NewDstRC; 7244 } 7245 default: 7246 return NewDstRC; 7247 } 7248 } 7249 7250 // Find the one SGPR operand we are allowed to use. 7251 Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI, 7252 int OpIndices[3]) const { 7253 const MCInstrDesc &Desc = MI.getDesc(); 7254 7255 // Find the one SGPR operand we are allowed to use. 7256 // 7257 // First we need to consider the instruction's operand requirements before 7258 // legalizing. Some operands are required to be SGPRs, such as implicit uses 7259 // of VCC, but we are still bound by the constant bus requirement to only use 7260 // one. 7261 // 7262 // If the operand's class is an SGPR, we can never move it. 7263 7264 Register SGPRReg = findImplicitSGPRRead(MI); 7265 if (SGPRReg != AMDGPU::NoRegister) 7266 return SGPRReg; 7267 7268 Register UsedSGPRs[3] = { AMDGPU::NoRegister }; 7269 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 7270 7271 for (unsigned i = 0; i < 3; ++i) { 7272 int Idx = OpIndices[i]; 7273 if (Idx == -1) 7274 break; 7275 7276 const MachineOperand &MO = MI.getOperand(Idx); 7277 if (!MO.isReg()) 7278 continue; 7279 7280 // Is this operand statically required to be an SGPR based on the operand 7281 // constraints? 7282 const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); 7283 bool IsRequiredSGPR = RI.isSGPRClass(OpRC); 7284 if (IsRequiredSGPR) 7285 return MO.getReg(); 7286 7287 // If this could be a VGPR or an SGPR, Check the dynamic register class. 7288 Register Reg = MO.getReg(); 7289 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); 7290 if (RI.isSGPRClass(RegRC)) 7291 UsedSGPRs[i] = Reg; 7292 } 7293 7294 // We don't have a required SGPR operand, so we have a bit more freedom in 7295 // selecting operands to move. 7296 7297 // Try to select the most used SGPR. If an SGPR is equal to one of the 7298 // others, we choose that. 7299 // 7300 // e.g. 7301 // V_FMA_F32 v0, s0, s0, s0 -> No moves 7302 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 7303 7304 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should 7305 // prefer those. 7306 7307 if (UsedSGPRs[0] != AMDGPU::NoRegister) { 7308 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 7309 SGPRReg = UsedSGPRs[0]; 7310 } 7311 7312 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { 7313 if (UsedSGPRs[1] == UsedSGPRs[2]) 7314 SGPRReg = UsedSGPRs[1]; 7315 } 7316 7317 return SGPRReg; 7318 } 7319 7320 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 7321 unsigned OperandName) const { 7322 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 7323 if (Idx == -1) 7324 return nullptr; 7325 7326 return &MI.getOperand(Idx); 7327 } 7328 7329 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 7330 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 7331 return (AMDGPU::MTBUFFormat::UFMT_32_FLOAT << 44) | 7332 (1ULL << 56) | // RESOURCE_LEVEL = 1 7333 (3ULL << 60); // OOB_SELECT = 3 7334 } 7335 7336 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 7337 if (ST.isAmdHsaOS()) { 7338 // Set ATC = 1. GFX9 doesn't have this bit. 7339 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) 7340 RsrcDataFormat |= (1ULL << 56); 7341 7342 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this. 7343 // BTW, it disables TC L2 and therefore decreases performance. 7344 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) 7345 RsrcDataFormat |= (2ULL << 59); 7346 } 7347 7348 return RsrcDataFormat; 7349 } 7350 7351 uint64_t SIInstrInfo::getScratchRsrcWords23() const { 7352 uint64_t Rsrc23 = getDefaultRsrcDataFormat() | 7353 AMDGPU::RSRC_TID_ENABLE | 7354 0xffffffff; // Size; 7355 7356 // GFX9 doesn't have ELEMENT_SIZE. 7357 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 7358 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1; 7359 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; 7360 } 7361 7362 // IndexStride = 64 / 32. 7363 uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2; 7364 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; 7365 7366 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. 7367 // Clear them unless we want a huge stride. 7368 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 7369 ST.getGeneration() <= AMDGPUSubtarget::GFX9) 7370 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; 7371 7372 return Rsrc23; 7373 } 7374 7375 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { 7376 unsigned Opc = MI.getOpcode(); 7377 7378 return isSMRD(Opc); 7379 } 7380 7381 bool SIInstrInfo::isHighLatencyDef(int Opc) const { 7382 return get(Opc).mayLoad() && 7383 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc)); 7384 } 7385 7386 unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, 7387 int &FrameIndex) const { 7388 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 7389 if (!Addr || !Addr->isFI()) 7390 return AMDGPU::NoRegister; 7391 7392 assert(!MI.memoperands_empty() && 7393 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS); 7394 7395 FrameIndex = Addr->getIndex(); 7396 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); 7397 } 7398 7399 unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, 7400 int &FrameIndex) const { 7401 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr); 7402 assert(Addr && Addr->isFI()); 7403 FrameIndex = Addr->getIndex(); 7404 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg(); 7405 } 7406 7407 unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 7408 int &FrameIndex) const { 7409 if (!MI.mayLoad()) 7410 return AMDGPU::NoRegister; 7411 7412 if (isMUBUF(MI) || isVGPRSpill(MI)) 7413 return isStackAccess(MI, FrameIndex); 7414 7415 if (isSGPRSpill(MI)) 7416 return isSGPRStackAccess(MI, FrameIndex); 7417 7418 return AMDGPU::NoRegister; 7419 } 7420 7421 unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI, 7422 int &FrameIndex) const { 7423 if (!MI.mayStore()) 7424 return AMDGPU::NoRegister; 7425 7426 if (isMUBUF(MI) || isVGPRSpill(MI)) 7427 return isStackAccess(MI, FrameIndex); 7428 7429 if (isSGPRSpill(MI)) 7430 return isSGPRStackAccess(MI, FrameIndex); 7431 7432 return AMDGPU::NoRegister; 7433 } 7434 7435 unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const { 7436 unsigned Size = 0; 7437 MachineBasicBlock::const_instr_iterator I = MI.getIterator(); 7438 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); 7439 while (++I != E && I->isInsideBundle()) { 7440 assert(!I->isBundle() && "No nested bundle!"); 7441 Size += getInstSizeInBytes(*I); 7442 } 7443 7444 return Size; 7445 } 7446 7447 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 7448 unsigned Opc = MI.getOpcode(); 7449 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); 7450 unsigned DescSize = Desc.getSize(); 7451 7452 // If we have a definitive size, we can use it. Otherwise we need to inspect 7453 // the operands to know the size. 7454 if (isFixedSize(MI)) { 7455 unsigned Size = DescSize; 7456 7457 // If we hit the buggy offset, an extra nop will be inserted in MC so 7458 // estimate the worst case. 7459 if (MI.isBranch() && ST.hasOffset3fBug()) 7460 Size += 4; 7461 7462 return Size; 7463 } 7464 7465 // Instructions may have a 32-bit literal encoded after them. Check 7466 // operands that could ever be literals. 7467 if (isVALU(MI) || isSALU(MI)) { 7468 if (isDPP(MI)) 7469 return DescSize; 7470 bool HasLiteral = false; 7471 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) { 7472 const MachineOperand &Op = MI.getOperand(I); 7473 const MCOperandInfo &OpInfo = Desc.OpInfo[I]; 7474 if (isLiteralConstantLike(Op, OpInfo)) { 7475 HasLiteral = true; 7476 break; 7477 } 7478 } 7479 return HasLiteral ? DescSize + 4 : DescSize; 7480 } 7481 7482 // Check whether we have extra NSA words. 7483 if (isMIMG(MI)) { 7484 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 7485 if (VAddr0Idx < 0) 7486 return 8; 7487 7488 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 7489 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4); 7490 } 7491 7492 switch (Opc) { 7493 case TargetOpcode::BUNDLE: 7494 return getInstBundleSize(MI); 7495 case TargetOpcode::INLINEASM: 7496 case TargetOpcode::INLINEASM_BR: { 7497 const MachineFunction *MF = MI.getParent()->getParent(); 7498 const char *AsmStr = MI.getOperand(0).getSymbolName(); 7499 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST); 7500 } 7501 default: 7502 if (MI.isMetaInstruction()) 7503 return 0; 7504 return DescSize; 7505 } 7506 } 7507 7508 bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { 7509 if (!isFLAT(MI)) 7510 return false; 7511 7512 if (MI.memoperands_empty()) 7513 return true; 7514 7515 for (const MachineMemOperand *MMO : MI.memoperands()) { 7516 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) 7517 return true; 7518 } 7519 return false; 7520 } 7521 7522 bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const { 7523 return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO; 7524 } 7525 7526 void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, 7527 MachineBasicBlock *IfEnd) const { 7528 MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator(); 7529 assert(TI != IfEntry->end()); 7530 7531 MachineInstr *Branch = &(*TI); 7532 MachineFunction *MF = IfEntry->getParent(); 7533 MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo(); 7534 7535 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 7536 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); 7537 MachineInstr *SIIF = 7538 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg) 7539 .add(Branch->getOperand(0)) 7540 .add(Branch->getOperand(1)); 7541 MachineInstr *SIEND = 7542 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF)) 7543 .addReg(DstReg); 7544 7545 IfEntry->erase(TI); 7546 IfEntry->insert(IfEntry->end(), SIIF); 7547 IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND); 7548 } 7549 } 7550 7551 void SIInstrInfo::convertNonUniformLoopRegion( 7552 MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const { 7553 MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator(); 7554 // We expect 2 terminators, one conditional and one unconditional. 7555 assert(TI != LoopEnd->end()); 7556 7557 MachineInstr *Branch = &(*TI); 7558 MachineFunction *MF = LoopEnd->getParent(); 7559 MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo(); 7560 7561 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 7562 7563 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); 7564 Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC()); 7565 MachineInstrBuilder HeaderPHIBuilder = 7566 BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg); 7567 for (MachineBasicBlock *PMBB : LoopEntry->predecessors()) { 7568 if (PMBB == LoopEnd) { 7569 HeaderPHIBuilder.addReg(BackEdgeReg); 7570 } else { 7571 Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC()); 7572 materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(), 7573 ZeroReg, 0); 7574 HeaderPHIBuilder.addReg(ZeroReg); 7575 } 7576 HeaderPHIBuilder.addMBB(PMBB); 7577 } 7578 MachineInstr *HeaderPhi = HeaderPHIBuilder; 7579 MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(), 7580 get(AMDGPU::SI_IF_BREAK), BackEdgeReg) 7581 .addReg(DstReg) 7582 .add(Branch->getOperand(0)); 7583 MachineInstr *SILOOP = 7584 BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP)) 7585 .addReg(BackEdgeReg) 7586 .addMBB(LoopEntry); 7587 7588 LoopEntry->insert(LoopEntry->begin(), HeaderPhi); 7589 LoopEnd->erase(TI); 7590 LoopEnd->insert(LoopEnd->end(), SIIFBREAK); 7591 LoopEnd->insert(LoopEnd->end(), SILOOP); 7592 } 7593 } 7594 7595 ArrayRef<std::pair<int, const char *>> 7596 SIInstrInfo::getSerializableTargetIndices() const { 7597 static const std::pair<int, const char *> TargetIndices[] = { 7598 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, 7599 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, 7600 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, 7601 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, 7602 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; 7603 return makeArrayRef(TargetIndices); 7604 } 7605 7606 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The 7607 /// post-RA version of misched uses CreateTargetMIHazardRecognizer. 7608 ScheduleHazardRecognizer * 7609 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, 7610 const ScheduleDAG *DAG) const { 7611 return new GCNHazardRecognizer(DAG->MF); 7612 } 7613 7614 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer 7615 /// pass. 7616 ScheduleHazardRecognizer * 7617 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { 7618 return new GCNHazardRecognizer(MF); 7619 } 7620 7621 // Called during: 7622 // - pre-RA scheduling and post-RA scheduling 7623 ScheduleHazardRecognizer * 7624 SIInstrInfo::CreateTargetMIHazardRecognizer(const InstrItineraryData *II, 7625 const ScheduleDAGMI *DAG) const { 7626 // Borrowed from Arm Target 7627 // We would like to restrict this hazard recognizer to only 7628 // post-RA scheduling; we can tell that we're post-RA because we don't 7629 // track VRegLiveness. 7630 if (!DAG->hasVRegLiveness()) 7631 return new GCNHazardRecognizer(DAG->MF); 7632 return TargetInstrInfo::CreateTargetMIHazardRecognizer(II, DAG); 7633 } 7634 7635 std::pair<unsigned, unsigned> 7636 SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { 7637 return std::make_pair(TF & MO_MASK, TF & ~MO_MASK); 7638 } 7639 7640 ArrayRef<std::pair<unsigned, const char *>> 7641 SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { 7642 static const std::pair<unsigned, const char *> TargetFlags[] = { 7643 { MO_GOTPCREL, "amdgpu-gotprel" }, 7644 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" }, 7645 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" }, 7646 { MO_REL32_LO, "amdgpu-rel32-lo" }, 7647 { MO_REL32_HI, "amdgpu-rel32-hi" }, 7648 { MO_ABS32_LO, "amdgpu-abs32-lo" }, 7649 { MO_ABS32_HI, "amdgpu-abs32-hi" }, 7650 }; 7651 7652 return makeArrayRef(TargetFlags); 7653 } 7654 7655 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>> 7656 SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const { 7657 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] = 7658 { 7659 {MONoClobber, "amdgpu-noclobber"}, 7660 }; 7661 7662 return makeArrayRef(TargetFlags); 7663 } 7664 7665 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { 7666 return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && 7667 MI.modifiesRegister(AMDGPU::EXEC, &RI); 7668 } 7669 7670 MachineInstrBuilder 7671 SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, 7672 MachineBasicBlock::iterator I, 7673 const DebugLoc &DL, 7674 Register DestReg) const { 7675 if (ST.hasAddNoCarry()) 7676 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg); 7677 7678 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 7679 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC()); 7680 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC()); 7681 7682 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg) 7683 .addReg(UnusedCarry, RegState::Define | RegState::Dead); 7684 } 7685 7686 MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, 7687 MachineBasicBlock::iterator I, 7688 const DebugLoc &DL, 7689 Register DestReg, 7690 RegScavenger &RS) const { 7691 if (ST.hasAddNoCarry()) 7692 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg); 7693 7694 // If available, prefer to use vcc. 7695 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC) 7696 ? Register(RI.getVCC()) 7697 : RS.scavengeRegister(RI.getBoolRC(), I, 0, false); 7698 7699 // TODO: Users need to deal with this. 7700 if (!UnusedCarry.isValid()) 7701 return MachineInstrBuilder(); 7702 7703 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg) 7704 .addReg(UnusedCarry, RegState::Define | RegState::Dead); 7705 } 7706 7707 bool SIInstrInfo::isKillTerminator(unsigned Opcode) { 7708 switch (Opcode) { 7709 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: 7710 case AMDGPU::SI_KILL_I1_TERMINATOR: 7711 return true; 7712 default: 7713 return false; 7714 } 7715 } 7716 7717 const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const { 7718 switch (Opcode) { 7719 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: 7720 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR); 7721 case AMDGPU::SI_KILL_I1_PSEUDO: 7722 return get(AMDGPU::SI_KILL_I1_TERMINATOR); 7723 default: 7724 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO"); 7725 } 7726 } 7727 7728 void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const { 7729 if (!ST.isWave32()) 7730 return; 7731 7732 for (auto &Op : MI.implicit_operands()) { 7733 if (Op.isReg() && Op.getReg() == AMDGPU::VCC) 7734 Op.setReg(AMDGPU::VCC_LO); 7735 } 7736 } 7737 7738 bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { 7739 if (!isSMRD(MI)) 7740 return false; 7741 7742 // Check that it is using a buffer resource. 7743 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase); 7744 if (Idx == -1) // e.g. s_memtime 7745 return false; 7746 7747 const auto RCID = MI.getDesc().OpInfo[Idx].RegClass; 7748 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass); 7749 } 7750 7751 // Depending on the used address space and instructions, some immediate offsets 7752 // are allowed and some are not. 7753 // In general, flat instruction offsets can only be non-negative, global and 7754 // scratch instruction offsets can also be negative. 7755 // 7756 // There are several bugs related to these offsets: 7757 // On gfx10.1, flat instructions that go into the global address space cannot 7758 // use an offset. 7759 // 7760 // For scratch instructions, the address can be either an SGPR or a VGPR. 7761 // The following offsets can be used, depending on the architecture (x means 7762 // cannot be used): 7763 // +----------------------------+------+------+ 7764 // | Address-Mode | SGPR | VGPR | 7765 // +----------------------------+------+------+ 7766 // | gfx9 | | | 7767 // | negative, 4-aligned offset | x | ok | 7768 // | negative, unaligned offset | x | ok | 7769 // +----------------------------+------+------+ 7770 // | gfx10 | | | 7771 // | negative, 4-aligned offset | ok | ok | 7772 // | negative, unaligned offset | ok | x | 7773 // +----------------------------+------+------+ 7774 // | gfx10.3 | | | 7775 // | negative, 4-aligned offset | ok | ok | 7776 // | negative, unaligned offset | ok | ok | 7777 // +----------------------------+------+------+ 7778 // 7779 // This function ignores the addressing mode, so if an offset cannot be used in 7780 // one addressing mode, it is considered illegal. 7781 bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, 7782 uint64_t FlatVariant) const { 7783 // TODO: Should 0 be special cased? 7784 if (!ST.hasFlatInstOffsets()) 7785 return false; 7786 7787 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT && 7788 (AddrSpace == AMDGPUAS::FLAT_ADDRESS || 7789 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS)) 7790 return false; 7791 7792 bool Signed = FlatVariant != SIInstrFlags::FLAT; 7793 if (ST.hasNegativeScratchOffsetBug() && 7794 FlatVariant == SIInstrFlags::FlatScratch) 7795 Signed = false; 7796 if (ST.hasNegativeUnalignedScratchOffsetBug() && 7797 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 && 7798 (Offset % 4) != 0) { 7799 return false; 7800 } 7801 7802 unsigned N = AMDGPU::getNumFlatOffsetBits(ST, Signed); 7803 return Signed ? isIntN(N, Offset) : isUIntN(N, Offset); 7804 } 7805 7806 // See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not. 7807 std::pair<int64_t, int64_t> 7808 SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, 7809 uint64_t FlatVariant) const { 7810 int64_t RemainderOffset = COffsetVal; 7811 int64_t ImmField = 0; 7812 bool Signed = FlatVariant != SIInstrFlags::FLAT; 7813 if (ST.hasNegativeScratchOffsetBug() && 7814 FlatVariant == SIInstrFlags::FlatScratch) 7815 Signed = false; 7816 7817 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST, Signed); 7818 if (Signed) { 7819 // Use signed division by a power of two to truncate towards 0. 7820 int64_t D = 1LL << (NumBits - 1); 7821 RemainderOffset = (COffsetVal / D) * D; 7822 ImmField = COffsetVal - RemainderOffset; 7823 7824 if (ST.hasNegativeUnalignedScratchOffsetBug() && 7825 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 && 7826 (ImmField % 4) != 0) { 7827 // Make ImmField a multiple of 4 7828 RemainderOffset += ImmField % 4; 7829 ImmField -= ImmField % 4; 7830 } 7831 } else if (COffsetVal >= 0) { 7832 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits); 7833 RemainderOffset = COffsetVal - ImmField; 7834 } 7835 7836 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant)); 7837 assert(RemainderOffset + ImmField == COffsetVal); 7838 return {ImmField, RemainderOffset}; 7839 } 7840 7841 // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td 7842 // and the columns of the getMCOpcodeGen table. 7843 enum SIEncodingFamily { 7844 SI = 0, 7845 VI = 1, 7846 SDWA = 2, 7847 SDWA9 = 3, 7848 GFX80 = 4, 7849 GFX9 = 5, 7850 GFX10 = 6, 7851 SDWA10 = 7, 7852 GFX90A = 8, 7853 GFX940 = 9, 7854 GFX11 = 10, 7855 }; 7856 7857 static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) { 7858 switch (ST.getGeneration()) { 7859 default: 7860 break; 7861 case AMDGPUSubtarget::SOUTHERN_ISLANDS: 7862 case AMDGPUSubtarget::SEA_ISLANDS: 7863 return SIEncodingFamily::SI; 7864 case AMDGPUSubtarget::VOLCANIC_ISLANDS: 7865 case AMDGPUSubtarget::GFX9: 7866 return SIEncodingFamily::VI; 7867 case AMDGPUSubtarget::GFX10: 7868 return SIEncodingFamily::GFX10; 7869 case AMDGPUSubtarget::GFX11: 7870 return SIEncodingFamily::GFX11; 7871 } 7872 llvm_unreachable("Unknown subtarget generation!"); 7873 } 7874 7875 bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const { 7876 switch(MCOp) { 7877 // These opcodes use indirect register addressing so 7878 // they need special handling by codegen (currently missing). 7879 // Therefore it is too risky to allow these opcodes 7880 // to be selected by dpp combiner or sdwa peepholer. 7881 case AMDGPU::V_MOVRELS_B32_dpp_gfx10: 7882 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10: 7883 case AMDGPU::V_MOVRELD_B32_dpp_gfx10: 7884 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10: 7885 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10: 7886 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10: 7887 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10: 7888 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10: 7889 return true; 7890 default: 7891 return false; 7892 } 7893 } 7894 7895 int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { 7896 SIEncodingFamily Gen = subtargetEncodingFamily(ST); 7897 7898 if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 && 7899 ST.getGeneration() == AMDGPUSubtarget::GFX9) 7900 Gen = SIEncodingFamily::GFX9; 7901 7902 // Adjust the encoding family to GFX80 for D16 buffer instructions when the 7903 // subtarget has UnpackedD16VMem feature. 7904 // TODO: remove this when we discard GFX80 encoding. 7905 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf)) 7906 Gen = SIEncodingFamily::GFX80; 7907 7908 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) { 7909 switch (ST.getGeneration()) { 7910 default: 7911 Gen = SIEncodingFamily::SDWA; 7912 break; 7913 case AMDGPUSubtarget::GFX9: 7914 Gen = SIEncodingFamily::SDWA9; 7915 break; 7916 case AMDGPUSubtarget::GFX10: 7917 Gen = SIEncodingFamily::SDWA10; 7918 break; 7919 } 7920 } 7921 7922 if (isMAI(Opcode)) { 7923 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode); 7924 if (MFMAOp != -1) 7925 Opcode = MFMAOp; 7926 } 7927 7928 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); 7929 7930 // -1 means that Opcode is already a native instruction. 7931 if (MCOp == -1) 7932 return Opcode; 7933 7934 if (ST.hasGFX90AInsts()) { 7935 uint16_t NMCOp = (uint16_t)-1; 7936 if (ST.hasGFX940Insts()) 7937 NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX940); 7938 if (NMCOp == (uint16_t)-1) 7939 NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX90A); 7940 if (NMCOp == (uint16_t)-1) 7941 NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX9); 7942 if (NMCOp != (uint16_t)-1) 7943 MCOp = NMCOp; 7944 } 7945 7946 // (uint16_t)-1 means that Opcode is a pseudo instruction that has 7947 // no encoding in the given subtarget generation. 7948 if (MCOp == (uint16_t)-1) 7949 return -1; 7950 7951 if (isAsmOnlyOpcode(MCOp)) 7952 return -1; 7953 7954 return MCOp; 7955 } 7956 7957 static 7958 TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) { 7959 assert(RegOpnd.isReg()); 7960 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() : 7961 getRegSubRegPair(RegOpnd); 7962 } 7963 7964 TargetInstrInfo::RegSubRegPair 7965 llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) { 7966 assert(MI.isRegSequence()); 7967 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I) 7968 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) { 7969 auto &RegOp = MI.getOperand(1 + 2 * I); 7970 return getRegOrUndef(RegOp); 7971 } 7972 return TargetInstrInfo::RegSubRegPair(); 7973 } 7974 7975 // Try to find the definition of reg:subreg in subreg-manipulation pseudos 7976 // Following a subreg of reg:subreg isn't supported 7977 static bool followSubRegDef(MachineInstr &MI, 7978 TargetInstrInfo::RegSubRegPair &RSR) { 7979 if (!RSR.SubReg) 7980 return false; 7981 switch (MI.getOpcode()) { 7982 default: break; 7983 case AMDGPU::REG_SEQUENCE: 7984 RSR = getRegSequenceSubReg(MI, RSR.SubReg); 7985 return true; 7986 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg 7987 case AMDGPU::INSERT_SUBREG: 7988 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm()) 7989 // inserted the subreg we're looking for 7990 RSR = getRegOrUndef(MI.getOperand(2)); 7991 else { // the subreg in the rest of the reg 7992 auto R1 = getRegOrUndef(MI.getOperand(1)); 7993 if (R1.SubReg) // subreg of subreg isn't supported 7994 return false; 7995 RSR.Reg = R1.Reg; 7996 } 7997 return true; 7998 } 7999 return false; 8000 } 8001 8002 MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, 8003 MachineRegisterInfo &MRI) { 8004 assert(MRI.isSSA()); 8005 if (!P.Reg.isVirtual()) 8006 return nullptr; 8007 8008 auto RSR = P; 8009 auto *DefInst = MRI.getVRegDef(RSR.Reg); 8010 while (auto *MI = DefInst) { 8011 DefInst = nullptr; 8012 switch (MI->getOpcode()) { 8013 case AMDGPU::COPY: 8014 case AMDGPU::V_MOV_B32_e32: { 8015 auto &Op1 = MI->getOperand(1); 8016 if (Op1.isReg() && Op1.getReg().isVirtual()) { 8017 if (Op1.isUndef()) 8018 return nullptr; 8019 RSR = getRegSubRegPair(Op1); 8020 DefInst = MRI.getVRegDef(RSR.Reg); 8021 } 8022 break; 8023 } 8024 default: 8025 if (followSubRegDef(*MI, RSR)) { 8026 if (!RSR.Reg) 8027 return nullptr; 8028 DefInst = MRI.getVRegDef(RSR.Reg); 8029 } 8030 } 8031 if (!DefInst) 8032 return MI; 8033 } 8034 return nullptr; 8035 } 8036 8037 bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, 8038 Register VReg, 8039 const MachineInstr &DefMI, 8040 const MachineInstr &UseMI) { 8041 assert(MRI.isSSA() && "Must be run on SSA"); 8042 8043 auto *TRI = MRI.getTargetRegisterInfo(); 8044 auto *DefBB = DefMI.getParent(); 8045 8046 // Don't bother searching between blocks, although it is possible this block 8047 // doesn't modify exec. 8048 if (UseMI.getParent() != DefBB) 8049 return true; 8050 8051 const int MaxInstScan = 20; 8052 int NumInst = 0; 8053 8054 // Stop scan at the use. 8055 auto E = UseMI.getIterator(); 8056 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) { 8057 if (I->isDebugInstr()) 8058 continue; 8059 8060 if (++NumInst > MaxInstScan) 8061 return true; 8062 8063 if (I->modifiesRegister(AMDGPU::EXEC, TRI)) 8064 return true; 8065 } 8066 8067 return false; 8068 } 8069 8070 bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, 8071 Register VReg, 8072 const MachineInstr &DefMI) { 8073 assert(MRI.isSSA() && "Must be run on SSA"); 8074 8075 auto *TRI = MRI.getTargetRegisterInfo(); 8076 auto *DefBB = DefMI.getParent(); 8077 8078 const int MaxUseScan = 10; 8079 int NumUse = 0; 8080 8081 for (auto &Use : MRI.use_nodbg_operands(VReg)) { 8082 auto &UseInst = *Use.getParent(); 8083 // Don't bother searching between blocks, although it is possible this block 8084 // doesn't modify exec. 8085 if (UseInst.getParent() != DefBB || UseInst.isPHI()) 8086 return true; 8087 8088 if (++NumUse > MaxUseScan) 8089 return true; 8090 } 8091 8092 if (NumUse == 0) 8093 return false; 8094 8095 const int MaxInstScan = 20; 8096 int NumInst = 0; 8097 8098 // Stop scan when we have seen all the uses. 8099 for (auto I = std::next(DefMI.getIterator()); ; ++I) { 8100 assert(I != DefBB->end()); 8101 8102 if (I->isDebugInstr()) 8103 continue; 8104 8105 if (++NumInst > MaxInstScan) 8106 return true; 8107 8108 for (const MachineOperand &Op : I->operands()) { 8109 // We don't check reg masks here as they're used only on calls: 8110 // 1. EXEC is only considered const within one BB 8111 // 2. Call should be a terminator instruction if present in a BB 8112 8113 if (!Op.isReg()) 8114 continue; 8115 8116 Register Reg = Op.getReg(); 8117 if (Op.isUse()) { 8118 if (Reg == VReg && --NumUse == 0) 8119 return false; 8120 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC)) 8121 return true; 8122 } 8123 } 8124 } 8125 8126 MachineInstr *SIInstrInfo::createPHIDestinationCopy( 8127 MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt, 8128 const DebugLoc &DL, Register Src, Register Dst) const { 8129 auto Cur = MBB.begin(); 8130 if (Cur != MBB.end()) 8131 do { 8132 if (!Cur->isPHI() && Cur->readsRegister(Dst)) 8133 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src); 8134 ++Cur; 8135 } while (Cur != MBB.end() && Cur != LastPHIIt); 8136 8137 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src, 8138 Dst); 8139 } 8140 8141 MachineInstr *SIInstrInfo::createPHISourceCopy( 8142 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, 8143 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const { 8144 if (InsPt != MBB.end() && 8145 (InsPt->getOpcode() == AMDGPU::SI_IF || 8146 InsPt->getOpcode() == AMDGPU::SI_ELSE || 8147 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) && 8148 InsPt->definesRegister(Src)) { 8149 InsPt++; 8150 return BuildMI(MBB, InsPt, DL, 8151 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term 8152 : AMDGPU::S_MOV_B64_term), 8153 Dst) 8154 .addReg(Src, 0, SrcSubReg) 8155 .addReg(AMDGPU::EXEC, RegState::Implicit); 8156 } 8157 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg, 8158 Dst); 8159 } 8160 8161 bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); } 8162 8163 MachineInstr *SIInstrInfo::foldMemoryOperandImpl( 8164 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, 8165 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS, 8166 VirtRegMap *VRM) const { 8167 // This is a bit of a hack (copied from AArch64). Consider this instruction: 8168 // 8169 // %0:sreg_32 = COPY $m0 8170 // 8171 // We explicitly chose SReg_32 for the virtual register so such a copy might 8172 // be eliminated by RegisterCoalescer. However, that may not be possible, and 8173 // %0 may even spill. We can't spill $m0 normally (it would require copying to 8174 // a numbered SGPR anyway), and since it is in the SReg_32 register class, 8175 // TargetInstrInfo::foldMemoryOperand() is going to try. 8176 // A similar issue also exists with spilling and reloading $exec registers. 8177 // 8178 // To prevent that, constrain the %0 register class here. 8179 if (MI.isFullCopy()) { 8180 Register DstReg = MI.getOperand(0).getReg(); 8181 Register SrcReg = MI.getOperand(1).getReg(); 8182 if ((DstReg.isVirtual() || SrcReg.isVirtual()) && 8183 (DstReg.isVirtual() != SrcReg.isVirtual())) { 8184 MachineRegisterInfo &MRI = MF.getRegInfo(); 8185 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg; 8186 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg); 8187 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) { 8188 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); 8189 return nullptr; 8190 } else if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) { 8191 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass); 8192 return nullptr; 8193 } 8194 } 8195 } 8196 8197 return nullptr; 8198 } 8199 8200 unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, 8201 const MachineInstr &MI, 8202 unsigned *PredCost) const { 8203 if (MI.isBundle()) { 8204 MachineBasicBlock::const_instr_iterator I(MI.getIterator()); 8205 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end()); 8206 unsigned Lat = 0, Count = 0; 8207 for (++I; I != E && I->isBundledWithPred(); ++I) { 8208 ++Count; 8209 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I)); 8210 } 8211 return Lat + Count - 1; 8212 } 8213 8214 return SchedModel.computeInstrLatency(&MI); 8215 } 8216 8217 unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) { 8218 switch (MF.getFunction().getCallingConv()) { 8219 case CallingConv::AMDGPU_PS: 8220 return 1; 8221 case CallingConv::AMDGPU_VS: 8222 return 2; 8223 case CallingConv::AMDGPU_GS: 8224 return 3; 8225 case CallingConv::AMDGPU_HS: 8226 case CallingConv::AMDGPU_LS: 8227 case CallingConv::AMDGPU_ES: 8228 report_fatal_error("ds_ordered_count unsupported for this calling conv"); 8229 case CallingConv::AMDGPU_CS: 8230 case CallingConv::AMDGPU_KERNEL: 8231 case CallingConv::C: 8232 case CallingConv::Fast: 8233 default: 8234 // Assume other calling conventions are various compute callable functions 8235 return 0; 8236 } 8237 } 8238 8239 bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, 8240 Register &SrcReg2, int64_t &CmpMask, 8241 int64_t &CmpValue) const { 8242 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg()) 8243 return false; 8244 8245 switch (MI.getOpcode()) { 8246 default: 8247 break; 8248 case AMDGPU::S_CMP_EQ_U32: 8249 case AMDGPU::S_CMP_EQ_I32: 8250 case AMDGPU::S_CMP_LG_U32: 8251 case AMDGPU::S_CMP_LG_I32: 8252 case AMDGPU::S_CMP_LT_U32: 8253 case AMDGPU::S_CMP_LT_I32: 8254 case AMDGPU::S_CMP_GT_U32: 8255 case AMDGPU::S_CMP_GT_I32: 8256 case AMDGPU::S_CMP_LE_U32: 8257 case AMDGPU::S_CMP_LE_I32: 8258 case AMDGPU::S_CMP_GE_U32: 8259 case AMDGPU::S_CMP_GE_I32: 8260 case AMDGPU::S_CMP_EQ_U64: 8261 case AMDGPU::S_CMP_LG_U64: 8262 SrcReg = MI.getOperand(0).getReg(); 8263 if (MI.getOperand(1).isReg()) { 8264 if (MI.getOperand(1).getSubReg()) 8265 return false; 8266 SrcReg2 = MI.getOperand(1).getReg(); 8267 CmpValue = 0; 8268 } else if (MI.getOperand(1).isImm()) { 8269 SrcReg2 = Register(); 8270 CmpValue = MI.getOperand(1).getImm(); 8271 } else { 8272 return false; 8273 } 8274 CmpMask = ~0; 8275 return true; 8276 case AMDGPU::S_CMPK_EQ_U32: 8277 case AMDGPU::S_CMPK_EQ_I32: 8278 case AMDGPU::S_CMPK_LG_U32: 8279 case AMDGPU::S_CMPK_LG_I32: 8280 case AMDGPU::S_CMPK_LT_U32: 8281 case AMDGPU::S_CMPK_LT_I32: 8282 case AMDGPU::S_CMPK_GT_U32: 8283 case AMDGPU::S_CMPK_GT_I32: 8284 case AMDGPU::S_CMPK_LE_U32: 8285 case AMDGPU::S_CMPK_LE_I32: 8286 case AMDGPU::S_CMPK_GE_U32: 8287 case AMDGPU::S_CMPK_GE_I32: 8288 SrcReg = MI.getOperand(0).getReg(); 8289 SrcReg2 = Register(); 8290 CmpValue = MI.getOperand(1).getImm(); 8291 CmpMask = ~0; 8292 return true; 8293 } 8294 8295 return false; 8296 } 8297 8298 bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, 8299 Register SrcReg2, int64_t CmpMask, 8300 int64_t CmpValue, 8301 const MachineRegisterInfo *MRI) const { 8302 if (!SrcReg || SrcReg.isPhysical()) 8303 return false; 8304 8305 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue)) 8306 return false; 8307 8308 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI, 8309 this](int64_t ExpectedValue, unsigned SrcSize, 8310 bool IsReversible, bool IsSigned) -> bool { 8311 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n 8312 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n 8313 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n 8314 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n 8315 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n 8316 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n 8317 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n 8318 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n 8319 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n 8320 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n 8321 // 8322 // Signed ge/gt are not used for the sign bit. 8323 // 8324 // If result of the AND is unused except in the compare: 8325 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n 8326 // 8327 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n 8328 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n 8329 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n 8330 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n 8331 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n 8332 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n 8333 8334 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg); 8335 if (!Def || Def->getParent() != CmpInstr.getParent()) 8336 return false; 8337 8338 if (Def->getOpcode() != AMDGPU::S_AND_B32 && 8339 Def->getOpcode() != AMDGPU::S_AND_B64) 8340 return false; 8341 8342 int64_t Mask; 8343 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool { 8344 if (MO->isImm()) 8345 Mask = MO->getImm(); 8346 else if (!getFoldableImm(MO, Mask)) 8347 return false; 8348 Mask &= maxUIntN(SrcSize); 8349 return isPowerOf2_64(Mask); 8350 }; 8351 8352 MachineOperand *SrcOp = &Def->getOperand(1); 8353 if (isMask(SrcOp)) 8354 SrcOp = &Def->getOperand(2); 8355 else if (isMask(&Def->getOperand(2))) 8356 SrcOp = &Def->getOperand(1); 8357 else 8358 return false; 8359 8360 unsigned BitNo = countTrailingZeros((uint64_t)Mask); 8361 if (IsSigned && BitNo == SrcSize - 1) 8362 return false; 8363 8364 ExpectedValue <<= BitNo; 8365 8366 bool IsReversedCC = false; 8367 if (CmpValue != ExpectedValue) { 8368 if (!IsReversible) 8369 return false; 8370 IsReversedCC = CmpValue == (ExpectedValue ^ Mask); 8371 if (!IsReversedCC) 8372 return false; 8373 } 8374 8375 Register DefReg = Def->getOperand(0).getReg(); 8376 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg)) 8377 return false; 8378 8379 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator(); 8380 I != E; ++I) { 8381 if (I->modifiesRegister(AMDGPU::SCC, &RI) || 8382 I->killsRegister(AMDGPU::SCC, &RI)) 8383 return false; 8384 } 8385 8386 MachineOperand *SccDef = Def->findRegisterDefOperand(AMDGPU::SCC); 8387 SccDef->setIsDead(false); 8388 CmpInstr.eraseFromParent(); 8389 8390 if (!MRI->use_nodbg_empty(DefReg)) { 8391 assert(!IsReversedCC); 8392 return true; 8393 } 8394 8395 // Replace AND with unused result with a S_BITCMP. 8396 MachineBasicBlock *MBB = Def->getParent(); 8397 8398 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32 8399 : AMDGPU::S_BITCMP1_B32 8400 : IsReversedCC ? AMDGPU::S_BITCMP0_B64 8401 : AMDGPU::S_BITCMP1_B64; 8402 8403 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc)) 8404 .add(*SrcOp) 8405 .addImm(BitNo); 8406 Def->eraseFromParent(); 8407 8408 return true; 8409 }; 8410 8411 switch (CmpInstr.getOpcode()) { 8412 default: 8413 break; 8414 case AMDGPU::S_CMP_EQ_U32: 8415 case AMDGPU::S_CMP_EQ_I32: 8416 case AMDGPU::S_CMPK_EQ_U32: 8417 case AMDGPU::S_CMPK_EQ_I32: 8418 return optimizeCmpAnd(1, 32, true, false); 8419 case AMDGPU::S_CMP_GE_U32: 8420 case AMDGPU::S_CMPK_GE_U32: 8421 return optimizeCmpAnd(1, 32, false, false); 8422 case AMDGPU::S_CMP_GE_I32: 8423 case AMDGPU::S_CMPK_GE_I32: 8424 return optimizeCmpAnd(1, 32, false, true); 8425 case AMDGPU::S_CMP_EQ_U64: 8426 return optimizeCmpAnd(1, 64, true, false); 8427 case AMDGPU::S_CMP_LG_U32: 8428 case AMDGPU::S_CMP_LG_I32: 8429 case AMDGPU::S_CMPK_LG_U32: 8430 case AMDGPU::S_CMPK_LG_I32: 8431 return optimizeCmpAnd(0, 32, true, false); 8432 case AMDGPU::S_CMP_GT_U32: 8433 case AMDGPU::S_CMPK_GT_U32: 8434 return optimizeCmpAnd(0, 32, false, false); 8435 case AMDGPU::S_CMP_GT_I32: 8436 case AMDGPU::S_CMPK_GT_I32: 8437 return optimizeCmpAnd(0, 32, false, true); 8438 case AMDGPU::S_CMP_LG_U64: 8439 return optimizeCmpAnd(0, 64, true, false); 8440 } 8441 8442 return false; 8443 } 8444