1 //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI Implementation of TargetInstrInfo. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIInstrInfo.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUSubtarget.h" 17 #include "GCNHazardRecognizer.h" 18 #include "SIDefines.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "SIRegisterInfo.h" 21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/APInt.h" 24 #include "llvm/ADT/ArrayRef.h" 25 #include "llvm/ADT/SmallVector.h" 26 #include "llvm/ADT/StringRef.h" 27 #include "llvm/ADT/iterator_range.h" 28 #include "llvm/Analysis/AliasAnalysis.h" 29 #include "llvm/Analysis/MemoryLocation.h" 30 #include "llvm/Analysis/ValueTracking.h" 31 #include "llvm/CodeGen/MachineBasicBlock.h" 32 #include "llvm/CodeGen/MachineDominators.h" 33 #include "llvm/CodeGen/MachineFrameInfo.h" 34 #include "llvm/CodeGen/MachineFunction.h" 35 #include "llvm/CodeGen/MachineInstr.h" 36 #include "llvm/CodeGen/MachineInstrBuilder.h" 37 #include "llvm/CodeGen/MachineInstrBundle.h" 38 #include "llvm/CodeGen/MachineMemOperand.h" 39 #include "llvm/CodeGen/MachineOperand.h" 40 #include "llvm/CodeGen/MachineRegisterInfo.h" 41 #include "llvm/CodeGen/RegisterScavenging.h" 42 #include "llvm/CodeGen/ScheduleDAG.h" 43 #include "llvm/CodeGen/SelectionDAGNodes.h" 44 #include "llvm/CodeGen/TargetOpcodes.h" 45 #include "llvm/CodeGen/TargetRegisterInfo.h" 46 #include "llvm/IR/DebugLoc.h" 47 #include "llvm/IR/DiagnosticInfo.h" 48 #include "llvm/IR/Function.h" 49 #include "llvm/IR/InlineAsm.h" 50 #include "llvm/IR/LLVMContext.h" 51 #include "llvm/MC/MCInstrDesc.h" 52 #include "llvm/Support/Casting.h" 53 #include "llvm/Support/CommandLine.h" 54 #include "llvm/Support/Compiler.h" 55 #include "llvm/Support/ErrorHandling.h" 56 #include "llvm/Support/MachineValueType.h" 57 #include "llvm/Support/MathExtras.h" 58 #include "llvm/Target/TargetMachine.h" 59 #include <cassert> 60 #include <cstdint> 61 #include <iterator> 62 #include <utility> 63 64 using namespace llvm; 65 66 #define DEBUG_TYPE "si-instr-info" 67 68 #define GET_INSTRINFO_CTOR_DTOR 69 #include "AMDGPUGenInstrInfo.inc" 70 71 namespace llvm { 72 namespace AMDGPU { 73 #define GET_D16ImageDimIntrinsics_IMPL 74 #define GET_ImageDimIntrinsicTable_IMPL 75 #define GET_RsrcIntrinsics_IMPL 76 #include "AMDGPUGenSearchableTables.inc" 77 } 78 } 79 80 81 // Must be at least 4 to be able to branch over minimum unconditional branch 82 // code. This is only for making it possible to write reasonably small tests for 83 // long branches. 84 static cl::opt<unsigned> 85 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), 86 cl::desc("Restrict range of branch instructions (DEBUG)")); 87 88 static cl::opt<bool> Fix16BitCopies( 89 "amdgpu-fix-16-bit-physreg-copies", 90 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), 91 cl::init(true), 92 cl::ReallyHidden); 93 94 SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) 95 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), 96 RI(ST), ST(ST) { 97 SchedModel.init(&ST); 98 } 99 100 //===----------------------------------------------------------------------===// 101 // TargetInstrInfo callbacks 102 //===----------------------------------------------------------------------===// 103 104 static unsigned getNumOperandsNoGlue(SDNode *Node) { 105 unsigned N = Node->getNumOperands(); 106 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 107 --N; 108 return N; 109 } 110 111 /// Returns true if both nodes have the same value for the given 112 /// operand \p Op, or if both nodes do not have this operand. 113 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 114 unsigned Opc0 = N0->getMachineOpcode(); 115 unsigned Opc1 = N1->getMachineOpcode(); 116 117 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 118 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 119 120 if (Op0Idx == -1 && Op1Idx == -1) 121 return true; 122 123 124 if ((Op0Idx == -1 && Op1Idx != -1) || 125 (Op1Idx == -1 && Op0Idx != -1)) 126 return false; 127 128 // getNamedOperandIdx returns the index for the MachineInstr's operands, 129 // which includes the result as the first operand. We are indexing into the 130 // MachineSDNode's operands, so we need to skip the result operand to get 131 // the real index. 132 --Op0Idx; 133 --Op1Idx; 134 135 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 136 } 137 138 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, 139 AliasAnalysis *AA) const { 140 // TODO: The generic check fails for VALU instructions that should be 141 // rematerializable due to implicit reads of exec. We really want all of the 142 // generic logic for this except for this. 143 switch (MI.getOpcode()) { 144 case AMDGPU::V_MOV_B32_e32: 145 case AMDGPU::V_MOV_B32_e64: 146 case AMDGPU::V_MOV_B64_PSEUDO: 147 // No implicit operands. 148 return MI.getNumOperands() == MI.getDesc().getNumOperands(); 149 default: 150 return false; 151 } 152 } 153 154 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 155 int64_t &Offset0, 156 int64_t &Offset1) const { 157 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 158 return false; 159 160 unsigned Opc0 = Load0->getMachineOpcode(); 161 unsigned Opc1 = Load1->getMachineOpcode(); 162 163 // Make sure both are actually loads. 164 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 165 return false; 166 167 if (isDS(Opc0) && isDS(Opc1)) { 168 169 // FIXME: Handle this case: 170 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 171 return false; 172 173 // Check base reg. 174 if (Load0->getOperand(0) != Load1->getOperand(0)) 175 return false; 176 177 // Skip read2 / write2 variants for simplicity. 178 // TODO: We should report true if the used offsets are adjacent (excluded 179 // st64 versions). 180 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 181 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 182 if (Offset0Idx == -1 || Offset1Idx == -1) 183 return false; 184 185 // XXX - be careful of datalesss loads 186 // getNamedOperandIdx returns the index for MachineInstrs. Since they 187 // include the output in the operand list, but SDNodes don't, we need to 188 // subtract the index by one. 189 Offset0Idx -= get(Opc0).NumDefs; 190 Offset1Idx -= get(Opc1).NumDefs; 191 Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue(); 192 Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue(); 193 return true; 194 } 195 196 if (isSMRD(Opc0) && isSMRD(Opc1)) { 197 // Skip time and cache invalidation instructions. 198 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 || 199 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1) 200 return false; 201 202 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); 203 204 // Check base reg. 205 if (Load0->getOperand(0) != Load1->getOperand(0)) 206 return false; 207 208 const ConstantSDNode *Load0Offset = 209 dyn_cast<ConstantSDNode>(Load0->getOperand(1)); 210 const ConstantSDNode *Load1Offset = 211 dyn_cast<ConstantSDNode>(Load1->getOperand(1)); 212 213 if (!Load0Offset || !Load1Offset) 214 return false; 215 216 Offset0 = Load0Offset->getZExtValue(); 217 Offset1 = Load1Offset->getZExtValue(); 218 return true; 219 } 220 221 // MUBUF and MTBUF can access the same addresses. 222 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 223 224 // MUBUF and MTBUF have vaddr at different indices. 225 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 226 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 227 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 228 return false; 229 230 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 231 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 232 233 if (OffIdx0 == -1 || OffIdx1 == -1) 234 return false; 235 236 // getNamedOperandIdx returns the index for MachineInstrs. Since they 237 // include the output in the operand list, but SDNodes don't, we need to 238 // subtract the index by one. 239 OffIdx0 -= get(Opc0).NumDefs; 240 OffIdx1 -= get(Opc1).NumDefs; 241 242 SDValue Off0 = Load0->getOperand(OffIdx0); 243 SDValue Off1 = Load1->getOperand(OffIdx1); 244 245 // The offset might be a FrameIndexSDNode. 246 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 247 return false; 248 249 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 250 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 251 return true; 252 } 253 254 return false; 255 } 256 257 static bool isStride64(unsigned Opc) { 258 switch (Opc) { 259 case AMDGPU::DS_READ2ST64_B32: 260 case AMDGPU::DS_READ2ST64_B64: 261 case AMDGPU::DS_WRITE2ST64_B32: 262 case AMDGPU::DS_WRITE2ST64_B64: 263 return true; 264 default: 265 return false; 266 } 267 } 268 269 bool SIInstrInfo::getMemOperandsWithOffsetWidth( 270 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, 271 int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, 272 const TargetRegisterInfo *TRI) const { 273 if (!LdSt.mayLoadOrStore()) 274 return false; 275 276 unsigned Opc = LdSt.getOpcode(); 277 OffsetIsScalable = false; 278 const MachineOperand *BaseOp, *OffsetOp; 279 int DataOpIdx; 280 281 if (isDS(LdSt)) { 282 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); 283 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); 284 if (OffsetOp) { 285 // Normal, single offset LDS instruction. 286 if (!BaseOp) { 287 // DS_CONSUME/DS_APPEND use M0 for the base address. 288 // TODO: find the implicit use operand for M0 and use that as BaseOp? 289 return false; 290 } 291 BaseOps.push_back(BaseOp); 292 Offset = OffsetOp->getImm(); 293 // Get appropriate operand, and compute width accordingly. 294 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 295 if (DataOpIdx == -1) 296 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 297 Width = getOpSize(LdSt, DataOpIdx); 298 } else { 299 // The 2 offset instructions use offset0 and offset1 instead. We can treat 300 // these as a load with a single offset if the 2 offsets are consecutive. 301 // We will use this for some partially aligned loads. 302 const MachineOperand *Offset0Op = 303 getNamedOperand(LdSt, AMDGPU::OpName::offset0); 304 const MachineOperand *Offset1Op = 305 getNamedOperand(LdSt, AMDGPU::OpName::offset1); 306 307 unsigned Offset0 = Offset0Op->getImm(); 308 unsigned Offset1 = Offset1Op->getImm(); 309 if (Offset0 + 1 != Offset1) 310 return false; 311 312 // Each of these offsets is in element sized units, so we need to convert 313 // to bytes of the individual reads. 314 315 unsigned EltSize; 316 if (LdSt.mayLoad()) 317 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16; 318 else { 319 assert(LdSt.mayStore()); 320 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 321 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8; 322 } 323 324 if (isStride64(Opc)) 325 EltSize *= 64; 326 327 BaseOps.push_back(BaseOp); 328 Offset = EltSize * Offset0; 329 // Get appropriate operand(s), and compute width accordingly. 330 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 331 if (DataOpIdx == -1) { 332 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 333 Width = getOpSize(LdSt, DataOpIdx); 334 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1); 335 Width += getOpSize(LdSt, DataOpIdx); 336 } else { 337 Width = getOpSize(LdSt, DataOpIdx); 338 } 339 } 340 return true; 341 } 342 343 if (isMUBUF(LdSt) || isMTBUF(LdSt)) { 344 const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset); 345 if (SOffset && SOffset->isReg()) { 346 // We can only handle this if it's a stack access, as any other resource 347 // would require reporting multiple base registers. 348 const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 349 if (AddrReg && !AddrReg->isFI()) 350 return false; 351 352 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); 353 const SIMachineFunctionInfo *MFI 354 = LdSt.getParent()->getParent()->getInfo<SIMachineFunctionInfo>(); 355 if (RSrc->getReg() != MFI->getScratchRSrcReg()) 356 return false; 357 358 const MachineOperand *OffsetImm = 359 getNamedOperand(LdSt, AMDGPU::OpName::offset); 360 BaseOps.push_back(RSrc); 361 BaseOps.push_back(SOffset); 362 Offset = OffsetImm->getImm(); 363 } else { 364 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); 365 if (!BaseOp) // e.g. BUFFER_WBINVL1_VOL 366 return false; 367 BaseOps.push_back(BaseOp); 368 369 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 370 if (BaseOp) 371 BaseOps.push_back(BaseOp); 372 373 const MachineOperand *OffsetImm = 374 getNamedOperand(LdSt, AMDGPU::OpName::offset); 375 Offset = OffsetImm->getImm(); 376 if (SOffset) // soffset can be an inline immediate. 377 Offset += SOffset->getImm(); 378 } 379 // Get appropriate operand, and compute width accordingly. 380 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 381 if (DataOpIdx == -1) 382 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); 383 Width = getOpSize(LdSt, DataOpIdx); 384 return true; 385 } 386 387 if (isMIMG(LdSt)) { 388 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 389 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx)); 390 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 391 if (VAddr0Idx >= 0) { 392 // GFX10 possible NSA encoding. 393 for (int I = VAddr0Idx; I < SRsrcIdx; ++I) 394 BaseOps.push_back(&LdSt.getOperand(I)); 395 } else { 396 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr)); 397 } 398 Offset = 0; 399 return true; 400 } 401 402 if (isSMRD(LdSt)) { 403 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase); 404 if (!BaseOp) // e.g. S_MEMTIME 405 return false; 406 BaseOps.push_back(BaseOp); 407 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); 408 Offset = OffsetOp ? OffsetOp->getImm() : 0; 409 // Get appropriate operand, and compute width accordingly. 410 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst); 411 Width = getOpSize(LdSt, DataOpIdx); 412 return true; 413 } 414 415 if (isFLAT(LdSt)) { 416 // Instructions have either vaddr or saddr or both. 417 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 418 if (BaseOp) 419 BaseOps.push_back(BaseOp); 420 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr); 421 if (BaseOp) 422 BaseOps.push_back(BaseOp); 423 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm(); 424 // Get appropriate operand, and compute width accordingly. 425 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 426 if (DataOpIdx == -1) 427 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); 428 Width = getOpSize(LdSt, DataOpIdx); 429 return true; 430 } 431 432 return false; 433 } 434 435 static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, 436 ArrayRef<const MachineOperand *> BaseOps1, 437 const MachineInstr &MI2, 438 ArrayRef<const MachineOperand *> BaseOps2) { 439 // Only examine the first "base" operand of each instruction, on the 440 // assumption that it represents the real base address of the memory access. 441 // Other operands are typically offsets or indices from this base address. 442 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front())) 443 return true; 444 445 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand()) 446 return false; 447 448 auto MO1 = *MI1.memoperands_begin(); 449 auto MO2 = *MI2.memoperands_begin(); 450 if (MO1->getAddrSpace() != MO2->getAddrSpace()) 451 return false; 452 453 auto Base1 = MO1->getValue(); 454 auto Base2 = MO2->getValue(); 455 if (!Base1 || !Base2) 456 return false; 457 const MachineFunction &MF = *MI1.getParent()->getParent(); 458 const DataLayout &DL = MF.getFunction().getParent()->getDataLayout(); 459 Base1 = GetUnderlyingObject(Base1, DL); 460 Base2 = GetUnderlyingObject(Base2, DL); 461 462 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2)) 463 return false; 464 465 return Base1 == Base2; 466 } 467 468 bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1, 469 ArrayRef<const MachineOperand *> BaseOps2, 470 unsigned NumLoads, 471 unsigned NumBytes) const { 472 assert(!BaseOps1.empty() && !BaseOps2.empty()); 473 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent(); 474 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent(); 475 476 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) 477 return false; 478 479 const MachineOperand *FirstDst = nullptr; 480 const MachineOperand *SecondDst = nullptr; 481 482 if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || 483 (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) || 484 (isMIMG(FirstLdSt) && isMIMG(SecondLdSt)) || 485 (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) { 486 const unsigned MaxGlobalLoadCluster = 7; 487 if (NumLoads > MaxGlobalLoadCluster) 488 return false; 489 490 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); 491 if (!FirstDst) 492 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); 493 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); 494 if (!SecondDst) 495 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); 496 } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { 497 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); 498 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); 499 } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) { 500 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); 501 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); 502 } 503 504 if (!FirstDst || !SecondDst) 505 return false; 506 507 // Try to limit clustering based on the total number of bytes loaded 508 // rather than the number of instructions. This is done to help reduce 509 // register pressure. The method used is somewhat inexact, though, 510 // because it assumes that all loads in the cluster will load the 511 // same number of bytes as FirstLdSt. 512 513 // The unit of this value is bytes. 514 // FIXME: This needs finer tuning. 515 unsigned LoadClusterThreshold = 16; 516 517 const MachineRegisterInfo &MRI = 518 FirstLdSt.getParent()->getParent()->getRegInfo(); 519 520 const Register Reg = FirstDst->getReg(); 521 522 const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg) 523 ? MRI.getRegClass(Reg) 524 : RI.getPhysRegClass(Reg); 525 526 // FIXME: NumLoads should not be subtracted 1. This is to match behavior 527 // of clusterNeighboringMemOps which was previosly passing cluster length 528 // less 1. LoadClusterThreshold should be tuned instead. 529 return ((NumLoads - 1) * (RI.getRegSizeInBits(*DstRC) / 8)) <= 530 LoadClusterThreshold; 531 } 532 533 // FIXME: This behaves strangely. If, for example, you have 32 load + stores, 534 // the first 16 loads will be interleaved with the stores, and the next 16 will 535 // be clustered as expected. It should really split into 2 16 store batches. 536 // 537 // Loads are clustered until this returns false, rather than trying to schedule 538 // groups of stores. This also means we have to deal with saying different 539 // address space loads should be clustered, and ones which might cause bank 540 // conflicts. 541 // 542 // This might be deprecated so it might not be worth that much effort to fix. 543 bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, 544 int64_t Offset0, int64_t Offset1, 545 unsigned NumLoads) const { 546 assert(Offset1 > Offset0 && 547 "Second offset should be larger than first offset!"); 548 // If we have less than 16 loads in a row, and the offsets are within 64 549 // bytes, then schedule together. 550 551 // A cacheline is 64 bytes (for global memory). 552 return (NumLoads <= 16 && (Offset1 - Offset0) < 64); 553 } 554 555 static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, 556 MachineBasicBlock::iterator MI, 557 const DebugLoc &DL, MCRegister DestReg, 558 MCRegister SrcReg, bool KillSrc, 559 const char *Msg = "illegal SGPR to VGPR copy") { 560 MachineFunction *MF = MBB.getParent(); 561 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error); 562 LLVMContext &C = MF->getFunction().getContext(); 563 C.diagnose(IllegalCopy); 564 565 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg) 566 .addReg(SrcReg, getKillRegState(KillSrc)); 567 } 568 569 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 570 MachineBasicBlock::iterator MI, 571 const DebugLoc &DL, MCRegister DestReg, 572 MCRegister SrcReg, bool KillSrc) const { 573 const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg); 574 575 // FIXME: This is hack to resolve copies between 16 bit and 32 bit 576 // registers until all patterns are fixed. 577 if (Fix16BitCopies && 578 ((RI.getRegSizeInBits(*RC) == 16) ^ 579 (RI.getRegSizeInBits(*RI.getPhysRegClass(SrcReg)) == 16))) { 580 MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg; 581 MCRegister Super = RI.get32BitRegister(RegToFix); 582 assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix); 583 RegToFix = Super; 584 585 if (DestReg == SrcReg) { 586 // Insert empty bundle since ExpandPostRA expects an instruction here. 587 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE)); 588 return; 589 } 590 591 RC = RI.getPhysRegClass(DestReg); 592 } 593 594 if (RC == &AMDGPU::VGPR_32RegClass) { 595 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 596 AMDGPU::SReg_32RegClass.contains(SrcReg) || 597 AMDGPU::AGPR_32RegClass.contains(SrcReg)); 598 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ? 599 AMDGPU::V_ACCVGPR_READ_B32 : AMDGPU::V_MOV_B32_e32; 600 BuildMI(MBB, MI, DL, get(Opc), DestReg) 601 .addReg(SrcReg, getKillRegState(KillSrc)); 602 return; 603 } 604 605 if (RC == &AMDGPU::SReg_32_XM0RegClass || 606 RC == &AMDGPU::SReg_32RegClass) { 607 if (SrcReg == AMDGPU::SCC) { 608 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) 609 .addImm(1) 610 .addImm(0); 611 return; 612 } 613 614 if (DestReg == AMDGPU::VCC_LO) { 615 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) { 616 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO) 617 .addReg(SrcReg, getKillRegState(KillSrc)); 618 } else { 619 // FIXME: Hack until VReg_1 removed. 620 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 621 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) 622 .addImm(0) 623 .addReg(SrcReg, getKillRegState(KillSrc)); 624 } 625 626 return; 627 } 628 629 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) { 630 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 631 return; 632 } 633 634 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 635 .addReg(SrcReg, getKillRegState(KillSrc)); 636 return; 637 } 638 639 if (RC == &AMDGPU::SReg_64RegClass) { 640 if (DestReg == AMDGPU::VCC) { 641 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 642 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 643 .addReg(SrcReg, getKillRegState(KillSrc)); 644 } else { 645 // FIXME: Hack until VReg_1 removed. 646 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 647 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) 648 .addImm(0) 649 .addReg(SrcReg, getKillRegState(KillSrc)); 650 } 651 652 return; 653 } 654 655 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) { 656 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 657 return; 658 } 659 660 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 661 .addReg(SrcReg, getKillRegState(KillSrc)); 662 return; 663 } 664 665 if (DestReg == AMDGPU::SCC) { 666 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 667 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) 668 .addReg(SrcReg, getKillRegState(KillSrc)) 669 .addImm(0); 670 return; 671 } 672 673 if (RC == &AMDGPU::AGPR_32RegClass) { 674 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 675 AMDGPU::SReg_32RegClass.contains(SrcReg) || 676 AMDGPU::AGPR_32RegClass.contains(SrcReg)); 677 if (!AMDGPU::VGPR_32RegClass.contains(SrcReg)) { 678 // First try to find defining accvgpr_write to avoid temporary registers. 679 for (auto Def = MI, E = MBB.begin(); Def != E; ) { 680 --Def; 681 if (!Def->definesRegister(SrcReg, &RI)) 682 continue; 683 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32) 684 break; 685 686 MachineOperand &DefOp = Def->getOperand(1); 687 assert(DefOp.isReg() || DefOp.isImm()); 688 689 if (DefOp.isReg()) { 690 // Check that register source operand if not clobbered before MI. 691 // Immediate operands are always safe to propagate. 692 bool SafeToPropagate = true; 693 for (auto I = Def; I != MI && SafeToPropagate; ++I) 694 if (I->modifiesRegister(DefOp.getReg(), &RI)) 695 SafeToPropagate = false; 696 697 if (!SafeToPropagate) 698 break; 699 700 DefOp.setIsKill(false); 701 } 702 703 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) 704 .add(DefOp); 705 return; 706 } 707 708 RegScavenger RS; 709 RS.enterBasicBlock(MBB); 710 RS.forward(MI); 711 712 // Ideally we want to have three registers for a long reg_sequence copy 713 // to hide 2 waitstates between v_mov_b32 and accvgpr_write. 714 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 715 *MBB.getParent()); 716 717 // Registers in the sequence are allocated contiguously so we can just 718 // use register number to pick one of three round-robin temps. 719 unsigned RegNo = DestReg % 3; 720 Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); 721 if (!Tmp) 722 report_fatal_error("Cannot scavenge VGPR to copy to AGPR"); 723 RS.setRegUsed(Tmp); 724 // Only loop through if there are any free registers left, otherwise 725 // scavenger may report a fatal error without emergency spill slot 726 // or spill with the slot. 727 while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { 728 unsigned Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); 729 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) 730 break; 731 Tmp = Tmp2; 732 RS.setRegUsed(Tmp); 733 } 734 copyPhysReg(MBB, MI, DL, Tmp, SrcReg, KillSrc); 735 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) 736 .addReg(Tmp, RegState::Kill); 737 return; 738 } 739 740 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) 741 .addReg(SrcReg, getKillRegState(KillSrc)); 742 return; 743 } 744 745 if (RI.getRegSizeInBits(*RC) == 16) { 746 assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || 747 AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || 748 AMDGPU::SReg_LO16RegClass.contains(SrcReg) || 749 AMDGPU::AGPR_LO16RegClass.contains(SrcReg)); 750 751 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg); 752 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg); 753 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg); 754 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg); 755 bool DstLow = AMDGPU::VGPR_LO16RegClass.contains(DestReg) || 756 AMDGPU::SReg_LO16RegClass.contains(DestReg) || 757 AMDGPU::AGPR_LO16RegClass.contains(DestReg); 758 bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || 759 AMDGPU::SReg_LO16RegClass.contains(SrcReg) || 760 AMDGPU::AGPR_LO16RegClass.contains(SrcReg); 761 MCRegister NewDestReg = RI.get32BitRegister(DestReg); 762 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg); 763 764 if (IsSGPRDst) { 765 if (!IsSGPRSrc) { 766 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 767 return; 768 } 769 770 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg) 771 .addReg(NewSrcReg, getKillRegState(KillSrc)); 772 return; 773 } 774 775 if (IsAGPRDst || IsAGPRSrc) { 776 if (!DstLow || !SrcLow) { 777 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, 778 "Cannot use hi16 subreg with an AGPR!"); 779 } 780 781 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc); 782 return; 783 } 784 785 if (IsSGPRSrc && !ST.hasSDWAScalar()) { 786 if (!DstLow || !SrcLow) { 787 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, 788 "Cannot use hi16 subreg on VI!"); 789 } 790 791 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg) 792 .addReg(NewSrcReg, getKillRegState(KillSrc)); 793 return; 794 } 795 796 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg) 797 .addImm(0) // src0_modifiers 798 .addReg(NewSrcReg) 799 .addImm(0) // clamp 800 .addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0 801 : AMDGPU::SDWA::SdwaSel::WORD_1) 802 .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) 803 .addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0 804 : AMDGPU::SDWA::SdwaSel::WORD_1) 805 .addReg(NewDestReg, RegState::Implicit | RegState::Undef); 806 // First implicit operand is $exec. 807 MIB->tieOperands(0, MIB->getNumOperands() - 1); 808 return; 809 } 810 811 unsigned EltSize = 4; 812 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 813 if (RI.isSGPRClass(RC)) { 814 // TODO: Copy vec3/vec5 with s_mov_b64s then final s_mov_b32. 815 if (!(RI.getRegSizeInBits(*RC) % 64)) { 816 Opcode = AMDGPU::S_MOV_B64; 817 EltSize = 8; 818 } else { 819 Opcode = AMDGPU::S_MOV_B32; 820 EltSize = 4; 821 } 822 823 if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) { 824 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 825 return; 826 } 827 } else if (RI.hasAGPRs(RC)) { 828 Opcode = RI.hasVGPRs(RI.getPhysRegClass(SrcReg)) ? 829 AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY; 830 } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) { 831 Opcode = AMDGPU::V_ACCVGPR_READ_B32; 832 } 833 834 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); 835 bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); 836 837 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 838 unsigned SubIdx; 839 if (Forward) 840 SubIdx = SubIndices[Idx]; 841 else 842 SubIdx = SubIndices[SubIndices.size() - Idx - 1]; 843 844 if (Opcode == TargetOpcode::COPY) { 845 copyPhysReg(MBB, MI, DL, RI.getSubReg(DestReg, SubIdx), 846 RI.getSubReg(SrcReg, SubIdx), KillSrc); 847 continue; 848 } 849 850 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 851 get(Opcode), RI.getSubReg(DestReg, SubIdx)); 852 853 Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); 854 855 if (Idx == 0) 856 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 857 858 bool UseKill = KillSrc && Idx == SubIndices.size() - 1; 859 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); 860 } 861 } 862 863 int SIInstrInfo::commuteOpcode(unsigned Opcode) const { 864 int NewOpc; 865 866 // Try to map original to commuted opcode 867 NewOpc = AMDGPU::getCommuteRev(Opcode); 868 if (NewOpc != -1) 869 // Check if the commuted (REV) opcode exists on the target. 870 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 871 872 // Try to map commuted to original opcode 873 NewOpc = AMDGPU::getCommuteOrig(Opcode); 874 if (NewOpc != -1) 875 // Check if the original (non-REV) opcode exists on the target. 876 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 877 878 return Opcode; 879 } 880 881 void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, 882 MachineBasicBlock::iterator MI, 883 const DebugLoc &DL, unsigned DestReg, 884 int64_t Value) const { 885 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 886 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg); 887 if (RegClass == &AMDGPU::SReg_32RegClass || 888 RegClass == &AMDGPU::SGPR_32RegClass || 889 RegClass == &AMDGPU::SReg_32_XM0RegClass || 890 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) { 891 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 892 .addImm(Value); 893 return; 894 } 895 896 if (RegClass == &AMDGPU::SReg_64RegClass || 897 RegClass == &AMDGPU::SGPR_64RegClass || 898 RegClass == &AMDGPU::SReg_64_XEXECRegClass) { 899 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 900 .addImm(Value); 901 return; 902 } 903 904 if (RegClass == &AMDGPU::VGPR_32RegClass) { 905 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 906 .addImm(Value); 907 return; 908 } 909 if (RegClass == &AMDGPU::VReg_64RegClass) { 910 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg) 911 .addImm(Value); 912 return; 913 } 914 915 unsigned EltSize = 4; 916 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 917 if (RI.isSGPRClass(RegClass)) { 918 if (RI.getRegSizeInBits(*RegClass) > 32) { 919 Opcode = AMDGPU::S_MOV_B64; 920 EltSize = 8; 921 } else { 922 Opcode = AMDGPU::S_MOV_B32; 923 EltSize = 4; 924 } 925 } 926 927 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize); 928 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 929 int64_t IdxValue = Idx == 0 ? Value : 0; 930 931 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 932 get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx])); 933 Builder.addImm(IdxValue); 934 } 935 } 936 937 const TargetRegisterClass * 938 SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const { 939 return &AMDGPU::VGPR_32RegClass; 940 } 941 942 void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, 943 MachineBasicBlock::iterator I, 944 const DebugLoc &DL, Register DstReg, 945 ArrayRef<MachineOperand> Cond, 946 Register TrueReg, 947 Register FalseReg) const { 948 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 949 MachineFunction *MF = MBB.getParent(); 950 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 951 const TargetRegisterClass *BoolXExecRC = 952 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 953 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && 954 "Not a VGPR32 reg"); 955 956 if (Cond.size() == 1) { 957 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 958 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 959 .add(Cond[0]); 960 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 961 .addImm(0) 962 .addReg(FalseReg) 963 .addImm(0) 964 .addReg(TrueReg) 965 .addReg(SReg); 966 } else if (Cond.size() == 2) { 967 assert(Cond[0].isImm() && "Cond[0] is not an immediate"); 968 switch (Cond[0].getImm()) { 969 case SIInstrInfo::SCC_TRUE: { 970 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 971 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 972 : AMDGPU::S_CSELECT_B64), SReg) 973 .addImm(1) 974 .addImm(0); 975 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 976 .addImm(0) 977 .addReg(FalseReg) 978 .addImm(0) 979 .addReg(TrueReg) 980 .addReg(SReg); 981 break; 982 } 983 case SIInstrInfo::SCC_FALSE: { 984 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 985 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 986 : AMDGPU::S_CSELECT_B64), SReg) 987 .addImm(0) 988 .addImm(1); 989 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 990 .addImm(0) 991 .addReg(FalseReg) 992 .addImm(0) 993 .addReg(TrueReg) 994 .addReg(SReg); 995 break; 996 } 997 case SIInstrInfo::VCCNZ: { 998 MachineOperand RegOp = Cond[1]; 999 RegOp.setImplicit(false); 1000 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1001 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 1002 .add(RegOp); 1003 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1004 .addImm(0) 1005 .addReg(FalseReg) 1006 .addImm(0) 1007 .addReg(TrueReg) 1008 .addReg(SReg); 1009 break; 1010 } 1011 case SIInstrInfo::VCCZ: { 1012 MachineOperand RegOp = Cond[1]; 1013 RegOp.setImplicit(false); 1014 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1015 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 1016 .add(RegOp); 1017 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1018 .addImm(0) 1019 .addReg(TrueReg) 1020 .addImm(0) 1021 .addReg(FalseReg) 1022 .addReg(SReg); 1023 break; 1024 } 1025 case SIInstrInfo::EXECNZ: { 1026 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1027 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); 1028 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 1029 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 1030 .addImm(0); 1031 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 1032 : AMDGPU::S_CSELECT_B64), SReg) 1033 .addImm(1) 1034 .addImm(0); 1035 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1036 .addImm(0) 1037 .addReg(FalseReg) 1038 .addImm(0) 1039 .addReg(TrueReg) 1040 .addReg(SReg); 1041 break; 1042 } 1043 case SIInstrInfo::EXECZ: { 1044 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1045 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); 1046 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 1047 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 1048 .addImm(0); 1049 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 1050 : AMDGPU::S_CSELECT_B64), SReg) 1051 .addImm(0) 1052 .addImm(1); 1053 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1054 .addImm(0) 1055 .addReg(FalseReg) 1056 .addImm(0) 1057 .addReg(TrueReg) 1058 .addReg(SReg); 1059 llvm_unreachable("Unhandled branch predicate EXECZ"); 1060 break; 1061 } 1062 default: 1063 llvm_unreachable("invalid branch predicate"); 1064 } 1065 } else { 1066 llvm_unreachable("Can only handle Cond size 1 or 2"); 1067 } 1068 } 1069 1070 Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB, 1071 MachineBasicBlock::iterator I, 1072 const DebugLoc &DL, 1073 Register SrcReg, int Value) const { 1074 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1075 Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); 1076 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) 1077 .addImm(Value) 1078 .addReg(SrcReg); 1079 1080 return Reg; 1081 } 1082 1083 Register SIInstrInfo::insertNE(MachineBasicBlock *MBB, 1084 MachineBasicBlock::iterator I, 1085 const DebugLoc &DL, 1086 Register SrcReg, int Value) const { 1087 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1088 Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); 1089 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) 1090 .addImm(Value) 1091 .addReg(SrcReg); 1092 1093 return Reg; 1094 } 1095 1096 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 1097 1098 if (RI.hasAGPRs(DstRC)) 1099 return AMDGPU::COPY; 1100 if (RI.getRegSizeInBits(*DstRC) == 32) { 1101 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1102 } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { 1103 return AMDGPU::S_MOV_B64; 1104 } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) { 1105 return AMDGPU::V_MOV_B64_PSEUDO; 1106 } 1107 return AMDGPU::COPY; 1108 } 1109 1110 static unsigned getIndirectVGPRWritePseudoOpc(unsigned VecSize) { 1111 if (VecSize <= 32) // 4 bytes 1112 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V1; 1113 if (VecSize <= 64) // 8 bytes 1114 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V2; 1115 if (VecSize <= 96) // 12 bytes 1116 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V3; 1117 if (VecSize <= 128) // 16 bytes 1118 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V4; 1119 if (VecSize <= 160) // 20 bytes 1120 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V5; 1121 if (VecSize <= 256) // 32 bytes 1122 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V8; 1123 if (VecSize <= 512) // 64 bytes 1124 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V16; 1125 if (VecSize <= 1024) // 128 bytes 1126 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V32; 1127 1128 llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); 1129 } 1130 1131 static unsigned getIndirectSGPRWritePseudo32(unsigned VecSize) { 1132 if (VecSize <= 32) // 4 bytes 1133 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V1; 1134 if (VecSize <= 64) // 8 bytes 1135 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V2; 1136 if (VecSize <= 96) // 12 bytes 1137 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V3; 1138 if (VecSize <= 128) // 16 bytes 1139 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V4; 1140 if (VecSize <= 160) // 20 bytes 1141 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V5; 1142 if (VecSize <= 256) // 32 bytes 1143 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V8; 1144 if (VecSize <= 512) // 64 bytes 1145 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V16; 1146 if (VecSize <= 1024) // 128 bytes 1147 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V32; 1148 1149 llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); 1150 } 1151 1152 static unsigned getIndirectSGPRWritePseudo64(unsigned VecSize) { 1153 if (VecSize <= 64) // 8 bytes 1154 return AMDGPU::S_INDIRECT_REG_WRITE_B64_V1; 1155 if (VecSize <= 128) // 16 bytes 1156 return AMDGPU::S_INDIRECT_REG_WRITE_B64_V2; 1157 if (VecSize <= 256) // 32 bytes 1158 return AMDGPU::S_INDIRECT_REG_WRITE_B64_V4; 1159 if (VecSize <= 512) // 64 bytes 1160 return AMDGPU::S_INDIRECT_REG_WRITE_B64_V8; 1161 if (VecSize <= 1024) // 128 bytes 1162 return AMDGPU::S_INDIRECT_REG_WRITE_B64_V16; 1163 1164 llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); 1165 } 1166 1167 const MCInstrDesc &SIInstrInfo::getIndirectRegWritePseudo( 1168 unsigned VecSize, unsigned EltSize, bool IsSGPR) const { 1169 if (IsSGPR) { 1170 switch (EltSize) { 1171 case 32: 1172 return get(getIndirectSGPRWritePseudo32(VecSize)); 1173 case 64: 1174 return get(getIndirectSGPRWritePseudo64(VecSize)); 1175 default: 1176 llvm_unreachable("invalid reg indexing elt size"); 1177 } 1178 } 1179 1180 assert(EltSize == 32 && "invalid reg indexing elt size"); 1181 return get(getIndirectVGPRWritePseudoOpc(VecSize)); 1182 } 1183 1184 static unsigned getSGPRSpillSaveOpcode(unsigned Size) { 1185 switch (Size) { 1186 case 4: 1187 return AMDGPU::SI_SPILL_S32_SAVE; 1188 case 8: 1189 return AMDGPU::SI_SPILL_S64_SAVE; 1190 case 12: 1191 return AMDGPU::SI_SPILL_S96_SAVE; 1192 case 16: 1193 return AMDGPU::SI_SPILL_S128_SAVE; 1194 case 20: 1195 return AMDGPU::SI_SPILL_S160_SAVE; 1196 case 24: 1197 return AMDGPU::SI_SPILL_S192_SAVE; 1198 case 32: 1199 return AMDGPU::SI_SPILL_S256_SAVE; 1200 case 64: 1201 return AMDGPU::SI_SPILL_S512_SAVE; 1202 case 128: 1203 return AMDGPU::SI_SPILL_S1024_SAVE; 1204 default: 1205 llvm_unreachable("unknown register size"); 1206 } 1207 } 1208 1209 static unsigned getVGPRSpillSaveOpcode(unsigned Size) { 1210 switch (Size) { 1211 case 4: 1212 return AMDGPU::SI_SPILL_V32_SAVE; 1213 case 8: 1214 return AMDGPU::SI_SPILL_V64_SAVE; 1215 case 12: 1216 return AMDGPU::SI_SPILL_V96_SAVE; 1217 case 16: 1218 return AMDGPU::SI_SPILL_V128_SAVE; 1219 case 20: 1220 return AMDGPU::SI_SPILL_V160_SAVE; 1221 case 24: 1222 return AMDGPU::SI_SPILL_V192_SAVE; 1223 case 32: 1224 return AMDGPU::SI_SPILL_V256_SAVE; 1225 case 64: 1226 return AMDGPU::SI_SPILL_V512_SAVE; 1227 case 128: 1228 return AMDGPU::SI_SPILL_V1024_SAVE; 1229 default: 1230 llvm_unreachable("unknown register size"); 1231 } 1232 } 1233 1234 static unsigned getAGPRSpillSaveOpcode(unsigned Size) { 1235 switch (Size) { 1236 case 4: 1237 return AMDGPU::SI_SPILL_A32_SAVE; 1238 case 8: 1239 return AMDGPU::SI_SPILL_A64_SAVE; 1240 case 16: 1241 return AMDGPU::SI_SPILL_A128_SAVE; 1242 case 64: 1243 return AMDGPU::SI_SPILL_A512_SAVE; 1244 case 128: 1245 return AMDGPU::SI_SPILL_A1024_SAVE; 1246 default: 1247 llvm_unreachable("unknown register size"); 1248 } 1249 } 1250 1251 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 1252 MachineBasicBlock::iterator MI, 1253 Register SrcReg, bool isKill, 1254 int FrameIndex, 1255 const TargetRegisterClass *RC, 1256 const TargetRegisterInfo *TRI) const { 1257 MachineFunction *MF = MBB.getParent(); 1258 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1259 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1260 const DebugLoc &DL = MBB.findDebugLoc(MI); 1261 1262 MachinePointerInfo PtrInfo 1263 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 1264 MachineMemOperand *MMO = MF->getMachineMemOperand( 1265 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex), 1266 FrameInfo.getObjectAlign(FrameIndex)); 1267 unsigned SpillSize = TRI->getSpillSize(*RC); 1268 1269 if (RI.isSGPRClass(RC)) { 1270 MFI->setHasSpilledSGPRs(); 1271 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled"); 1272 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && 1273 SrcReg != AMDGPU::EXEC && "exec should not be spilled"); 1274 1275 // We are only allowed to create one new instruction when spilling 1276 // registers, so we need to use pseudo instruction for spilling SGPRs. 1277 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize)); 1278 1279 // The SGPR spill/restore instructions only work on number sgprs, so we need 1280 // to make sure we are using the correct register class. 1281 if (Register::isVirtualRegister(SrcReg) && SpillSize == 4) { 1282 MachineRegisterInfo &MRI = MF->getRegInfo(); 1283 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); 1284 } 1285 1286 BuildMI(MBB, MI, DL, OpDesc) 1287 .addReg(SrcReg, getKillRegState(isKill)) // data 1288 .addFrameIndex(FrameIndex) // addr 1289 .addMemOperand(MMO) 1290 .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) 1291 .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); 1292 // Add the scratch resource registers as implicit uses because we may end up 1293 // needing them, and need to ensure that the reserved registers are 1294 // correctly handled. 1295 if (RI.spillSGPRToVGPR()) 1296 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); 1297 return; 1298 } 1299 1300 unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillSaveOpcode(SpillSize) 1301 : getVGPRSpillSaveOpcode(SpillSize); 1302 MFI->setHasSpilledVGPRs(); 1303 1304 auto MIB = BuildMI(MBB, MI, DL, get(Opcode)); 1305 if (RI.hasAGPRs(RC)) { 1306 MachineRegisterInfo &MRI = MF->getRegInfo(); 1307 Register Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1308 MIB.addReg(Tmp, RegState::Define); 1309 } 1310 MIB.addReg(SrcReg, getKillRegState(isKill)) // data 1311 .addFrameIndex(FrameIndex) // addr 1312 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 1313 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset 1314 .addImm(0) // offset 1315 .addMemOperand(MMO); 1316 } 1317 1318 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { 1319 switch (Size) { 1320 case 4: 1321 return AMDGPU::SI_SPILL_S32_RESTORE; 1322 case 8: 1323 return AMDGPU::SI_SPILL_S64_RESTORE; 1324 case 12: 1325 return AMDGPU::SI_SPILL_S96_RESTORE; 1326 case 16: 1327 return AMDGPU::SI_SPILL_S128_RESTORE; 1328 case 20: 1329 return AMDGPU::SI_SPILL_S160_RESTORE; 1330 case 24: 1331 return AMDGPU::SI_SPILL_S192_RESTORE; 1332 case 32: 1333 return AMDGPU::SI_SPILL_S256_RESTORE; 1334 case 64: 1335 return AMDGPU::SI_SPILL_S512_RESTORE; 1336 case 128: 1337 return AMDGPU::SI_SPILL_S1024_RESTORE; 1338 default: 1339 llvm_unreachable("unknown register size"); 1340 } 1341 } 1342 1343 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { 1344 switch (Size) { 1345 case 4: 1346 return AMDGPU::SI_SPILL_V32_RESTORE; 1347 case 8: 1348 return AMDGPU::SI_SPILL_V64_RESTORE; 1349 case 12: 1350 return AMDGPU::SI_SPILL_V96_RESTORE; 1351 case 16: 1352 return AMDGPU::SI_SPILL_V128_RESTORE; 1353 case 20: 1354 return AMDGPU::SI_SPILL_V160_RESTORE; 1355 case 24: 1356 return AMDGPU::SI_SPILL_V192_RESTORE; 1357 case 32: 1358 return AMDGPU::SI_SPILL_V256_RESTORE; 1359 case 64: 1360 return AMDGPU::SI_SPILL_V512_RESTORE; 1361 case 128: 1362 return AMDGPU::SI_SPILL_V1024_RESTORE; 1363 default: 1364 llvm_unreachable("unknown register size"); 1365 } 1366 } 1367 1368 static unsigned getAGPRSpillRestoreOpcode(unsigned Size) { 1369 switch (Size) { 1370 case 4: 1371 return AMDGPU::SI_SPILL_A32_RESTORE; 1372 case 8: 1373 return AMDGPU::SI_SPILL_A64_RESTORE; 1374 case 16: 1375 return AMDGPU::SI_SPILL_A128_RESTORE; 1376 case 64: 1377 return AMDGPU::SI_SPILL_A512_RESTORE; 1378 case 128: 1379 return AMDGPU::SI_SPILL_A1024_RESTORE; 1380 default: 1381 llvm_unreachable("unknown register size"); 1382 } 1383 } 1384 1385 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 1386 MachineBasicBlock::iterator MI, 1387 Register DestReg, int FrameIndex, 1388 const TargetRegisterClass *RC, 1389 const TargetRegisterInfo *TRI) const { 1390 MachineFunction *MF = MBB.getParent(); 1391 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1392 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1393 const DebugLoc &DL = MBB.findDebugLoc(MI); 1394 unsigned SpillSize = TRI->getSpillSize(*RC); 1395 1396 MachinePointerInfo PtrInfo 1397 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 1398 1399 MachineMemOperand *MMO = MF->getMachineMemOperand( 1400 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex), 1401 FrameInfo.getObjectAlign(FrameIndex)); 1402 1403 if (RI.isSGPRClass(RC)) { 1404 MFI->setHasSpilledSGPRs(); 1405 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into"); 1406 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && 1407 DestReg != AMDGPU::EXEC && "exec should not be spilled"); 1408 1409 // FIXME: Maybe this should not include a memoperand because it will be 1410 // lowered to non-memory instructions. 1411 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); 1412 if (DestReg.isVirtual() && SpillSize == 4) { 1413 MachineRegisterInfo &MRI = MF->getRegInfo(); 1414 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); 1415 } 1416 1417 if (RI.spillSGPRToVGPR()) 1418 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); 1419 BuildMI(MBB, MI, DL, OpDesc, DestReg) 1420 .addFrameIndex(FrameIndex) // addr 1421 .addMemOperand(MMO) 1422 .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) 1423 .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); 1424 return; 1425 } 1426 1427 unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize) 1428 : getVGPRSpillRestoreOpcode(SpillSize); 1429 auto MIB = BuildMI(MBB, MI, DL, get(Opcode), DestReg); 1430 if (RI.hasAGPRs(RC)) { 1431 MachineRegisterInfo &MRI = MF->getRegInfo(); 1432 Register Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1433 MIB.addReg(Tmp, RegState::Define); 1434 } 1435 MIB.addFrameIndex(FrameIndex) // vaddr 1436 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 1437 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset 1438 .addImm(0) // offset 1439 .addMemOperand(MMO); 1440 } 1441 1442 /// \param @Offset Offset in bytes of the FrameIndex being spilled 1443 unsigned SIInstrInfo::calculateLDSSpillAddress( 1444 MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg, 1445 unsigned FrameOffset, unsigned Size) const { 1446 MachineFunction *MF = MBB.getParent(); 1447 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1448 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 1449 const DebugLoc &DL = MBB.findDebugLoc(MI); 1450 unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); 1451 unsigned WavefrontSize = ST.getWavefrontSize(); 1452 1453 Register TIDReg = MFI->getTIDReg(); 1454 if (!MFI->hasCalculatedTID()) { 1455 MachineBasicBlock &Entry = MBB.getParent()->front(); 1456 MachineBasicBlock::iterator Insert = Entry.front(); 1457 const DebugLoc &DL = Insert->getDebugLoc(); 1458 1459 TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass, 1460 *MF); 1461 if (TIDReg == AMDGPU::NoRegister) 1462 return TIDReg; 1463 1464 if (!AMDGPU::isShader(MF->getFunction().getCallingConv()) && 1465 WorkGroupSize > WavefrontSize) { 1466 Register TIDIGXReg = 1467 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 1468 Register TIDIGYReg = 1469 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 1470 Register TIDIGZReg = 1471 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 1472 Register InputPtrReg = 1473 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1474 for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { 1475 if (!Entry.isLiveIn(Reg)) 1476 Entry.addLiveIn(Reg); 1477 } 1478 1479 RS->enterBasicBlock(Entry); 1480 // FIXME: Can we scavenge an SReg_64 and access the subregs? 1481 Register STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 1482 Register STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 1483 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) 1484 .addReg(InputPtrReg) 1485 .addImm(SI::KernelInputOffsets::NGROUPS_Z); 1486 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) 1487 .addReg(InputPtrReg) 1488 .addImm(SI::KernelInputOffsets::NGROUPS_Y); 1489 1490 // NGROUPS.X * NGROUPS.Y 1491 BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) 1492 .addReg(STmp1) 1493 .addReg(STmp0); 1494 // (NGROUPS.X * NGROUPS.Y) * TIDIG.X 1495 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) 1496 .addReg(STmp1) 1497 .addReg(TIDIGXReg); 1498 // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) 1499 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) 1500 .addReg(STmp0) 1501 .addReg(TIDIGYReg) 1502 .addReg(TIDReg); 1503 // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z 1504 getAddNoCarry(Entry, Insert, DL, TIDReg) 1505 .addReg(TIDReg) 1506 .addReg(TIDIGZReg) 1507 .addImm(0); // clamp bit 1508 } else { 1509 // Get the wave id 1510 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), 1511 TIDReg) 1512 .addImm(-1) 1513 .addImm(0); 1514 1515 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), 1516 TIDReg) 1517 .addImm(-1) 1518 .addReg(TIDReg); 1519 } 1520 1521 BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), 1522 TIDReg) 1523 .addImm(2) 1524 .addReg(TIDReg); 1525 MFI->setTIDReg(TIDReg); 1526 } 1527 1528 // Add FrameIndex to LDS offset 1529 unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize); 1530 getAddNoCarry(MBB, MI, DL, TmpReg) 1531 .addImm(LDSOffset) 1532 .addReg(TIDReg) 1533 .addImm(0); // clamp bit 1534 1535 return TmpReg; 1536 } 1537 1538 void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB, 1539 MachineBasicBlock::iterator MI, 1540 int Count) const { 1541 DebugLoc DL = MBB.findDebugLoc(MI); 1542 while (Count > 0) { 1543 int Arg; 1544 if (Count >= 8) 1545 Arg = 7; 1546 else 1547 Arg = Count - 1; 1548 Count -= 8; 1549 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)) 1550 .addImm(Arg); 1551 } 1552 } 1553 1554 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, 1555 MachineBasicBlock::iterator MI) const { 1556 insertWaitStates(MBB, MI, 1); 1557 } 1558 1559 void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { 1560 auto MF = MBB.getParent(); 1561 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 1562 1563 assert(Info->isEntryFunction()); 1564 1565 if (MBB.succ_empty()) { 1566 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end(); 1567 if (HasNoTerminator) { 1568 if (Info->returnsVoid()) { 1569 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0); 1570 } else { 1571 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG)); 1572 } 1573 } 1574 } 1575 } 1576 1577 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { 1578 switch (MI.getOpcode()) { 1579 default: return 1; // FIXME: Do wait states equal cycles? 1580 1581 case AMDGPU::S_NOP: 1582 return MI.getOperand(0).getImm() + 1; 1583 } 1584 } 1585 1586 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 1587 MachineBasicBlock &MBB = *MI.getParent(); 1588 DebugLoc DL = MBB.findDebugLoc(MI); 1589 switch (MI.getOpcode()) { 1590 default: return TargetInstrInfo::expandPostRAPseudo(MI); 1591 case AMDGPU::S_MOV_B64_term: 1592 // This is only a terminator to get the correct spill code placement during 1593 // register allocation. 1594 MI.setDesc(get(AMDGPU::S_MOV_B64)); 1595 break; 1596 1597 case AMDGPU::S_MOV_B32_term: 1598 // This is only a terminator to get the correct spill code placement during 1599 // register allocation. 1600 MI.setDesc(get(AMDGPU::S_MOV_B32)); 1601 break; 1602 1603 case AMDGPU::S_XOR_B64_term: 1604 // This is only a terminator to get the correct spill code placement during 1605 // register allocation. 1606 MI.setDesc(get(AMDGPU::S_XOR_B64)); 1607 break; 1608 1609 case AMDGPU::S_XOR_B32_term: 1610 // This is only a terminator to get the correct spill code placement during 1611 // register allocation. 1612 MI.setDesc(get(AMDGPU::S_XOR_B32)); 1613 break; 1614 1615 case AMDGPU::S_OR_B32_term: 1616 // This is only a terminator to get the correct spill code placement during 1617 // register allocation. 1618 MI.setDesc(get(AMDGPU::S_OR_B32)); 1619 break; 1620 1621 case AMDGPU::S_ANDN2_B64_term: 1622 // This is only a terminator to get the correct spill code placement during 1623 // register allocation. 1624 MI.setDesc(get(AMDGPU::S_ANDN2_B64)); 1625 break; 1626 1627 case AMDGPU::S_ANDN2_B32_term: 1628 // This is only a terminator to get the correct spill code placement during 1629 // register allocation. 1630 MI.setDesc(get(AMDGPU::S_ANDN2_B32)); 1631 break; 1632 1633 case AMDGPU::V_MOV_B64_PSEUDO: { 1634 Register Dst = MI.getOperand(0).getReg(); 1635 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 1636 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 1637 1638 const MachineOperand &SrcOp = MI.getOperand(1); 1639 // FIXME: Will this work for 64-bit floating point immediates? 1640 assert(!SrcOp.isFPImm()); 1641 if (SrcOp.isImm()) { 1642 APInt Imm(64, SrcOp.getImm()); 1643 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 1644 .addImm(Imm.getLoBits(32).getZExtValue()) 1645 .addReg(Dst, RegState::Implicit | RegState::Define); 1646 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 1647 .addImm(Imm.getHiBits(32).getZExtValue()) 1648 .addReg(Dst, RegState::Implicit | RegState::Define); 1649 } else { 1650 assert(SrcOp.isReg()); 1651 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 1652 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 1653 .addReg(Dst, RegState::Implicit | RegState::Define); 1654 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 1655 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 1656 .addReg(Dst, RegState::Implicit | RegState::Define); 1657 } 1658 MI.eraseFromParent(); 1659 break; 1660 } 1661 case AMDGPU::V_MOV_B64_DPP_PSEUDO: { 1662 expandMovDPP64(MI); 1663 break; 1664 } 1665 case AMDGPU::V_SET_INACTIVE_B32: { 1666 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; 1667 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1668 BuildMI(MBB, MI, DL, get(NotOpc), Exec) 1669 .addReg(Exec); 1670 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) 1671 .add(MI.getOperand(2)); 1672 BuildMI(MBB, MI, DL, get(NotOpc), Exec) 1673 .addReg(Exec); 1674 MI.eraseFromParent(); 1675 break; 1676 } 1677 case AMDGPU::V_SET_INACTIVE_B64: { 1678 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; 1679 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1680 BuildMI(MBB, MI, DL, get(NotOpc), Exec) 1681 .addReg(Exec); 1682 MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), 1683 MI.getOperand(0).getReg()) 1684 .add(MI.getOperand(2)); 1685 expandPostRAPseudo(*Copy); 1686 BuildMI(MBB, MI, DL, get(NotOpc), Exec) 1687 .addReg(Exec); 1688 MI.eraseFromParent(); 1689 break; 1690 } 1691 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V1: 1692 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V2: 1693 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V3: 1694 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V4: 1695 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V5: 1696 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V8: 1697 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V16: 1698 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V32: 1699 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V1: 1700 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V2: 1701 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V3: 1702 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V4: 1703 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V5: 1704 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V8: 1705 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V16: 1706 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V32: 1707 case AMDGPU::S_INDIRECT_REG_WRITE_B64_V1: 1708 case AMDGPU::S_INDIRECT_REG_WRITE_B64_V2: 1709 case AMDGPU::S_INDIRECT_REG_WRITE_B64_V4: 1710 case AMDGPU::S_INDIRECT_REG_WRITE_B64_V8: 1711 case AMDGPU::S_INDIRECT_REG_WRITE_B64_V16: { 1712 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2); 1713 1714 unsigned Opc; 1715 if (RI.hasVGPRs(EltRC)) { 1716 Opc = ST.useVGPRIndexMode() ? 1717 AMDGPU::V_MOV_B32_indirect : AMDGPU::V_MOVRELD_B32_e32; 1718 } else { 1719 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? 1720 AMDGPU::S_MOVRELD_B64 : AMDGPU::S_MOVRELD_B32; 1721 } 1722 1723 const MCInstrDesc &OpDesc = get(Opc); 1724 Register VecReg = MI.getOperand(0).getReg(); 1725 bool IsUndef = MI.getOperand(1).isUndef(); 1726 unsigned SubReg = MI.getOperand(3).getImm(); 1727 assert(VecReg == MI.getOperand(1).getReg()); 1728 1729 MachineInstrBuilder MIB = 1730 BuildMI(MBB, MI, DL, OpDesc) 1731 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) 1732 .add(MI.getOperand(2)) 1733 .addReg(VecReg, RegState::ImplicitDefine) 1734 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); 1735 1736 const int ImpDefIdx = 1737 OpDesc.getNumOperands() + OpDesc.getNumImplicitUses(); 1738 const int ImpUseIdx = ImpDefIdx + 1; 1739 MIB->tieOperands(ImpDefIdx, ImpUseIdx); 1740 MI.eraseFromParent(); 1741 break; 1742 } 1743 case AMDGPU::SI_PC_ADD_REL_OFFSET: { 1744 MachineFunction &MF = *MBB.getParent(); 1745 Register Reg = MI.getOperand(0).getReg(); 1746 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0); 1747 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1); 1748 1749 // Create a bundle so these instructions won't be re-ordered by the 1750 // post-RA scheduler. 1751 MIBundleBuilder Bundler(MBB, MI); 1752 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); 1753 1754 // Add 32-bit offset from this instruction to the start of the 1755 // constant data. 1756 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) 1757 .addReg(RegLo) 1758 .add(MI.getOperand(1))); 1759 1760 MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) 1761 .addReg(RegHi); 1762 MIB.add(MI.getOperand(2)); 1763 1764 Bundler.append(MIB); 1765 finalizeBundle(MBB, Bundler.begin()); 1766 1767 MI.eraseFromParent(); 1768 break; 1769 } 1770 case AMDGPU::ENTER_WWM: { 1771 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when 1772 // WWM is entered. 1773 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 1774 : AMDGPU::S_OR_SAVEEXEC_B64)); 1775 break; 1776 } 1777 case AMDGPU::EXIT_WWM: { 1778 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when 1779 // WWM is exited. 1780 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); 1781 break; 1782 } 1783 } 1784 return true; 1785 } 1786 1787 std::pair<MachineInstr*, MachineInstr*> 1788 SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { 1789 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); 1790 1791 MachineBasicBlock &MBB = *MI.getParent(); 1792 DebugLoc DL = MBB.findDebugLoc(MI); 1793 MachineFunction *MF = MBB.getParent(); 1794 MachineRegisterInfo &MRI = MF->getRegInfo(); 1795 Register Dst = MI.getOperand(0).getReg(); 1796 unsigned Part = 0; 1797 MachineInstr *Split[2]; 1798 1799 1800 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) { 1801 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp)); 1802 if (Dst.isPhysical()) { 1803 MovDPP.addDef(RI.getSubReg(Dst, Sub)); 1804 } else { 1805 assert(MRI.isSSA()); 1806 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1807 MovDPP.addDef(Tmp); 1808 } 1809 1810 for (unsigned I = 1; I <= 2; ++I) { // old and src operands. 1811 const MachineOperand &SrcOp = MI.getOperand(I); 1812 assert(!SrcOp.isFPImm()); 1813 if (SrcOp.isImm()) { 1814 APInt Imm(64, SrcOp.getImm()); 1815 Imm.ashrInPlace(Part * 32); 1816 MovDPP.addImm(Imm.getLoBits(32).getZExtValue()); 1817 } else { 1818 assert(SrcOp.isReg()); 1819 Register Src = SrcOp.getReg(); 1820 if (Src.isPhysical()) 1821 MovDPP.addReg(RI.getSubReg(Src, Sub)); 1822 else 1823 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub); 1824 } 1825 } 1826 1827 for (unsigned I = 3; I < MI.getNumExplicitOperands(); ++I) 1828 MovDPP.addImm(MI.getOperand(I).getImm()); 1829 1830 Split[Part] = MovDPP; 1831 ++Part; 1832 } 1833 1834 if (Dst.isVirtual()) 1835 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst) 1836 .addReg(Split[0]->getOperand(0).getReg()) 1837 .addImm(AMDGPU::sub0) 1838 .addReg(Split[1]->getOperand(0).getReg()) 1839 .addImm(AMDGPU::sub1); 1840 1841 MI.eraseFromParent(); 1842 return std::make_pair(Split[0], Split[1]); 1843 } 1844 1845 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, 1846 MachineOperand &Src0, 1847 unsigned Src0OpName, 1848 MachineOperand &Src1, 1849 unsigned Src1OpName) const { 1850 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); 1851 if (!Src0Mods) 1852 return false; 1853 1854 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); 1855 assert(Src1Mods && 1856 "All commutable instructions have both src0 and src1 modifiers"); 1857 1858 int Src0ModsVal = Src0Mods->getImm(); 1859 int Src1ModsVal = Src1Mods->getImm(); 1860 1861 Src1Mods->setImm(Src0ModsVal); 1862 Src0Mods->setImm(Src1ModsVal); 1863 return true; 1864 } 1865 1866 static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, 1867 MachineOperand &RegOp, 1868 MachineOperand &NonRegOp) { 1869 Register Reg = RegOp.getReg(); 1870 unsigned SubReg = RegOp.getSubReg(); 1871 bool IsKill = RegOp.isKill(); 1872 bool IsDead = RegOp.isDead(); 1873 bool IsUndef = RegOp.isUndef(); 1874 bool IsDebug = RegOp.isDebug(); 1875 1876 if (NonRegOp.isImm()) 1877 RegOp.ChangeToImmediate(NonRegOp.getImm()); 1878 else if (NonRegOp.isFI()) 1879 RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); 1880 else 1881 return nullptr; 1882 1883 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); 1884 NonRegOp.setSubReg(SubReg); 1885 1886 return &MI; 1887 } 1888 1889 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, 1890 unsigned Src0Idx, 1891 unsigned Src1Idx) const { 1892 assert(!NewMI && "this should never be used"); 1893 1894 unsigned Opc = MI.getOpcode(); 1895 int CommutedOpcode = commuteOpcode(Opc); 1896 if (CommutedOpcode == -1) 1897 return nullptr; 1898 1899 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == 1900 static_cast<int>(Src0Idx) && 1901 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == 1902 static_cast<int>(Src1Idx) && 1903 "inconsistency with findCommutedOpIndices"); 1904 1905 MachineOperand &Src0 = MI.getOperand(Src0Idx); 1906 MachineOperand &Src1 = MI.getOperand(Src1Idx); 1907 1908 MachineInstr *CommutedMI = nullptr; 1909 if (Src0.isReg() && Src1.isReg()) { 1910 if (isOperandLegal(MI, Src1Idx, &Src0)) { 1911 // Be sure to copy the source modifiers to the right place. 1912 CommutedMI 1913 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); 1914 } 1915 1916 } else if (Src0.isReg() && !Src1.isReg()) { 1917 // src0 should always be able to support any operand type, so no need to 1918 // check operand legality. 1919 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); 1920 } else if (!Src0.isReg() && Src1.isReg()) { 1921 if (isOperandLegal(MI, Src1Idx, &Src0)) 1922 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); 1923 } else { 1924 // FIXME: Found two non registers to commute. This does happen. 1925 return nullptr; 1926 } 1927 1928 if (CommutedMI) { 1929 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, 1930 Src1, AMDGPU::OpName::src1_modifiers); 1931 1932 CommutedMI->setDesc(get(CommutedOpcode)); 1933 } 1934 1935 return CommutedMI; 1936 } 1937 1938 // This needs to be implemented because the source modifiers may be inserted 1939 // between the true commutable operands, and the base 1940 // TargetInstrInfo::commuteInstruction uses it. 1941 bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI, 1942 unsigned &SrcOpIdx0, 1943 unsigned &SrcOpIdx1) const { 1944 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1); 1945 } 1946 1947 bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0, 1948 unsigned &SrcOpIdx1) const { 1949 if (!Desc.isCommutable()) 1950 return false; 1951 1952 unsigned Opc = Desc.getOpcode(); 1953 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1954 if (Src0Idx == -1) 1955 return false; 1956 1957 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1958 if (Src1Idx == -1) 1959 return false; 1960 1961 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); 1962 } 1963 1964 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, 1965 int64_t BrOffset) const { 1966 // BranchRelaxation should never have to check s_setpc_b64 because its dest 1967 // block is unanalyzable. 1968 assert(BranchOp != AMDGPU::S_SETPC_B64); 1969 1970 // Convert to dwords. 1971 BrOffset /= 4; 1972 1973 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is 1974 // from the next instruction. 1975 BrOffset -= 1; 1976 1977 return isIntN(BranchOffsetBits, BrOffset); 1978 } 1979 1980 MachineBasicBlock *SIInstrInfo::getBranchDestBlock( 1981 const MachineInstr &MI) const { 1982 if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { 1983 // This would be a difficult analysis to perform, but can always be legal so 1984 // there's no need to analyze it. 1985 return nullptr; 1986 } 1987 1988 return MI.getOperand(0).getMBB(); 1989 } 1990 1991 unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, 1992 MachineBasicBlock &DestBB, 1993 const DebugLoc &DL, 1994 int64_t BrOffset, 1995 RegScavenger *RS) const { 1996 assert(RS && "RegScavenger required for long branching"); 1997 assert(MBB.empty() && 1998 "new block should be inserted for expanding unconditional branch"); 1999 assert(MBB.pred_size() == 1); 2000 2001 MachineFunction *MF = MBB.getParent(); 2002 MachineRegisterInfo &MRI = MF->getRegInfo(); 2003 2004 // FIXME: Virtual register workaround for RegScavenger not working with empty 2005 // blocks. 2006 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2007 2008 auto I = MBB.end(); 2009 2010 // We need to compute the offset relative to the instruction immediately after 2011 // s_getpc_b64. Insert pc arithmetic code before last terminator. 2012 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); 2013 2014 // TODO: Handle > 32-bit block address. 2015 if (BrOffset >= 0) { 2016 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) 2017 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 2018 .addReg(PCReg, 0, AMDGPU::sub0) 2019 .addMBB(&DestBB, MO_LONG_BRANCH_FORWARD); 2020 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) 2021 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 2022 .addReg(PCReg, 0, AMDGPU::sub1) 2023 .addImm(0); 2024 } else { 2025 // Backwards branch. 2026 BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32)) 2027 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 2028 .addReg(PCReg, 0, AMDGPU::sub0) 2029 .addMBB(&DestBB, MO_LONG_BRANCH_BACKWARD); 2030 BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32)) 2031 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 2032 .addReg(PCReg, 0, AMDGPU::sub1) 2033 .addImm(0); 2034 } 2035 2036 // Insert the indirect branch after the other terminator. 2037 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) 2038 .addReg(PCReg); 2039 2040 // FIXME: If spilling is necessary, this will fail because this scavenger has 2041 // no emergency stack slots. It is non-trivial to spill in this situation, 2042 // because the restore code needs to be specially placed after the 2043 // jump. BranchRelaxation then needs to be made aware of the newly inserted 2044 // block. 2045 // 2046 // If a spill is needed for the pc register pair, we need to insert a spill 2047 // restore block right before the destination block, and insert a short branch 2048 // into the old destination block's fallthrough predecessor. 2049 // e.g.: 2050 // 2051 // s_cbranch_scc0 skip_long_branch: 2052 // 2053 // long_branch_bb: 2054 // spill s[8:9] 2055 // s_getpc_b64 s[8:9] 2056 // s_add_u32 s8, s8, restore_bb 2057 // s_addc_u32 s9, s9, 0 2058 // s_setpc_b64 s[8:9] 2059 // 2060 // skip_long_branch: 2061 // foo; 2062 // 2063 // ..... 2064 // 2065 // dest_bb_fallthrough_predecessor: 2066 // bar; 2067 // s_branch dest_bb 2068 // 2069 // restore_bb: 2070 // restore s[8:9] 2071 // fallthrough dest_bb 2072 /// 2073 // dest_bb: 2074 // buzz; 2075 2076 RS->enterBasicBlockEnd(MBB); 2077 unsigned Scav = RS->scavengeRegisterBackwards( 2078 AMDGPU::SReg_64RegClass, 2079 MachineBasicBlock::iterator(GetPC), false, 0); 2080 MRI.replaceRegWith(PCReg, Scav); 2081 MRI.clearVirtRegs(); 2082 RS->setRegUsed(Scav); 2083 2084 return 4 + 8 + 4 + 4; 2085 } 2086 2087 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { 2088 switch (Cond) { 2089 case SIInstrInfo::SCC_TRUE: 2090 return AMDGPU::S_CBRANCH_SCC1; 2091 case SIInstrInfo::SCC_FALSE: 2092 return AMDGPU::S_CBRANCH_SCC0; 2093 case SIInstrInfo::VCCNZ: 2094 return AMDGPU::S_CBRANCH_VCCNZ; 2095 case SIInstrInfo::VCCZ: 2096 return AMDGPU::S_CBRANCH_VCCZ; 2097 case SIInstrInfo::EXECNZ: 2098 return AMDGPU::S_CBRANCH_EXECNZ; 2099 case SIInstrInfo::EXECZ: 2100 return AMDGPU::S_CBRANCH_EXECZ; 2101 default: 2102 llvm_unreachable("invalid branch predicate"); 2103 } 2104 } 2105 2106 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { 2107 switch (Opcode) { 2108 case AMDGPU::S_CBRANCH_SCC0: 2109 return SCC_FALSE; 2110 case AMDGPU::S_CBRANCH_SCC1: 2111 return SCC_TRUE; 2112 case AMDGPU::S_CBRANCH_VCCNZ: 2113 return VCCNZ; 2114 case AMDGPU::S_CBRANCH_VCCZ: 2115 return VCCZ; 2116 case AMDGPU::S_CBRANCH_EXECNZ: 2117 return EXECNZ; 2118 case AMDGPU::S_CBRANCH_EXECZ: 2119 return EXECZ; 2120 default: 2121 return INVALID_BR; 2122 } 2123 } 2124 2125 bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, 2126 MachineBasicBlock::iterator I, 2127 MachineBasicBlock *&TBB, 2128 MachineBasicBlock *&FBB, 2129 SmallVectorImpl<MachineOperand> &Cond, 2130 bool AllowModify) const { 2131 if (I->getOpcode() == AMDGPU::S_BRANCH) { 2132 // Unconditional Branch 2133 TBB = I->getOperand(0).getMBB(); 2134 return false; 2135 } 2136 2137 MachineBasicBlock *CondBB = nullptr; 2138 2139 if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 2140 CondBB = I->getOperand(1).getMBB(); 2141 Cond.push_back(I->getOperand(0)); 2142 } else { 2143 BranchPredicate Pred = getBranchPredicate(I->getOpcode()); 2144 if (Pred == INVALID_BR) 2145 return true; 2146 2147 CondBB = I->getOperand(0).getMBB(); 2148 Cond.push_back(MachineOperand::CreateImm(Pred)); 2149 Cond.push_back(I->getOperand(1)); // Save the branch register. 2150 } 2151 ++I; 2152 2153 if (I == MBB.end()) { 2154 // Conditional branch followed by fall-through. 2155 TBB = CondBB; 2156 return false; 2157 } 2158 2159 if (I->getOpcode() == AMDGPU::S_BRANCH) { 2160 TBB = CondBB; 2161 FBB = I->getOperand(0).getMBB(); 2162 return false; 2163 } 2164 2165 return true; 2166 } 2167 2168 bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, 2169 MachineBasicBlock *&FBB, 2170 SmallVectorImpl<MachineOperand> &Cond, 2171 bool AllowModify) const { 2172 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 2173 auto E = MBB.end(); 2174 if (I == E) 2175 return false; 2176 2177 // Skip over the instructions that are artificially terminators for special 2178 // exec management. 2179 while (I != E && !I->isBranch() && !I->isReturn() && 2180 I->getOpcode() != AMDGPU::SI_MASK_BRANCH) { 2181 switch (I->getOpcode()) { 2182 case AMDGPU::SI_MASK_BRANCH: 2183 case AMDGPU::S_MOV_B64_term: 2184 case AMDGPU::S_XOR_B64_term: 2185 case AMDGPU::S_ANDN2_B64_term: 2186 case AMDGPU::S_MOV_B32_term: 2187 case AMDGPU::S_XOR_B32_term: 2188 case AMDGPU::S_OR_B32_term: 2189 case AMDGPU::S_ANDN2_B32_term: 2190 break; 2191 case AMDGPU::SI_IF: 2192 case AMDGPU::SI_ELSE: 2193 case AMDGPU::SI_KILL_I1_TERMINATOR: 2194 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: 2195 // FIXME: It's messy that these need to be considered here at all. 2196 return true; 2197 default: 2198 llvm_unreachable("unexpected non-branch terminator inst"); 2199 } 2200 2201 ++I; 2202 } 2203 2204 if (I == E) 2205 return false; 2206 2207 if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH) 2208 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); 2209 2210 ++I; 2211 2212 // TODO: Should be able to treat as fallthrough? 2213 if (I == MBB.end()) 2214 return true; 2215 2216 if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify)) 2217 return true; 2218 2219 MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB(); 2220 2221 // Specifically handle the case where the conditional branch is to the same 2222 // destination as the mask branch. e.g. 2223 // 2224 // si_mask_branch BB8 2225 // s_cbranch_execz BB8 2226 // s_cbranch BB9 2227 // 2228 // This is required to understand divergent loops which may need the branches 2229 // to be relaxed. 2230 if (TBB != MaskBrDest || Cond.empty()) 2231 return true; 2232 2233 auto Pred = Cond[0].getImm(); 2234 return (Pred != EXECZ && Pred != EXECNZ); 2235 } 2236 2237 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, 2238 int *BytesRemoved) const { 2239 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 2240 2241 unsigned Count = 0; 2242 unsigned RemovedSize = 0; 2243 while (I != MBB.end()) { 2244 MachineBasicBlock::iterator Next = std::next(I); 2245 if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) { 2246 I = Next; 2247 continue; 2248 } 2249 2250 RemovedSize += getInstSizeInBytes(*I); 2251 I->eraseFromParent(); 2252 ++Count; 2253 I = Next; 2254 } 2255 2256 if (BytesRemoved) 2257 *BytesRemoved = RemovedSize; 2258 2259 return Count; 2260 } 2261 2262 // Copy the flags onto the implicit condition register operand. 2263 static void preserveCondRegFlags(MachineOperand &CondReg, 2264 const MachineOperand &OrigCond) { 2265 CondReg.setIsUndef(OrigCond.isUndef()); 2266 CondReg.setIsKill(OrigCond.isKill()); 2267 } 2268 2269 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, 2270 MachineBasicBlock *TBB, 2271 MachineBasicBlock *FBB, 2272 ArrayRef<MachineOperand> Cond, 2273 const DebugLoc &DL, 2274 int *BytesAdded) const { 2275 if (!FBB && Cond.empty()) { 2276 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 2277 .addMBB(TBB); 2278 if (BytesAdded) 2279 *BytesAdded = 4; 2280 return 1; 2281 } 2282 2283 if(Cond.size() == 1 && Cond[0].isReg()) { 2284 BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO)) 2285 .add(Cond[0]) 2286 .addMBB(TBB); 2287 return 1; 2288 } 2289 2290 assert(TBB && Cond[0].isImm()); 2291 2292 unsigned Opcode 2293 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); 2294 2295 if (!FBB) { 2296 Cond[1].isUndef(); 2297 MachineInstr *CondBr = 2298 BuildMI(&MBB, DL, get(Opcode)) 2299 .addMBB(TBB); 2300 2301 // Copy the flags onto the implicit condition register operand. 2302 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); 2303 2304 if (BytesAdded) 2305 *BytesAdded = 4; 2306 return 1; 2307 } 2308 2309 assert(TBB && FBB); 2310 2311 MachineInstr *CondBr = 2312 BuildMI(&MBB, DL, get(Opcode)) 2313 .addMBB(TBB); 2314 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 2315 .addMBB(FBB); 2316 2317 MachineOperand &CondReg = CondBr->getOperand(1); 2318 CondReg.setIsUndef(Cond[1].isUndef()); 2319 CondReg.setIsKill(Cond[1].isKill()); 2320 2321 if (BytesAdded) 2322 *BytesAdded = 8; 2323 2324 return 2; 2325 } 2326 2327 bool SIInstrInfo::reverseBranchCondition( 2328 SmallVectorImpl<MachineOperand> &Cond) const { 2329 if (Cond.size() != 2) { 2330 return true; 2331 } 2332 2333 if (Cond[0].isImm()) { 2334 Cond[0].setImm(-Cond[0].getImm()); 2335 return false; 2336 } 2337 2338 return true; 2339 } 2340 2341 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 2342 ArrayRef<MachineOperand> Cond, 2343 Register DstReg, Register TrueReg, 2344 Register FalseReg, int &CondCycles, 2345 int &TrueCycles, int &FalseCycles) const { 2346 switch (Cond[0].getImm()) { 2347 case VCCNZ: 2348 case VCCZ: { 2349 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2350 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 2351 assert(MRI.getRegClass(FalseReg) == RC); 2352 2353 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 2354 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 2355 2356 // Limit to equal cost for branch vs. N v_cndmask_b32s. 2357 return RI.hasVGPRs(RC) && NumInsts <= 6; 2358 } 2359 case SCC_TRUE: 2360 case SCC_FALSE: { 2361 // FIXME: We could insert for VGPRs if we could replace the original compare 2362 // with a vector one. 2363 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2364 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 2365 assert(MRI.getRegClass(FalseReg) == RC); 2366 2367 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 2368 2369 // Multiples of 8 can do s_cselect_b64 2370 if (NumInsts % 2 == 0) 2371 NumInsts /= 2; 2372 2373 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 2374 return RI.isSGPRClass(RC); 2375 } 2376 default: 2377 return false; 2378 } 2379 } 2380 2381 void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, 2382 MachineBasicBlock::iterator I, const DebugLoc &DL, 2383 Register DstReg, ArrayRef<MachineOperand> Cond, 2384 Register TrueReg, Register FalseReg) const { 2385 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); 2386 if (Pred == VCCZ || Pred == SCC_FALSE) { 2387 Pred = static_cast<BranchPredicate>(-Pred); 2388 std::swap(TrueReg, FalseReg); 2389 } 2390 2391 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2392 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); 2393 unsigned DstSize = RI.getRegSizeInBits(*DstRC); 2394 2395 if (DstSize == 32) { 2396 MachineInstr *Select; 2397 if (Pred == SCC_TRUE) { 2398 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg) 2399 .addReg(TrueReg) 2400 .addReg(FalseReg); 2401 } else { 2402 // Instruction's operands are backwards from what is expected. 2403 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg) 2404 .addReg(FalseReg) 2405 .addReg(TrueReg); 2406 } 2407 2408 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 2409 return; 2410 } 2411 2412 if (DstSize == 64 && Pred == SCC_TRUE) { 2413 MachineInstr *Select = 2414 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) 2415 .addReg(TrueReg) 2416 .addReg(FalseReg); 2417 2418 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 2419 return; 2420 } 2421 2422 static const int16_t Sub0_15[] = { 2423 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 2424 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 2425 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 2426 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 2427 }; 2428 2429 static const int16_t Sub0_15_64[] = { 2430 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 2431 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 2432 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 2433 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 2434 }; 2435 2436 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; 2437 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; 2438 const int16_t *SubIndices = Sub0_15; 2439 int NElts = DstSize / 32; 2440 2441 // 64-bit select is only available for SALU. 2442 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit. 2443 if (Pred == SCC_TRUE) { 2444 if (NElts % 2) { 2445 SelOp = AMDGPU::S_CSELECT_B32; 2446 EltRC = &AMDGPU::SGPR_32RegClass; 2447 } else { 2448 SelOp = AMDGPU::S_CSELECT_B64; 2449 EltRC = &AMDGPU::SGPR_64RegClass; 2450 SubIndices = Sub0_15_64; 2451 NElts /= 2; 2452 } 2453 } 2454 2455 MachineInstrBuilder MIB = BuildMI( 2456 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); 2457 2458 I = MIB->getIterator(); 2459 2460 SmallVector<Register, 8> Regs; 2461 for (int Idx = 0; Idx != NElts; ++Idx) { 2462 Register DstElt = MRI.createVirtualRegister(EltRC); 2463 Regs.push_back(DstElt); 2464 2465 unsigned SubIdx = SubIndices[Idx]; 2466 2467 MachineInstr *Select; 2468 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) { 2469 Select = 2470 BuildMI(MBB, I, DL, get(SelOp), DstElt) 2471 .addReg(FalseReg, 0, SubIdx) 2472 .addReg(TrueReg, 0, SubIdx); 2473 } else { 2474 Select = 2475 BuildMI(MBB, I, DL, get(SelOp), DstElt) 2476 .addReg(TrueReg, 0, SubIdx) 2477 .addReg(FalseReg, 0, SubIdx); 2478 } 2479 2480 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 2481 fixImplicitOperands(*Select); 2482 2483 MIB.addReg(DstElt) 2484 .addImm(SubIdx); 2485 } 2486 } 2487 2488 bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { 2489 switch (MI.getOpcode()) { 2490 case AMDGPU::V_MOV_B32_e32: 2491 case AMDGPU::V_MOV_B32_e64: 2492 case AMDGPU::V_MOV_B64_PSEUDO: { 2493 // If there are additional implicit register operands, this may be used for 2494 // register indexing so the source register operand isn't simply copied. 2495 unsigned NumOps = MI.getDesc().getNumOperands() + 2496 MI.getDesc().getNumImplicitUses(); 2497 2498 return MI.getNumOperands() == NumOps; 2499 } 2500 case AMDGPU::S_MOV_B32: 2501 case AMDGPU::S_MOV_B64: 2502 case AMDGPU::COPY: 2503 case AMDGPU::V_ACCVGPR_WRITE_B32: 2504 case AMDGPU::V_ACCVGPR_READ_B32: 2505 return true; 2506 default: 2507 return false; 2508 } 2509 } 2510 2511 unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind( 2512 unsigned Kind) const { 2513 switch(Kind) { 2514 case PseudoSourceValue::Stack: 2515 case PseudoSourceValue::FixedStack: 2516 return AMDGPUAS::PRIVATE_ADDRESS; 2517 case PseudoSourceValue::ConstantPool: 2518 case PseudoSourceValue::GOT: 2519 case PseudoSourceValue::JumpTable: 2520 case PseudoSourceValue::GlobalValueCallEntry: 2521 case PseudoSourceValue::ExternalSymbolCallEntry: 2522 case PseudoSourceValue::TargetCustom: 2523 return AMDGPUAS::CONSTANT_ADDRESS; 2524 } 2525 return AMDGPUAS::FLAT_ADDRESS; 2526 } 2527 2528 static void removeModOperands(MachineInstr &MI) { 2529 unsigned Opc = MI.getOpcode(); 2530 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, 2531 AMDGPU::OpName::src0_modifiers); 2532 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, 2533 AMDGPU::OpName::src1_modifiers); 2534 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, 2535 AMDGPU::OpName::src2_modifiers); 2536 2537 MI.RemoveOperand(Src2ModIdx); 2538 MI.RemoveOperand(Src1ModIdx); 2539 MI.RemoveOperand(Src0ModIdx); 2540 } 2541 2542 bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, 2543 Register Reg, MachineRegisterInfo *MRI) const { 2544 if (!MRI->hasOneNonDBGUse(Reg)) 2545 return false; 2546 2547 switch (DefMI.getOpcode()) { 2548 default: 2549 return false; 2550 case AMDGPU::S_MOV_B64: 2551 // TODO: We could fold 64-bit immediates, but this get compilicated 2552 // when there are sub-registers. 2553 return false; 2554 2555 case AMDGPU::V_MOV_B32_e32: 2556 case AMDGPU::S_MOV_B32: 2557 case AMDGPU::V_ACCVGPR_WRITE_B32: 2558 break; 2559 } 2560 2561 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); 2562 assert(ImmOp); 2563 // FIXME: We could handle FrameIndex values here. 2564 if (!ImmOp->isImm()) 2565 return false; 2566 2567 unsigned Opc = UseMI.getOpcode(); 2568 if (Opc == AMDGPU::COPY) { 2569 Register DstReg = UseMI.getOperand(0).getReg(); 2570 bool Is16Bit = getOpSize(UseMI, 0) == 2; 2571 bool isVGPRCopy = RI.isVGPR(*MRI, DstReg); 2572 unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 2573 APInt Imm(32, ImmOp->getImm()); 2574 2575 if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16) 2576 Imm = Imm.ashr(16); 2577 2578 if (RI.isAGPR(*MRI, DstReg)) { 2579 if (!isInlineConstant(Imm)) 2580 return false; 2581 NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32; 2582 } 2583 2584 if (Is16Bit) { 2585 if (isVGPRCopy) 2586 return false; // Do not clobber vgpr_hi16 2587 2588 if (DstReg.isVirtual() && 2589 UseMI.getOperand(0).getSubReg() != AMDGPU::lo16) 2590 return false; 2591 2592 UseMI.getOperand(0).setSubReg(0); 2593 if (DstReg.isPhysical()) { 2594 DstReg = RI.get32BitRegister(DstReg); 2595 UseMI.getOperand(0).setReg(DstReg); 2596 } 2597 assert(UseMI.getOperand(1).getReg().isVirtual()); 2598 } 2599 2600 UseMI.setDesc(get(NewOpc)); 2601 UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue()); 2602 UseMI.getOperand(1).setTargetFlags(0); 2603 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); 2604 return true; 2605 } 2606 2607 if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || 2608 Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64 || 2609 Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 || 2610 Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64) { 2611 // Don't fold if we are using source or output modifiers. The new VOP2 2612 // instructions don't have them. 2613 if (hasAnyModifiersSet(UseMI)) 2614 return false; 2615 2616 // If this is a free constant, there's no reason to do this. 2617 // TODO: We could fold this here instead of letting SIFoldOperands do it 2618 // later. 2619 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); 2620 2621 // Any src operand can be used for the legality check. 2622 if (isInlineConstant(UseMI, *Src0, *ImmOp)) 2623 return false; 2624 2625 bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || 2626 Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64; 2627 bool IsFMA = Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 || 2628 Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64; 2629 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); 2630 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); 2631 2632 // Multiplied part is the constant: Use v_madmk_{f16, f32}. 2633 // We should only expect these to be on src0 due to canonicalizations. 2634 if (Src0->isReg() && Src0->getReg() == Reg) { 2635 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 2636 return false; 2637 2638 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) 2639 return false; 2640 2641 unsigned NewOpc = 2642 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : AMDGPU::V_FMAMK_F16) 2643 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16); 2644 if (pseudoToMCOpcode(NewOpc) == -1) 2645 return false; 2646 2647 // We need to swap operands 0 and 1 since madmk constant is at operand 1. 2648 2649 const int64_t Imm = ImmOp->getImm(); 2650 2651 // FIXME: This would be a lot easier if we could return a new instruction 2652 // instead of having to modify in place. 2653 2654 // Remove these first since they are at the end. 2655 UseMI.RemoveOperand( 2656 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 2657 UseMI.RemoveOperand( 2658 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 2659 2660 Register Src1Reg = Src1->getReg(); 2661 unsigned Src1SubReg = Src1->getSubReg(); 2662 Src0->setReg(Src1Reg); 2663 Src0->setSubReg(Src1SubReg); 2664 Src0->setIsKill(Src1->isKill()); 2665 2666 if (Opc == AMDGPU::V_MAC_F32_e64 || 2667 Opc == AMDGPU::V_MAC_F16_e64 || 2668 Opc == AMDGPU::V_FMAC_F32_e64 || 2669 Opc == AMDGPU::V_FMAC_F16_e64) 2670 UseMI.untieRegOperand( 2671 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 2672 2673 Src1->ChangeToImmediate(Imm); 2674 2675 removeModOperands(UseMI); 2676 UseMI.setDesc(get(NewOpc)); 2677 2678 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 2679 if (DeleteDef) 2680 DefMI.eraseFromParent(); 2681 2682 return true; 2683 } 2684 2685 // Added part is the constant: Use v_madak_{f16, f32}. 2686 if (Src2->isReg() && Src2->getReg() == Reg) { 2687 // Not allowed to use constant bus for another operand. 2688 // We can however allow an inline immediate as src0. 2689 bool Src0Inlined = false; 2690 if (Src0->isReg()) { 2691 // Try to inline constant if possible. 2692 // If the Def moves immediate and the use is single 2693 // We are saving VGPR here. 2694 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg()); 2695 if (Def && Def->isMoveImmediate() && 2696 isInlineConstant(Def->getOperand(1)) && 2697 MRI->hasOneUse(Src0->getReg())) { 2698 Src0->ChangeToImmediate(Def->getOperand(1).getImm()); 2699 Src0Inlined = true; 2700 } else if ((Register::isPhysicalRegister(Src0->getReg()) && 2701 (ST.getConstantBusLimit(Opc) <= 1 && 2702 RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) || 2703 (Register::isVirtualRegister(Src0->getReg()) && 2704 (ST.getConstantBusLimit(Opc) <= 1 && 2705 RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))) 2706 return false; 2707 // VGPR is okay as Src0 - fallthrough 2708 } 2709 2710 if (Src1->isReg() && !Src0Inlined ) { 2711 // We have one slot for inlinable constant so far - try to fill it 2712 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg()); 2713 if (Def && Def->isMoveImmediate() && 2714 isInlineConstant(Def->getOperand(1)) && 2715 MRI->hasOneUse(Src1->getReg()) && 2716 commuteInstruction(UseMI)) { 2717 Src0->ChangeToImmediate(Def->getOperand(1).getImm()); 2718 } else if ((Register::isPhysicalRegister(Src1->getReg()) && 2719 RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) || 2720 (Register::isVirtualRegister(Src1->getReg()) && 2721 RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) 2722 return false; 2723 // VGPR is okay as Src1 - fallthrough 2724 } 2725 2726 unsigned NewOpc = 2727 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 : AMDGPU::V_FMAAK_F16) 2728 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16); 2729 if (pseudoToMCOpcode(NewOpc) == -1) 2730 return false; 2731 2732 const int64_t Imm = ImmOp->getImm(); 2733 2734 // FIXME: This would be a lot easier if we could return a new instruction 2735 // instead of having to modify in place. 2736 2737 // Remove these first since they are at the end. 2738 UseMI.RemoveOperand( 2739 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 2740 UseMI.RemoveOperand( 2741 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 2742 2743 if (Opc == AMDGPU::V_MAC_F32_e64 || 2744 Opc == AMDGPU::V_MAC_F16_e64 || 2745 Opc == AMDGPU::V_FMAC_F32_e64 || 2746 Opc == AMDGPU::V_FMAC_F16_e64) 2747 UseMI.untieRegOperand( 2748 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 2749 2750 // ChangingToImmediate adds Src2 back to the instruction. 2751 Src2->ChangeToImmediate(Imm); 2752 2753 // These come before src2. 2754 removeModOperands(UseMI); 2755 UseMI.setDesc(get(NewOpc)); 2756 // It might happen that UseMI was commuted 2757 // and we now have SGPR as SRC1. If so 2 inlined 2758 // constant and SGPR are illegal. 2759 legalizeOperands(UseMI); 2760 2761 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 2762 if (DeleteDef) 2763 DefMI.eraseFromParent(); 2764 2765 return true; 2766 } 2767 } 2768 2769 return false; 2770 } 2771 2772 static bool 2773 memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1, 2774 ArrayRef<const MachineOperand *> BaseOps2) { 2775 if (BaseOps1.size() != BaseOps2.size()) 2776 return false; 2777 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) { 2778 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I])) 2779 return false; 2780 } 2781 return true; 2782 } 2783 2784 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 2785 int WidthB, int OffsetB) { 2786 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 2787 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 2788 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 2789 return LowOffset + LowWidth <= HighOffset; 2790 } 2791 2792 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa, 2793 const MachineInstr &MIb) const { 2794 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1; 2795 int64_t Offset0, Offset1; 2796 unsigned Dummy0, Dummy1; 2797 bool Offset0IsScalable, Offset1IsScalable; 2798 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable, 2799 Dummy0, &RI) || 2800 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable, 2801 Dummy1, &RI)) 2802 return false; 2803 2804 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1)) 2805 return false; 2806 2807 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { 2808 // FIXME: Handle ds_read2 / ds_write2. 2809 return false; 2810 } 2811 unsigned Width0 = MIa.memoperands().front()->getSize(); 2812 unsigned Width1 = MIb.memoperands().front()->getSize(); 2813 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1); 2814 } 2815 2816 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, 2817 const MachineInstr &MIb) const { 2818 assert(MIa.mayLoadOrStore() && 2819 "MIa must load from or modify a memory location"); 2820 assert(MIb.mayLoadOrStore() && 2821 "MIb must load from or modify a memory location"); 2822 2823 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) 2824 return false; 2825 2826 // XXX - Can we relax this between address spaces? 2827 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 2828 return false; 2829 2830 // TODO: Should we check the address space from the MachineMemOperand? That 2831 // would allow us to distinguish objects we know don't alias based on the 2832 // underlying address space, even if it was lowered to a different one, 2833 // e.g. private accesses lowered to use MUBUF instructions on a scratch 2834 // buffer. 2835 if (isDS(MIa)) { 2836 if (isDS(MIb)) 2837 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2838 2839 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb); 2840 } 2841 2842 if (isMUBUF(MIa) || isMTBUF(MIa)) { 2843 if (isMUBUF(MIb) || isMTBUF(MIb)) 2844 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2845 2846 return !isFLAT(MIb) && !isSMRD(MIb); 2847 } 2848 2849 if (isSMRD(MIa)) { 2850 if (isSMRD(MIb)) 2851 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2852 2853 return !isFLAT(MIb) && !isMUBUF(MIb) && !isMTBUF(MIb); 2854 } 2855 2856 if (isFLAT(MIa)) { 2857 if (isFLAT(MIb)) 2858 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2859 2860 return false; 2861 } 2862 2863 return false; 2864 } 2865 2866 static int64_t getFoldableImm(const MachineOperand* MO) { 2867 if (!MO->isReg()) 2868 return false; 2869 const MachineFunction *MF = MO->getParent()->getParent()->getParent(); 2870 const MachineRegisterInfo &MRI = MF->getRegInfo(); 2871 auto Def = MRI.getUniqueVRegDef(MO->getReg()); 2872 if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 && 2873 Def->getOperand(1).isImm()) 2874 return Def->getOperand(1).getImm(); 2875 return AMDGPU::NoRegister; 2876 } 2877 2878 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, 2879 MachineInstr &MI, 2880 LiveVariables *LV) const { 2881 unsigned Opc = MI.getOpcode(); 2882 bool IsF16 = false; 2883 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 || 2884 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64; 2885 2886 switch (Opc) { 2887 default: 2888 return nullptr; 2889 case AMDGPU::V_MAC_F16_e64: 2890 case AMDGPU::V_FMAC_F16_e64: 2891 IsF16 = true; 2892 LLVM_FALLTHROUGH; 2893 case AMDGPU::V_MAC_F32_e64: 2894 case AMDGPU::V_FMAC_F32_e64: 2895 break; 2896 case AMDGPU::V_MAC_F16_e32: 2897 case AMDGPU::V_FMAC_F16_e32: 2898 IsF16 = true; 2899 LLVM_FALLTHROUGH; 2900 case AMDGPU::V_MAC_F32_e32: 2901 case AMDGPU::V_FMAC_F32_e32: { 2902 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 2903 AMDGPU::OpName::src0); 2904 const MachineOperand *Src0 = &MI.getOperand(Src0Idx); 2905 if (!Src0->isReg() && !Src0->isImm()) 2906 return nullptr; 2907 2908 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0)) 2909 return nullptr; 2910 2911 break; 2912 } 2913 } 2914 2915 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 2916 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); 2917 const MachineOperand *Src0Mods = 2918 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 2919 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 2920 const MachineOperand *Src1Mods = 2921 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 2922 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 2923 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); 2924 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); 2925 2926 if (!Src0Mods && !Src1Mods && !Clamp && !Omod && 2927 // If we have an SGPR input, we will violate the constant bus restriction. 2928 (ST.getConstantBusLimit(Opc) > 1 || 2929 !Src0->isReg() || 2930 !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) { 2931 if (auto Imm = getFoldableImm(Src2)) { 2932 unsigned NewOpc = 2933 IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32) 2934 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32); 2935 if (pseudoToMCOpcode(NewOpc) != -1) 2936 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) 2937 .add(*Dst) 2938 .add(*Src0) 2939 .add(*Src1) 2940 .addImm(Imm); 2941 } 2942 unsigned NewOpc = 2943 IsFMA ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32) 2944 : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32); 2945 if (auto Imm = getFoldableImm(Src1)) { 2946 if (pseudoToMCOpcode(NewOpc) != -1) 2947 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) 2948 .add(*Dst) 2949 .add(*Src0) 2950 .addImm(Imm) 2951 .add(*Src2); 2952 } 2953 if (auto Imm = getFoldableImm(Src0)) { 2954 if (pseudoToMCOpcode(NewOpc) != -1 && 2955 isOperandLegal(MI, AMDGPU::getNamedOperandIdx(NewOpc, 2956 AMDGPU::OpName::src0), Src1)) 2957 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) 2958 .add(*Dst) 2959 .add(*Src1) 2960 .addImm(Imm) 2961 .add(*Src2); 2962 } 2963 } 2964 2965 unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16 : AMDGPU::V_FMA_F32) 2966 : (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32); 2967 if (pseudoToMCOpcode(NewOpc) == -1) 2968 return nullptr; 2969 2970 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) 2971 .add(*Dst) 2972 .addImm(Src0Mods ? Src0Mods->getImm() : 0) 2973 .add(*Src0) 2974 .addImm(Src1Mods ? Src1Mods->getImm() : 0) 2975 .add(*Src1) 2976 .addImm(0) // Src mods 2977 .add(*Src2) 2978 .addImm(Clamp ? Clamp->getImm() : 0) 2979 .addImm(Omod ? Omod->getImm() : 0); 2980 } 2981 2982 // It's not generally safe to move VALU instructions across these since it will 2983 // start using the register as a base index rather than directly. 2984 // XXX - Why isn't hasSideEffects sufficient for these? 2985 static bool changesVGPRIndexingMode(const MachineInstr &MI) { 2986 switch (MI.getOpcode()) { 2987 case AMDGPU::S_SET_GPR_IDX_ON: 2988 case AMDGPU::S_SET_GPR_IDX_MODE: 2989 case AMDGPU::S_SET_GPR_IDX_OFF: 2990 return true; 2991 default: 2992 return false; 2993 } 2994 } 2995 2996 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, 2997 const MachineBasicBlock *MBB, 2998 const MachineFunction &MF) const { 2999 // Skipping the check for SP writes in the base implementation. The reason it 3000 // was added was apparently due to compile time concerns. 3001 // 3002 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops 3003 // but is probably avoidable. 3004 3005 // Copied from base implementation. 3006 // Terminators and labels can't be scheduled around. 3007 if (MI.isTerminator() || MI.isPosition()) 3008 return true; 3009 3010 // Target-independent instructions do not have an implicit-use of EXEC, even 3011 // when they operate on VGPRs. Treating EXEC modifications as scheduling 3012 // boundaries prevents incorrect movements of such instructions. 3013 3014 // TODO: Don't treat setreg with known constant that only changes MODE as 3015 // barrier. 3016 return MI.modifiesRegister(AMDGPU::EXEC, &RI) || 3017 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || 3018 MI.getOpcode() == AMDGPU::S_SETREG_B32 || 3019 changesVGPRIndexingMode(MI); 3020 } 3021 3022 bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { 3023 return Opcode == AMDGPU::DS_ORDERED_COUNT || 3024 Opcode == AMDGPU::DS_GWS_INIT || 3025 Opcode == AMDGPU::DS_GWS_SEMA_V || 3026 Opcode == AMDGPU::DS_GWS_SEMA_BR || 3027 Opcode == AMDGPU::DS_GWS_SEMA_P || 3028 Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL || 3029 Opcode == AMDGPU::DS_GWS_BARRIER; 3030 } 3031 3032 bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) { 3033 // Skip the full operand and register alias search modifiesRegister 3034 // does. There's only a handful of instructions that touch this, it's only an 3035 // implicit def, and doesn't alias any other registers. 3036 if (const MCPhysReg *ImpDef = MI.getDesc().getImplicitDefs()) { 3037 for (; ImpDef && *ImpDef; ++ImpDef) { 3038 if (*ImpDef == AMDGPU::MODE) 3039 return true; 3040 } 3041 } 3042 3043 return false; 3044 } 3045 3046 bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const { 3047 unsigned Opcode = MI.getOpcode(); 3048 3049 if (MI.mayStore() && isSMRD(MI)) 3050 return true; // scalar store or atomic 3051 3052 // This will terminate the function when other lanes may need to continue. 3053 if (MI.isReturn()) 3054 return true; 3055 3056 // These instructions cause shader I/O that may cause hardware lockups 3057 // when executed with an empty EXEC mask. 3058 // 3059 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when 3060 // EXEC = 0, but checking for that case here seems not worth it 3061 // given the typical code patterns. 3062 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT || 3063 Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE || 3064 Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP || 3065 Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER) 3066 return true; 3067 3068 if (MI.isCall() || MI.isInlineAsm()) 3069 return true; // conservative assumption 3070 3071 // A mode change is a scalar operation that influences vector instructions. 3072 if (modifiesModeRegister(MI)) 3073 return true; 3074 3075 // These are like SALU instructions in terms of effects, so it's questionable 3076 // whether we should return true for those. 3077 // 3078 // However, executing them with EXEC = 0 causes them to operate on undefined 3079 // data, which we avoid by returning true here. 3080 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || Opcode == AMDGPU::V_READLANE_B32) 3081 return true; 3082 3083 return false; 3084 } 3085 3086 bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI, 3087 const MachineInstr &MI) const { 3088 if (MI.isMetaInstruction()) 3089 return false; 3090 3091 // This won't read exec if this is an SGPR->SGPR copy. 3092 if (MI.isCopyLike()) { 3093 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg())) 3094 return true; 3095 3096 // Make sure this isn't copying exec as a normal operand 3097 return MI.readsRegister(AMDGPU::EXEC, &RI); 3098 } 3099 3100 // Make a conservative assumption about the callee. 3101 if (MI.isCall()) 3102 return true; 3103 3104 // Be conservative with any unhandled generic opcodes. 3105 if (!isTargetSpecificOpcode(MI.getOpcode())) 3106 return true; 3107 3108 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI); 3109 } 3110 3111 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 3112 switch (Imm.getBitWidth()) { 3113 case 1: // This likely will be a condition code mask. 3114 return true; 3115 3116 case 32: 3117 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(), 3118 ST.hasInv2PiInlineImm()); 3119 case 64: 3120 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), 3121 ST.hasInv2PiInlineImm()); 3122 case 16: 3123 return ST.has16BitInsts() && 3124 AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), 3125 ST.hasInv2PiInlineImm()); 3126 default: 3127 llvm_unreachable("invalid bitwidth"); 3128 } 3129 } 3130 3131 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 3132 uint8_t OperandType) const { 3133 if (!MO.isImm() || 3134 OperandType < AMDGPU::OPERAND_SRC_FIRST || 3135 OperandType > AMDGPU::OPERAND_SRC_LAST) 3136 return false; 3137 3138 // MachineOperand provides no way to tell the true operand size, since it only 3139 // records a 64-bit value. We need to know the size to determine if a 32-bit 3140 // floating point immediate bit pattern is legal for an integer immediate. It 3141 // would be for any 32-bit integer operand, but would not be for a 64-bit one. 3142 3143 int64_t Imm = MO.getImm(); 3144 switch (OperandType) { 3145 case AMDGPU::OPERAND_REG_IMM_INT32: 3146 case AMDGPU::OPERAND_REG_IMM_FP32: 3147 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 3148 case AMDGPU::OPERAND_REG_INLINE_C_FP32: 3149 case AMDGPU::OPERAND_REG_INLINE_AC_INT32: 3150 case AMDGPU::OPERAND_REG_INLINE_AC_FP32: { 3151 int32_t Trunc = static_cast<int32_t>(Imm); 3152 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); 3153 } 3154 case AMDGPU::OPERAND_REG_IMM_INT64: 3155 case AMDGPU::OPERAND_REG_IMM_FP64: 3156 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 3157 case AMDGPU::OPERAND_REG_INLINE_C_FP64: 3158 return AMDGPU::isInlinableLiteral64(MO.getImm(), 3159 ST.hasInv2PiInlineImm()); 3160 case AMDGPU::OPERAND_REG_IMM_INT16: 3161 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 3162 case AMDGPU::OPERAND_REG_INLINE_AC_INT16: 3163 // We would expect inline immediates to not be concerned with an integer/fp 3164 // distinction. However, in the case of 16-bit integer operations, the 3165 // "floating point" values appear to not work. It seems read the low 16-bits 3166 // of 32-bit immediates, which happens to always work for the integer 3167 // values. 3168 // 3169 // See llvm bugzilla 46302. 3170 // 3171 // TODO: Theoretically we could use op-sel to use the high bits of the 3172 // 32-bit FP values. 3173 return AMDGPU::isInlinableIntLiteral(Imm); 3174 case AMDGPU::OPERAND_REG_IMM_V2INT16: 3175 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: 3176 case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: 3177 // This suffers the same problem as the scalar 16-bit cases. 3178 return AMDGPU::isInlinableIntLiteralV216(Imm); 3179 case AMDGPU::OPERAND_REG_IMM_FP16: 3180 case AMDGPU::OPERAND_REG_INLINE_C_FP16: 3181 case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { 3182 if (isInt<16>(Imm) || isUInt<16>(Imm)) { 3183 // A few special case instructions have 16-bit operands on subtargets 3184 // where 16-bit instructions are not legal. 3185 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle 3186 // constants in these cases 3187 int16_t Trunc = static_cast<int16_t>(Imm); 3188 return ST.has16BitInsts() && 3189 AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); 3190 } 3191 3192 return false; 3193 } 3194 case AMDGPU::OPERAND_REG_IMM_V2FP16: 3195 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: 3196 case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { 3197 uint32_t Trunc = static_cast<uint32_t>(Imm); 3198 return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); 3199 } 3200 default: 3201 llvm_unreachable("invalid bitwidth"); 3202 } 3203 } 3204 3205 bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO, 3206 const MCOperandInfo &OpInfo) const { 3207 switch (MO.getType()) { 3208 case MachineOperand::MO_Register: 3209 return false; 3210 case MachineOperand::MO_Immediate: 3211 return !isInlineConstant(MO, OpInfo); 3212 case MachineOperand::MO_FrameIndex: 3213 case MachineOperand::MO_MachineBasicBlock: 3214 case MachineOperand::MO_ExternalSymbol: 3215 case MachineOperand::MO_GlobalAddress: 3216 case MachineOperand::MO_MCSymbol: 3217 return true; 3218 default: 3219 llvm_unreachable("unexpected operand type"); 3220 } 3221 } 3222 3223 static bool compareMachineOp(const MachineOperand &Op0, 3224 const MachineOperand &Op1) { 3225 if (Op0.getType() != Op1.getType()) 3226 return false; 3227 3228 switch (Op0.getType()) { 3229 case MachineOperand::MO_Register: 3230 return Op0.getReg() == Op1.getReg(); 3231 case MachineOperand::MO_Immediate: 3232 return Op0.getImm() == Op1.getImm(); 3233 default: 3234 llvm_unreachable("Didn't expect to be comparing these operand types"); 3235 } 3236 } 3237 3238 bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, 3239 const MachineOperand &MO) const { 3240 const MCInstrDesc &InstDesc = MI.getDesc(); 3241 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo]; 3242 3243 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); 3244 3245 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 3246 return true; 3247 3248 if (OpInfo.RegClass < 0) 3249 return false; 3250 3251 const MachineFunction *MF = MI.getParent()->getParent(); 3252 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 3253 3254 if (MO.isImm() && isInlineConstant(MO, OpInfo)) { 3255 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() && 3256 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(), 3257 AMDGPU::OpName::src2)) 3258 return false; 3259 return RI.opCanUseInlineConstant(OpInfo.OperandType); 3260 } 3261 3262 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType)) 3263 return false; 3264 3265 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo)) 3266 return true; 3267 3268 return ST.hasVOP3Literal(); 3269 } 3270 3271 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 3272 int Op32 = AMDGPU::getVOPe32(Opcode); 3273 if (Op32 == -1) 3274 return false; 3275 3276 return pseudoToMCOpcode(Op32) != -1; 3277 } 3278 3279 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 3280 // The src0_modifier operand is present on all instructions 3281 // that have modifiers. 3282 3283 return AMDGPU::getNamedOperandIdx(Opcode, 3284 AMDGPU::OpName::src0_modifiers) != -1; 3285 } 3286 3287 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 3288 unsigned OpName) const { 3289 const MachineOperand *Mods = getNamedOperand(MI, OpName); 3290 return Mods && Mods->getImm(); 3291 } 3292 3293 bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { 3294 return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || 3295 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || 3296 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) || 3297 hasModifiersSet(MI, AMDGPU::OpName::clamp) || 3298 hasModifiersSet(MI, AMDGPU::OpName::omod); 3299 } 3300 3301 bool SIInstrInfo::canShrink(const MachineInstr &MI, 3302 const MachineRegisterInfo &MRI) const { 3303 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 3304 // Can't shrink instruction with three operands. 3305 // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add 3306 // a special case for it. It can only be shrunk if the third operand 3307 // is vcc, and src0_modifiers and src1_modifiers are not set. 3308 // We should handle this the same way we handle vopc, by addding 3309 // a register allocation hint pre-regalloc and then do the shrinking 3310 // post-regalloc. 3311 if (Src2) { 3312 switch (MI.getOpcode()) { 3313 default: return false; 3314 3315 case AMDGPU::V_ADDC_U32_e64: 3316 case AMDGPU::V_SUBB_U32_e64: 3317 case AMDGPU::V_SUBBREV_U32_e64: { 3318 const MachineOperand *Src1 3319 = getNamedOperand(MI, AMDGPU::OpName::src1); 3320 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg())) 3321 return false; 3322 // Additional verification is needed for sdst/src2. 3323 return true; 3324 } 3325 case AMDGPU::V_MAC_F32_e64: 3326 case AMDGPU::V_MAC_F16_e64: 3327 case AMDGPU::V_FMAC_F32_e64: 3328 case AMDGPU::V_FMAC_F16_e64: 3329 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) || 3330 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) 3331 return false; 3332 break; 3333 3334 case AMDGPU::V_CNDMASK_B32_e64: 3335 break; 3336 } 3337 } 3338 3339 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 3340 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) || 3341 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers))) 3342 return false; 3343 3344 // We don't need to check src0, all input types are legal, so just make sure 3345 // src0 isn't using any modifiers. 3346 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) 3347 return false; 3348 3349 // Can it be shrunk to a valid 32 bit opcode? 3350 if (!hasVALU32BitEncoding(MI.getOpcode())) 3351 return false; 3352 3353 // Check output modifiers 3354 return !hasModifiersSet(MI, AMDGPU::OpName::omod) && 3355 !hasModifiersSet(MI, AMDGPU::OpName::clamp); 3356 } 3357 3358 // Set VCC operand with all flags from \p Orig, except for setting it as 3359 // implicit. 3360 static void copyFlagsToImplicitVCC(MachineInstr &MI, 3361 const MachineOperand &Orig) { 3362 3363 for (MachineOperand &Use : MI.implicit_operands()) { 3364 if (Use.isUse() && Use.getReg() == AMDGPU::VCC) { 3365 Use.setIsUndef(Orig.isUndef()); 3366 Use.setIsKill(Orig.isKill()); 3367 return; 3368 } 3369 } 3370 } 3371 3372 MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI, 3373 unsigned Op32) const { 3374 MachineBasicBlock *MBB = MI.getParent();; 3375 MachineInstrBuilder Inst32 = 3376 BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32)) 3377 .setMIFlags(MI.getFlags()); 3378 3379 // Add the dst operand if the 32-bit encoding also has an explicit $vdst. 3380 // For VOPC instructions, this is replaced by an implicit def of vcc. 3381 int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst); 3382 if (Op32DstIdx != -1) { 3383 // dst 3384 Inst32.add(MI.getOperand(0)); 3385 } else { 3386 assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) || 3387 (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && 3388 "Unexpected case"); 3389 } 3390 3391 Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0)); 3392 3393 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 3394 if (Src1) 3395 Inst32.add(*Src1); 3396 3397 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 3398 3399 if (Src2) { 3400 int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); 3401 if (Op32Src2Idx != -1) { 3402 Inst32.add(*Src2); 3403 } else { 3404 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is 3405 // replaced with an implicit read of vcc. This was already added 3406 // during the initial BuildMI, so find it to preserve the flags. 3407 copyFlagsToImplicitVCC(*Inst32, *Src2); 3408 } 3409 } 3410 3411 return Inst32; 3412 } 3413 3414 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 3415 const MachineOperand &MO, 3416 const MCOperandInfo &OpInfo) const { 3417 // Literal constants use the constant bus. 3418 //if (isLiteralConstantLike(MO, OpInfo)) 3419 // return true; 3420 if (MO.isImm()) 3421 return !isInlineConstant(MO, OpInfo); 3422 3423 if (!MO.isReg()) 3424 return true; // Misc other operands like FrameIndex 3425 3426 if (!MO.isUse()) 3427 return false; 3428 3429 if (Register::isVirtualRegister(MO.getReg())) 3430 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 3431 3432 // Null is free 3433 if (MO.getReg() == AMDGPU::SGPR_NULL) 3434 return false; 3435 3436 // SGPRs use the constant bus 3437 if (MO.isImplicit()) { 3438 return MO.getReg() == AMDGPU::M0 || 3439 MO.getReg() == AMDGPU::VCC || 3440 MO.getReg() == AMDGPU::VCC_LO; 3441 } else { 3442 return AMDGPU::SReg_32RegClass.contains(MO.getReg()) || 3443 AMDGPU::SReg_64RegClass.contains(MO.getReg()); 3444 } 3445 } 3446 3447 static Register findImplicitSGPRRead(const MachineInstr &MI) { 3448 for (const MachineOperand &MO : MI.implicit_operands()) { 3449 // We only care about reads. 3450 if (MO.isDef()) 3451 continue; 3452 3453 switch (MO.getReg()) { 3454 case AMDGPU::VCC: 3455 case AMDGPU::VCC_LO: 3456 case AMDGPU::VCC_HI: 3457 case AMDGPU::M0: 3458 case AMDGPU::FLAT_SCR: 3459 return MO.getReg(); 3460 3461 default: 3462 break; 3463 } 3464 } 3465 3466 return AMDGPU::NoRegister; 3467 } 3468 3469 static bool shouldReadExec(const MachineInstr &MI) { 3470 if (SIInstrInfo::isVALU(MI)) { 3471 switch (MI.getOpcode()) { 3472 case AMDGPU::V_READLANE_B32: 3473 case AMDGPU::V_READLANE_B32_gfx6_gfx7: 3474 case AMDGPU::V_READLANE_B32_gfx10: 3475 case AMDGPU::V_READLANE_B32_vi: 3476 case AMDGPU::V_WRITELANE_B32: 3477 case AMDGPU::V_WRITELANE_B32_gfx6_gfx7: 3478 case AMDGPU::V_WRITELANE_B32_gfx10: 3479 case AMDGPU::V_WRITELANE_B32_vi: 3480 return false; 3481 } 3482 3483 return true; 3484 } 3485 3486 if (MI.isPreISelOpcode() || 3487 SIInstrInfo::isGenericOpcode(MI.getOpcode()) || 3488 SIInstrInfo::isSALU(MI) || 3489 SIInstrInfo::isSMRD(MI)) 3490 return false; 3491 3492 return true; 3493 } 3494 3495 static bool isSubRegOf(const SIRegisterInfo &TRI, 3496 const MachineOperand &SuperVec, 3497 const MachineOperand &SubReg) { 3498 if (Register::isPhysicalRegister(SubReg.getReg())) 3499 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); 3500 3501 return SubReg.getSubReg() != AMDGPU::NoSubRegister && 3502 SubReg.getReg() == SuperVec.getReg(); 3503 } 3504 3505 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, 3506 StringRef &ErrInfo) const { 3507 uint16_t Opcode = MI.getOpcode(); 3508 if (SIInstrInfo::isGenericOpcode(MI.getOpcode())) 3509 return true; 3510 3511 const MachineFunction *MF = MI.getParent()->getParent(); 3512 const MachineRegisterInfo &MRI = MF->getRegInfo(); 3513 3514 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 3515 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 3516 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 3517 3518 // Make sure the number of operands is correct. 3519 const MCInstrDesc &Desc = get(Opcode); 3520 if (!Desc.isVariadic() && 3521 Desc.getNumOperands() != MI.getNumExplicitOperands()) { 3522 ErrInfo = "Instruction has wrong number of operands."; 3523 return false; 3524 } 3525 3526 if (MI.isInlineAsm()) { 3527 // Verify register classes for inlineasm constraints. 3528 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands(); 3529 I != E; ++I) { 3530 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI); 3531 if (!RC) 3532 continue; 3533 3534 const MachineOperand &Op = MI.getOperand(I); 3535 if (!Op.isReg()) 3536 continue; 3537 3538 Register Reg = Op.getReg(); 3539 if (!Register::isVirtualRegister(Reg) && !RC->contains(Reg)) { 3540 ErrInfo = "inlineasm operand has incorrect register class."; 3541 return false; 3542 } 3543 } 3544 3545 return true; 3546 } 3547 3548 if (isMIMG(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) { 3549 ErrInfo = "missing memory operand from MIMG instruction."; 3550 return false; 3551 } 3552 3553 // Make sure the register classes are correct. 3554 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 3555 if (MI.getOperand(i).isFPImm()) { 3556 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 3557 "all fp values to integers."; 3558 return false; 3559 } 3560 3561 int RegClass = Desc.OpInfo[i].RegClass; 3562 3563 switch (Desc.OpInfo[i].OperandType) { 3564 case MCOI::OPERAND_REGISTER: 3565 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) { 3566 ErrInfo = "Illegal immediate value for operand."; 3567 return false; 3568 } 3569 break; 3570 case AMDGPU::OPERAND_REG_IMM_INT32: 3571 case AMDGPU::OPERAND_REG_IMM_FP32: 3572 break; 3573 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 3574 case AMDGPU::OPERAND_REG_INLINE_C_FP32: 3575 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 3576 case AMDGPU::OPERAND_REG_INLINE_C_FP64: 3577 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 3578 case AMDGPU::OPERAND_REG_INLINE_C_FP16: 3579 case AMDGPU::OPERAND_REG_INLINE_AC_INT32: 3580 case AMDGPU::OPERAND_REG_INLINE_AC_FP32: 3581 case AMDGPU::OPERAND_REG_INLINE_AC_INT16: 3582 case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { 3583 const MachineOperand &MO = MI.getOperand(i); 3584 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { 3585 ErrInfo = "Illegal immediate value for operand."; 3586 return false; 3587 } 3588 break; 3589 } 3590 case MCOI::OPERAND_IMMEDIATE: 3591 case AMDGPU::OPERAND_KIMM32: 3592 // Check if this operand is an immediate. 3593 // FrameIndex operands will be replaced by immediates, so they are 3594 // allowed. 3595 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { 3596 ErrInfo = "Expected immediate, but got non-immediate"; 3597 return false; 3598 } 3599 LLVM_FALLTHROUGH; 3600 default: 3601 continue; 3602 } 3603 3604 if (!MI.getOperand(i).isReg()) 3605 continue; 3606 3607 if (RegClass != -1) { 3608 Register Reg = MI.getOperand(i).getReg(); 3609 if (Reg == AMDGPU::NoRegister || Register::isVirtualRegister(Reg)) 3610 continue; 3611 3612 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 3613 if (!RC->contains(Reg)) { 3614 ErrInfo = "Operand has incorrect register class."; 3615 return false; 3616 } 3617 } 3618 } 3619 3620 // Verify SDWA 3621 if (isSDWA(MI)) { 3622 if (!ST.hasSDWA()) { 3623 ErrInfo = "SDWA is not supported on this target"; 3624 return false; 3625 } 3626 3627 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); 3628 3629 const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx }; 3630 3631 for (int OpIdx: OpIndicies) { 3632 if (OpIdx == -1) 3633 continue; 3634 const MachineOperand &MO = MI.getOperand(OpIdx); 3635 3636 if (!ST.hasSDWAScalar()) { 3637 // Only VGPRS on VI 3638 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) { 3639 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI"; 3640 return false; 3641 } 3642 } else { 3643 // No immediates on GFX9 3644 if (!MO.isReg()) { 3645 ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9"; 3646 return false; 3647 } 3648 } 3649 } 3650 3651 if (!ST.hasSDWAOmod()) { 3652 // No omod allowed on VI 3653 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); 3654 if (OMod != nullptr && 3655 (!OMod->isImm() || OMod->getImm() != 0)) { 3656 ErrInfo = "OMod not allowed in SDWA instructions on VI"; 3657 return false; 3658 } 3659 } 3660 3661 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode); 3662 if (isVOPC(BasicOpcode)) { 3663 if (!ST.hasSDWASdst() && DstIdx != -1) { 3664 // Only vcc allowed as dst on VI for VOPC 3665 const MachineOperand &Dst = MI.getOperand(DstIdx); 3666 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) { 3667 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI"; 3668 return false; 3669 } 3670 } else if (!ST.hasSDWAOutModsVOPC()) { 3671 // No clamp allowed on GFX9 for VOPC 3672 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); 3673 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) { 3674 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI"; 3675 return false; 3676 } 3677 3678 // No omod allowed on GFX9 for VOPC 3679 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); 3680 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) { 3681 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI"; 3682 return false; 3683 } 3684 } 3685 } 3686 3687 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused); 3688 if (DstUnused && DstUnused->isImm() && 3689 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) { 3690 const MachineOperand &Dst = MI.getOperand(DstIdx); 3691 if (!Dst.isReg() || !Dst.isTied()) { 3692 ErrInfo = "Dst register should have tied register"; 3693 return false; 3694 } 3695 3696 const MachineOperand &TiedMO = 3697 MI.getOperand(MI.findTiedOperandIdx(DstIdx)); 3698 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) { 3699 ErrInfo = 3700 "Dst register should be tied to implicit use of preserved register"; 3701 return false; 3702 } else if (Register::isPhysicalRegister(TiedMO.getReg()) && 3703 Dst.getReg() != TiedMO.getReg()) { 3704 ErrInfo = "Dst register should use same physical register as preserved"; 3705 return false; 3706 } 3707 } 3708 } 3709 3710 // Verify MIMG 3711 if (isMIMG(MI.getOpcode()) && !MI.mayStore()) { 3712 // Ensure that the return type used is large enough for all the options 3713 // being used TFE/LWE require an extra result register. 3714 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask); 3715 if (DMask) { 3716 uint64_t DMaskImm = DMask->getImm(); 3717 uint32_t RegCount = 3718 isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm); 3719 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe); 3720 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe); 3721 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16); 3722 3723 // Adjust for packed 16 bit values 3724 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem()) 3725 RegCount >>= 1; 3726 3727 // Adjust if using LWE or TFE 3728 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm())) 3729 RegCount += 1; 3730 3731 const uint32_t DstIdx = 3732 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); 3733 const MachineOperand &Dst = MI.getOperand(DstIdx); 3734 if (Dst.isReg()) { 3735 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx); 3736 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32; 3737 if (RegCount > DstSize) { 3738 ErrInfo = "MIMG instruction returns too many registers for dst " 3739 "register class"; 3740 return false; 3741 } 3742 } 3743 } 3744 } 3745 3746 // Verify VOP*. Ignore multiple sgpr operands on writelane. 3747 if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32 3748 && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) { 3749 // Only look at the true operands. Only a real operand can use the constant 3750 // bus, and we don't want to check pseudo-operands like the source modifier 3751 // flags. 3752 const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; 3753 3754 unsigned ConstantBusCount = 0; 3755 unsigned LiteralCount = 0; 3756 3757 if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) 3758 ++ConstantBusCount; 3759 3760 SmallVector<Register, 2> SGPRsUsed; 3761 Register SGPRUsed = findImplicitSGPRRead(MI); 3762 if (SGPRUsed != AMDGPU::NoRegister) { 3763 ++ConstantBusCount; 3764 SGPRsUsed.push_back(SGPRUsed); 3765 } 3766 3767 for (int OpIdx : OpIndices) { 3768 if (OpIdx == -1) 3769 break; 3770 const MachineOperand &MO = MI.getOperand(OpIdx); 3771 if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { 3772 if (MO.isReg()) { 3773 SGPRUsed = MO.getReg(); 3774 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) { 3775 return !RI.regsOverlap(SGPRUsed, SGPR); 3776 })) { 3777 ++ConstantBusCount; 3778 SGPRsUsed.push_back(SGPRUsed); 3779 } 3780 } else { 3781 ++ConstantBusCount; 3782 ++LiteralCount; 3783 } 3784 } 3785 } 3786 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 3787 // v_writelane_b32 is an exception from constant bus restriction: 3788 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const 3789 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) && 3790 Opcode != AMDGPU::V_WRITELANE_B32) { 3791 ErrInfo = "VOP* instruction violates constant bus restriction"; 3792 return false; 3793 } 3794 3795 if (isVOP3(MI) && LiteralCount) { 3796 if (LiteralCount && !ST.hasVOP3Literal()) { 3797 ErrInfo = "VOP3 instruction uses literal"; 3798 return false; 3799 } 3800 if (LiteralCount > 1) { 3801 ErrInfo = "VOP3 instruction uses more than one literal"; 3802 return false; 3803 } 3804 } 3805 } 3806 3807 // Special case for writelane - this can break the multiple constant bus rule, 3808 // but still can't use more than one SGPR register 3809 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) { 3810 unsigned SGPRCount = 0; 3811 Register SGPRUsed = AMDGPU::NoRegister; 3812 3813 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) { 3814 if (OpIdx == -1) 3815 break; 3816 3817 const MachineOperand &MO = MI.getOperand(OpIdx); 3818 3819 if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { 3820 if (MO.isReg() && MO.getReg() != AMDGPU::M0) { 3821 if (MO.getReg() != SGPRUsed) 3822 ++SGPRCount; 3823 SGPRUsed = MO.getReg(); 3824 } 3825 } 3826 if (SGPRCount > ST.getConstantBusLimit(Opcode)) { 3827 ErrInfo = "WRITELANE instruction violates constant bus restriction"; 3828 return false; 3829 } 3830 } 3831 } 3832 3833 // Verify misc. restrictions on specific instructions. 3834 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || 3835 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { 3836 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 3837 const MachineOperand &Src1 = MI.getOperand(Src1Idx); 3838 const MachineOperand &Src2 = MI.getOperand(Src2Idx); 3839 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 3840 if (!compareMachineOp(Src0, Src1) && 3841 !compareMachineOp(Src0, Src2)) { 3842 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 3843 return false; 3844 } 3845 } 3846 } 3847 3848 if (isSOP2(MI) || isSOPC(MI)) { 3849 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 3850 const MachineOperand &Src1 = MI.getOperand(Src1Idx); 3851 unsigned Immediates = 0; 3852 3853 if (!Src0.isReg() && 3854 !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType)) 3855 Immediates++; 3856 if (!Src1.isReg() && 3857 !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType)) 3858 Immediates++; 3859 3860 if (Immediates > 1) { 3861 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants"; 3862 return false; 3863 } 3864 } 3865 3866 if (isSOPK(MI)) { 3867 auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16); 3868 if (Desc.isBranch()) { 3869 if (!Op->isMBB()) { 3870 ErrInfo = "invalid branch target for SOPK instruction"; 3871 return false; 3872 } 3873 } else { 3874 uint64_t Imm = Op->getImm(); 3875 if (sopkIsZext(MI)) { 3876 if (!isUInt<16>(Imm)) { 3877 ErrInfo = "invalid immediate for SOPK instruction"; 3878 return false; 3879 } 3880 } else { 3881 if (!isInt<16>(Imm)) { 3882 ErrInfo = "invalid immediate for SOPK instruction"; 3883 return false; 3884 } 3885 } 3886 } 3887 } 3888 3889 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || 3890 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || 3891 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 3892 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { 3893 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 3894 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; 3895 3896 const unsigned StaticNumOps = Desc.getNumOperands() + 3897 Desc.getNumImplicitUses(); 3898 const unsigned NumImplicitOps = IsDst ? 2 : 1; 3899 3900 // Allow additional implicit operands. This allows a fixup done by the post 3901 // RA scheduler where the main implicit operand is killed and implicit-defs 3902 // are added for sub-registers that remain live after this instruction. 3903 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { 3904 ErrInfo = "missing implicit register operands"; 3905 return false; 3906 } 3907 3908 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 3909 if (IsDst) { 3910 if (!Dst->isUse()) { 3911 ErrInfo = "v_movreld_b32 vdst should be a use operand"; 3912 return false; 3913 } 3914 3915 unsigned UseOpIdx; 3916 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || 3917 UseOpIdx != StaticNumOps + 1) { 3918 ErrInfo = "movrel implicit operands should be tied"; 3919 return false; 3920 } 3921 } 3922 3923 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 3924 const MachineOperand &ImpUse 3925 = MI.getOperand(StaticNumOps + NumImplicitOps - 1); 3926 if (!ImpUse.isReg() || !ImpUse.isUse() || 3927 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { 3928 ErrInfo = "src0 should be subreg of implicit vector use"; 3929 return false; 3930 } 3931 } 3932 3933 // Make sure we aren't losing exec uses in the td files. This mostly requires 3934 // being careful when using let Uses to try to add other use registers. 3935 if (shouldReadExec(MI)) { 3936 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 3937 ErrInfo = "VALU instruction does not implicitly read exec mask"; 3938 return false; 3939 } 3940 } 3941 3942 if (isSMRD(MI)) { 3943 if (MI.mayStore()) { 3944 // The register offset form of scalar stores may only use m0 as the 3945 // soffset register. 3946 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff); 3947 if (Soff && Soff->getReg() != AMDGPU::M0) { 3948 ErrInfo = "scalar stores must use m0 as offset register"; 3949 return false; 3950 } 3951 } 3952 } 3953 3954 if (isFLAT(MI) && !MF->getSubtarget<GCNSubtarget>().hasFlatInstOffsets()) { 3955 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 3956 if (Offset->getImm() != 0) { 3957 ErrInfo = "subtarget does not support offsets in flat instructions"; 3958 return false; 3959 } 3960 } 3961 3962 if (isMIMG(MI)) { 3963 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim); 3964 if (DimOp) { 3965 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode, 3966 AMDGPU::OpName::vaddr0); 3967 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); 3968 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode); 3969 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3970 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); 3971 const AMDGPU::MIMGDimInfo *Dim = 3972 AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm()); 3973 3974 if (!Dim) { 3975 ErrInfo = "dim is out of range"; 3976 return false; 3977 } 3978 3979 bool IsA16 = false; 3980 if (ST.hasR128A16()) { 3981 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128); 3982 IsA16 = R128A16->getImm() != 0; 3983 } else if (ST.hasGFX10A16()) { 3984 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16); 3985 IsA16 = A16->getImm() != 0; 3986 } 3987 3988 bool PackDerivatives = IsA16 || BaseOpcode->G16; 3989 bool IsNSA = SRsrcIdx - VAddr0Idx > 1; 3990 3991 unsigned AddrWords = BaseOpcode->NumExtraArgs; 3992 unsigned AddrComponents = (BaseOpcode->Coordinates ? Dim->NumCoords : 0) + 3993 (BaseOpcode->LodOrClampOrMip ? 1 : 0); 3994 if (IsA16) 3995 AddrWords += (AddrComponents + 1) / 2; 3996 else 3997 AddrWords += AddrComponents; 3998 3999 if (BaseOpcode->Gradients) { 4000 if (PackDerivatives) 4001 // There are two gradients per coordinate, we pack them separately. 4002 // For the 3d case, we get (dy/du, dx/du) (-, dz/du) (dy/dv, dx/dv) (-, dz/dv) 4003 AddrWords += (Dim->NumGradients / 2 + 1) / 2 * 2; 4004 else 4005 AddrWords += Dim->NumGradients; 4006 } 4007 4008 unsigned VAddrWords; 4009 if (IsNSA) { 4010 VAddrWords = SRsrcIdx - VAddr0Idx; 4011 } else { 4012 const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx); 4013 VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32; 4014 if (AddrWords > 8) 4015 AddrWords = 16; 4016 else if (AddrWords > 4) 4017 AddrWords = 8; 4018 else if (AddrWords == 4) 4019 AddrWords = 4; 4020 else if (AddrWords == 3) 4021 AddrWords = 3; 4022 } 4023 4024 if (VAddrWords != AddrWords) { 4025 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords 4026 << " but got " << VAddrWords << "\n"); 4027 ErrInfo = "bad vaddr size"; 4028 return false; 4029 } 4030 } 4031 } 4032 4033 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl); 4034 if (DppCt) { 4035 using namespace AMDGPU::DPP; 4036 4037 unsigned DC = DppCt->getImm(); 4038 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 || 4039 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST || 4040 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) || 4041 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) || 4042 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) || 4043 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) || 4044 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) { 4045 ErrInfo = "Invalid dpp_ctrl value"; 4046 return false; 4047 } 4048 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 && 4049 ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 4050 ErrInfo = "Invalid dpp_ctrl value: " 4051 "wavefront shifts are not supported on GFX10+"; 4052 return false; 4053 } 4054 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 && 4055 ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 4056 ErrInfo = "Invalid dpp_ctrl value: " 4057 "broadcasts are not supported on GFX10+"; 4058 return false; 4059 } 4060 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST && 4061 ST.getGeneration() < AMDGPUSubtarget::GFX10) { 4062 ErrInfo = "Invalid dpp_ctrl value: " 4063 "row_share and row_xmask are not supported before GFX10"; 4064 return false; 4065 } 4066 } 4067 4068 return true; 4069 } 4070 4071 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { 4072 switch (MI.getOpcode()) { 4073 default: return AMDGPU::INSTRUCTION_LIST_END; 4074 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 4075 case AMDGPU::COPY: return AMDGPU::COPY; 4076 case AMDGPU::PHI: return AMDGPU::PHI; 4077 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 4078 case AMDGPU::WQM: return AMDGPU::WQM; 4079 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM; 4080 case AMDGPU::WWM: return AMDGPU::WWM; 4081 case AMDGPU::S_MOV_B32: { 4082 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 4083 return MI.getOperand(1).isReg() || 4084 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ? 4085 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 4086 } 4087 case AMDGPU::S_ADD_I32: 4088 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32; 4089 case AMDGPU::S_ADDC_U32: 4090 return AMDGPU::V_ADDC_U32_e32; 4091 case AMDGPU::S_SUB_I32: 4092 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32; 4093 // FIXME: These are not consistently handled, and selected when the carry is 4094 // used. 4095 case AMDGPU::S_ADD_U32: 4096 return AMDGPU::V_ADD_I32_e32; 4097 case AMDGPU::S_SUB_U32: 4098 return AMDGPU::V_SUB_I32_e32; 4099 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 4100 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32; 4101 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32; 4102 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32; 4103 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; 4104 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; 4105 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; 4106 case AMDGPU::S_XNOR_B32: 4107 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; 4108 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; 4109 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; 4110 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; 4111 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; 4112 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 4113 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; 4114 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 4115 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; 4116 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 4117 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; 4118 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; 4119 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; 4120 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; 4121 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; 4122 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 4123 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 4124 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 4125 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 4126 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; 4127 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; 4128 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; 4129 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; 4130 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; 4131 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; 4132 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; 4133 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; 4134 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; 4135 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; 4136 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; 4137 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; 4138 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32; 4139 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32; 4140 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 4141 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 4142 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 4143 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 4144 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; 4145 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; 4146 } 4147 llvm_unreachable( 4148 "Unexpected scalar opcode without corresponding vector one!"); 4149 } 4150 4151 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 4152 unsigned OpNo) const { 4153 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 4154 const MCInstrDesc &Desc = get(MI.getOpcode()); 4155 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 4156 Desc.OpInfo[OpNo].RegClass == -1) { 4157 Register Reg = MI.getOperand(OpNo).getReg(); 4158 4159 if (Register::isVirtualRegister(Reg)) 4160 return MRI.getRegClass(Reg); 4161 return RI.getPhysRegClass(Reg); 4162 } 4163 4164 unsigned RCID = Desc.OpInfo[OpNo].RegClass; 4165 return RI.getRegClass(RCID); 4166 } 4167 4168 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { 4169 MachineBasicBlock::iterator I = MI; 4170 MachineBasicBlock *MBB = MI.getParent(); 4171 MachineOperand &MO = MI.getOperand(OpIdx); 4172 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 4173 const SIRegisterInfo *TRI = 4174 static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 4175 unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; 4176 const TargetRegisterClass *RC = RI.getRegClass(RCID); 4177 unsigned Size = TRI->getRegSizeInBits(*RC); 4178 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32; 4179 if (MO.isReg()) 4180 Opcode = AMDGPU::COPY; 4181 else if (RI.isSGPRClass(RC)) 4182 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 4183 4184 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 4185 if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) 4186 VRC = &AMDGPU::VReg_64RegClass; 4187 else 4188 VRC = &AMDGPU::VGPR_32RegClass; 4189 4190 Register Reg = MRI.createVirtualRegister(VRC); 4191 DebugLoc DL = MBB->findDebugLoc(I); 4192 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); 4193 MO.ChangeToRegister(Reg, false); 4194 } 4195 4196 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 4197 MachineRegisterInfo &MRI, 4198 MachineOperand &SuperReg, 4199 const TargetRegisterClass *SuperRC, 4200 unsigned SubIdx, 4201 const TargetRegisterClass *SubRC) 4202 const { 4203 MachineBasicBlock *MBB = MI->getParent(); 4204 DebugLoc DL = MI->getDebugLoc(); 4205 Register SubReg = MRI.createVirtualRegister(SubRC); 4206 4207 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { 4208 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 4209 .addReg(SuperReg.getReg(), 0, SubIdx); 4210 return SubReg; 4211 } 4212 4213 // Just in case the super register is itself a sub-register, copy it to a new 4214 // value so we don't need to worry about merging its subreg index with the 4215 // SubIdx passed to this function. The register coalescer should be able to 4216 // eliminate this extra copy. 4217 Register NewSuperReg = MRI.createVirtualRegister(SuperRC); 4218 4219 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 4220 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 4221 4222 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 4223 .addReg(NewSuperReg, 0, SubIdx); 4224 4225 return SubReg; 4226 } 4227 4228 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 4229 MachineBasicBlock::iterator MII, 4230 MachineRegisterInfo &MRI, 4231 MachineOperand &Op, 4232 const TargetRegisterClass *SuperRC, 4233 unsigned SubIdx, 4234 const TargetRegisterClass *SubRC) const { 4235 if (Op.isImm()) { 4236 if (SubIdx == AMDGPU::sub0) 4237 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); 4238 if (SubIdx == AMDGPU::sub1) 4239 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32)); 4240 4241 llvm_unreachable("Unhandled register index for immediate"); 4242 } 4243 4244 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 4245 SubIdx, SubRC); 4246 return MachineOperand::CreateReg(SubReg, false); 4247 } 4248 4249 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 4250 void SIInstrInfo::swapOperands(MachineInstr &Inst) const { 4251 assert(Inst.getNumExplicitOperands() == 3); 4252 MachineOperand Op1 = Inst.getOperand(1); 4253 Inst.RemoveOperand(1); 4254 Inst.addOperand(Op1); 4255 } 4256 4257 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, 4258 const MCOperandInfo &OpInfo, 4259 const MachineOperand &MO) const { 4260 if (!MO.isReg()) 4261 return false; 4262 4263 Register Reg = MO.getReg(); 4264 const TargetRegisterClass *RC = Register::isVirtualRegister(Reg) 4265 ? MRI.getRegClass(Reg) 4266 : RI.getPhysRegClass(Reg); 4267 4268 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass); 4269 if (MO.getSubReg()) { 4270 const MachineFunction *MF = MO.getParent()->getParent()->getParent(); 4271 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF); 4272 if (!SuperRC) 4273 return false; 4274 4275 DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()); 4276 if (!DRC) 4277 return false; 4278 } 4279 return RC->hasSuperClassEq(DRC); 4280 } 4281 4282 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, 4283 const MCOperandInfo &OpInfo, 4284 const MachineOperand &MO) const { 4285 if (MO.isReg()) 4286 return isLegalRegOperand(MRI, OpInfo, MO); 4287 4288 // Handle non-register types that are treated like immediates. 4289 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); 4290 return true; 4291 } 4292 4293 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, 4294 const MachineOperand *MO) const { 4295 const MachineFunction &MF = *MI.getParent()->getParent(); 4296 const MachineRegisterInfo &MRI = MF.getRegInfo(); 4297 const MCInstrDesc &InstDesc = MI.getDesc(); 4298 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; 4299 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 4300 const TargetRegisterClass *DefinedRC = 4301 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 4302 if (!MO) 4303 MO = &MI.getOperand(OpIdx); 4304 4305 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); 4306 int VOP3LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; 4307 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { 4308 if (isVOP3(MI) && isLiteralConstantLike(*MO, OpInfo) && !VOP3LiteralLimit--) 4309 return false; 4310 4311 SmallDenseSet<RegSubRegPair> SGPRsUsed; 4312 if (MO->isReg()) 4313 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg())); 4314 4315 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 4316 if (i == OpIdx) 4317 continue; 4318 const MachineOperand &Op = MI.getOperand(i); 4319 if (Op.isReg()) { 4320 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg()); 4321 if (!SGPRsUsed.count(SGPR) && 4322 usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) { 4323 if (--ConstantBusLimit <= 0) 4324 return false; 4325 SGPRsUsed.insert(SGPR); 4326 } 4327 } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { 4328 if (--ConstantBusLimit <= 0) 4329 return false; 4330 } else if (isVOP3(MI) && AMDGPU::isSISrcOperand(InstDesc, i) && 4331 isLiteralConstantLike(Op, InstDesc.OpInfo[i])) { 4332 if (!VOP3LiteralLimit--) 4333 return false; 4334 if (--ConstantBusLimit <= 0) 4335 return false; 4336 } 4337 } 4338 } 4339 4340 if (MO->isReg()) { 4341 assert(DefinedRC); 4342 return isLegalRegOperand(MRI, OpInfo, *MO); 4343 } 4344 4345 // Handle non-register types that are treated like immediates. 4346 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal()); 4347 4348 if (!DefinedRC) { 4349 // This operand expects an immediate. 4350 return true; 4351 } 4352 4353 return isImmOperandLegal(MI, OpIdx, *MO); 4354 } 4355 4356 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, 4357 MachineInstr &MI) const { 4358 unsigned Opc = MI.getOpcode(); 4359 const MCInstrDesc &InstrDesc = get(Opc); 4360 4361 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 4362 MachineOperand &Src0 = MI.getOperand(Src0Idx); 4363 4364 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 4365 MachineOperand &Src1 = MI.getOperand(Src1Idx); 4366 4367 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 4368 // we need to only have one constant bus use before GFX10. 4369 bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; 4370 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && 4371 Src0.isReg() && (RI.isSGPRReg(MRI, Src0.getReg()) || 4372 isLiteralConstantLike(Src0, InstrDesc.OpInfo[Src0Idx]))) 4373 legalizeOpWithMove(MI, Src0Idx); 4374 4375 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for 4376 // both the value to write (src0) and lane select (src1). Fix up non-SGPR 4377 // src0/src1 with V_READFIRSTLANE. 4378 if (Opc == AMDGPU::V_WRITELANE_B32) { 4379 const DebugLoc &DL = MI.getDebugLoc(); 4380 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) { 4381 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4382 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4383 .add(Src0); 4384 Src0.ChangeToRegister(Reg, false); 4385 } 4386 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) { 4387 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4388 const DebugLoc &DL = MI.getDebugLoc(); 4389 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4390 .add(Src1); 4391 Src1.ChangeToRegister(Reg, false); 4392 } 4393 return; 4394 } 4395 4396 // No VOP2 instructions support AGPRs. 4397 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg())) 4398 legalizeOpWithMove(MI, Src0Idx); 4399 4400 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg())) 4401 legalizeOpWithMove(MI, Src1Idx); 4402 4403 // VOP2 src0 instructions support all operand types, so we don't need to check 4404 // their legality. If src1 is already legal, we don't need to do anything. 4405 if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) 4406 return; 4407 4408 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for 4409 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane 4410 // select is uniform. 4411 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() && 4412 RI.isVGPR(MRI, Src1.getReg())) { 4413 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4414 const DebugLoc &DL = MI.getDebugLoc(); 4415 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4416 .add(Src1); 4417 Src1.ChangeToRegister(Reg, false); 4418 return; 4419 } 4420 4421 // We do not use commuteInstruction here because it is too aggressive and will 4422 // commute if it is possible. We only want to commute here if it improves 4423 // legality. This can be called a fairly large number of times so don't waste 4424 // compile time pointlessly swapping and checking legality again. 4425 if (HasImplicitSGPR || !MI.isCommutable()) { 4426 legalizeOpWithMove(MI, Src1Idx); 4427 return; 4428 } 4429 4430 // If src0 can be used as src1, commuting will make the operands legal. 4431 // Otherwise we have to give up and insert a move. 4432 // 4433 // TODO: Other immediate-like operand kinds could be commuted if there was a 4434 // MachineOperand::ChangeTo* for them. 4435 if ((!Src1.isImm() && !Src1.isReg()) || 4436 !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { 4437 legalizeOpWithMove(MI, Src1Idx); 4438 return; 4439 } 4440 4441 int CommutedOpc = commuteOpcode(MI); 4442 if (CommutedOpc == -1) { 4443 legalizeOpWithMove(MI, Src1Idx); 4444 return; 4445 } 4446 4447 MI.setDesc(get(CommutedOpc)); 4448 4449 Register Src0Reg = Src0.getReg(); 4450 unsigned Src0SubReg = Src0.getSubReg(); 4451 bool Src0Kill = Src0.isKill(); 4452 4453 if (Src1.isImm()) 4454 Src0.ChangeToImmediate(Src1.getImm()); 4455 else if (Src1.isReg()) { 4456 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); 4457 Src0.setSubReg(Src1.getSubReg()); 4458 } else 4459 llvm_unreachable("Should only have register or immediate operands"); 4460 4461 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); 4462 Src1.setSubReg(Src0SubReg); 4463 fixImplicitOperands(MI); 4464 } 4465 4466 // Legalize VOP3 operands. All operand types are supported for any operand 4467 // but only one literal constant and only starting from GFX10. 4468 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, 4469 MachineInstr &MI) const { 4470 unsigned Opc = MI.getOpcode(); 4471 4472 int VOP3Idx[3] = { 4473 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 4474 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), 4475 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) 4476 }; 4477 4478 if (Opc == AMDGPU::V_PERMLANE16_B32 || 4479 Opc == AMDGPU::V_PERMLANEX16_B32) { 4480 // src1 and src2 must be scalar 4481 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]); 4482 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]); 4483 const DebugLoc &DL = MI.getDebugLoc(); 4484 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) { 4485 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4486 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4487 .add(Src1); 4488 Src1.ChangeToRegister(Reg, false); 4489 } 4490 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) { 4491 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4492 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4493 .add(Src2); 4494 Src2.ChangeToRegister(Reg, false); 4495 } 4496 } 4497 4498 // Find the one SGPR operand we are allowed to use. 4499 int ConstantBusLimit = ST.getConstantBusLimit(Opc); 4500 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; 4501 SmallDenseSet<unsigned> SGPRsUsed; 4502 unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); 4503 if (SGPRReg != AMDGPU::NoRegister) { 4504 SGPRsUsed.insert(SGPRReg); 4505 --ConstantBusLimit; 4506 } 4507 4508 for (unsigned i = 0; i < 3; ++i) { 4509 int Idx = VOP3Idx[i]; 4510 if (Idx == -1) 4511 break; 4512 MachineOperand &MO = MI.getOperand(Idx); 4513 4514 if (!MO.isReg()) { 4515 if (!isLiteralConstantLike(MO, get(Opc).OpInfo[Idx])) 4516 continue; 4517 4518 if (LiteralLimit > 0 && ConstantBusLimit > 0) { 4519 --LiteralLimit; 4520 --ConstantBusLimit; 4521 continue; 4522 } 4523 4524 --LiteralLimit; 4525 --ConstantBusLimit; 4526 legalizeOpWithMove(MI, Idx); 4527 continue; 4528 } 4529 4530 if (RI.hasAGPRs(MRI.getRegClass(MO.getReg())) && 4531 !isOperandLegal(MI, Idx, &MO)) { 4532 legalizeOpWithMove(MI, Idx); 4533 continue; 4534 } 4535 4536 if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 4537 continue; // VGPRs are legal 4538 4539 // We can use one SGPR in each VOP3 instruction prior to GFX10 4540 // and two starting from GFX10. 4541 if (SGPRsUsed.count(MO.getReg())) 4542 continue; 4543 if (ConstantBusLimit > 0) { 4544 SGPRsUsed.insert(MO.getReg()); 4545 --ConstantBusLimit; 4546 continue; 4547 } 4548 4549 // If we make it this far, then the operand is not legal and we must 4550 // legalize it. 4551 legalizeOpWithMove(MI, Idx); 4552 } 4553 } 4554 4555 Register SIInstrInfo::readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, 4556 MachineRegisterInfo &MRI) const { 4557 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); 4558 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); 4559 Register DstReg = MRI.createVirtualRegister(SRC); 4560 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; 4561 4562 if (RI.hasAGPRs(VRC)) { 4563 VRC = RI.getEquivalentVGPRClass(VRC); 4564 Register NewSrcReg = MRI.createVirtualRegister(VRC); 4565 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 4566 get(TargetOpcode::COPY), NewSrcReg) 4567 .addReg(SrcReg); 4568 SrcReg = NewSrcReg; 4569 } 4570 4571 if (SubRegs == 1) { 4572 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 4573 get(AMDGPU::V_READFIRSTLANE_B32), DstReg) 4574 .addReg(SrcReg); 4575 return DstReg; 4576 } 4577 4578 SmallVector<unsigned, 8> SRegs; 4579 for (unsigned i = 0; i < SubRegs; ++i) { 4580 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4581 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 4582 get(AMDGPU::V_READFIRSTLANE_B32), SGPR) 4583 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); 4584 SRegs.push_back(SGPR); 4585 } 4586 4587 MachineInstrBuilder MIB = 4588 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 4589 get(AMDGPU::REG_SEQUENCE), DstReg); 4590 for (unsigned i = 0; i < SubRegs; ++i) { 4591 MIB.addReg(SRegs[i]); 4592 MIB.addImm(RI.getSubRegFromChannel(i)); 4593 } 4594 return DstReg; 4595 } 4596 4597 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, 4598 MachineInstr &MI) const { 4599 4600 // If the pointer is store in VGPRs, then we need to move them to 4601 // SGPRs using v_readfirstlane. This is safe because we only select 4602 // loads with uniform pointers to SMRD instruction so we know the 4603 // pointer value is uniform. 4604 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); 4605 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { 4606 unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); 4607 SBase->setReg(SGPR); 4608 } 4609 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff); 4610 if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) { 4611 unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI); 4612 SOff->setReg(SGPR); 4613 } 4614 } 4615 4616 void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, 4617 MachineBasicBlock::iterator I, 4618 const TargetRegisterClass *DstRC, 4619 MachineOperand &Op, 4620 MachineRegisterInfo &MRI, 4621 const DebugLoc &DL) const { 4622 Register OpReg = Op.getReg(); 4623 unsigned OpSubReg = Op.getSubReg(); 4624 4625 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( 4626 RI.getRegClassForReg(MRI, OpReg), OpSubReg); 4627 4628 // Check if operand is already the correct register class. 4629 if (DstRC == OpRC) 4630 return; 4631 4632 Register DstReg = MRI.createVirtualRegister(DstRC); 4633 MachineInstr *Copy = 4634 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); 4635 4636 Op.setReg(DstReg); 4637 Op.setSubReg(0); 4638 4639 MachineInstr *Def = MRI.getVRegDef(OpReg); 4640 if (!Def) 4641 return; 4642 4643 // Try to eliminate the copy if it is copying an immediate value. 4644 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass) 4645 FoldImmediate(*Copy, *Def, OpReg, &MRI); 4646 4647 bool ImpDef = Def->isImplicitDef(); 4648 while (!ImpDef && Def && Def->isCopy()) { 4649 if (Def->getOperand(1).getReg().isPhysical()) 4650 break; 4651 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg()); 4652 ImpDef = Def && Def->isImplicitDef(); 4653 } 4654 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) && 4655 !ImpDef) 4656 Copy->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 4657 } 4658 4659 // Emit the actual waterfall loop, executing the wrapped instruction for each 4660 // unique value of \p Rsrc across all lanes. In the best case we execute 1 4661 // iteration, in the worst case we execute 64 (once per lane). 4662 static void 4663 emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, 4664 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, 4665 const DebugLoc &DL, MachineOperand &Rsrc) { 4666 MachineFunction &MF = *OrigBB.getParent(); 4667 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 4668 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 4669 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 4670 unsigned SaveExecOpc = 4671 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 4672 unsigned XorTermOpc = 4673 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; 4674 unsigned AndOpc = 4675 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 4676 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 4677 4678 MachineBasicBlock::iterator I = LoopBB.begin(); 4679 4680 Register VRsrc = Rsrc.getReg(); 4681 unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef()); 4682 4683 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); 4684 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC); 4685 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC); 4686 Register AndCond = MRI.createVirtualRegister(BoolXExecRC); 4687 Register SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4688 Register SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4689 Register SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4690 Register SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4691 Register SRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 4692 4693 // Beginning of the loop, read the next Rsrc variant. 4694 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0) 4695 .addReg(VRsrc, VRsrcUndef, AMDGPU::sub0); 4696 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub1) 4697 .addReg(VRsrc, VRsrcUndef, AMDGPU::sub1); 4698 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub2) 4699 .addReg(VRsrc, VRsrcUndef, AMDGPU::sub2); 4700 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub3) 4701 .addReg(VRsrc, VRsrcUndef, AMDGPU::sub3); 4702 4703 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc) 4704 .addReg(SRsrcSub0) 4705 .addImm(AMDGPU::sub0) 4706 .addReg(SRsrcSub1) 4707 .addImm(AMDGPU::sub1) 4708 .addReg(SRsrcSub2) 4709 .addImm(AMDGPU::sub2) 4710 .addReg(SRsrcSub3) 4711 .addImm(AMDGPU::sub3); 4712 4713 // Update Rsrc operand to use the SGPR Rsrc. 4714 Rsrc.setReg(SRsrc); 4715 Rsrc.setIsKill(true); 4716 4717 // Identify all lanes with identical Rsrc operands in their VGPRs. 4718 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg0) 4719 .addReg(SRsrc, 0, AMDGPU::sub0_sub1) 4720 .addReg(VRsrc, 0, AMDGPU::sub0_sub1); 4721 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1) 4722 .addReg(SRsrc, 0, AMDGPU::sub2_sub3) 4723 .addReg(VRsrc, 0, AMDGPU::sub2_sub3); 4724 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndCond) 4725 .addReg(CondReg0) 4726 .addReg(CondReg1); 4727 4728 MRI.setSimpleHint(SaveExec, AndCond); 4729 4730 // Update EXEC to matching lanes, saving original to SaveExec. 4731 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec) 4732 .addReg(AndCond, RegState::Kill); 4733 4734 // The original instruction is here; we insert the terminators after it. 4735 I = LoopBB.end(); 4736 4737 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 4738 BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec) 4739 .addReg(Exec) 4740 .addReg(SaveExec); 4741 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB); 4742 } 4743 4744 // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register 4745 // with SGPRs by iterating over all unique values across all lanes. 4746 static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, 4747 MachineOperand &Rsrc, MachineDominatorTree *MDT) { 4748 MachineBasicBlock &MBB = *MI.getParent(); 4749 MachineFunction &MF = *MBB.getParent(); 4750 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 4751 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 4752 MachineRegisterInfo &MRI = MF.getRegInfo(); 4753 MachineBasicBlock::iterator I(&MI); 4754 const DebugLoc &DL = MI.getDebugLoc(); 4755 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 4756 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 4757 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 4758 4759 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); 4760 4761 // Save the EXEC mask 4762 BuildMI(MBB, I, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec); 4763 4764 // Killed uses in the instruction we are waterfalling around will be 4765 // incorrect due to the added control-flow. 4766 for (auto &MO : MI.uses()) { 4767 if (MO.isReg() && MO.isUse()) { 4768 MRI.clearKillFlags(MO.getReg()); 4769 } 4770 } 4771 4772 // To insert the loop we need to split the block. Move everything after this 4773 // point to a new block, and insert a new empty block between the two. 4774 MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock(); 4775 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock(); 4776 MachineFunction::iterator MBBI(MBB); 4777 ++MBBI; 4778 4779 MF.insert(MBBI, LoopBB); 4780 MF.insert(MBBI, RemainderBB); 4781 4782 LoopBB->addSuccessor(LoopBB); 4783 LoopBB->addSuccessor(RemainderBB); 4784 4785 // Move MI to the LoopBB, and the remainder of the block to RemainderBB. 4786 MachineBasicBlock::iterator J = I++; 4787 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 4788 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); 4789 LoopBB->splice(LoopBB->begin(), &MBB, J); 4790 4791 MBB.addSuccessor(LoopBB); 4792 4793 // Update dominators. We know that MBB immediately dominates LoopBB, that 4794 // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately 4795 // dominates all of the successors transferred to it from MBB that MBB used 4796 // to properly dominate. 4797 if (MDT) { 4798 MDT->addNewBlock(LoopBB, &MBB); 4799 MDT->addNewBlock(RemainderBB, LoopBB); 4800 for (auto &Succ : RemainderBB->successors()) { 4801 if (MDT->properlyDominates(&MBB, Succ)) { 4802 MDT->changeImmediateDominator(Succ, RemainderBB); 4803 } 4804 } 4805 } 4806 4807 emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc); 4808 4809 // Restore the EXEC mask 4810 MachineBasicBlock::iterator First = RemainderBB->begin(); 4811 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec); 4812 } 4813 4814 // Extract pointer from Rsrc and return a zero-value Rsrc replacement. 4815 static std::tuple<unsigned, unsigned> 4816 extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) { 4817 MachineBasicBlock &MBB = *MI.getParent(); 4818 MachineFunction &MF = *MBB.getParent(); 4819 MachineRegisterInfo &MRI = MF.getRegInfo(); 4820 4821 // Extract the ptr from the resource descriptor. 4822 unsigned RsrcPtr = 4823 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass, 4824 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); 4825 4826 // Create an empty resource descriptor 4827 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4828 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4829 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4830 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 4831 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat(); 4832 4833 // Zero64 = 0 4834 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64) 4835 .addImm(0); 4836 4837 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 4838 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo) 4839 .addImm(RsrcDataFormat & 0xFFFFFFFF); 4840 4841 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 4842 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi) 4843 .addImm(RsrcDataFormat >> 32); 4844 4845 // NewSRsrc = {Zero64, SRsrcFormat} 4846 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc) 4847 .addReg(Zero64) 4848 .addImm(AMDGPU::sub0_sub1) 4849 .addReg(SRsrcFormatLo) 4850 .addImm(AMDGPU::sub2) 4851 .addReg(SRsrcFormatHi) 4852 .addImm(AMDGPU::sub3); 4853 4854 return std::make_tuple(RsrcPtr, NewSRsrc); 4855 } 4856 4857 void SIInstrInfo::legalizeOperands(MachineInstr &MI, 4858 MachineDominatorTree *MDT) const { 4859 MachineFunction &MF = *MI.getParent()->getParent(); 4860 MachineRegisterInfo &MRI = MF.getRegInfo(); 4861 4862 // Legalize VOP2 4863 if (isVOP2(MI) || isVOPC(MI)) { 4864 legalizeOperandsVOP2(MRI, MI); 4865 return; 4866 } 4867 4868 // Legalize VOP3 4869 if (isVOP3(MI)) { 4870 legalizeOperandsVOP3(MRI, MI); 4871 return; 4872 } 4873 4874 // Legalize SMRD 4875 if (isSMRD(MI)) { 4876 legalizeOperandsSMRD(MRI, MI); 4877 return; 4878 } 4879 4880 // Legalize REG_SEQUENCE and PHI 4881 // The register class of the operands much be the same type as the register 4882 // class of the output. 4883 if (MI.getOpcode() == AMDGPU::PHI) { 4884 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 4885 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { 4886 if (!MI.getOperand(i).isReg() || 4887 !Register::isVirtualRegister(MI.getOperand(i).getReg())) 4888 continue; 4889 const TargetRegisterClass *OpRC = 4890 MRI.getRegClass(MI.getOperand(i).getReg()); 4891 if (RI.hasVectorRegisters(OpRC)) { 4892 VRC = OpRC; 4893 } else { 4894 SRC = OpRC; 4895 } 4896 } 4897 4898 // If any of the operands are VGPR registers, then they all most be 4899 // otherwise we will create illegal VGPR->SGPR copies when legalizing 4900 // them. 4901 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { 4902 if (!VRC) { 4903 assert(SRC); 4904 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) { 4905 VRC = &AMDGPU::VReg_1RegClass; 4906 } else 4907 VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) 4908 ? RI.getEquivalentAGPRClass(SRC) 4909 : RI.getEquivalentVGPRClass(SRC); 4910 } else { 4911 VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) 4912 ? RI.getEquivalentAGPRClass(VRC) 4913 : RI.getEquivalentVGPRClass(VRC); 4914 } 4915 RC = VRC; 4916 } else { 4917 RC = SRC; 4918 } 4919 4920 // Update all the operands so they have the same type. 4921 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 4922 MachineOperand &Op = MI.getOperand(I); 4923 if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg())) 4924 continue; 4925 4926 // MI is a PHI instruction. 4927 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); 4928 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); 4929 4930 // Avoid creating no-op copies with the same src and dst reg class. These 4931 // confuse some of the machine passes. 4932 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); 4933 } 4934 } 4935 4936 // REG_SEQUENCE doesn't really require operand legalization, but if one has a 4937 // VGPR dest type and SGPR sources, insert copies so all operands are 4938 // VGPRs. This seems to help operand folding / the register coalescer. 4939 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { 4940 MachineBasicBlock *MBB = MI.getParent(); 4941 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); 4942 if (RI.hasVGPRs(DstRC)) { 4943 // Update all the operands so they are VGPR register classes. These may 4944 // not be the same register class because REG_SEQUENCE supports mixing 4945 // subregister index types e.g. sub0_sub1 + sub2 + sub3 4946 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 4947 MachineOperand &Op = MI.getOperand(I); 4948 if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg())) 4949 continue; 4950 4951 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); 4952 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); 4953 if (VRC == OpRC) 4954 continue; 4955 4956 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc()); 4957 Op.setIsKill(); 4958 } 4959 } 4960 4961 return; 4962 } 4963 4964 // Legalize INSERT_SUBREG 4965 // src0 must have the same register class as dst 4966 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { 4967 Register Dst = MI.getOperand(0).getReg(); 4968 Register Src0 = MI.getOperand(1).getReg(); 4969 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 4970 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 4971 if (DstRC != Src0RC) { 4972 MachineBasicBlock *MBB = MI.getParent(); 4973 MachineOperand &Op = MI.getOperand(1); 4974 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); 4975 } 4976 return; 4977 } 4978 4979 // Legalize SI_INIT_M0 4980 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) { 4981 MachineOperand &Src = MI.getOperand(0); 4982 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) 4983 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); 4984 return; 4985 } 4986 4987 // Legalize MIMG and MUBUF/MTBUF for shaders. 4988 // 4989 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via 4990 // scratch memory access. In both cases, the legalization never involves 4991 // conversion to the addr64 form. 4992 if (isMIMG(MI) || 4993 (AMDGPU::isShader(MF.getFunction().getCallingConv()) && 4994 (isMUBUF(MI) || isMTBUF(MI)))) { 4995 MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); 4996 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { 4997 unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); 4998 SRsrc->setReg(SGPR); 4999 } 5000 5001 MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); 5002 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { 5003 unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); 5004 SSamp->setReg(SGPR); 5005 } 5006 return; 5007 } 5008 5009 // Legalize MUBUF* instructions. 5010 int RsrcIdx = 5011 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); 5012 if (RsrcIdx != -1) { 5013 // We have an MUBUF instruction 5014 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx); 5015 unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass; 5016 if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()), 5017 RI.getRegClass(RsrcRC))) { 5018 // The operands are legal. 5019 // FIXME: We may need to legalize operands besided srsrc. 5020 return; 5021 } 5022 5023 // Legalize a VGPR Rsrc. 5024 // 5025 // If the instruction is _ADDR64, we can avoid a waterfall by extracting 5026 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using 5027 // a zero-value SRsrc. 5028 // 5029 // If the instruction is _OFFSET (both idxen and offen disabled), and we 5030 // support ADDR64 instructions, we can convert to ADDR64 and do the same as 5031 // above. 5032 // 5033 // Otherwise we are on non-ADDR64 hardware, and/or we have 5034 // idxen/offen/bothen and we fall back to a waterfall loop. 5035 5036 MachineBasicBlock &MBB = *MI.getParent(); 5037 5038 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 5039 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) { 5040 // This is already an ADDR64 instruction so we need to add the pointer 5041 // extracted from the resource descriptor to the current value of VAddr. 5042 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5043 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5044 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 5045 5046 const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 5047 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC); 5048 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC); 5049 5050 unsigned RsrcPtr, NewSRsrc; 5051 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); 5052 5053 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0 5054 const DebugLoc &DL = MI.getDebugLoc(); 5055 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e64), NewVAddrLo) 5056 .addDef(CondReg0) 5057 .addReg(RsrcPtr, 0, AMDGPU::sub0) 5058 .addReg(VAddr->getReg(), 0, AMDGPU::sub0) 5059 .addImm(0); 5060 5061 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1 5062 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi) 5063 .addDef(CondReg1, RegState::Dead) 5064 .addReg(RsrcPtr, 0, AMDGPU::sub1) 5065 .addReg(VAddr->getReg(), 0, AMDGPU::sub1) 5066 .addReg(CondReg0, RegState::Kill) 5067 .addImm(0); 5068 5069 // NewVaddr = {NewVaddrHi, NewVaddrLo} 5070 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 5071 .addReg(NewVAddrLo) 5072 .addImm(AMDGPU::sub0) 5073 .addReg(NewVAddrHi) 5074 .addImm(AMDGPU::sub1); 5075 5076 VAddr->setReg(NewVAddr); 5077 Rsrc->setReg(NewSRsrc); 5078 } else if (!VAddr && ST.hasAddr64()) { 5079 // This instructions is the _OFFSET variant, so we need to convert it to 5080 // ADDR64. 5081 assert(MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration() 5082 < AMDGPUSubtarget::VOLCANIC_ISLANDS && 5083 "FIXME: Need to emit flat atomics here"); 5084 5085 unsigned RsrcPtr, NewSRsrc; 5086 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); 5087 5088 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 5089 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); 5090 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 5091 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); 5092 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); 5093 5094 // Atomics rith return have have an additional tied operand and are 5095 // missing some of the special bits. 5096 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); 5097 MachineInstr *Addr64; 5098 5099 if (!VDataIn) { 5100 // Regular buffer load / store. 5101 MachineInstrBuilder MIB = 5102 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 5103 .add(*VData) 5104 .addReg(NewVAddr) 5105 .addReg(NewSRsrc) 5106 .add(*SOffset) 5107 .add(*Offset); 5108 5109 // Atomics do not have this operand. 5110 if (const MachineOperand *GLC = 5111 getNamedOperand(MI, AMDGPU::OpName::glc)) { 5112 MIB.addImm(GLC->getImm()); 5113 } 5114 if (const MachineOperand *DLC = 5115 getNamedOperand(MI, AMDGPU::OpName::dlc)) { 5116 MIB.addImm(DLC->getImm()); 5117 } 5118 5119 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)); 5120 5121 if (const MachineOperand *TFE = 5122 getNamedOperand(MI, AMDGPU::OpName::tfe)) { 5123 MIB.addImm(TFE->getImm()); 5124 } 5125 5126 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz)); 5127 5128 MIB.cloneMemRefs(MI); 5129 Addr64 = MIB; 5130 } else { 5131 // Atomics with return. 5132 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 5133 .add(*VData) 5134 .add(*VDataIn) 5135 .addReg(NewVAddr) 5136 .addReg(NewSRsrc) 5137 .add(*SOffset) 5138 .add(*Offset) 5139 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) 5140 .cloneMemRefs(MI); 5141 } 5142 5143 MI.removeFromParent(); 5144 5145 // NewVaddr = {NewVaddrHi, NewVaddrLo} 5146 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), 5147 NewVAddr) 5148 .addReg(RsrcPtr, 0, AMDGPU::sub0) 5149 .addImm(AMDGPU::sub0) 5150 .addReg(RsrcPtr, 0, AMDGPU::sub1) 5151 .addImm(AMDGPU::sub1); 5152 } else { 5153 // This is another variant; legalize Rsrc with waterfall loop from VGPRs 5154 // to SGPRs. 5155 loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT); 5156 } 5157 } 5158 } 5159 5160 void SIInstrInfo::moveToVALU(MachineInstr &TopInst, 5161 MachineDominatorTree *MDT) const { 5162 SetVectorType Worklist; 5163 Worklist.insert(&TopInst); 5164 5165 while (!Worklist.empty()) { 5166 MachineInstr &Inst = *Worklist.pop_back_val(); 5167 MachineBasicBlock *MBB = Inst.getParent(); 5168 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 5169 5170 unsigned Opcode = Inst.getOpcode(); 5171 unsigned NewOpcode = getVALUOp(Inst); 5172 5173 // Handle some special cases 5174 switch (Opcode) { 5175 default: 5176 break; 5177 case AMDGPU::S_ADD_U64_PSEUDO: 5178 case AMDGPU::S_SUB_U64_PSEUDO: 5179 splitScalar64BitAddSub(Worklist, Inst, MDT); 5180 Inst.eraseFromParent(); 5181 continue; 5182 case AMDGPU::S_ADD_I32: 5183 case AMDGPU::S_SUB_I32: 5184 // FIXME: The u32 versions currently selected use the carry. 5185 if (moveScalarAddSub(Worklist, Inst, MDT)) 5186 continue; 5187 5188 // Default handling 5189 break; 5190 case AMDGPU::S_AND_B64: 5191 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); 5192 Inst.eraseFromParent(); 5193 continue; 5194 5195 case AMDGPU::S_OR_B64: 5196 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT); 5197 Inst.eraseFromParent(); 5198 continue; 5199 5200 case AMDGPU::S_XOR_B64: 5201 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT); 5202 Inst.eraseFromParent(); 5203 continue; 5204 5205 case AMDGPU::S_NAND_B64: 5206 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT); 5207 Inst.eraseFromParent(); 5208 continue; 5209 5210 case AMDGPU::S_NOR_B64: 5211 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT); 5212 Inst.eraseFromParent(); 5213 continue; 5214 5215 case AMDGPU::S_XNOR_B64: 5216 if (ST.hasDLInsts()) 5217 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); 5218 else 5219 splitScalar64BitXnor(Worklist, Inst, MDT); 5220 Inst.eraseFromParent(); 5221 continue; 5222 5223 case AMDGPU::S_ANDN2_B64: 5224 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT); 5225 Inst.eraseFromParent(); 5226 continue; 5227 5228 case AMDGPU::S_ORN2_B64: 5229 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT); 5230 Inst.eraseFromParent(); 5231 continue; 5232 5233 case AMDGPU::S_NOT_B64: 5234 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); 5235 Inst.eraseFromParent(); 5236 continue; 5237 5238 case AMDGPU::S_BCNT1_I32_B64: 5239 splitScalar64BitBCNT(Worklist, Inst); 5240 Inst.eraseFromParent(); 5241 continue; 5242 5243 case AMDGPU::S_BFE_I64: 5244 splitScalar64BitBFE(Worklist, Inst); 5245 Inst.eraseFromParent(); 5246 continue; 5247 5248 case AMDGPU::S_LSHL_B32: 5249 if (ST.hasOnlyRevVALUShifts()) { 5250 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 5251 swapOperands(Inst); 5252 } 5253 break; 5254 case AMDGPU::S_ASHR_I32: 5255 if (ST.hasOnlyRevVALUShifts()) { 5256 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 5257 swapOperands(Inst); 5258 } 5259 break; 5260 case AMDGPU::S_LSHR_B32: 5261 if (ST.hasOnlyRevVALUShifts()) { 5262 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 5263 swapOperands(Inst); 5264 } 5265 break; 5266 case AMDGPU::S_LSHL_B64: 5267 if (ST.hasOnlyRevVALUShifts()) { 5268 NewOpcode = AMDGPU::V_LSHLREV_B64; 5269 swapOperands(Inst); 5270 } 5271 break; 5272 case AMDGPU::S_ASHR_I64: 5273 if (ST.hasOnlyRevVALUShifts()) { 5274 NewOpcode = AMDGPU::V_ASHRREV_I64; 5275 swapOperands(Inst); 5276 } 5277 break; 5278 case AMDGPU::S_LSHR_B64: 5279 if (ST.hasOnlyRevVALUShifts()) { 5280 NewOpcode = AMDGPU::V_LSHRREV_B64; 5281 swapOperands(Inst); 5282 } 5283 break; 5284 5285 case AMDGPU::S_ABS_I32: 5286 lowerScalarAbs(Worklist, Inst); 5287 Inst.eraseFromParent(); 5288 continue; 5289 5290 case AMDGPU::S_CBRANCH_SCC0: 5291 case AMDGPU::S_CBRANCH_SCC1: 5292 // Clear unused bits of vcc 5293 if (ST.isWave32()) 5294 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B32), 5295 AMDGPU::VCC_LO) 5296 .addReg(AMDGPU::EXEC_LO) 5297 .addReg(AMDGPU::VCC_LO); 5298 else 5299 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), 5300 AMDGPU::VCC) 5301 .addReg(AMDGPU::EXEC) 5302 .addReg(AMDGPU::VCC); 5303 break; 5304 5305 case AMDGPU::S_BFE_U64: 5306 case AMDGPU::S_BFM_B64: 5307 llvm_unreachable("Moving this op to VALU not implemented"); 5308 5309 case AMDGPU::S_PACK_LL_B32_B16: 5310 case AMDGPU::S_PACK_LH_B32_B16: 5311 case AMDGPU::S_PACK_HH_B32_B16: 5312 movePackToVALU(Worklist, MRI, Inst); 5313 Inst.eraseFromParent(); 5314 continue; 5315 5316 case AMDGPU::S_XNOR_B32: 5317 lowerScalarXnor(Worklist, Inst); 5318 Inst.eraseFromParent(); 5319 continue; 5320 5321 case AMDGPU::S_NAND_B32: 5322 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32); 5323 Inst.eraseFromParent(); 5324 continue; 5325 5326 case AMDGPU::S_NOR_B32: 5327 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32); 5328 Inst.eraseFromParent(); 5329 continue; 5330 5331 case AMDGPU::S_ANDN2_B32: 5332 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32); 5333 Inst.eraseFromParent(); 5334 continue; 5335 5336 case AMDGPU::S_ORN2_B32: 5337 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); 5338 Inst.eraseFromParent(); 5339 continue; 5340 5341 // TODO: remove as soon as everything is ready 5342 // to replace VGPR to SGPR copy with V_READFIRSTLANEs. 5343 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO 5344 // can only be selected from the uniform SDNode. 5345 case AMDGPU::S_ADD_CO_PSEUDO: 5346 case AMDGPU::S_SUB_CO_PSEUDO: { 5347 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) 5348 ? AMDGPU::V_ADDC_U32_e64 5349 : AMDGPU::V_SUBB_U32_e64; 5350 const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 5351 5352 Register CarryInReg = Inst.getOperand(4).getReg(); 5353 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) { 5354 Register NewCarryReg = MRI.createVirtualRegister(CarryRC); 5355 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg) 5356 .addReg(CarryInReg); 5357 } 5358 5359 Register CarryOutReg = Inst.getOperand(1).getReg(); 5360 5361 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass( 5362 MRI.getRegClass(Inst.getOperand(0).getReg()))); 5363 MachineInstr *CarryOp = 5364 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg) 5365 .addReg(CarryOutReg, RegState::Define) 5366 .add(Inst.getOperand(2)) 5367 .add(Inst.getOperand(3)) 5368 .addReg(CarryInReg) 5369 .addImm(0); 5370 legalizeOperands(*CarryOp); 5371 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg); 5372 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); 5373 Inst.eraseFromParent(); 5374 } 5375 continue; 5376 case AMDGPU::S_UADDO_PSEUDO: 5377 case AMDGPU::S_USUBO_PSEUDO: { 5378 const DebugLoc &DL = Inst.getDebugLoc(); 5379 MachineOperand &Dest0 = Inst.getOperand(0); 5380 MachineOperand &Dest1 = Inst.getOperand(1); 5381 MachineOperand &Src0 = Inst.getOperand(2); 5382 MachineOperand &Src1 = Inst.getOperand(3); 5383 5384 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO) 5385 ? AMDGPU::V_ADD_I32_e64 5386 : AMDGPU::V_SUB_I32_e64; 5387 const TargetRegisterClass *NewRC = 5388 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg())); 5389 Register DestReg = MRI.createVirtualRegister(NewRC); 5390 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg) 5391 .addReg(Dest1.getReg(), RegState::Define) 5392 .add(Src0) 5393 .add(Src1) 5394 .addImm(0); // clamp bit 5395 5396 legalizeOperands(*NewInstr, MDT); 5397 5398 MRI.replaceRegWith(Dest0.getReg(), DestReg); 5399 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, 5400 Worklist); 5401 Inst.eraseFromParent(); 5402 } 5403 continue; 5404 } 5405 5406 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 5407 // We cannot move this instruction to the VALU, so we should try to 5408 // legalize its operands instead. 5409 legalizeOperands(Inst, MDT); 5410 continue; 5411 } 5412 5413 // Use the new VALU Opcode. 5414 const MCInstrDesc &NewDesc = get(NewOpcode); 5415 Inst.setDesc(NewDesc); 5416 5417 // Remove any references to SCC. Vector instructions can't read from it, and 5418 // We're just about to add the implicit use / defs of VCC, and we don't want 5419 // both. 5420 for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { 5421 MachineOperand &Op = Inst.getOperand(i); 5422 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { 5423 // Only propagate through live-def of SCC. 5424 if (Op.isDef() && !Op.isDead()) 5425 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist); 5426 Inst.RemoveOperand(i); 5427 } 5428 } 5429 5430 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 5431 // We are converting these to a BFE, so we need to add the missing 5432 // operands for the size and offset. 5433 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 5434 Inst.addOperand(MachineOperand::CreateImm(0)); 5435 Inst.addOperand(MachineOperand::CreateImm(Size)); 5436 5437 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 5438 // The VALU version adds the second operand to the result, so insert an 5439 // extra 0 operand. 5440 Inst.addOperand(MachineOperand::CreateImm(0)); 5441 } 5442 5443 Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); 5444 fixImplicitOperands(Inst); 5445 5446 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 5447 const MachineOperand &OffsetWidthOp = Inst.getOperand(2); 5448 // If we need to move this to VGPRs, we need to unpack the second operand 5449 // back into the 2 separate ones for bit offset and width. 5450 assert(OffsetWidthOp.isImm() && 5451 "Scalar BFE is only implemented for constant width and offset"); 5452 uint32_t Imm = OffsetWidthOp.getImm(); 5453 5454 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 5455 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 5456 Inst.RemoveOperand(2); // Remove old immediate. 5457 Inst.addOperand(MachineOperand::CreateImm(Offset)); 5458 Inst.addOperand(MachineOperand::CreateImm(BitWidth)); 5459 } 5460 5461 bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); 5462 unsigned NewDstReg = AMDGPU::NoRegister; 5463 if (HasDst) { 5464 Register DstReg = Inst.getOperand(0).getReg(); 5465 if (Register::isPhysicalRegister(DstReg)) 5466 continue; 5467 5468 // Update the destination register class. 5469 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); 5470 if (!NewDstRC) 5471 continue; 5472 5473 if (Inst.isCopy() && 5474 Register::isVirtualRegister(Inst.getOperand(1).getReg()) && 5475 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { 5476 // Instead of creating a copy where src and dst are the same register 5477 // class, we just replace all uses of dst with src. These kinds of 5478 // copies interfere with the heuristics MachineSink uses to decide 5479 // whether or not to split a critical edge. Since the pass assumes 5480 // that copies will end up as machine instructions and not be 5481 // eliminated. 5482 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); 5483 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); 5484 MRI.clearKillFlags(Inst.getOperand(1).getReg()); 5485 Inst.getOperand(0).setReg(DstReg); 5486 5487 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally 5488 // these are deleted later, but at -O0 it would leave a suspicious 5489 // looking illegal copy of an undef register. 5490 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I) 5491 Inst.RemoveOperand(I); 5492 Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); 5493 continue; 5494 } 5495 5496 NewDstReg = MRI.createVirtualRegister(NewDstRC); 5497 MRI.replaceRegWith(DstReg, NewDstReg); 5498 } 5499 5500 // Legalize the operands 5501 legalizeOperands(Inst, MDT); 5502 5503 if (HasDst) 5504 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 5505 } 5506 } 5507 5508 // Add/sub require special handling to deal with carry outs. 5509 bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, 5510 MachineDominatorTree *MDT) const { 5511 if (ST.hasAddNoCarry()) { 5512 // Assume there is no user of scc since we don't select this in that case. 5513 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant 5514 // is used. 5515 5516 MachineBasicBlock &MBB = *Inst.getParent(); 5517 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5518 5519 Register OldDstReg = Inst.getOperand(0).getReg(); 5520 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5521 5522 unsigned Opc = Inst.getOpcode(); 5523 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32); 5524 5525 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ? 5526 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64; 5527 5528 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC); 5529 Inst.RemoveOperand(3); 5530 5531 Inst.setDesc(get(NewOpc)); 5532 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit 5533 Inst.addImplicitDefUseOperands(*MBB.getParent()); 5534 MRI.replaceRegWith(OldDstReg, ResultReg); 5535 legalizeOperands(Inst, MDT); 5536 5537 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 5538 return true; 5539 } 5540 5541 return false; 5542 } 5543 5544 void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, 5545 MachineInstr &Inst) const { 5546 MachineBasicBlock &MBB = *Inst.getParent(); 5547 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5548 MachineBasicBlock::iterator MII = Inst; 5549 DebugLoc DL = Inst.getDebugLoc(); 5550 5551 MachineOperand &Dest = Inst.getOperand(0); 5552 MachineOperand &Src = Inst.getOperand(1); 5553 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5554 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5555 5556 unsigned SubOp = ST.hasAddNoCarry() ? 5557 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32; 5558 5559 BuildMI(MBB, MII, DL, get(SubOp), TmpReg) 5560 .addImm(0) 5561 .addReg(Src.getReg()); 5562 5563 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) 5564 .addReg(Src.getReg()) 5565 .addReg(TmpReg); 5566 5567 MRI.replaceRegWith(Dest.getReg(), ResultReg); 5568 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 5569 } 5570 5571 void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, 5572 MachineInstr &Inst) const { 5573 MachineBasicBlock &MBB = *Inst.getParent(); 5574 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5575 MachineBasicBlock::iterator MII = Inst; 5576 const DebugLoc &DL = Inst.getDebugLoc(); 5577 5578 MachineOperand &Dest = Inst.getOperand(0); 5579 MachineOperand &Src0 = Inst.getOperand(1); 5580 MachineOperand &Src1 = Inst.getOperand(2); 5581 5582 if (ST.hasDLInsts()) { 5583 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5584 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); 5585 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); 5586 5587 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest) 5588 .add(Src0) 5589 .add(Src1); 5590 5591 MRI.replaceRegWith(Dest.getReg(), NewDest); 5592 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 5593 } else { 5594 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can 5595 // invert either source and then perform the XOR. If either source is a 5596 // scalar register, then we can leave the inversion on the scalar unit to 5597 // acheive a better distrubution of scalar and vector instructions. 5598 bool Src0IsSGPR = Src0.isReg() && 5599 RI.isSGPRClass(MRI.getRegClass(Src0.getReg())); 5600 bool Src1IsSGPR = Src1.isReg() && 5601 RI.isSGPRClass(MRI.getRegClass(Src1.getReg())); 5602 MachineInstr *Xor; 5603 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5604 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5605 5606 // Build a pair of scalar instructions and add them to the work list. 5607 // The next iteration over the work list will lower these to the vector 5608 // unit as necessary. 5609 if (Src0IsSGPR) { 5610 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0); 5611 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) 5612 .addReg(Temp) 5613 .add(Src1); 5614 } else if (Src1IsSGPR) { 5615 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1); 5616 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) 5617 .add(Src0) 5618 .addReg(Temp); 5619 } else { 5620 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp) 5621 .add(Src0) 5622 .add(Src1); 5623 MachineInstr *Not = 5624 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp); 5625 Worklist.insert(Not); 5626 } 5627 5628 MRI.replaceRegWith(Dest.getReg(), NewDest); 5629 5630 Worklist.insert(Xor); 5631 5632 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 5633 } 5634 } 5635 5636 void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist, 5637 MachineInstr &Inst, 5638 unsigned Opcode) const { 5639 MachineBasicBlock &MBB = *Inst.getParent(); 5640 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5641 MachineBasicBlock::iterator MII = Inst; 5642 const DebugLoc &DL = Inst.getDebugLoc(); 5643 5644 MachineOperand &Dest = Inst.getOperand(0); 5645 MachineOperand &Src0 = Inst.getOperand(1); 5646 MachineOperand &Src1 = Inst.getOperand(2); 5647 5648 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5649 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5650 5651 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm) 5652 .add(Src0) 5653 .add(Src1); 5654 5655 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest) 5656 .addReg(Interm); 5657 5658 Worklist.insert(&Op); 5659 Worklist.insert(&Not); 5660 5661 MRI.replaceRegWith(Dest.getReg(), NewDest); 5662 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 5663 } 5664 5665 void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist, 5666 MachineInstr &Inst, 5667 unsigned Opcode) const { 5668 MachineBasicBlock &MBB = *Inst.getParent(); 5669 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5670 MachineBasicBlock::iterator MII = Inst; 5671 const DebugLoc &DL = Inst.getDebugLoc(); 5672 5673 MachineOperand &Dest = Inst.getOperand(0); 5674 MachineOperand &Src0 = Inst.getOperand(1); 5675 MachineOperand &Src1 = Inst.getOperand(2); 5676 5677 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 5678 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 5679 5680 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm) 5681 .add(Src1); 5682 5683 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest) 5684 .add(Src0) 5685 .addReg(Interm); 5686 5687 Worklist.insert(&Not); 5688 Worklist.insert(&Op); 5689 5690 MRI.replaceRegWith(Dest.getReg(), NewDest); 5691 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 5692 } 5693 5694 void SIInstrInfo::splitScalar64BitUnaryOp( 5695 SetVectorType &Worklist, MachineInstr &Inst, 5696 unsigned Opcode) const { 5697 MachineBasicBlock &MBB = *Inst.getParent(); 5698 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5699 5700 MachineOperand &Dest = Inst.getOperand(0); 5701 MachineOperand &Src0 = Inst.getOperand(1); 5702 DebugLoc DL = Inst.getDebugLoc(); 5703 5704 MachineBasicBlock::iterator MII = Inst; 5705 5706 const MCInstrDesc &InstDesc = get(Opcode); 5707 const TargetRegisterClass *Src0RC = Src0.isReg() ? 5708 MRI.getRegClass(Src0.getReg()) : 5709 &AMDGPU::SGPR_32RegClass; 5710 5711 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 5712 5713 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5714 AMDGPU::sub0, Src0SubRC); 5715 5716 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 5717 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 5718 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 5719 5720 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 5721 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); 5722 5723 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5724 AMDGPU::sub1, Src0SubRC); 5725 5726 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 5727 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); 5728 5729 Register FullDestReg = MRI.createVirtualRegister(NewDestRC); 5730 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 5731 .addReg(DestSub0) 5732 .addImm(AMDGPU::sub0) 5733 .addReg(DestSub1) 5734 .addImm(AMDGPU::sub1); 5735 5736 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 5737 5738 Worklist.insert(&LoHalf); 5739 Worklist.insert(&HiHalf); 5740 5741 // We don't need to legalizeOperands here because for a single operand, src0 5742 // will support any kind of input. 5743 5744 // Move all users of this moved value. 5745 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 5746 } 5747 5748 void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, 5749 MachineInstr &Inst, 5750 MachineDominatorTree *MDT) const { 5751 bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); 5752 5753 MachineBasicBlock &MBB = *Inst.getParent(); 5754 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5755 const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 5756 5757 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 5758 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5759 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5760 5761 Register CarryReg = MRI.createVirtualRegister(CarryRC); 5762 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC); 5763 5764 MachineOperand &Dest = Inst.getOperand(0); 5765 MachineOperand &Src0 = Inst.getOperand(1); 5766 MachineOperand &Src1 = Inst.getOperand(2); 5767 const DebugLoc &DL = Inst.getDebugLoc(); 5768 MachineBasicBlock::iterator MII = Inst; 5769 5770 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); 5771 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); 5772 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 5773 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 5774 5775 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5776 AMDGPU::sub0, Src0SubRC); 5777 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 5778 AMDGPU::sub0, Src1SubRC); 5779 5780 5781 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5782 AMDGPU::sub1, Src0SubRC); 5783 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 5784 AMDGPU::sub1, Src1SubRC); 5785 5786 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; 5787 MachineInstr *LoHalf = 5788 BuildMI(MBB, MII, DL, get(LoOpc), DestSub0) 5789 .addReg(CarryReg, RegState::Define) 5790 .add(SrcReg0Sub0) 5791 .add(SrcReg1Sub0) 5792 .addImm(0); // clamp bit 5793 5794 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 5795 MachineInstr *HiHalf = 5796 BuildMI(MBB, MII, DL, get(HiOpc), DestSub1) 5797 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 5798 .add(SrcReg0Sub1) 5799 .add(SrcReg1Sub1) 5800 .addReg(CarryReg, RegState::Kill) 5801 .addImm(0); // clamp bit 5802 5803 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 5804 .addReg(DestSub0) 5805 .addImm(AMDGPU::sub0) 5806 .addReg(DestSub1) 5807 .addImm(AMDGPU::sub1); 5808 5809 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 5810 5811 // Try to legalize the operands in case we need to swap the order to keep it 5812 // valid. 5813 legalizeOperands(*LoHalf, MDT); 5814 legalizeOperands(*HiHalf, MDT); 5815 5816 // Move all users of this moved vlaue. 5817 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 5818 } 5819 5820 void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, 5821 MachineInstr &Inst, unsigned Opcode, 5822 MachineDominatorTree *MDT) const { 5823 MachineBasicBlock &MBB = *Inst.getParent(); 5824 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5825 5826 MachineOperand &Dest = Inst.getOperand(0); 5827 MachineOperand &Src0 = Inst.getOperand(1); 5828 MachineOperand &Src1 = Inst.getOperand(2); 5829 DebugLoc DL = Inst.getDebugLoc(); 5830 5831 MachineBasicBlock::iterator MII = Inst; 5832 5833 const MCInstrDesc &InstDesc = get(Opcode); 5834 const TargetRegisterClass *Src0RC = Src0.isReg() ? 5835 MRI.getRegClass(Src0.getReg()) : 5836 &AMDGPU::SGPR_32RegClass; 5837 5838 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 5839 const TargetRegisterClass *Src1RC = Src1.isReg() ? 5840 MRI.getRegClass(Src1.getReg()) : 5841 &AMDGPU::SGPR_32RegClass; 5842 5843 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 5844 5845 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5846 AMDGPU::sub0, Src0SubRC); 5847 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 5848 AMDGPU::sub0, Src1SubRC); 5849 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5850 AMDGPU::sub1, Src0SubRC); 5851 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 5852 AMDGPU::sub1, Src1SubRC); 5853 5854 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 5855 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 5856 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 5857 5858 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 5859 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) 5860 .add(SrcReg0Sub0) 5861 .add(SrcReg1Sub0); 5862 5863 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 5864 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) 5865 .add(SrcReg0Sub1) 5866 .add(SrcReg1Sub1); 5867 5868 Register FullDestReg = MRI.createVirtualRegister(NewDestRC); 5869 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 5870 .addReg(DestSub0) 5871 .addImm(AMDGPU::sub0) 5872 .addReg(DestSub1) 5873 .addImm(AMDGPU::sub1); 5874 5875 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 5876 5877 Worklist.insert(&LoHalf); 5878 Worklist.insert(&HiHalf); 5879 5880 // Move all users of this moved vlaue. 5881 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 5882 } 5883 5884 void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist, 5885 MachineInstr &Inst, 5886 MachineDominatorTree *MDT) const { 5887 MachineBasicBlock &MBB = *Inst.getParent(); 5888 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5889 5890 MachineOperand &Dest = Inst.getOperand(0); 5891 MachineOperand &Src0 = Inst.getOperand(1); 5892 MachineOperand &Src1 = Inst.getOperand(2); 5893 const DebugLoc &DL = Inst.getDebugLoc(); 5894 5895 MachineBasicBlock::iterator MII = Inst; 5896 5897 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 5898 5899 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 5900 5901 MachineOperand* Op0; 5902 MachineOperand* Op1; 5903 5904 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) { 5905 Op0 = &Src0; 5906 Op1 = &Src1; 5907 } else { 5908 Op0 = &Src1; 5909 Op1 = &Src0; 5910 } 5911 5912 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm) 5913 .add(*Op0); 5914 5915 Register NewDest = MRI.createVirtualRegister(DestRC); 5916 5917 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest) 5918 .addReg(Interm) 5919 .add(*Op1); 5920 5921 MRI.replaceRegWith(Dest.getReg(), NewDest); 5922 5923 Worklist.insert(&Xor); 5924 } 5925 5926 void SIInstrInfo::splitScalar64BitBCNT( 5927 SetVectorType &Worklist, MachineInstr &Inst) const { 5928 MachineBasicBlock &MBB = *Inst.getParent(); 5929 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5930 5931 MachineBasicBlock::iterator MII = Inst; 5932 const DebugLoc &DL = Inst.getDebugLoc(); 5933 5934 MachineOperand &Dest = Inst.getOperand(0); 5935 MachineOperand &Src = Inst.getOperand(1); 5936 5937 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 5938 const TargetRegisterClass *SrcRC = Src.isReg() ? 5939 MRI.getRegClass(Src.getReg()) : 5940 &AMDGPU::SGPR_32RegClass; 5941 5942 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5943 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5944 5945 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); 5946 5947 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 5948 AMDGPU::sub0, SrcSubRC); 5949 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 5950 AMDGPU::sub1, SrcSubRC); 5951 5952 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0); 5953 5954 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg); 5955 5956 MRI.replaceRegWith(Dest.getReg(), ResultReg); 5957 5958 // We don't need to legalize operands here. src0 for etiher instruction can be 5959 // an SGPR, and the second input is unused or determined here. 5960 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 5961 } 5962 5963 void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, 5964 MachineInstr &Inst) const { 5965 MachineBasicBlock &MBB = *Inst.getParent(); 5966 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5967 MachineBasicBlock::iterator MII = Inst; 5968 const DebugLoc &DL = Inst.getDebugLoc(); 5969 5970 MachineOperand &Dest = Inst.getOperand(0); 5971 uint32_t Imm = Inst.getOperand(2).getImm(); 5972 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 5973 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 5974 5975 (void) Offset; 5976 5977 // Only sext_inreg cases handled. 5978 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && 5979 Offset == 0 && "Not implemented"); 5980 5981 if (BitWidth < 32) { 5982 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5983 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5984 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 5985 5986 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) 5987 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) 5988 .addImm(0) 5989 .addImm(BitWidth); 5990 5991 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 5992 .addImm(31) 5993 .addReg(MidRegLo); 5994 5995 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 5996 .addReg(MidRegLo) 5997 .addImm(AMDGPU::sub0) 5998 .addReg(MidRegHi) 5999 .addImm(AMDGPU::sub1); 6000 6001 MRI.replaceRegWith(Dest.getReg(), ResultReg); 6002 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 6003 return; 6004 } 6005 6006 MachineOperand &Src = Inst.getOperand(1); 6007 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6008 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 6009 6010 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 6011 .addImm(31) 6012 .addReg(Src.getReg(), 0, AMDGPU::sub0); 6013 6014 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 6015 .addReg(Src.getReg(), 0, AMDGPU::sub0) 6016 .addImm(AMDGPU::sub0) 6017 .addReg(TmpReg) 6018 .addImm(AMDGPU::sub1); 6019 6020 MRI.replaceRegWith(Dest.getReg(), ResultReg); 6021 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 6022 } 6023 6024 void SIInstrInfo::addUsersToMoveToVALUWorklist( 6025 Register DstReg, 6026 MachineRegisterInfo &MRI, 6027 SetVectorType &Worklist) const { 6028 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), 6029 E = MRI.use_end(); I != E;) { 6030 MachineInstr &UseMI = *I->getParent(); 6031 6032 unsigned OpNo = 0; 6033 6034 switch (UseMI.getOpcode()) { 6035 case AMDGPU::COPY: 6036 case AMDGPU::WQM: 6037 case AMDGPU::SOFT_WQM: 6038 case AMDGPU::WWM: 6039 case AMDGPU::REG_SEQUENCE: 6040 case AMDGPU::PHI: 6041 case AMDGPU::INSERT_SUBREG: 6042 break; 6043 default: 6044 OpNo = I.getOperandNo(); 6045 break; 6046 } 6047 6048 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) { 6049 Worklist.insert(&UseMI); 6050 6051 do { 6052 ++I; 6053 } while (I != E && I->getParent() == &UseMI); 6054 } else { 6055 ++I; 6056 } 6057 } 6058 } 6059 6060 void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, 6061 MachineRegisterInfo &MRI, 6062 MachineInstr &Inst) const { 6063 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6064 MachineBasicBlock *MBB = Inst.getParent(); 6065 MachineOperand &Src0 = Inst.getOperand(1); 6066 MachineOperand &Src1 = Inst.getOperand(2); 6067 const DebugLoc &DL = Inst.getDebugLoc(); 6068 6069 switch (Inst.getOpcode()) { 6070 case AMDGPU::S_PACK_LL_B32_B16: { 6071 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6072 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6073 6074 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are 6075 // 0. 6076 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 6077 .addImm(0xffff); 6078 6079 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg) 6080 .addReg(ImmReg, RegState::Kill) 6081 .add(Src0); 6082 6083 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg) 6084 .add(Src1) 6085 .addImm(16) 6086 .addReg(TmpReg, RegState::Kill); 6087 break; 6088 } 6089 case AMDGPU::S_PACK_LH_B32_B16: { 6090 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6091 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 6092 .addImm(0xffff); 6093 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg) 6094 .addReg(ImmReg, RegState::Kill) 6095 .add(Src0) 6096 .add(Src1); 6097 break; 6098 } 6099 case AMDGPU::S_PACK_HH_B32_B16: { 6100 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6101 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6102 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) 6103 .addImm(16) 6104 .add(Src0); 6105 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 6106 .addImm(0xffff0000); 6107 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg) 6108 .add(Src1) 6109 .addReg(ImmReg, RegState::Kill) 6110 .addReg(TmpReg, RegState::Kill); 6111 break; 6112 } 6113 default: 6114 llvm_unreachable("unhandled s_pack_* instruction"); 6115 } 6116 6117 MachineOperand &Dest = Inst.getOperand(0); 6118 MRI.replaceRegWith(Dest.getReg(), ResultReg); 6119 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 6120 } 6121 6122 void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, 6123 MachineInstr &SCCDefInst, 6124 SetVectorType &Worklist) const { 6125 // Ensure that def inst defines SCC, which is still live. 6126 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && 6127 !Op.isDead() && Op.getParent() == &SCCDefInst); 6128 SmallVector<MachineInstr *, 4> CopyToDelete; 6129 // This assumes that all the users of SCC are in the same block 6130 // as the SCC def. 6131 for (MachineInstr &MI : // Skip the def inst itself. 6132 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)), 6133 SCCDefInst.getParent()->end())) { 6134 // Check if SCC is used first. 6135 if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) { 6136 if (MI.isCopy()) { 6137 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 6138 unsigned DestReg = MI.getOperand(0).getReg(); 6139 SmallVector<MachineInstr *, 4> Users; 6140 for (auto &User : MRI.use_nodbg_instructions(DestReg)) { 6141 if ((User.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) || 6142 (User.getOpcode() == AMDGPU::S_SUB_CO_PSEUDO)) { 6143 Users.push_back(&User); 6144 Worklist.insert(&User); 6145 } 6146 } 6147 for (auto &U : Users) 6148 U->getOperand(4).setReg(RI.getVCC()); 6149 CopyToDelete.push_back(&MI); 6150 } else 6151 Worklist.insert(&MI); 6152 } 6153 // Exit if we find another SCC def. 6154 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1) 6155 break; 6156 } 6157 for (auto &Copy : CopyToDelete) 6158 Copy->eraseFromParent(); 6159 } 6160 6161 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( 6162 const MachineInstr &Inst) const { 6163 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); 6164 6165 switch (Inst.getOpcode()) { 6166 // For target instructions, getOpRegClass just returns the virtual register 6167 // class associated with the operand, so we need to find an equivalent VGPR 6168 // register class in order to move the instruction to the VALU. 6169 case AMDGPU::COPY: 6170 case AMDGPU::PHI: 6171 case AMDGPU::REG_SEQUENCE: 6172 case AMDGPU::INSERT_SUBREG: 6173 case AMDGPU::WQM: 6174 case AMDGPU::SOFT_WQM: 6175 case AMDGPU::WWM: { 6176 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1); 6177 if (RI.hasAGPRs(SrcRC)) { 6178 if (RI.hasAGPRs(NewDstRC)) 6179 return nullptr; 6180 6181 switch (Inst.getOpcode()) { 6182 case AMDGPU::PHI: 6183 case AMDGPU::REG_SEQUENCE: 6184 case AMDGPU::INSERT_SUBREG: 6185 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC); 6186 break; 6187 default: 6188 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 6189 } 6190 6191 if (!NewDstRC) 6192 return nullptr; 6193 } else { 6194 if (RI.hasVGPRs(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass) 6195 return nullptr; 6196 6197 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 6198 if (!NewDstRC) 6199 return nullptr; 6200 } 6201 6202 return NewDstRC; 6203 } 6204 default: 6205 return NewDstRC; 6206 } 6207 } 6208 6209 // Find the one SGPR operand we are allowed to use. 6210 Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI, 6211 int OpIndices[3]) const { 6212 const MCInstrDesc &Desc = MI.getDesc(); 6213 6214 // Find the one SGPR operand we are allowed to use. 6215 // 6216 // First we need to consider the instruction's operand requirements before 6217 // legalizing. Some operands are required to be SGPRs, such as implicit uses 6218 // of VCC, but we are still bound by the constant bus requirement to only use 6219 // one. 6220 // 6221 // If the operand's class is an SGPR, we can never move it. 6222 6223 Register SGPRReg = findImplicitSGPRRead(MI); 6224 if (SGPRReg != AMDGPU::NoRegister) 6225 return SGPRReg; 6226 6227 Register UsedSGPRs[3] = { AMDGPU::NoRegister }; 6228 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 6229 6230 for (unsigned i = 0; i < 3; ++i) { 6231 int Idx = OpIndices[i]; 6232 if (Idx == -1) 6233 break; 6234 6235 const MachineOperand &MO = MI.getOperand(Idx); 6236 if (!MO.isReg()) 6237 continue; 6238 6239 // Is this operand statically required to be an SGPR based on the operand 6240 // constraints? 6241 const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); 6242 bool IsRequiredSGPR = RI.isSGPRClass(OpRC); 6243 if (IsRequiredSGPR) 6244 return MO.getReg(); 6245 6246 // If this could be a VGPR or an SGPR, Check the dynamic register class. 6247 Register Reg = MO.getReg(); 6248 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); 6249 if (RI.isSGPRClass(RegRC)) 6250 UsedSGPRs[i] = Reg; 6251 } 6252 6253 // We don't have a required SGPR operand, so we have a bit more freedom in 6254 // selecting operands to move. 6255 6256 // Try to select the most used SGPR. If an SGPR is equal to one of the 6257 // others, we choose that. 6258 // 6259 // e.g. 6260 // V_FMA_F32 v0, s0, s0, s0 -> No moves 6261 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 6262 6263 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should 6264 // prefer those. 6265 6266 if (UsedSGPRs[0] != AMDGPU::NoRegister) { 6267 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 6268 SGPRReg = UsedSGPRs[0]; 6269 } 6270 6271 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { 6272 if (UsedSGPRs[1] == UsedSGPRs[2]) 6273 SGPRReg = UsedSGPRs[1]; 6274 } 6275 6276 return SGPRReg; 6277 } 6278 6279 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 6280 unsigned OperandName) const { 6281 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 6282 if (Idx == -1) 6283 return nullptr; 6284 6285 return &MI.getOperand(Idx); 6286 } 6287 6288 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 6289 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 6290 return (22ULL << 44) | // IMG_FORMAT_32_FLOAT 6291 (1ULL << 56) | // RESOURCE_LEVEL = 1 6292 (3ULL << 60); // OOB_SELECT = 3 6293 } 6294 6295 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 6296 if (ST.isAmdHsaOS()) { 6297 // Set ATC = 1. GFX9 doesn't have this bit. 6298 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) 6299 RsrcDataFormat |= (1ULL << 56); 6300 6301 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this. 6302 // BTW, it disables TC L2 and therefore decreases performance. 6303 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) 6304 RsrcDataFormat |= (2ULL << 59); 6305 } 6306 6307 return RsrcDataFormat; 6308 } 6309 6310 uint64_t SIInstrInfo::getScratchRsrcWords23() const { 6311 uint64_t Rsrc23 = getDefaultRsrcDataFormat() | 6312 AMDGPU::RSRC_TID_ENABLE | 6313 0xffffffff; // Size; 6314 6315 // GFX9 doesn't have ELEMENT_SIZE. 6316 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 6317 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; 6318 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; 6319 } 6320 6321 // IndexStride = 64 / 32. 6322 uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2; 6323 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; 6324 6325 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. 6326 // Clear them unless we want a huge stride. 6327 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 6328 ST.getGeneration() <= AMDGPUSubtarget::GFX9) 6329 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; 6330 6331 return Rsrc23; 6332 } 6333 6334 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { 6335 unsigned Opc = MI.getOpcode(); 6336 6337 return isSMRD(Opc); 6338 } 6339 6340 bool SIInstrInfo::isHighLatencyDef(int Opc) const { 6341 return get(Opc).mayLoad() && 6342 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc)); 6343 } 6344 6345 unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, 6346 int &FrameIndex) const { 6347 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 6348 if (!Addr || !Addr->isFI()) 6349 return AMDGPU::NoRegister; 6350 6351 assert(!MI.memoperands_empty() && 6352 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS); 6353 6354 FrameIndex = Addr->getIndex(); 6355 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); 6356 } 6357 6358 unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, 6359 int &FrameIndex) const { 6360 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr); 6361 assert(Addr && Addr->isFI()); 6362 FrameIndex = Addr->getIndex(); 6363 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg(); 6364 } 6365 6366 unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 6367 int &FrameIndex) const { 6368 if (!MI.mayLoad()) 6369 return AMDGPU::NoRegister; 6370 6371 if (isMUBUF(MI) || isVGPRSpill(MI)) 6372 return isStackAccess(MI, FrameIndex); 6373 6374 if (isSGPRSpill(MI)) 6375 return isSGPRStackAccess(MI, FrameIndex); 6376 6377 return AMDGPU::NoRegister; 6378 } 6379 6380 unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI, 6381 int &FrameIndex) const { 6382 if (!MI.mayStore()) 6383 return AMDGPU::NoRegister; 6384 6385 if (isMUBUF(MI) || isVGPRSpill(MI)) 6386 return isStackAccess(MI, FrameIndex); 6387 6388 if (isSGPRSpill(MI)) 6389 return isSGPRStackAccess(MI, FrameIndex); 6390 6391 return AMDGPU::NoRegister; 6392 } 6393 6394 unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const { 6395 unsigned Size = 0; 6396 MachineBasicBlock::const_instr_iterator I = MI.getIterator(); 6397 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); 6398 while (++I != E && I->isInsideBundle()) { 6399 assert(!I->isBundle() && "No nested bundle!"); 6400 Size += getInstSizeInBytes(*I); 6401 } 6402 6403 return Size; 6404 } 6405 6406 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 6407 unsigned Opc = MI.getOpcode(); 6408 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); 6409 unsigned DescSize = Desc.getSize(); 6410 6411 // If we have a definitive size, we can use it. Otherwise we need to inspect 6412 // the operands to know the size. 6413 if (isFixedSize(MI)) 6414 return DescSize; 6415 6416 // 4-byte instructions may have a 32-bit literal encoded after them. Check 6417 // operands that coud ever be literals. 6418 if (isVALU(MI) || isSALU(MI)) { 6419 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 6420 if (Src0Idx == -1) 6421 return DescSize; // No operands. 6422 6423 if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx])) 6424 return isVOP3(MI) ? 12 : (DescSize + 4); 6425 6426 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 6427 if (Src1Idx == -1) 6428 return DescSize; 6429 6430 if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx])) 6431 return isVOP3(MI) ? 12 : (DescSize + 4); 6432 6433 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 6434 if (Src2Idx == -1) 6435 return DescSize; 6436 6437 if (isLiteralConstantLike(MI.getOperand(Src2Idx), Desc.OpInfo[Src2Idx])) 6438 return isVOP3(MI) ? 12 : (DescSize + 4); 6439 6440 return DescSize; 6441 } 6442 6443 // Check whether we have extra NSA words. 6444 if (isMIMG(MI)) { 6445 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 6446 if (VAddr0Idx < 0) 6447 return 8; 6448 6449 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 6450 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4); 6451 } 6452 6453 switch (Opc) { 6454 case TargetOpcode::IMPLICIT_DEF: 6455 case TargetOpcode::KILL: 6456 case TargetOpcode::DBG_VALUE: 6457 case TargetOpcode::EH_LABEL: 6458 return 0; 6459 case TargetOpcode::BUNDLE: 6460 return getInstBundleSize(MI); 6461 case TargetOpcode::INLINEASM: 6462 case TargetOpcode::INLINEASM_BR: { 6463 const MachineFunction *MF = MI.getParent()->getParent(); 6464 const char *AsmStr = MI.getOperand(0).getSymbolName(); 6465 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), 6466 &MF->getSubtarget()); 6467 } 6468 default: 6469 return DescSize; 6470 } 6471 } 6472 6473 bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { 6474 if (!isFLAT(MI)) 6475 return false; 6476 6477 if (MI.memoperands_empty()) 6478 return true; 6479 6480 for (const MachineMemOperand *MMO : MI.memoperands()) { 6481 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) 6482 return true; 6483 } 6484 return false; 6485 } 6486 6487 bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const { 6488 return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO; 6489 } 6490 6491 void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, 6492 MachineBasicBlock *IfEnd) const { 6493 MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator(); 6494 assert(TI != IfEntry->end()); 6495 6496 MachineInstr *Branch = &(*TI); 6497 MachineFunction *MF = IfEntry->getParent(); 6498 MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo(); 6499 6500 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 6501 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); 6502 MachineInstr *SIIF = 6503 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg) 6504 .add(Branch->getOperand(0)) 6505 .add(Branch->getOperand(1)); 6506 MachineInstr *SIEND = 6507 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF)) 6508 .addReg(DstReg); 6509 6510 IfEntry->erase(TI); 6511 IfEntry->insert(IfEntry->end(), SIIF); 6512 IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND); 6513 } 6514 } 6515 6516 void SIInstrInfo::convertNonUniformLoopRegion( 6517 MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const { 6518 MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator(); 6519 // We expect 2 terminators, one conditional and one unconditional. 6520 assert(TI != LoopEnd->end()); 6521 6522 MachineInstr *Branch = &(*TI); 6523 MachineFunction *MF = LoopEnd->getParent(); 6524 MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo(); 6525 6526 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 6527 6528 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); 6529 Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC()); 6530 MachineInstrBuilder HeaderPHIBuilder = 6531 BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg); 6532 for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(), 6533 E = LoopEntry->pred_end(); 6534 PI != E; ++PI) { 6535 if (*PI == LoopEnd) { 6536 HeaderPHIBuilder.addReg(BackEdgeReg); 6537 } else { 6538 MachineBasicBlock *PMBB = *PI; 6539 Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC()); 6540 materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(), 6541 ZeroReg, 0); 6542 HeaderPHIBuilder.addReg(ZeroReg); 6543 } 6544 HeaderPHIBuilder.addMBB(*PI); 6545 } 6546 MachineInstr *HeaderPhi = HeaderPHIBuilder; 6547 MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(), 6548 get(AMDGPU::SI_IF_BREAK), BackEdgeReg) 6549 .addReg(DstReg) 6550 .add(Branch->getOperand(0)); 6551 MachineInstr *SILOOP = 6552 BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP)) 6553 .addReg(BackEdgeReg) 6554 .addMBB(LoopEntry); 6555 6556 LoopEntry->insert(LoopEntry->begin(), HeaderPhi); 6557 LoopEnd->erase(TI); 6558 LoopEnd->insert(LoopEnd->end(), SIIFBREAK); 6559 LoopEnd->insert(LoopEnd->end(), SILOOP); 6560 } 6561 } 6562 6563 ArrayRef<std::pair<int, const char *>> 6564 SIInstrInfo::getSerializableTargetIndices() const { 6565 static const std::pair<int, const char *> TargetIndices[] = { 6566 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, 6567 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, 6568 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, 6569 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, 6570 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; 6571 return makeArrayRef(TargetIndices); 6572 } 6573 6574 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The 6575 /// post-RA version of misched uses CreateTargetMIHazardRecognizer. 6576 ScheduleHazardRecognizer * 6577 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, 6578 const ScheduleDAG *DAG) const { 6579 return new GCNHazardRecognizer(DAG->MF); 6580 } 6581 6582 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer 6583 /// pass. 6584 ScheduleHazardRecognizer * 6585 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { 6586 return new GCNHazardRecognizer(MF); 6587 } 6588 6589 std::pair<unsigned, unsigned> 6590 SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { 6591 return std::make_pair(TF & MO_MASK, TF & ~MO_MASK); 6592 } 6593 6594 ArrayRef<std::pair<unsigned, const char *>> 6595 SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { 6596 static const std::pair<unsigned, const char *> TargetFlags[] = { 6597 { MO_GOTPCREL, "amdgpu-gotprel" }, 6598 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" }, 6599 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" }, 6600 { MO_REL32_LO, "amdgpu-rel32-lo" }, 6601 { MO_REL32_HI, "amdgpu-rel32-hi" }, 6602 { MO_ABS32_LO, "amdgpu-abs32-lo" }, 6603 { MO_ABS32_HI, "amdgpu-abs32-hi" }, 6604 }; 6605 6606 return makeArrayRef(TargetFlags); 6607 } 6608 6609 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { 6610 return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && 6611 MI.modifiesRegister(AMDGPU::EXEC, &RI); 6612 } 6613 6614 MachineInstrBuilder 6615 SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, 6616 MachineBasicBlock::iterator I, 6617 const DebugLoc &DL, 6618 Register DestReg) const { 6619 if (ST.hasAddNoCarry()) 6620 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg); 6621 6622 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6623 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC()); 6624 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC()); 6625 6626 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) 6627 .addReg(UnusedCarry, RegState::Define | RegState::Dead); 6628 } 6629 6630 MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, 6631 MachineBasicBlock::iterator I, 6632 const DebugLoc &DL, 6633 Register DestReg, 6634 RegScavenger &RS) const { 6635 if (ST.hasAddNoCarry()) 6636 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg); 6637 6638 // If available, prefer to use vcc. 6639 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC) 6640 ? Register(RI.getVCC()) 6641 : RS.scavengeRegister(RI.getBoolRC(), I, 0, false); 6642 6643 // TODO: Users need to deal with this. 6644 if (!UnusedCarry.isValid()) 6645 return MachineInstrBuilder(); 6646 6647 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) 6648 .addReg(UnusedCarry, RegState::Define | RegState::Dead); 6649 } 6650 6651 bool SIInstrInfo::isKillTerminator(unsigned Opcode) { 6652 switch (Opcode) { 6653 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: 6654 case AMDGPU::SI_KILL_I1_TERMINATOR: 6655 return true; 6656 default: 6657 return false; 6658 } 6659 } 6660 6661 const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const { 6662 switch (Opcode) { 6663 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: 6664 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR); 6665 case AMDGPU::SI_KILL_I1_PSEUDO: 6666 return get(AMDGPU::SI_KILL_I1_TERMINATOR); 6667 default: 6668 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO"); 6669 } 6670 } 6671 6672 void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const { 6673 MachineBasicBlock *MBB = MI.getParent(); 6674 MachineFunction *MF = MBB->getParent(); 6675 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 6676 6677 if (!ST.isWave32()) 6678 return; 6679 6680 for (auto &Op : MI.implicit_operands()) { 6681 if (Op.isReg() && Op.getReg() == AMDGPU::VCC) 6682 Op.setReg(AMDGPU::VCC_LO); 6683 } 6684 } 6685 6686 bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { 6687 if (!isSMRD(MI)) 6688 return false; 6689 6690 // Check that it is using a buffer resource. 6691 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase); 6692 if (Idx == -1) // e.g. s_memtime 6693 return false; 6694 6695 const auto RCID = MI.getDesc().OpInfo[Idx].RegClass; 6696 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass); 6697 } 6698 6699 unsigned SIInstrInfo::getNumFlatOffsetBits(unsigned AddrSpace, 6700 bool Signed) const { 6701 if (!ST.hasFlatInstOffsets()) 6702 return 0; 6703 6704 if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS) 6705 return 0; 6706 6707 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) 6708 return Signed ? 12 : 11; 6709 6710 return Signed ? 13 : 12; 6711 } 6712 6713 bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, 6714 bool Signed) const { 6715 // TODO: Should 0 be special cased? 6716 if (!ST.hasFlatInstOffsets()) 6717 return false; 6718 6719 if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS) 6720 return false; 6721 6722 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 6723 return (Signed && isInt<12>(Offset)) || 6724 (!Signed && isUInt<11>(Offset)); 6725 } 6726 6727 return (Signed && isInt<13>(Offset)) || 6728 (!Signed && isUInt<12>(Offset)); 6729 } 6730 6731 6732 // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td 6733 enum SIEncodingFamily { 6734 SI = 0, 6735 VI = 1, 6736 SDWA = 2, 6737 SDWA9 = 3, 6738 GFX80 = 4, 6739 GFX9 = 5, 6740 GFX10 = 6, 6741 SDWA10 = 7 6742 }; 6743 6744 static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) { 6745 switch (ST.getGeneration()) { 6746 default: 6747 break; 6748 case AMDGPUSubtarget::SOUTHERN_ISLANDS: 6749 case AMDGPUSubtarget::SEA_ISLANDS: 6750 return SIEncodingFamily::SI; 6751 case AMDGPUSubtarget::VOLCANIC_ISLANDS: 6752 case AMDGPUSubtarget::GFX9: 6753 return SIEncodingFamily::VI; 6754 case AMDGPUSubtarget::GFX10: 6755 return SIEncodingFamily::GFX10; 6756 } 6757 llvm_unreachable("Unknown subtarget generation!"); 6758 } 6759 6760 bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const { 6761 switch(MCOp) { 6762 // These opcodes use indirect register addressing so 6763 // they need special handling by codegen (currently missing). 6764 // Therefore it is too risky to allow these opcodes 6765 // to be selected by dpp combiner or sdwa peepholer. 6766 case AMDGPU::V_MOVRELS_B32_dpp_gfx10: 6767 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10: 6768 case AMDGPU::V_MOVRELD_B32_dpp_gfx10: 6769 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10: 6770 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10: 6771 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10: 6772 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10: 6773 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10: 6774 return true; 6775 default: 6776 return false; 6777 } 6778 } 6779 6780 int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { 6781 SIEncodingFamily Gen = subtargetEncodingFamily(ST); 6782 6783 if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 && 6784 ST.getGeneration() == AMDGPUSubtarget::GFX9) 6785 Gen = SIEncodingFamily::GFX9; 6786 6787 // Adjust the encoding family to GFX80 for D16 buffer instructions when the 6788 // subtarget has UnpackedD16VMem feature. 6789 // TODO: remove this when we discard GFX80 encoding. 6790 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf)) 6791 Gen = SIEncodingFamily::GFX80; 6792 6793 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) { 6794 switch (ST.getGeneration()) { 6795 default: 6796 Gen = SIEncodingFamily::SDWA; 6797 break; 6798 case AMDGPUSubtarget::GFX9: 6799 Gen = SIEncodingFamily::SDWA9; 6800 break; 6801 case AMDGPUSubtarget::GFX10: 6802 Gen = SIEncodingFamily::SDWA10; 6803 break; 6804 } 6805 } 6806 6807 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); 6808 6809 // -1 means that Opcode is already a native instruction. 6810 if (MCOp == -1) 6811 return Opcode; 6812 6813 // (uint16_t)-1 means that Opcode is a pseudo instruction that has 6814 // no encoding in the given subtarget generation. 6815 if (MCOp == (uint16_t)-1) 6816 return -1; 6817 6818 if (isAsmOnlyOpcode(MCOp)) 6819 return -1; 6820 6821 return MCOp; 6822 } 6823 6824 static 6825 TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) { 6826 assert(RegOpnd.isReg()); 6827 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() : 6828 getRegSubRegPair(RegOpnd); 6829 } 6830 6831 TargetInstrInfo::RegSubRegPair 6832 llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) { 6833 assert(MI.isRegSequence()); 6834 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I) 6835 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) { 6836 auto &RegOp = MI.getOperand(1 + 2 * I); 6837 return getRegOrUndef(RegOp); 6838 } 6839 return TargetInstrInfo::RegSubRegPair(); 6840 } 6841 6842 // Try to find the definition of reg:subreg in subreg-manipulation pseudos 6843 // Following a subreg of reg:subreg isn't supported 6844 static bool followSubRegDef(MachineInstr &MI, 6845 TargetInstrInfo::RegSubRegPair &RSR) { 6846 if (!RSR.SubReg) 6847 return false; 6848 switch (MI.getOpcode()) { 6849 default: break; 6850 case AMDGPU::REG_SEQUENCE: 6851 RSR = getRegSequenceSubReg(MI, RSR.SubReg); 6852 return true; 6853 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg 6854 case AMDGPU::INSERT_SUBREG: 6855 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm()) 6856 // inserted the subreg we're looking for 6857 RSR = getRegOrUndef(MI.getOperand(2)); 6858 else { // the subreg in the rest of the reg 6859 auto R1 = getRegOrUndef(MI.getOperand(1)); 6860 if (R1.SubReg) // subreg of subreg isn't supported 6861 return false; 6862 RSR.Reg = R1.Reg; 6863 } 6864 return true; 6865 } 6866 return false; 6867 } 6868 6869 MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, 6870 MachineRegisterInfo &MRI) { 6871 assert(MRI.isSSA()); 6872 if (!Register::isVirtualRegister(P.Reg)) 6873 return nullptr; 6874 6875 auto RSR = P; 6876 auto *DefInst = MRI.getVRegDef(RSR.Reg); 6877 while (auto *MI = DefInst) { 6878 DefInst = nullptr; 6879 switch (MI->getOpcode()) { 6880 case AMDGPU::COPY: 6881 case AMDGPU::V_MOV_B32_e32: { 6882 auto &Op1 = MI->getOperand(1); 6883 if (Op1.isReg() && Register::isVirtualRegister(Op1.getReg())) { 6884 if (Op1.isUndef()) 6885 return nullptr; 6886 RSR = getRegSubRegPair(Op1); 6887 DefInst = MRI.getVRegDef(RSR.Reg); 6888 } 6889 break; 6890 } 6891 default: 6892 if (followSubRegDef(*MI, RSR)) { 6893 if (!RSR.Reg) 6894 return nullptr; 6895 DefInst = MRI.getVRegDef(RSR.Reg); 6896 } 6897 } 6898 if (!DefInst) 6899 return MI; 6900 } 6901 return nullptr; 6902 } 6903 6904 bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, 6905 Register VReg, 6906 const MachineInstr &DefMI, 6907 const MachineInstr &UseMI) { 6908 assert(MRI.isSSA() && "Must be run on SSA"); 6909 6910 auto *TRI = MRI.getTargetRegisterInfo(); 6911 auto *DefBB = DefMI.getParent(); 6912 6913 // Don't bother searching between blocks, although it is possible this block 6914 // doesn't modify exec. 6915 if (UseMI.getParent() != DefBB) 6916 return true; 6917 6918 const int MaxInstScan = 20; 6919 int NumInst = 0; 6920 6921 // Stop scan at the use. 6922 auto E = UseMI.getIterator(); 6923 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) { 6924 if (I->isDebugInstr()) 6925 continue; 6926 6927 if (++NumInst > MaxInstScan) 6928 return true; 6929 6930 if (I->modifiesRegister(AMDGPU::EXEC, TRI)) 6931 return true; 6932 } 6933 6934 return false; 6935 } 6936 6937 bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, 6938 Register VReg, 6939 const MachineInstr &DefMI) { 6940 assert(MRI.isSSA() && "Must be run on SSA"); 6941 6942 auto *TRI = MRI.getTargetRegisterInfo(); 6943 auto *DefBB = DefMI.getParent(); 6944 6945 const int MaxUseInstScan = 10; 6946 int NumUseInst = 0; 6947 6948 for (auto &UseInst : MRI.use_nodbg_instructions(VReg)) { 6949 // Don't bother searching between blocks, although it is possible this block 6950 // doesn't modify exec. 6951 if (UseInst.getParent() != DefBB) 6952 return true; 6953 6954 if (++NumUseInst > MaxUseInstScan) 6955 return true; 6956 } 6957 6958 const int MaxInstScan = 20; 6959 int NumInst = 0; 6960 6961 // Stop scan when we have seen all the uses. 6962 for (auto I = std::next(DefMI.getIterator()); ; ++I) { 6963 if (I->isDebugInstr()) 6964 continue; 6965 6966 if (++NumInst > MaxInstScan) 6967 return true; 6968 6969 if (I->readsRegister(VReg)) 6970 if (--NumUseInst == 0) 6971 return false; 6972 6973 if (I->modifiesRegister(AMDGPU::EXEC, TRI)) 6974 return true; 6975 } 6976 } 6977 6978 MachineInstr *SIInstrInfo::createPHIDestinationCopy( 6979 MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt, 6980 const DebugLoc &DL, Register Src, Register Dst) const { 6981 auto Cur = MBB.begin(); 6982 if (Cur != MBB.end()) 6983 do { 6984 if (!Cur->isPHI() && Cur->readsRegister(Dst)) 6985 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src); 6986 ++Cur; 6987 } while (Cur != MBB.end() && Cur != LastPHIIt); 6988 6989 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src, 6990 Dst); 6991 } 6992 6993 MachineInstr *SIInstrInfo::createPHISourceCopy( 6994 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, 6995 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const { 6996 if (InsPt != MBB.end() && 6997 (InsPt->getOpcode() == AMDGPU::SI_IF || 6998 InsPt->getOpcode() == AMDGPU::SI_ELSE || 6999 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) && 7000 InsPt->definesRegister(Src)) { 7001 InsPt++; 7002 return BuildMI(MBB, InsPt, DL, 7003 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term 7004 : AMDGPU::S_MOV_B64_term), 7005 Dst) 7006 .addReg(Src, 0, SrcSubReg) 7007 .addReg(AMDGPU::EXEC, RegState::Implicit); 7008 } 7009 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg, 7010 Dst); 7011 } 7012 7013 bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); } 7014 7015 MachineInstr *SIInstrInfo::foldMemoryOperandImpl( 7016 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, 7017 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS, 7018 VirtRegMap *VRM) const { 7019 // This is a bit of a hack (copied from AArch64). Consider this instruction: 7020 // 7021 // %0:sreg_32 = COPY $m0 7022 // 7023 // We explicitly chose SReg_32 for the virtual register so such a copy might 7024 // be eliminated by RegisterCoalescer. However, that may not be possible, and 7025 // %0 may even spill. We can't spill $m0 normally (it would require copying to 7026 // a numbered SGPR anyway), and since it is in the SReg_32 register class, 7027 // TargetInstrInfo::foldMemoryOperand() is going to try. 7028 // A similar issue also exists with spilling and reloading $exec registers. 7029 // 7030 // To prevent that, constrain the %0 register class here. 7031 if (MI.isFullCopy()) { 7032 Register DstReg = MI.getOperand(0).getReg(); 7033 Register SrcReg = MI.getOperand(1).getReg(); 7034 if ((DstReg.isVirtual() || SrcReg.isVirtual()) && 7035 (DstReg.isVirtual() != SrcReg.isVirtual())) { 7036 MachineRegisterInfo &MRI = MF.getRegInfo(); 7037 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg; 7038 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg); 7039 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) { 7040 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); 7041 return nullptr; 7042 } else if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) { 7043 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass); 7044 return nullptr; 7045 } 7046 } 7047 } 7048 7049 return nullptr; 7050 } 7051 7052 unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, 7053 const MachineInstr &MI, 7054 unsigned *PredCost) const { 7055 if (MI.isBundle()) { 7056 MachineBasicBlock::const_instr_iterator I(MI.getIterator()); 7057 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end()); 7058 unsigned Lat = 0, Count = 0; 7059 for (++I; I != E && I->isBundledWithPred(); ++I) { 7060 ++Count; 7061 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I)); 7062 } 7063 return Lat + Count - 1; 7064 } 7065 7066 return SchedModel.computeInstrLatency(&MI); 7067 } 7068