1 //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI Implementation of TargetInstrInfo. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIInstrInfo.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUSubtarget.h" 17 #include "GCNHazardRecognizer.h" 18 #include "SIDefines.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "SIRegisterInfo.h" 21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/APInt.h" 24 #include "llvm/ADT/ArrayRef.h" 25 #include "llvm/ADT/SmallVector.h" 26 #include "llvm/ADT/StringRef.h" 27 #include "llvm/ADT/iterator_range.h" 28 #include "llvm/Analysis/AliasAnalysis.h" 29 #include "llvm/Analysis/MemoryLocation.h" 30 #include "llvm/Analysis/ValueTracking.h" 31 #include "llvm/CodeGen/MachineBasicBlock.h" 32 #include "llvm/CodeGen/MachineDominators.h" 33 #include "llvm/CodeGen/MachineFrameInfo.h" 34 #include "llvm/CodeGen/MachineFunction.h" 35 #include "llvm/CodeGen/MachineInstr.h" 36 #include "llvm/CodeGen/MachineInstrBuilder.h" 37 #include "llvm/CodeGen/MachineInstrBundle.h" 38 #include "llvm/CodeGen/MachineMemOperand.h" 39 #include "llvm/CodeGen/MachineOperand.h" 40 #include "llvm/CodeGen/MachineRegisterInfo.h" 41 #include "llvm/CodeGen/RegisterScavenging.h" 42 #include "llvm/CodeGen/ScheduleDAG.h" 43 #include "llvm/CodeGen/SelectionDAGNodes.h" 44 #include "llvm/CodeGen/TargetOpcodes.h" 45 #include "llvm/CodeGen/TargetRegisterInfo.h" 46 #include "llvm/IR/DebugLoc.h" 47 #include "llvm/IR/DiagnosticInfo.h" 48 #include "llvm/IR/Function.h" 49 #include "llvm/IR/InlineAsm.h" 50 #include "llvm/IR/LLVMContext.h" 51 #include "llvm/MC/MCInstrDesc.h" 52 #include "llvm/Support/Casting.h" 53 #include "llvm/Support/CommandLine.h" 54 #include "llvm/Support/Compiler.h" 55 #include "llvm/Support/ErrorHandling.h" 56 #include "llvm/Support/MachineValueType.h" 57 #include "llvm/Support/MathExtras.h" 58 #include "llvm/Target/TargetMachine.h" 59 #include <cassert> 60 #include <cstdint> 61 #include <iterator> 62 #include <utility> 63 64 using namespace llvm; 65 66 #define GET_INSTRINFO_CTOR_DTOR 67 #include "AMDGPUGenInstrInfo.inc" 68 69 namespace llvm { 70 namespace AMDGPU { 71 #define GET_D16ImageDimIntrinsics_IMPL 72 #define GET_ImageDimIntrinsicTable_IMPL 73 #define GET_RsrcIntrinsics_IMPL 74 #include "AMDGPUGenSearchableTables.inc" 75 } 76 } 77 78 79 // Must be at least 4 to be able to branch over minimum unconditional branch 80 // code. This is only for making it possible to write reasonably small tests for 81 // long branches. 82 static cl::opt<unsigned> 83 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), 84 cl::desc("Restrict range of branch instructions (DEBUG)")); 85 86 static cl::opt<bool> Fix16BitCopies( 87 "amdgpu-fix-16-bit-physreg-copies", 88 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), 89 cl::init(true), 90 cl::ReallyHidden); 91 92 SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) 93 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), 94 RI(ST), ST(ST) { 95 SchedModel.init(&ST); 96 } 97 98 //===----------------------------------------------------------------------===// 99 // TargetInstrInfo callbacks 100 //===----------------------------------------------------------------------===// 101 102 static unsigned getNumOperandsNoGlue(SDNode *Node) { 103 unsigned N = Node->getNumOperands(); 104 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 105 --N; 106 return N; 107 } 108 109 /// Returns true if both nodes have the same value for the given 110 /// operand \p Op, or if both nodes do not have this operand. 111 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 112 unsigned Opc0 = N0->getMachineOpcode(); 113 unsigned Opc1 = N1->getMachineOpcode(); 114 115 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 116 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 117 118 if (Op0Idx == -1 && Op1Idx == -1) 119 return true; 120 121 122 if ((Op0Idx == -1 && Op1Idx != -1) || 123 (Op1Idx == -1 && Op0Idx != -1)) 124 return false; 125 126 // getNamedOperandIdx returns the index for the MachineInstr's operands, 127 // which includes the result as the first operand. We are indexing into the 128 // MachineSDNode's operands, so we need to skip the result operand to get 129 // the real index. 130 --Op0Idx; 131 --Op1Idx; 132 133 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 134 } 135 136 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, 137 AliasAnalysis *AA) const { 138 // TODO: The generic check fails for VALU instructions that should be 139 // rematerializable due to implicit reads of exec. We really want all of the 140 // generic logic for this except for this. 141 switch (MI.getOpcode()) { 142 case AMDGPU::V_MOV_B32_e32: 143 case AMDGPU::V_MOV_B32_e64: 144 case AMDGPU::V_MOV_B64_PSEUDO: 145 // No implicit operands. 146 return MI.getNumOperands() == MI.getDesc().getNumOperands(); 147 default: 148 return false; 149 } 150 } 151 152 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 153 int64_t &Offset0, 154 int64_t &Offset1) const { 155 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 156 return false; 157 158 unsigned Opc0 = Load0->getMachineOpcode(); 159 unsigned Opc1 = Load1->getMachineOpcode(); 160 161 // Make sure both are actually loads. 162 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 163 return false; 164 165 if (isDS(Opc0) && isDS(Opc1)) { 166 167 // FIXME: Handle this case: 168 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 169 return false; 170 171 // Check base reg. 172 if (Load0->getOperand(0) != Load1->getOperand(0)) 173 return false; 174 175 // Skip read2 / write2 variants for simplicity. 176 // TODO: We should report true if the used offsets are adjacent (excluded 177 // st64 versions). 178 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 179 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 180 if (Offset0Idx == -1 || Offset1Idx == -1) 181 return false; 182 183 // XXX - be careful of datalesss loads 184 // getNamedOperandIdx returns the index for MachineInstrs. Since they 185 // include the output in the operand list, but SDNodes don't, we need to 186 // subtract the index by one. 187 Offset0Idx -= get(Opc0).NumDefs; 188 Offset1Idx -= get(Opc1).NumDefs; 189 Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue(); 190 Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue(); 191 return true; 192 } 193 194 if (isSMRD(Opc0) && isSMRD(Opc1)) { 195 // Skip time and cache invalidation instructions. 196 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 || 197 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1) 198 return false; 199 200 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); 201 202 // Check base reg. 203 if (Load0->getOperand(0) != Load1->getOperand(0)) 204 return false; 205 206 const ConstantSDNode *Load0Offset = 207 dyn_cast<ConstantSDNode>(Load0->getOperand(1)); 208 const ConstantSDNode *Load1Offset = 209 dyn_cast<ConstantSDNode>(Load1->getOperand(1)); 210 211 if (!Load0Offset || !Load1Offset) 212 return false; 213 214 Offset0 = Load0Offset->getZExtValue(); 215 Offset1 = Load1Offset->getZExtValue(); 216 return true; 217 } 218 219 // MUBUF and MTBUF can access the same addresses. 220 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 221 222 // MUBUF and MTBUF have vaddr at different indices. 223 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 224 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 225 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 226 return false; 227 228 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 229 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 230 231 if (OffIdx0 == -1 || OffIdx1 == -1) 232 return false; 233 234 // getNamedOperandIdx returns the index for MachineInstrs. Since they 235 // include the output in the operand list, but SDNodes don't, we need to 236 // subtract the index by one. 237 OffIdx0 -= get(Opc0).NumDefs; 238 OffIdx1 -= get(Opc1).NumDefs; 239 240 SDValue Off0 = Load0->getOperand(OffIdx0); 241 SDValue Off1 = Load1->getOperand(OffIdx1); 242 243 // The offset might be a FrameIndexSDNode. 244 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 245 return false; 246 247 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 248 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 249 return true; 250 } 251 252 return false; 253 } 254 255 static bool isStride64(unsigned Opc) { 256 switch (Opc) { 257 case AMDGPU::DS_READ2ST64_B32: 258 case AMDGPU::DS_READ2ST64_B64: 259 case AMDGPU::DS_WRITE2ST64_B32: 260 case AMDGPU::DS_WRITE2ST64_B64: 261 return true; 262 default: 263 return false; 264 } 265 } 266 267 unsigned SIInstrInfo::getOperandSizeInBytes(const MachineInstr &LdSt, 268 const MachineOperand *MOp) const { 269 assert(MOp && "Unexpected null machine operand!"); 270 const MachineRegisterInfo &MRI = LdSt.getParent()->getParent()->getRegInfo(); 271 const Register Reg = MOp->getReg(); 272 const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg) 273 ? MRI.getRegClass(Reg) 274 : RI.getPhysRegClass(Reg); 275 return (RI.getRegSizeInBits(*DstRC) / 8); 276 } 277 278 bool SIInstrInfo::getMemOperandsWithOffsetWidth( 279 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, 280 int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, 281 const TargetRegisterInfo *TRI) const { 282 if (!LdSt.mayLoadOrStore()) 283 return false; 284 285 unsigned Opc = LdSt.getOpcode(); 286 OffsetIsScalable = false; 287 const MachineOperand *BaseOp, *OffsetOp, *MOp; 288 289 if (isDS(LdSt)) { 290 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); 291 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); 292 if (OffsetOp) { 293 // Normal, single offset LDS instruction. 294 if (!BaseOp) { 295 // DS_CONSUME/DS_APPEND use M0 for the base address. 296 // TODO: find the implicit use operand for M0 and use that as BaseOp? 297 return false; 298 } 299 BaseOps.push_back(BaseOp); 300 Offset = OffsetOp->getImm(); 301 // Get appropriate operand, and compute width accordingly. 302 MOp = getNamedOperand(LdSt, AMDGPU::OpName::vdst); 303 if (!MOp) 304 MOp = getNamedOperand(LdSt, AMDGPU::OpName::data0); 305 Width = getOperandSizeInBytes(LdSt, MOp); 306 } else { 307 // The 2 offset instructions use offset0 and offset1 instead. We can treat 308 // these as a load with a single offset if the 2 offsets are consecutive. 309 // We will use this for some partially aligned loads. 310 const MachineOperand *Offset0Op = 311 getNamedOperand(LdSt, AMDGPU::OpName::offset0); 312 const MachineOperand *Offset1Op = 313 getNamedOperand(LdSt, AMDGPU::OpName::offset1); 314 315 unsigned Offset0 = Offset0Op->getImm(); 316 unsigned Offset1 = Offset1Op->getImm(); 317 if (Offset0 + 1 != Offset1) 318 return false; 319 320 // Each of these offsets is in element sized units, so we need to convert 321 // to bytes of the individual reads. 322 323 unsigned EltSize; 324 if (LdSt.mayLoad()) 325 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16; 326 else { 327 assert(LdSt.mayStore()); 328 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 329 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8; 330 } 331 332 if (isStride64(Opc)) 333 EltSize *= 64; 334 335 BaseOps.push_back(BaseOp); 336 Offset = EltSize * Offset0; 337 // Get appropriate operand(s), and compute width accordingly. 338 MOp = getNamedOperand(LdSt, AMDGPU::OpName::vdst); 339 if (!MOp) { 340 MOp = getNamedOperand(LdSt, AMDGPU::OpName::data0); 341 Width = getOperandSizeInBytes(LdSt, MOp); 342 MOp = getNamedOperand(LdSt, AMDGPU::OpName::data1); 343 Width += getOperandSizeInBytes(LdSt, MOp); 344 } else { 345 Width = getOperandSizeInBytes(LdSt, MOp); 346 } 347 } 348 return true; 349 } 350 351 if (isMUBUF(LdSt) || isMTBUF(LdSt)) { 352 const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset); 353 if (SOffset && SOffset->isReg()) { 354 // We can only handle this if it's a stack access, as any other resource 355 // would require reporting multiple base registers. 356 const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 357 if (AddrReg && !AddrReg->isFI()) 358 return false; 359 360 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); 361 const SIMachineFunctionInfo *MFI 362 = LdSt.getParent()->getParent()->getInfo<SIMachineFunctionInfo>(); 363 if (RSrc->getReg() != MFI->getScratchRSrcReg()) 364 return false; 365 366 const MachineOperand *OffsetImm = 367 getNamedOperand(LdSt, AMDGPU::OpName::offset); 368 BaseOps.push_back(RSrc); 369 BaseOps.push_back(SOffset); 370 Offset = OffsetImm->getImm(); 371 // Get appropriate operand, and compute width accordingly. 372 MOp = getNamedOperand(LdSt, AMDGPU::OpName::vdst); 373 if (!MOp) 374 MOp = getNamedOperand(LdSt, AMDGPU::OpName::vdata); 375 Width = getOperandSizeInBytes(LdSt, MOp); 376 return true; 377 } 378 379 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); 380 if (!BaseOp) // e.g. BUFFER_WBINVL1_VOL 381 return false; 382 BaseOps.push_back(BaseOp); 383 384 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 385 if (BaseOp) 386 BaseOps.push_back(BaseOp); 387 388 const MachineOperand *OffsetImm = 389 getNamedOperand(LdSt, AMDGPU::OpName::offset); 390 Offset = OffsetImm->getImm(); 391 if (SOffset) // soffset can be an inline immediate. 392 Offset += SOffset->getImm(); 393 // Get appropriate operand, and compute width accordingly. 394 MOp = getNamedOperand(LdSt, AMDGPU::OpName::vdst); 395 if (!MOp) 396 MOp = getNamedOperand(LdSt, AMDGPU::OpName::vdata); 397 Width = getOperandSizeInBytes(LdSt, MOp); 398 return true; 399 } 400 401 if (isSMRD(LdSt)) { 402 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase); 403 if (!BaseOp) // e.g. S_MEMTIME 404 return false; 405 BaseOps.push_back(BaseOp); 406 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); 407 Offset = OffsetOp ? OffsetOp->getImm() : 0; 408 // Get appropriate operand, and compute width accordingly. 409 MOp = getNamedOperand(LdSt, AMDGPU::OpName::sdst); 410 Width = getOperandSizeInBytes(LdSt, MOp); 411 return true; 412 } 413 414 if (isFLAT(LdSt)) { 415 // Instructions have either vaddr or saddr or both. 416 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 417 if (BaseOp) 418 BaseOps.push_back(BaseOp); 419 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr); 420 if (BaseOp) 421 BaseOps.push_back(BaseOp); 422 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm(); 423 // Get appropriate operand, and compute width accordingly. 424 MOp = getNamedOperand(LdSt, AMDGPU::OpName::vdst); 425 if (!MOp) 426 MOp = getNamedOperand(LdSt, AMDGPU::OpName::vdata); 427 Width = getOperandSizeInBytes(LdSt, MOp); 428 return true; 429 } 430 431 return false; 432 } 433 434 static bool 435 memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1, 436 ArrayRef<const MachineOperand *> BaseOps2) { 437 if (BaseOps1.size() != BaseOps2.size()) 438 return false; 439 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) 440 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I])) 441 return false; 442 return true; 443 } 444 445 static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, 446 ArrayRef<const MachineOperand *> BaseOps1, 447 const MachineInstr &MI2, 448 ArrayRef<const MachineOperand *> BaseOps2) { 449 if (memOpsHaveSameBaseOperands(BaseOps1, BaseOps2)) 450 return true; 451 452 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand()) 453 return false; 454 455 auto MO1 = *MI1.memoperands_begin(); 456 auto MO2 = *MI2.memoperands_begin(); 457 if (MO1->getAddrSpace() != MO2->getAddrSpace()) 458 return false; 459 460 auto Base1 = MO1->getValue(); 461 auto Base2 = MO2->getValue(); 462 if (!Base1 || !Base2) 463 return false; 464 const MachineFunction &MF = *MI1.getParent()->getParent(); 465 const DataLayout &DL = MF.getFunction().getParent()->getDataLayout(); 466 Base1 = GetUnderlyingObject(Base1, DL); 467 Base2 = GetUnderlyingObject(Base2, DL); 468 469 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2)) 470 return false; 471 472 return Base1 == Base2; 473 } 474 475 bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1, 476 ArrayRef<const MachineOperand *> BaseOps2, 477 unsigned NumLoads, 478 unsigned NumBytes) const { 479 assert(!BaseOps1.empty() && !BaseOps2.empty()); 480 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent(); 481 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent(); 482 483 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) 484 return false; 485 486 const MachineOperand *FirstDst = nullptr; 487 const MachineOperand *SecondDst = nullptr; 488 489 if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || 490 (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) || 491 (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) { 492 const unsigned MaxGlobalLoadCluster = 7; 493 if (NumLoads > MaxGlobalLoadCluster) 494 return false; 495 496 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); 497 if (!FirstDst) 498 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); 499 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); 500 if (!SecondDst) 501 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); 502 } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { 503 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); 504 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); 505 } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) { 506 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); 507 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); 508 } 509 510 if (!FirstDst || !SecondDst) 511 return false; 512 513 // Try to limit clustering based on the total number of bytes loaded 514 // rather than the number of instructions. This is done to help reduce 515 // register pressure. The method used is somewhat inexact, though, 516 // because it assumes that all loads in the cluster will load the 517 // same number of bytes as FirstLdSt. 518 519 // The unit of this value is bytes. 520 // FIXME: This needs finer tuning. 521 unsigned LoadClusterThreshold = 16; 522 523 const MachineRegisterInfo &MRI = 524 FirstLdSt.getParent()->getParent()->getRegInfo(); 525 526 const Register Reg = FirstDst->getReg(); 527 528 const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg) 529 ? MRI.getRegClass(Reg) 530 : RI.getPhysRegClass(Reg); 531 532 // FIXME: NumLoads should not be subtracted 1. This is to match behavior 533 // of clusterNeighboringMemOps which was previosly passing cluster length 534 // less 1. LoadClusterThreshold should be tuned instead. 535 return ((NumLoads - 1) * (RI.getRegSizeInBits(*DstRC) / 8)) <= 536 LoadClusterThreshold; 537 } 538 539 // FIXME: This behaves strangely. If, for example, you have 32 load + stores, 540 // the first 16 loads will be interleaved with the stores, and the next 16 will 541 // be clustered as expected. It should really split into 2 16 store batches. 542 // 543 // Loads are clustered until this returns false, rather than trying to schedule 544 // groups of stores. This also means we have to deal with saying different 545 // address space loads should be clustered, and ones which might cause bank 546 // conflicts. 547 // 548 // This might be deprecated so it might not be worth that much effort to fix. 549 bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, 550 int64_t Offset0, int64_t Offset1, 551 unsigned NumLoads) const { 552 assert(Offset1 > Offset0 && 553 "Second offset should be larger than first offset!"); 554 // If we have less than 16 loads in a row, and the offsets are within 64 555 // bytes, then schedule together. 556 557 // A cacheline is 64 bytes (for global memory). 558 return (NumLoads <= 16 && (Offset1 - Offset0) < 64); 559 } 560 561 static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, 562 MachineBasicBlock::iterator MI, 563 const DebugLoc &DL, MCRegister DestReg, 564 MCRegister SrcReg, bool KillSrc, 565 const char *Msg = "illegal SGPR to VGPR copy") { 566 MachineFunction *MF = MBB.getParent(); 567 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error); 568 LLVMContext &C = MF->getFunction().getContext(); 569 C.diagnose(IllegalCopy); 570 571 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg) 572 .addReg(SrcReg, getKillRegState(KillSrc)); 573 } 574 575 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 576 MachineBasicBlock::iterator MI, 577 const DebugLoc &DL, MCRegister DestReg, 578 MCRegister SrcReg, bool KillSrc) const { 579 const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg); 580 581 // FIXME: This is hack to resolve copies between 16 bit and 32 bit 582 // registers until all patterns are fixed. 583 if (Fix16BitCopies && 584 ((RI.getRegSizeInBits(*RC) == 16) ^ 585 (RI.getRegSizeInBits(*RI.getPhysRegClass(SrcReg)) == 16))) { 586 MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg; 587 MCRegister Super = RI.get32BitRegister(RegToFix); 588 assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix); 589 RegToFix = Super; 590 591 if (DestReg == SrcReg) { 592 // Insert empty bundle since ExpandPostRA expects an instruction here. 593 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE)); 594 return; 595 } 596 597 RC = RI.getPhysRegClass(DestReg); 598 } 599 600 if (RC == &AMDGPU::VGPR_32RegClass) { 601 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 602 AMDGPU::SReg_32RegClass.contains(SrcReg) || 603 AMDGPU::AGPR_32RegClass.contains(SrcReg)); 604 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ? 605 AMDGPU::V_ACCVGPR_READ_B32 : AMDGPU::V_MOV_B32_e32; 606 BuildMI(MBB, MI, DL, get(Opc), DestReg) 607 .addReg(SrcReg, getKillRegState(KillSrc)); 608 return; 609 } 610 611 if (RC == &AMDGPU::SReg_32_XM0RegClass || 612 RC == &AMDGPU::SReg_32RegClass) { 613 if (SrcReg == AMDGPU::SCC) { 614 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) 615 .addImm(1) 616 .addImm(0); 617 return; 618 } 619 620 if (DestReg == AMDGPU::VCC_LO) { 621 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) { 622 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO) 623 .addReg(SrcReg, getKillRegState(KillSrc)); 624 } else { 625 // FIXME: Hack until VReg_1 removed. 626 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 627 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) 628 .addImm(0) 629 .addReg(SrcReg, getKillRegState(KillSrc)); 630 } 631 632 return; 633 } 634 635 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) { 636 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 637 return; 638 } 639 640 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 641 .addReg(SrcReg, getKillRegState(KillSrc)); 642 return; 643 } 644 645 if (RC == &AMDGPU::SReg_64RegClass) { 646 if (DestReg == AMDGPU::VCC) { 647 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 648 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 649 .addReg(SrcReg, getKillRegState(KillSrc)); 650 } else { 651 // FIXME: Hack until VReg_1 removed. 652 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 653 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) 654 .addImm(0) 655 .addReg(SrcReg, getKillRegState(KillSrc)); 656 } 657 658 return; 659 } 660 661 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) { 662 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 663 return; 664 } 665 666 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 667 .addReg(SrcReg, getKillRegState(KillSrc)); 668 return; 669 } 670 671 if (DestReg == AMDGPU::SCC) { 672 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 673 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) 674 .addReg(SrcReg, getKillRegState(KillSrc)) 675 .addImm(0); 676 return; 677 } 678 679 if (RC == &AMDGPU::AGPR_32RegClass) { 680 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 681 AMDGPU::SReg_32RegClass.contains(SrcReg) || 682 AMDGPU::AGPR_32RegClass.contains(SrcReg)); 683 if (!AMDGPU::VGPR_32RegClass.contains(SrcReg)) { 684 // First try to find defining accvgpr_write to avoid temporary registers. 685 for (auto Def = MI, E = MBB.begin(); Def != E; ) { 686 --Def; 687 if (!Def->definesRegister(SrcReg, &RI)) 688 continue; 689 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32) 690 break; 691 692 MachineOperand &DefOp = Def->getOperand(1); 693 assert(DefOp.isReg() || DefOp.isImm()); 694 695 if (DefOp.isReg()) { 696 // Check that register source operand if not clobbered before MI. 697 // Immediate operands are always safe to propagate. 698 bool SafeToPropagate = true; 699 for (auto I = Def; I != MI && SafeToPropagate; ++I) 700 if (I->modifiesRegister(DefOp.getReg(), &RI)) 701 SafeToPropagate = false; 702 703 if (!SafeToPropagate) 704 break; 705 706 DefOp.setIsKill(false); 707 } 708 709 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) 710 .add(DefOp); 711 return; 712 } 713 714 RegScavenger RS; 715 RS.enterBasicBlock(MBB); 716 RS.forward(MI); 717 718 // Ideally we want to have three registers for a long reg_sequence copy 719 // to hide 2 waitstates between v_mov_b32 and accvgpr_write. 720 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 721 *MBB.getParent()); 722 723 // Registers in the sequence are allocated contiguously so we can just 724 // use register number to pick one of three round-robin temps. 725 unsigned RegNo = DestReg % 3; 726 Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); 727 if (!Tmp) 728 report_fatal_error("Cannot scavenge VGPR to copy to AGPR"); 729 RS.setRegUsed(Tmp); 730 // Only loop through if there are any free registers left, otherwise 731 // scavenger may report a fatal error without emergency spill slot 732 // or spill with the slot. 733 while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { 734 unsigned Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); 735 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) 736 break; 737 Tmp = Tmp2; 738 RS.setRegUsed(Tmp); 739 } 740 copyPhysReg(MBB, MI, DL, Tmp, SrcReg, KillSrc); 741 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) 742 .addReg(Tmp, RegState::Kill); 743 return; 744 } 745 746 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) 747 .addReg(SrcReg, getKillRegState(KillSrc)); 748 return; 749 } 750 751 if (RI.getRegSizeInBits(*RC) == 16) { 752 assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || 753 AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || 754 AMDGPU::SReg_LO16RegClass.contains(SrcReg) || 755 AMDGPU::AGPR_LO16RegClass.contains(SrcReg)); 756 757 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg); 758 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg); 759 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg); 760 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg); 761 bool DstLow = AMDGPU::VGPR_LO16RegClass.contains(DestReg) || 762 AMDGPU::SReg_LO16RegClass.contains(DestReg) || 763 AMDGPU::AGPR_LO16RegClass.contains(DestReg); 764 bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || 765 AMDGPU::SReg_LO16RegClass.contains(SrcReg) || 766 AMDGPU::AGPR_LO16RegClass.contains(SrcReg); 767 MCRegister NewDestReg = RI.get32BitRegister(DestReg); 768 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg); 769 770 if (IsSGPRDst) { 771 if (!IsSGPRSrc) { 772 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 773 return; 774 } 775 776 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg) 777 .addReg(NewSrcReg, getKillRegState(KillSrc)); 778 return; 779 } 780 781 if (IsAGPRDst || IsAGPRSrc) { 782 if (!DstLow || !SrcLow) { 783 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, 784 "Cannot use hi16 subreg with an AGPR!"); 785 } 786 787 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc); 788 return; 789 } 790 791 if (IsSGPRSrc && !ST.hasSDWAScalar()) { 792 if (!DstLow || !SrcLow) { 793 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, 794 "Cannot use hi16 subreg on VI!"); 795 } 796 797 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg) 798 .addReg(NewSrcReg, getKillRegState(KillSrc)); 799 return; 800 } 801 802 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg) 803 .addImm(0) // src0_modifiers 804 .addReg(NewSrcReg) 805 .addImm(0) // clamp 806 .addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0 807 : AMDGPU::SDWA::SdwaSel::WORD_1) 808 .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) 809 .addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0 810 : AMDGPU::SDWA::SdwaSel::WORD_1) 811 .addReg(NewDestReg, RegState::Implicit | RegState::Undef); 812 // First implicit operand is $exec. 813 MIB->tieOperands(0, MIB->getNumOperands() - 1); 814 return; 815 } 816 817 unsigned EltSize = 4; 818 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 819 if (RI.isSGPRClass(RC)) { 820 // TODO: Copy vec3/vec5 with s_mov_b64s then final s_mov_b32. 821 if (!(RI.getRegSizeInBits(*RC) % 64)) { 822 Opcode = AMDGPU::S_MOV_B64; 823 EltSize = 8; 824 } else { 825 Opcode = AMDGPU::S_MOV_B32; 826 EltSize = 4; 827 } 828 829 if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) { 830 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 831 return; 832 } 833 } else if (RI.hasAGPRs(RC)) { 834 Opcode = RI.hasVGPRs(RI.getPhysRegClass(SrcReg)) ? 835 AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY; 836 } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) { 837 Opcode = AMDGPU::V_ACCVGPR_READ_B32; 838 } 839 840 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); 841 bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); 842 843 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 844 unsigned SubIdx; 845 if (Forward) 846 SubIdx = SubIndices[Idx]; 847 else 848 SubIdx = SubIndices[SubIndices.size() - Idx - 1]; 849 850 if (Opcode == TargetOpcode::COPY) { 851 copyPhysReg(MBB, MI, DL, RI.getSubReg(DestReg, SubIdx), 852 RI.getSubReg(SrcReg, SubIdx), KillSrc); 853 continue; 854 } 855 856 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 857 get(Opcode), RI.getSubReg(DestReg, SubIdx)); 858 859 Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); 860 861 if (Idx == 0) 862 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 863 864 bool UseKill = KillSrc && Idx == SubIndices.size() - 1; 865 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); 866 } 867 } 868 869 int SIInstrInfo::commuteOpcode(unsigned Opcode) const { 870 int NewOpc; 871 872 // Try to map original to commuted opcode 873 NewOpc = AMDGPU::getCommuteRev(Opcode); 874 if (NewOpc != -1) 875 // Check if the commuted (REV) opcode exists on the target. 876 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 877 878 // Try to map commuted to original opcode 879 NewOpc = AMDGPU::getCommuteOrig(Opcode); 880 if (NewOpc != -1) 881 // Check if the original (non-REV) opcode exists on the target. 882 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 883 884 return Opcode; 885 } 886 887 void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, 888 MachineBasicBlock::iterator MI, 889 const DebugLoc &DL, unsigned DestReg, 890 int64_t Value) const { 891 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 892 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg); 893 if (RegClass == &AMDGPU::SReg_32RegClass || 894 RegClass == &AMDGPU::SGPR_32RegClass || 895 RegClass == &AMDGPU::SReg_32_XM0RegClass || 896 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) { 897 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 898 .addImm(Value); 899 return; 900 } 901 902 if (RegClass == &AMDGPU::SReg_64RegClass || 903 RegClass == &AMDGPU::SGPR_64RegClass || 904 RegClass == &AMDGPU::SReg_64_XEXECRegClass) { 905 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 906 .addImm(Value); 907 return; 908 } 909 910 if (RegClass == &AMDGPU::VGPR_32RegClass) { 911 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 912 .addImm(Value); 913 return; 914 } 915 if (RegClass == &AMDGPU::VReg_64RegClass) { 916 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg) 917 .addImm(Value); 918 return; 919 } 920 921 unsigned EltSize = 4; 922 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 923 if (RI.isSGPRClass(RegClass)) { 924 if (RI.getRegSizeInBits(*RegClass) > 32) { 925 Opcode = AMDGPU::S_MOV_B64; 926 EltSize = 8; 927 } else { 928 Opcode = AMDGPU::S_MOV_B32; 929 EltSize = 4; 930 } 931 } 932 933 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize); 934 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 935 int64_t IdxValue = Idx == 0 ? Value : 0; 936 937 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 938 get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx])); 939 Builder.addImm(IdxValue); 940 } 941 } 942 943 const TargetRegisterClass * 944 SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const { 945 return &AMDGPU::VGPR_32RegClass; 946 } 947 948 void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, 949 MachineBasicBlock::iterator I, 950 const DebugLoc &DL, Register DstReg, 951 ArrayRef<MachineOperand> Cond, 952 Register TrueReg, 953 Register FalseReg) const { 954 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 955 MachineFunction *MF = MBB.getParent(); 956 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 957 const TargetRegisterClass *BoolXExecRC = 958 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 959 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && 960 "Not a VGPR32 reg"); 961 962 if (Cond.size() == 1) { 963 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 964 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 965 .add(Cond[0]); 966 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 967 .addImm(0) 968 .addReg(FalseReg) 969 .addImm(0) 970 .addReg(TrueReg) 971 .addReg(SReg); 972 } else if (Cond.size() == 2) { 973 assert(Cond[0].isImm() && "Cond[0] is not an immediate"); 974 switch (Cond[0].getImm()) { 975 case SIInstrInfo::SCC_TRUE: { 976 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 977 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 978 : AMDGPU::S_CSELECT_B64), SReg) 979 .addImm(1) 980 .addImm(0); 981 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 982 .addImm(0) 983 .addReg(FalseReg) 984 .addImm(0) 985 .addReg(TrueReg) 986 .addReg(SReg); 987 break; 988 } 989 case SIInstrInfo::SCC_FALSE: { 990 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 991 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 992 : AMDGPU::S_CSELECT_B64), SReg) 993 .addImm(0) 994 .addImm(1); 995 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 996 .addImm(0) 997 .addReg(FalseReg) 998 .addImm(0) 999 .addReg(TrueReg) 1000 .addReg(SReg); 1001 break; 1002 } 1003 case SIInstrInfo::VCCNZ: { 1004 MachineOperand RegOp = Cond[1]; 1005 RegOp.setImplicit(false); 1006 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1007 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 1008 .add(RegOp); 1009 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1010 .addImm(0) 1011 .addReg(FalseReg) 1012 .addImm(0) 1013 .addReg(TrueReg) 1014 .addReg(SReg); 1015 break; 1016 } 1017 case SIInstrInfo::VCCZ: { 1018 MachineOperand RegOp = Cond[1]; 1019 RegOp.setImplicit(false); 1020 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1021 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 1022 .add(RegOp); 1023 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1024 .addImm(0) 1025 .addReg(TrueReg) 1026 .addImm(0) 1027 .addReg(FalseReg) 1028 .addReg(SReg); 1029 break; 1030 } 1031 case SIInstrInfo::EXECNZ: { 1032 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1033 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); 1034 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 1035 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 1036 .addImm(0); 1037 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 1038 : AMDGPU::S_CSELECT_B64), SReg) 1039 .addImm(1) 1040 .addImm(0); 1041 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1042 .addImm(0) 1043 .addReg(FalseReg) 1044 .addImm(0) 1045 .addReg(TrueReg) 1046 .addReg(SReg); 1047 break; 1048 } 1049 case SIInstrInfo::EXECZ: { 1050 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1051 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); 1052 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 1053 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 1054 .addImm(0); 1055 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 1056 : AMDGPU::S_CSELECT_B64), SReg) 1057 .addImm(0) 1058 .addImm(1); 1059 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1060 .addImm(0) 1061 .addReg(FalseReg) 1062 .addImm(0) 1063 .addReg(TrueReg) 1064 .addReg(SReg); 1065 llvm_unreachable("Unhandled branch predicate EXECZ"); 1066 break; 1067 } 1068 default: 1069 llvm_unreachable("invalid branch predicate"); 1070 } 1071 } else { 1072 llvm_unreachable("Can only handle Cond size 1 or 2"); 1073 } 1074 } 1075 1076 Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB, 1077 MachineBasicBlock::iterator I, 1078 const DebugLoc &DL, 1079 Register SrcReg, int Value) const { 1080 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1081 Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); 1082 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) 1083 .addImm(Value) 1084 .addReg(SrcReg); 1085 1086 return Reg; 1087 } 1088 1089 Register SIInstrInfo::insertNE(MachineBasicBlock *MBB, 1090 MachineBasicBlock::iterator I, 1091 const DebugLoc &DL, 1092 Register SrcReg, int Value) const { 1093 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1094 Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); 1095 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) 1096 .addImm(Value) 1097 .addReg(SrcReg); 1098 1099 return Reg; 1100 } 1101 1102 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 1103 1104 if (RI.hasAGPRs(DstRC)) 1105 return AMDGPU::COPY; 1106 if (RI.getRegSizeInBits(*DstRC) == 32) { 1107 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1108 } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { 1109 return AMDGPU::S_MOV_B64; 1110 } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) { 1111 return AMDGPU::V_MOV_B64_PSEUDO; 1112 } 1113 return AMDGPU::COPY; 1114 } 1115 1116 static unsigned getIndirectVGPRWritePseudoOpc(unsigned VecSize) { 1117 if (VecSize <= 32) // 4 bytes 1118 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V1; 1119 if (VecSize <= 64) // 8 bytes 1120 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V2; 1121 if (VecSize <= 96) // 12 bytes 1122 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V3; 1123 if (VecSize <= 128) // 16 bytes 1124 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V4; 1125 if (VecSize <= 160) // 20 bytes 1126 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V5; 1127 if (VecSize <= 256) // 32 bytes 1128 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V8; 1129 if (VecSize <= 512) // 64 bytes 1130 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V16; 1131 if (VecSize <= 1024) // 128 bytes 1132 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V32; 1133 1134 llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); 1135 } 1136 1137 static unsigned getIndirectSGPRWritePseudo32(unsigned VecSize) { 1138 if (VecSize <= 32) // 4 bytes 1139 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V1; 1140 if (VecSize <= 64) // 8 bytes 1141 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V2; 1142 if (VecSize <= 96) // 12 bytes 1143 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V3; 1144 if (VecSize <= 128) // 16 bytes 1145 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V4; 1146 if (VecSize <= 160) // 20 bytes 1147 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V5; 1148 if (VecSize <= 256) // 32 bytes 1149 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V8; 1150 if (VecSize <= 512) // 64 bytes 1151 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V16; 1152 if (VecSize <= 1024) // 128 bytes 1153 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V32; 1154 1155 llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); 1156 } 1157 1158 static unsigned getIndirectSGPRWritePseudo64(unsigned VecSize) { 1159 if (VecSize <= 64) // 8 bytes 1160 return AMDGPU::S_INDIRECT_REG_WRITE_B64_V1; 1161 if (VecSize <= 128) // 16 bytes 1162 return AMDGPU::S_INDIRECT_REG_WRITE_B64_V2; 1163 if (VecSize <= 256) // 32 bytes 1164 return AMDGPU::S_INDIRECT_REG_WRITE_B64_V4; 1165 if (VecSize <= 512) // 64 bytes 1166 return AMDGPU::S_INDIRECT_REG_WRITE_B64_V8; 1167 if (VecSize <= 1024) // 128 bytes 1168 return AMDGPU::S_INDIRECT_REG_WRITE_B64_V16; 1169 1170 llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); 1171 } 1172 1173 const MCInstrDesc &SIInstrInfo::getIndirectRegWritePseudo( 1174 unsigned VecSize, unsigned EltSize, bool IsSGPR) const { 1175 if (IsSGPR) { 1176 switch (EltSize) { 1177 case 32: 1178 return get(getIndirectSGPRWritePseudo32(VecSize)); 1179 case 64: 1180 return get(getIndirectSGPRWritePseudo64(VecSize)); 1181 default: 1182 llvm_unreachable("invalid reg indexing elt size"); 1183 } 1184 } 1185 1186 assert(EltSize == 32 && "invalid reg indexing elt size"); 1187 return get(getIndirectVGPRWritePseudoOpc(VecSize)); 1188 } 1189 1190 static unsigned getSGPRSpillSaveOpcode(unsigned Size) { 1191 switch (Size) { 1192 case 4: 1193 return AMDGPU::SI_SPILL_S32_SAVE; 1194 case 8: 1195 return AMDGPU::SI_SPILL_S64_SAVE; 1196 case 12: 1197 return AMDGPU::SI_SPILL_S96_SAVE; 1198 case 16: 1199 return AMDGPU::SI_SPILL_S128_SAVE; 1200 case 20: 1201 return AMDGPU::SI_SPILL_S160_SAVE; 1202 case 32: 1203 return AMDGPU::SI_SPILL_S256_SAVE; 1204 case 64: 1205 return AMDGPU::SI_SPILL_S512_SAVE; 1206 case 128: 1207 return AMDGPU::SI_SPILL_S1024_SAVE; 1208 default: 1209 llvm_unreachable("unknown register size"); 1210 } 1211 } 1212 1213 static unsigned getVGPRSpillSaveOpcode(unsigned Size) { 1214 switch (Size) { 1215 case 4: 1216 return AMDGPU::SI_SPILL_V32_SAVE; 1217 case 8: 1218 return AMDGPU::SI_SPILL_V64_SAVE; 1219 case 12: 1220 return AMDGPU::SI_SPILL_V96_SAVE; 1221 case 16: 1222 return AMDGPU::SI_SPILL_V128_SAVE; 1223 case 20: 1224 return AMDGPU::SI_SPILL_V160_SAVE; 1225 case 32: 1226 return AMDGPU::SI_SPILL_V256_SAVE; 1227 case 64: 1228 return AMDGPU::SI_SPILL_V512_SAVE; 1229 case 128: 1230 return AMDGPU::SI_SPILL_V1024_SAVE; 1231 default: 1232 llvm_unreachable("unknown register size"); 1233 } 1234 } 1235 1236 static unsigned getAGPRSpillSaveOpcode(unsigned Size) { 1237 switch (Size) { 1238 case 4: 1239 return AMDGPU::SI_SPILL_A32_SAVE; 1240 case 8: 1241 return AMDGPU::SI_SPILL_A64_SAVE; 1242 case 16: 1243 return AMDGPU::SI_SPILL_A128_SAVE; 1244 case 64: 1245 return AMDGPU::SI_SPILL_A512_SAVE; 1246 case 128: 1247 return AMDGPU::SI_SPILL_A1024_SAVE; 1248 default: 1249 llvm_unreachable("unknown register size"); 1250 } 1251 } 1252 1253 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 1254 MachineBasicBlock::iterator MI, 1255 Register SrcReg, bool isKill, 1256 int FrameIndex, 1257 const TargetRegisterClass *RC, 1258 const TargetRegisterInfo *TRI) const { 1259 MachineFunction *MF = MBB.getParent(); 1260 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1261 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1262 const DebugLoc &DL = MBB.findDebugLoc(MI); 1263 1264 MachinePointerInfo PtrInfo 1265 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 1266 MachineMemOperand *MMO = MF->getMachineMemOperand( 1267 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex), 1268 FrameInfo.getObjectAlign(FrameIndex)); 1269 unsigned SpillSize = TRI->getSpillSize(*RC); 1270 1271 if (RI.isSGPRClass(RC)) { 1272 MFI->setHasSpilledSGPRs(); 1273 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled"); 1274 1275 // We are only allowed to create one new instruction when spilling 1276 // registers, so we need to use pseudo instruction for spilling SGPRs. 1277 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize)); 1278 1279 // The SGPR spill/restore instructions only work on number sgprs, so we need 1280 // to make sure we are using the correct register class. 1281 if (Register::isVirtualRegister(SrcReg) && SpillSize == 4) { 1282 MachineRegisterInfo &MRI = MF->getRegInfo(); 1283 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); 1284 } 1285 1286 BuildMI(MBB, MI, DL, OpDesc) 1287 .addReg(SrcReg, getKillRegState(isKill)) // data 1288 .addFrameIndex(FrameIndex) // addr 1289 .addMemOperand(MMO) 1290 .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) 1291 .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); 1292 // Add the scratch resource registers as implicit uses because we may end up 1293 // needing them, and need to ensure that the reserved registers are 1294 // correctly handled. 1295 if (RI.spillSGPRToVGPR()) 1296 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); 1297 return; 1298 } 1299 1300 unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillSaveOpcode(SpillSize) 1301 : getVGPRSpillSaveOpcode(SpillSize); 1302 MFI->setHasSpilledVGPRs(); 1303 1304 auto MIB = BuildMI(MBB, MI, DL, get(Opcode)); 1305 if (RI.hasAGPRs(RC)) { 1306 MachineRegisterInfo &MRI = MF->getRegInfo(); 1307 Register Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1308 MIB.addReg(Tmp, RegState::Define); 1309 } 1310 MIB.addReg(SrcReg, getKillRegState(isKill)) // data 1311 .addFrameIndex(FrameIndex) // addr 1312 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 1313 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset 1314 .addImm(0) // offset 1315 .addMemOperand(MMO); 1316 } 1317 1318 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { 1319 switch (Size) { 1320 case 4: 1321 return AMDGPU::SI_SPILL_S32_RESTORE; 1322 case 8: 1323 return AMDGPU::SI_SPILL_S64_RESTORE; 1324 case 12: 1325 return AMDGPU::SI_SPILL_S96_RESTORE; 1326 case 16: 1327 return AMDGPU::SI_SPILL_S128_RESTORE; 1328 case 20: 1329 return AMDGPU::SI_SPILL_S160_RESTORE; 1330 case 32: 1331 return AMDGPU::SI_SPILL_S256_RESTORE; 1332 case 64: 1333 return AMDGPU::SI_SPILL_S512_RESTORE; 1334 case 128: 1335 return AMDGPU::SI_SPILL_S1024_RESTORE; 1336 default: 1337 llvm_unreachable("unknown register size"); 1338 } 1339 } 1340 1341 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { 1342 switch (Size) { 1343 case 4: 1344 return AMDGPU::SI_SPILL_V32_RESTORE; 1345 case 8: 1346 return AMDGPU::SI_SPILL_V64_RESTORE; 1347 case 12: 1348 return AMDGPU::SI_SPILL_V96_RESTORE; 1349 case 16: 1350 return AMDGPU::SI_SPILL_V128_RESTORE; 1351 case 20: 1352 return AMDGPU::SI_SPILL_V160_RESTORE; 1353 case 32: 1354 return AMDGPU::SI_SPILL_V256_RESTORE; 1355 case 64: 1356 return AMDGPU::SI_SPILL_V512_RESTORE; 1357 case 128: 1358 return AMDGPU::SI_SPILL_V1024_RESTORE; 1359 default: 1360 llvm_unreachable("unknown register size"); 1361 } 1362 } 1363 1364 static unsigned getAGPRSpillRestoreOpcode(unsigned Size) { 1365 switch (Size) { 1366 case 4: 1367 return AMDGPU::SI_SPILL_A32_RESTORE; 1368 case 8: 1369 return AMDGPU::SI_SPILL_A64_RESTORE; 1370 case 16: 1371 return AMDGPU::SI_SPILL_A128_RESTORE; 1372 case 64: 1373 return AMDGPU::SI_SPILL_A512_RESTORE; 1374 case 128: 1375 return AMDGPU::SI_SPILL_A1024_RESTORE; 1376 default: 1377 llvm_unreachable("unknown register size"); 1378 } 1379 } 1380 1381 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 1382 MachineBasicBlock::iterator MI, 1383 Register DestReg, int FrameIndex, 1384 const TargetRegisterClass *RC, 1385 const TargetRegisterInfo *TRI) const { 1386 MachineFunction *MF = MBB.getParent(); 1387 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1388 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1389 const DebugLoc &DL = MBB.findDebugLoc(MI); 1390 unsigned SpillSize = TRI->getSpillSize(*RC); 1391 1392 MachinePointerInfo PtrInfo 1393 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 1394 1395 MachineMemOperand *MMO = MF->getMachineMemOperand( 1396 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex), 1397 FrameInfo.getObjectAlign(FrameIndex)); 1398 1399 if (RI.isSGPRClass(RC)) { 1400 MFI->setHasSpilledSGPRs(); 1401 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into"); 1402 1403 // FIXME: Maybe this should not include a memoperand because it will be 1404 // lowered to non-memory instructions. 1405 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); 1406 if (DestReg.isVirtual() && SpillSize == 4) { 1407 MachineRegisterInfo &MRI = MF->getRegInfo(); 1408 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); 1409 } 1410 1411 if (RI.spillSGPRToVGPR()) 1412 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); 1413 BuildMI(MBB, MI, DL, OpDesc, DestReg) 1414 .addFrameIndex(FrameIndex) // addr 1415 .addMemOperand(MMO) 1416 .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) 1417 .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); 1418 return; 1419 } 1420 1421 unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize) 1422 : getVGPRSpillRestoreOpcode(SpillSize); 1423 auto MIB = BuildMI(MBB, MI, DL, get(Opcode), DestReg); 1424 if (RI.hasAGPRs(RC)) { 1425 MachineRegisterInfo &MRI = MF->getRegInfo(); 1426 Register Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1427 MIB.addReg(Tmp, RegState::Define); 1428 } 1429 MIB.addFrameIndex(FrameIndex) // vaddr 1430 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 1431 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset 1432 .addImm(0) // offset 1433 .addMemOperand(MMO); 1434 } 1435 1436 /// \param @Offset Offset in bytes of the FrameIndex being spilled 1437 unsigned SIInstrInfo::calculateLDSSpillAddress( 1438 MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg, 1439 unsigned FrameOffset, unsigned Size) const { 1440 MachineFunction *MF = MBB.getParent(); 1441 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1442 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 1443 const DebugLoc &DL = MBB.findDebugLoc(MI); 1444 unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); 1445 unsigned WavefrontSize = ST.getWavefrontSize(); 1446 1447 Register TIDReg = MFI->getTIDReg(); 1448 if (!MFI->hasCalculatedTID()) { 1449 MachineBasicBlock &Entry = MBB.getParent()->front(); 1450 MachineBasicBlock::iterator Insert = Entry.front(); 1451 const DebugLoc &DL = Insert->getDebugLoc(); 1452 1453 TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass, 1454 *MF); 1455 if (TIDReg == AMDGPU::NoRegister) 1456 return TIDReg; 1457 1458 if (!AMDGPU::isShader(MF->getFunction().getCallingConv()) && 1459 WorkGroupSize > WavefrontSize) { 1460 Register TIDIGXReg = 1461 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 1462 Register TIDIGYReg = 1463 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 1464 Register TIDIGZReg = 1465 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 1466 Register InputPtrReg = 1467 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1468 for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { 1469 if (!Entry.isLiveIn(Reg)) 1470 Entry.addLiveIn(Reg); 1471 } 1472 1473 RS->enterBasicBlock(Entry); 1474 // FIXME: Can we scavenge an SReg_64 and access the subregs? 1475 Register STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 1476 Register STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 1477 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) 1478 .addReg(InputPtrReg) 1479 .addImm(SI::KernelInputOffsets::NGROUPS_Z); 1480 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) 1481 .addReg(InputPtrReg) 1482 .addImm(SI::KernelInputOffsets::NGROUPS_Y); 1483 1484 // NGROUPS.X * NGROUPS.Y 1485 BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) 1486 .addReg(STmp1) 1487 .addReg(STmp0); 1488 // (NGROUPS.X * NGROUPS.Y) * TIDIG.X 1489 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) 1490 .addReg(STmp1) 1491 .addReg(TIDIGXReg); 1492 // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) 1493 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) 1494 .addReg(STmp0) 1495 .addReg(TIDIGYReg) 1496 .addReg(TIDReg); 1497 // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z 1498 getAddNoCarry(Entry, Insert, DL, TIDReg) 1499 .addReg(TIDReg) 1500 .addReg(TIDIGZReg) 1501 .addImm(0); // clamp bit 1502 } else { 1503 // Get the wave id 1504 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), 1505 TIDReg) 1506 .addImm(-1) 1507 .addImm(0); 1508 1509 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), 1510 TIDReg) 1511 .addImm(-1) 1512 .addReg(TIDReg); 1513 } 1514 1515 BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), 1516 TIDReg) 1517 .addImm(2) 1518 .addReg(TIDReg); 1519 MFI->setTIDReg(TIDReg); 1520 } 1521 1522 // Add FrameIndex to LDS offset 1523 unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize); 1524 getAddNoCarry(MBB, MI, DL, TmpReg) 1525 .addImm(LDSOffset) 1526 .addReg(TIDReg) 1527 .addImm(0); // clamp bit 1528 1529 return TmpReg; 1530 } 1531 1532 void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB, 1533 MachineBasicBlock::iterator MI, 1534 int Count) const { 1535 DebugLoc DL = MBB.findDebugLoc(MI); 1536 while (Count > 0) { 1537 int Arg; 1538 if (Count >= 8) 1539 Arg = 7; 1540 else 1541 Arg = Count - 1; 1542 Count -= 8; 1543 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)) 1544 .addImm(Arg); 1545 } 1546 } 1547 1548 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, 1549 MachineBasicBlock::iterator MI) const { 1550 insertWaitStates(MBB, MI, 1); 1551 } 1552 1553 void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { 1554 auto MF = MBB.getParent(); 1555 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 1556 1557 assert(Info->isEntryFunction()); 1558 1559 if (MBB.succ_empty()) { 1560 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end(); 1561 if (HasNoTerminator) { 1562 if (Info->returnsVoid()) { 1563 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0); 1564 } else { 1565 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG)); 1566 } 1567 } 1568 } 1569 } 1570 1571 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { 1572 switch (MI.getOpcode()) { 1573 default: return 1; // FIXME: Do wait states equal cycles? 1574 1575 case AMDGPU::S_NOP: 1576 return MI.getOperand(0).getImm() + 1; 1577 } 1578 } 1579 1580 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 1581 MachineBasicBlock &MBB = *MI.getParent(); 1582 DebugLoc DL = MBB.findDebugLoc(MI); 1583 switch (MI.getOpcode()) { 1584 default: return TargetInstrInfo::expandPostRAPseudo(MI); 1585 case AMDGPU::S_MOV_B64_term: 1586 // This is only a terminator to get the correct spill code placement during 1587 // register allocation. 1588 MI.setDesc(get(AMDGPU::S_MOV_B64)); 1589 break; 1590 1591 case AMDGPU::S_MOV_B32_term: 1592 // This is only a terminator to get the correct spill code placement during 1593 // register allocation. 1594 MI.setDesc(get(AMDGPU::S_MOV_B32)); 1595 break; 1596 1597 case AMDGPU::S_XOR_B64_term: 1598 // This is only a terminator to get the correct spill code placement during 1599 // register allocation. 1600 MI.setDesc(get(AMDGPU::S_XOR_B64)); 1601 break; 1602 1603 case AMDGPU::S_XOR_B32_term: 1604 // This is only a terminator to get the correct spill code placement during 1605 // register allocation. 1606 MI.setDesc(get(AMDGPU::S_XOR_B32)); 1607 break; 1608 1609 case AMDGPU::S_OR_B32_term: 1610 // This is only a terminator to get the correct spill code placement during 1611 // register allocation. 1612 MI.setDesc(get(AMDGPU::S_OR_B32)); 1613 break; 1614 1615 case AMDGPU::S_ANDN2_B64_term: 1616 // This is only a terminator to get the correct spill code placement during 1617 // register allocation. 1618 MI.setDesc(get(AMDGPU::S_ANDN2_B64)); 1619 break; 1620 1621 case AMDGPU::S_ANDN2_B32_term: 1622 // This is only a terminator to get the correct spill code placement during 1623 // register allocation. 1624 MI.setDesc(get(AMDGPU::S_ANDN2_B32)); 1625 break; 1626 1627 case AMDGPU::V_MOV_B64_PSEUDO: { 1628 Register Dst = MI.getOperand(0).getReg(); 1629 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 1630 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 1631 1632 const MachineOperand &SrcOp = MI.getOperand(1); 1633 // FIXME: Will this work for 64-bit floating point immediates? 1634 assert(!SrcOp.isFPImm()); 1635 if (SrcOp.isImm()) { 1636 APInt Imm(64, SrcOp.getImm()); 1637 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 1638 .addImm(Imm.getLoBits(32).getZExtValue()) 1639 .addReg(Dst, RegState::Implicit | RegState::Define); 1640 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 1641 .addImm(Imm.getHiBits(32).getZExtValue()) 1642 .addReg(Dst, RegState::Implicit | RegState::Define); 1643 } else { 1644 assert(SrcOp.isReg()); 1645 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 1646 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 1647 .addReg(Dst, RegState::Implicit | RegState::Define); 1648 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 1649 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 1650 .addReg(Dst, RegState::Implicit | RegState::Define); 1651 } 1652 MI.eraseFromParent(); 1653 break; 1654 } 1655 case AMDGPU::V_MOV_B64_DPP_PSEUDO: { 1656 expandMovDPP64(MI); 1657 break; 1658 } 1659 case AMDGPU::V_SET_INACTIVE_B32: { 1660 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; 1661 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1662 BuildMI(MBB, MI, DL, get(NotOpc), Exec) 1663 .addReg(Exec); 1664 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) 1665 .add(MI.getOperand(2)); 1666 BuildMI(MBB, MI, DL, get(NotOpc), Exec) 1667 .addReg(Exec); 1668 MI.eraseFromParent(); 1669 break; 1670 } 1671 case AMDGPU::V_SET_INACTIVE_B64: { 1672 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; 1673 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1674 BuildMI(MBB, MI, DL, get(NotOpc), Exec) 1675 .addReg(Exec); 1676 MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), 1677 MI.getOperand(0).getReg()) 1678 .add(MI.getOperand(2)); 1679 expandPostRAPseudo(*Copy); 1680 BuildMI(MBB, MI, DL, get(NotOpc), Exec) 1681 .addReg(Exec); 1682 MI.eraseFromParent(); 1683 break; 1684 } 1685 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V1: 1686 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V2: 1687 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V3: 1688 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V4: 1689 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V5: 1690 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V8: 1691 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V16: 1692 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V32: 1693 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V1: 1694 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V2: 1695 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V3: 1696 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V4: 1697 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V5: 1698 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V8: 1699 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V16: 1700 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V32: 1701 case AMDGPU::S_INDIRECT_REG_WRITE_B64_V1: 1702 case AMDGPU::S_INDIRECT_REG_WRITE_B64_V2: 1703 case AMDGPU::S_INDIRECT_REG_WRITE_B64_V4: 1704 case AMDGPU::S_INDIRECT_REG_WRITE_B64_V8: 1705 case AMDGPU::S_INDIRECT_REG_WRITE_B64_V16: { 1706 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2); 1707 1708 unsigned Opc; 1709 if (RI.hasVGPRs(EltRC)) { 1710 Opc = ST.useVGPRIndexMode() ? 1711 AMDGPU::V_MOV_B32_indirect : AMDGPU::V_MOVRELD_B32_e32; 1712 } else { 1713 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? 1714 AMDGPU::S_MOVRELD_B64 : AMDGPU::S_MOVRELD_B32; 1715 } 1716 1717 const MCInstrDesc &OpDesc = get(Opc); 1718 Register VecReg = MI.getOperand(0).getReg(); 1719 bool IsUndef = MI.getOperand(1).isUndef(); 1720 unsigned SubReg = MI.getOperand(3).getImm(); 1721 assert(VecReg == MI.getOperand(1).getReg()); 1722 1723 MachineInstrBuilder MIB = 1724 BuildMI(MBB, MI, DL, OpDesc) 1725 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) 1726 .add(MI.getOperand(2)) 1727 .addReg(VecReg, RegState::ImplicitDefine) 1728 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); 1729 1730 const int ImpDefIdx = 1731 OpDesc.getNumOperands() + OpDesc.getNumImplicitUses(); 1732 const int ImpUseIdx = ImpDefIdx + 1; 1733 MIB->tieOperands(ImpDefIdx, ImpUseIdx); 1734 MI.eraseFromParent(); 1735 break; 1736 } 1737 case AMDGPU::SI_PC_ADD_REL_OFFSET: { 1738 MachineFunction &MF = *MBB.getParent(); 1739 Register Reg = MI.getOperand(0).getReg(); 1740 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0); 1741 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1); 1742 1743 // Create a bundle so these instructions won't be re-ordered by the 1744 // post-RA scheduler. 1745 MIBundleBuilder Bundler(MBB, MI); 1746 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); 1747 1748 // Add 32-bit offset from this instruction to the start of the 1749 // constant data. 1750 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) 1751 .addReg(RegLo) 1752 .add(MI.getOperand(1))); 1753 1754 MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) 1755 .addReg(RegHi); 1756 MIB.add(MI.getOperand(2)); 1757 1758 Bundler.append(MIB); 1759 finalizeBundle(MBB, Bundler.begin()); 1760 1761 MI.eraseFromParent(); 1762 break; 1763 } 1764 case AMDGPU::ENTER_WWM: { 1765 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when 1766 // WWM is entered. 1767 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 1768 : AMDGPU::S_OR_SAVEEXEC_B64)); 1769 break; 1770 } 1771 case AMDGPU::EXIT_WWM: { 1772 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when 1773 // WWM is exited. 1774 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); 1775 break; 1776 } 1777 } 1778 return true; 1779 } 1780 1781 std::pair<MachineInstr*, MachineInstr*> 1782 SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { 1783 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); 1784 1785 MachineBasicBlock &MBB = *MI.getParent(); 1786 DebugLoc DL = MBB.findDebugLoc(MI); 1787 MachineFunction *MF = MBB.getParent(); 1788 MachineRegisterInfo &MRI = MF->getRegInfo(); 1789 Register Dst = MI.getOperand(0).getReg(); 1790 unsigned Part = 0; 1791 MachineInstr *Split[2]; 1792 1793 1794 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) { 1795 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp)); 1796 if (Dst.isPhysical()) { 1797 MovDPP.addDef(RI.getSubReg(Dst, Sub)); 1798 } else { 1799 assert(MRI.isSSA()); 1800 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1801 MovDPP.addDef(Tmp); 1802 } 1803 1804 for (unsigned I = 1; I <= 2; ++I) { // old and src operands. 1805 const MachineOperand &SrcOp = MI.getOperand(I); 1806 assert(!SrcOp.isFPImm()); 1807 if (SrcOp.isImm()) { 1808 APInt Imm(64, SrcOp.getImm()); 1809 Imm.ashrInPlace(Part * 32); 1810 MovDPP.addImm(Imm.getLoBits(32).getZExtValue()); 1811 } else { 1812 assert(SrcOp.isReg()); 1813 Register Src = SrcOp.getReg(); 1814 if (Src.isPhysical()) 1815 MovDPP.addReg(RI.getSubReg(Src, Sub)); 1816 else 1817 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub); 1818 } 1819 } 1820 1821 for (unsigned I = 3; I < MI.getNumExplicitOperands(); ++I) 1822 MovDPP.addImm(MI.getOperand(I).getImm()); 1823 1824 Split[Part] = MovDPP; 1825 ++Part; 1826 } 1827 1828 if (Dst.isVirtual()) 1829 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst) 1830 .addReg(Split[0]->getOperand(0).getReg()) 1831 .addImm(AMDGPU::sub0) 1832 .addReg(Split[1]->getOperand(0).getReg()) 1833 .addImm(AMDGPU::sub1); 1834 1835 MI.eraseFromParent(); 1836 return std::make_pair(Split[0], Split[1]); 1837 } 1838 1839 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, 1840 MachineOperand &Src0, 1841 unsigned Src0OpName, 1842 MachineOperand &Src1, 1843 unsigned Src1OpName) const { 1844 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); 1845 if (!Src0Mods) 1846 return false; 1847 1848 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); 1849 assert(Src1Mods && 1850 "All commutable instructions have both src0 and src1 modifiers"); 1851 1852 int Src0ModsVal = Src0Mods->getImm(); 1853 int Src1ModsVal = Src1Mods->getImm(); 1854 1855 Src1Mods->setImm(Src0ModsVal); 1856 Src0Mods->setImm(Src1ModsVal); 1857 return true; 1858 } 1859 1860 static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, 1861 MachineOperand &RegOp, 1862 MachineOperand &NonRegOp) { 1863 Register Reg = RegOp.getReg(); 1864 unsigned SubReg = RegOp.getSubReg(); 1865 bool IsKill = RegOp.isKill(); 1866 bool IsDead = RegOp.isDead(); 1867 bool IsUndef = RegOp.isUndef(); 1868 bool IsDebug = RegOp.isDebug(); 1869 1870 if (NonRegOp.isImm()) 1871 RegOp.ChangeToImmediate(NonRegOp.getImm()); 1872 else if (NonRegOp.isFI()) 1873 RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); 1874 else 1875 return nullptr; 1876 1877 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); 1878 NonRegOp.setSubReg(SubReg); 1879 1880 return &MI; 1881 } 1882 1883 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, 1884 unsigned Src0Idx, 1885 unsigned Src1Idx) const { 1886 assert(!NewMI && "this should never be used"); 1887 1888 unsigned Opc = MI.getOpcode(); 1889 int CommutedOpcode = commuteOpcode(Opc); 1890 if (CommutedOpcode == -1) 1891 return nullptr; 1892 1893 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == 1894 static_cast<int>(Src0Idx) && 1895 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == 1896 static_cast<int>(Src1Idx) && 1897 "inconsistency with findCommutedOpIndices"); 1898 1899 MachineOperand &Src0 = MI.getOperand(Src0Idx); 1900 MachineOperand &Src1 = MI.getOperand(Src1Idx); 1901 1902 MachineInstr *CommutedMI = nullptr; 1903 if (Src0.isReg() && Src1.isReg()) { 1904 if (isOperandLegal(MI, Src1Idx, &Src0)) { 1905 // Be sure to copy the source modifiers to the right place. 1906 CommutedMI 1907 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); 1908 } 1909 1910 } else if (Src0.isReg() && !Src1.isReg()) { 1911 // src0 should always be able to support any operand type, so no need to 1912 // check operand legality. 1913 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); 1914 } else if (!Src0.isReg() && Src1.isReg()) { 1915 if (isOperandLegal(MI, Src1Idx, &Src0)) 1916 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); 1917 } else { 1918 // FIXME: Found two non registers to commute. This does happen. 1919 return nullptr; 1920 } 1921 1922 if (CommutedMI) { 1923 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, 1924 Src1, AMDGPU::OpName::src1_modifiers); 1925 1926 CommutedMI->setDesc(get(CommutedOpcode)); 1927 } 1928 1929 return CommutedMI; 1930 } 1931 1932 // This needs to be implemented because the source modifiers may be inserted 1933 // between the true commutable operands, and the base 1934 // TargetInstrInfo::commuteInstruction uses it. 1935 bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI, 1936 unsigned &SrcOpIdx0, 1937 unsigned &SrcOpIdx1) const { 1938 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1); 1939 } 1940 1941 bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0, 1942 unsigned &SrcOpIdx1) const { 1943 if (!Desc.isCommutable()) 1944 return false; 1945 1946 unsigned Opc = Desc.getOpcode(); 1947 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1948 if (Src0Idx == -1) 1949 return false; 1950 1951 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1952 if (Src1Idx == -1) 1953 return false; 1954 1955 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); 1956 } 1957 1958 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, 1959 int64_t BrOffset) const { 1960 // BranchRelaxation should never have to check s_setpc_b64 because its dest 1961 // block is unanalyzable. 1962 assert(BranchOp != AMDGPU::S_SETPC_B64); 1963 1964 // Convert to dwords. 1965 BrOffset /= 4; 1966 1967 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is 1968 // from the next instruction. 1969 BrOffset -= 1; 1970 1971 return isIntN(BranchOffsetBits, BrOffset); 1972 } 1973 1974 MachineBasicBlock *SIInstrInfo::getBranchDestBlock( 1975 const MachineInstr &MI) const { 1976 if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { 1977 // This would be a difficult analysis to perform, but can always be legal so 1978 // there's no need to analyze it. 1979 return nullptr; 1980 } 1981 1982 return MI.getOperand(0).getMBB(); 1983 } 1984 1985 unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, 1986 MachineBasicBlock &DestBB, 1987 const DebugLoc &DL, 1988 int64_t BrOffset, 1989 RegScavenger *RS) const { 1990 assert(RS && "RegScavenger required for long branching"); 1991 assert(MBB.empty() && 1992 "new block should be inserted for expanding unconditional branch"); 1993 assert(MBB.pred_size() == 1); 1994 1995 MachineFunction *MF = MBB.getParent(); 1996 MachineRegisterInfo &MRI = MF->getRegInfo(); 1997 1998 // FIXME: Virtual register workaround for RegScavenger not working with empty 1999 // blocks. 2000 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2001 2002 auto I = MBB.end(); 2003 2004 // We need to compute the offset relative to the instruction immediately after 2005 // s_getpc_b64. Insert pc arithmetic code before last terminator. 2006 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); 2007 2008 // TODO: Handle > 32-bit block address. 2009 if (BrOffset >= 0) { 2010 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) 2011 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 2012 .addReg(PCReg, 0, AMDGPU::sub0) 2013 .addMBB(&DestBB, MO_LONG_BRANCH_FORWARD); 2014 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) 2015 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 2016 .addReg(PCReg, 0, AMDGPU::sub1) 2017 .addImm(0); 2018 } else { 2019 // Backwards branch. 2020 BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32)) 2021 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 2022 .addReg(PCReg, 0, AMDGPU::sub0) 2023 .addMBB(&DestBB, MO_LONG_BRANCH_BACKWARD); 2024 BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32)) 2025 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 2026 .addReg(PCReg, 0, AMDGPU::sub1) 2027 .addImm(0); 2028 } 2029 2030 // Insert the indirect branch after the other terminator. 2031 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) 2032 .addReg(PCReg); 2033 2034 // FIXME: If spilling is necessary, this will fail because this scavenger has 2035 // no emergency stack slots. It is non-trivial to spill in this situation, 2036 // because the restore code needs to be specially placed after the 2037 // jump. BranchRelaxation then needs to be made aware of the newly inserted 2038 // block. 2039 // 2040 // If a spill is needed for the pc register pair, we need to insert a spill 2041 // restore block right before the destination block, and insert a short branch 2042 // into the old destination block's fallthrough predecessor. 2043 // e.g.: 2044 // 2045 // s_cbranch_scc0 skip_long_branch: 2046 // 2047 // long_branch_bb: 2048 // spill s[8:9] 2049 // s_getpc_b64 s[8:9] 2050 // s_add_u32 s8, s8, restore_bb 2051 // s_addc_u32 s9, s9, 0 2052 // s_setpc_b64 s[8:9] 2053 // 2054 // skip_long_branch: 2055 // foo; 2056 // 2057 // ..... 2058 // 2059 // dest_bb_fallthrough_predecessor: 2060 // bar; 2061 // s_branch dest_bb 2062 // 2063 // restore_bb: 2064 // restore s[8:9] 2065 // fallthrough dest_bb 2066 /// 2067 // dest_bb: 2068 // buzz; 2069 2070 RS->enterBasicBlockEnd(MBB); 2071 unsigned Scav = RS->scavengeRegisterBackwards( 2072 AMDGPU::SReg_64RegClass, 2073 MachineBasicBlock::iterator(GetPC), false, 0); 2074 MRI.replaceRegWith(PCReg, Scav); 2075 MRI.clearVirtRegs(); 2076 RS->setRegUsed(Scav); 2077 2078 return 4 + 8 + 4 + 4; 2079 } 2080 2081 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { 2082 switch (Cond) { 2083 case SIInstrInfo::SCC_TRUE: 2084 return AMDGPU::S_CBRANCH_SCC1; 2085 case SIInstrInfo::SCC_FALSE: 2086 return AMDGPU::S_CBRANCH_SCC0; 2087 case SIInstrInfo::VCCNZ: 2088 return AMDGPU::S_CBRANCH_VCCNZ; 2089 case SIInstrInfo::VCCZ: 2090 return AMDGPU::S_CBRANCH_VCCZ; 2091 case SIInstrInfo::EXECNZ: 2092 return AMDGPU::S_CBRANCH_EXECNZ; 2093 case SIInstrInfo::EXECZ: 2094 return AMDGPU::S_CBRANCH_EXECZ; 2095 default: 2096 llvm_unreachable("invalid branch predicate"); 2097 } 2098 } 2099 2100 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { 2101 switch (Opcode) { 2102 case AMDGPU::S_CBRANCH_SCC0: 2103 return SCC_FALSE; 2104 case AMDGPU::S_CBRANCH_SCC1: 2105 return SCC_TRUE; 2106 case AMDGPU::S_CBRANCH_VCCNZ: 2107 return VCCNZ; 2108 case AMDGPU::S_CBRANCH_VCCZ: 2109 return VCCZ; 2110 case AMDGPU::S_CBRANCH_EXECNZ: 2111 return EXECNZ; 2112 case AMDGPU::S_CBRANCH_EXECZ: 2113 return EXECZ; 2114 default: 2115 return INVALID_BR; 2116 } 2117 } 2118 2119 bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, 2120 MachineBasicBlock::iterator I, 2121 MachineBasicBlock *&TBB, 2122 MachineBasicBlock *&FBB, 2123 SmallVectorImpl<MachineOperand> &Cond, 2124 bool AllowModify) const { 2125 if (I->getOpcode() == AMDGPU::S_BRANCH) { 2126 // Unconditional Branch 2127 TBB = I->getOperand(0).getMBB(); 2128 return false; 2129 } 2130 2131 MachineBasicBlock *CondBB = nullptr; 2132 2133 if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 2134 CondBB = I->getOperand(1).getMBB(); 2135 Cond.push_back(I->getOperand(0)); 2136 } else { 2137 BranchPredicate Pred = getBranchPredicate(I->getOpcode()); 2138 if (Pred == INVALID_BR) 2139 return true; 2140 2141 CondBB = I->getOperand(0).getMBB(); 2142 Cond.push_back(MachineOperand::CreateImm(Pred)); 2143 Cond.push_back(I->getOperand(1)); // Save the branch register. 2144 } 2145 ++I; 2146 2147 if (I == MBB.end()) { 2148 // Conditional branch followed by fall-through. 2149 TBB = CondBB; 2150 return false; 2151 } 2152 2153 if (I->getOpcode() == AMDGPU::S_BRANCH) { 2154 TBB = CondBB; 2155 FBB = I->getOperand(0).getMBB(); 2156 return false; 2157 } 2158 2159 return true; 2160 } 2161 2162 bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, 2163 MachineBasicBlock *&FBB, 2164 SmallVectorImpl<MachineOperand> &Cond, 2165 bool AllowModify) const { 2166 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 2167 auto E = MBB.end(); 2168 if (I == E) 2169 return false; 2170 2171 // Skip over the instructions that are artificially terminators for special 2172 // exec management. 2173 while (I != E && !I->isBranch() && !I->isReturn() && 2174 I->getOpcode() != AMDGPU::SI_MASK_BRANCH) { 2175 switch (I->getOpcode()) { 2176 case AMDGPU::SI_MASK_BRANCH: 2177 case AMDGPU::S_MOV_B64_term: 2178 case AMDGPU::S_XOR_B64_term: 2179 case AMDGPU::S_ANDN2_B64_term: 2180 case AMDGPU::S_MOV_B32_term: 2181 case AMDGPU::S_XOR_B32_term: 2182 case AMDGPU::S_OR_B32_term: 2183 case AMDGPU::S_ANDN2_B32_term: 2184 break; 2185 case AMDGPU::SI_IF: 2186 case AMDGPU::SI_ELSE: 2187 case AMDGPU::SI_KILL_I1_TERMINATOR: 2188 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: 2189 // FIXME: It's messy that these need to be considered here at all. 2190 return true; 2191 default: 2192 llvm_unreachable("unexpected non-branch terminator inst"); 2193 } 2194 2195 ++I; 2196 } 2197 2198 if (I == E) 2199 return false; 2200 2201 if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH) 2202 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); 2203 2204 ++I; 2205 2206 // TODO: Should be able to treat as fallthrough? 2207 if (I == MBB.end()) 2208 return true; 2209 2210 if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify)) 2211 return true; 2212 2213 MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB(); 2214 2215 // Specifically handle the case where the conditional branch is to the same 2216 // destination as the mask branch. e.g. 2217 // 2218 // si_mask_branch BB8 2219 // s_cbranch_execz BB8 2220 // s_cbranch BB9 2221 // 2222 // This is required to understand divergent loops which may need the branches 2223 // to be relaxed. 2224 if (TBB != MaskBrDest || Cond.empty()) 2225 return true; 2226 2227 auto Pred = Cond[0].getImm(); 2228 return (Pred != EXECZ && Pred != EXECNZ); 2229 } 2230 2231 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, 2232 int *BytesRemoved) const { 2233 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 2234 2235 unsigned Count = 0; 2236 unsigned RemovedSize = 0; 2237 while (I != MBB.end()) { 2238 MachineBasicBlock::iterator Next = std::next(I); 2239 if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) { 2240 I = Next; 2241 continue; 2242 } 2243 2244 RemovedSize += getInstSizeInBytes(*I); 2245 I->eraseFromParent(); 2246 ++Count; 2247 I = Next; 2248 } 2249 2250 if (BytesRemoved) 2251 *BytesRemoved = RemovedSize; 2252 2253 return Count; 2254 } 2255 2256 // Copy the flags onto the implicit condition register operand. 2257 static void preserveCondRegFlags(MachineOperand &CondReg, 2258 const MachineOperand &OrigCond) { 2259 CondReg.setIsUndef(OrigCond.isUndef()); 2260 CondReg.setIsKill(OrigCond.isKill()); 2261 } 2262 2263 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, 2264 MachineBasicBlock *TBB, 2265 MachineBasicBlock *FBB, 2266 ArrayRef<MachineOperand> Cond, 2267 const DebugLoc &DL, 2268 int *BytesAdded) const { 2269 if (!FBB && Cond.empty()) { 2270 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 2271 .addMBB(TBB); 2272 if (BytesAdded) 2273 *BytesAdded = 4; 2274 return 1; 2275 } 2276 2277 if(Cond.size() == 1 && Cond[0].isReg()) { 2278 BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO)) 2279 .add(Cond[0]) 2280 .addMBB(TBB); 2281 return 1; 2282 } 2283 2284 assert(TBB && Cond[0].isImm()); 2285 2286 unsigned Opcode 2287 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); 2288 2289 if (!FBB) { 2290 Cond[1].isUndef(); 2291 MachineInstr *CondBr = 2292 BuildMI(&MBB, DL, get(Opcode)) 2293 .addMBB(TBB); 2294 2295 // Copy the flags onto the implicit condition register operand. 2296 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); 2297 2298 if (BytesAdded) 2299 *BytesAdded = 4; 2300 return 1; 2301 } 2302 2303 assert(TBB && FBB); 2304 2305 MachineInstr *CondBr = 2306 BuildMI(&MBB, DL, get(Opcode)) 2307 .addMBB(TBB); 2308 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 2309 .addMBB(FBB); 2310 2311 MachineOperand &CondReg = CondBr->getOperand(1); 2312 CondReg.setIsUndef(Cond[1].isUndef()); 2313 CondReg.setIsKill(Cond[1].isKill()); 2314 2315 if (BytesAdded) 2316 *BytesAdded = 8; 2317 2318 return 2; 2319 } 2320 2321 bool SIInstrInfo::reverseBranchCondition( 2322 SmallVectorImpl<MachineOperand> &Cond) const { 2323 if (Cond.size() != 2) { 2324 return true; 2325 } 2326 2327 if (Cond[0].isImm()) { 2328 Cond[0].setImm(-Cond[0].getImm()); 2329 return false; 2330 } 2331 2332 return true; 2333 } 2334 2335 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 2336 ArrayRef<MachineOperand> Cond, 2337 Register DstReg, Register TrueReg, 2338 Register FalseReg, int &CondCycles, 2339 int &TrueCycles, int &FalseCycles) const { 2340 switch (Cond[0].getImm()) { 2341 case VCCNZ: 2342 case VCCZ: { 2343 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2344 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 2345 assert(MRI.getRegClass(FalseReg) == RC); 2346 2347 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 2348 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 2349 2350 // Limit to equal cost for branch vs. N v_cndmask_b32s. 2351 return RI.hasVGPRs(RC) && NumInsts <= 6; 2352 } 2353 case SCC_TRUE: 2354 case SCC_FALSE: { 2355 // FIXME: We could insert for VGPRs if we could replace the original compare 2356 // with a vector one. 2357 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2358 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 2359 assert(MRI.getRegClass(FalseReg) == RC); 2360 2361 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 2362 2363 // Multiples of 8 can do s_cselect_b64 2364 if (NumInsts % 2 == 0) 2365 NumInsts /= 2; 2366 2367 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 2368 return RI.isSGPRClass(RC); 2369 } 2370 default: 2371 return false; 2372 } 2373 } 2374 2375 void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, 2376 MachineBasicBlock::iterator I, const DebugLoc &DL, 2377 Register DstReg, ArrayRef<MachineOperand> Cond, 2378 Register TrueReg, Register FalseReg) const { 2379 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); 2380 if (Pred == VCCZ || Pred == SCC_FALSE) { 2381 Pred = static_cast<BranchPredicate>(-Pred); 2382 std::swap(TrueReg, FalseReg); 2383 } 2384 2385 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2386 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); 2387 unsigned DstSize = RI.getRegSizeInBits(*DstRC); 2388 2389 if (DstSize == 32) { 2390 MachineInstr *Select; 2391 if (Pred == SCC_TRUE) { 2392 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg) 2393 .addReg(TrueReg) 2394 .addReg(FalseReg); 2395 } else { 2396 // Instruction's operands are backwards from what is expected. 2397 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg) 2398 .addReg(FalseReg) 2399 .addReg(TrueReg); 2400 } 2401 2402 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 2403 return; 2404 } 2405 2406 if (DstSize == 64 && Pred == SCC_TRUE) { 2407 MachineInstr *Select = 2408 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) 2409 .addReg(TrueReg) 2410 .addReg(FalseReg); 2411 2412 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 2413 return; 2414 } 2415 2416 static const int16_t Sub0_15[] = { 2417 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 2418 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 2419 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 2420 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 2421 }; 2422 2423 static const int16_t Sub0_15_64[] = { 2424 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 2425 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 2426 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 2427 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 2428 }; 2429 2430 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; 2431 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; 2432 const int16_t *SubIndices = Sub0_15; 2433 int NElts = DstSize / 32; 2434 2435 // 64-bit select is only available for SALU. 2436 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit. 2437 if (Pred == SCC_TRUE) { 2438 if (NElts % 2) { 2439 SelOp = AMDGPU::S_CSELECT_B32; 2440 EltRC = &AMDGPU::SGPR_32RegClass; 2441 } else { 2442 SelOp = AMDGPU::S_CSELECT_B64; 2443 EltRC = &AMDGPU::SGPR_64RegClass; 2444 SubIndices = Sub0_15_64; 2445 NElts /= 2; 2446 } 2447 } 2448 2449 MachineInstrBuilder MIB = BuildMI( 2450 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); 2451 2452 I = MIB->getIterator(); 2453 2454 SmallVector<Register, 8> Regs; 2455 for (int Idx = 0; Idx != NElts; ++Idx) { 2456 Register DstElt = MRI.createVirtualRegister(EltRC); 2457 Regs.push_back(DstElt); 2458 2459 unsigned SubIdx = SubIndices[Idx]; 2460 2461 MachineInstr *Select; 2462 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) { 2463 Select = 2464 BuildMI(MBB, I, DL, get(SelOp), DstElt) 2465 .addReg(FalseReg, 0, SubIdx) 2466 .addReg(TrueReg, 0, SubIdx); 2467 } else { 2468 Select = 2469 BuildMI(MBB, I, DL, get(SelOp), DstElt) 2470 .addReg(TrueReg, 0, SubIdx) 2471 .addReg(FalseReg, 0, SubIdx); 2472 } 2473 2474 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 2475 fixImplicitOperands(*Select); 2476 2477 MIB.addReg(DstElt) 2478 .addImm(SubIdx); 2479 } 2480 } 2481 2482 bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { 2483 switch (MI.getOpcode()) { 2484 case AMDGPU::V_MOV_B32_e32: 2485 case AMDGPU::V_MOV_B32_e64: 2486 case AMDGPU::V_MOV_B64_PSEUDO: { 2487 // If there are additional implicit register operands, this may be used for 2488 // register indexing so the source register operand isn't simply copied. 2489 unsigned NumOps = MI.getDesc().getNumOperands() + 2490 MI.getDesc().getNumImplicitUses(); 2491 2492 return MI.getNumOperands() == NumOps; 2493 } 2494 case AMDGPU::S_MOV_B32: 2495 case AMDGPU::S_MOV_B64: 2496 case AMDGPU::COPY: 2497 case AMDGPU::V_ACCVGPR_WRITE_B32: 2498 case AMDGPU::V_ACCVGPR_READ_B32: 2499 return true; 2500 default: 2501 return false; 2502 } 2503 } 2504 2505 unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind( 2506 unsigned Kind) const { 2507 switch(Kind) { 2508 case PseudoSourceValue::Stack: 2509 case PseudoSourceValue::FixedStack: 2510 return AMDGPUAS::PRIVATE_ADDRESS; 2511 case PseudoSourceValue::ConstantPool: 2512 case PseudoSourceValue::GOT: 2513 case PseudoSourceValue::JumpTable: 2514 case PseudoSourceValue::GlobalValueCallEntry: 2515 case PseudoSourceValue::ExternalSymbolCallEntry: 2516 case PseudoSourceValue::TargetCustom: 2517 return AMDGPUAS::CONSTANT_ADDRESS; 2518 } 2519 return AMDGPUAS::FLAT_ADDRESS; 2520 } 2521 2522 static void removeModOperands(MachineInstr &MI) { 2523 unsigned Opc = MI.getOpcode(); 2524 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, 2525 AMDGPU::OpName::src0_modifiers); 2526 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, 2527 AMDGPU::OpName::src1_modifiers); 2528 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, 2529 AMDGPU::OpName::src2_modifiers); 2530 2531 MI.RemoveOperand(Src2ModIdx); 2532 MI.RemoveOperand(Src1ModIdx); 2533 MI.RemoveOperand(Src0ModIdx); 2534 } 2535 2536 bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, 2537 Register Reg, MachineRegisterInfo *MRI) const { 2538 if (!MRI->hasOneNonDBGUse(Reg)) 2539 return false; 2540 2541 switch (DefMI.getOpcode()) { 2542 default: 2543 return false; 2544 case AMDGPU::S_MOV_B64: 2545 // TODO: We could fold 64-bit immediates, but this get compilicated 2546 // when there are sub-registers. 2547 return false; 2548 2549 case AMDGPU::V_MOV_B32_e32: 2550 case AMDGPU::S_MOV_B32: 2551 case AMDGPU::V_ACCVGPR_WRITE_B32: 2552 break; 2553 } 2554 2555 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); 2556 assert(ImmOp); 2557 // FIXME: We could handle FrameIndex values here. 2558 if (!ImmOp->isImm()) 2559 return false; 2560 2561 unsigned Opc = UseMI.getOpcode(); 2562 if (Opc == AMDGPU::COPY) { 2563 Register DstReg = UseMI.getOperand(0).getReg(); 2564 bool Is16Bit = getOpSize(UseMI, 0) == 2; 2565 bool isVGPRCopy = RI.isVGPR(*MRI, DstReg); 2566 unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 2567 APInt Imm(32, ImmOp->getImm()); 2568 2569 if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16) 2570 Imm = Imm.ashr(16); 2571 2572 if (RI.isAGPR(*MRI, DstReg)) { 2573 if (!isInlineConstant(Imm)) 2574 return false; 2575 NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32; 2576 } 2577 2578 if (Is16Bit) { 2579 if (isVGPRCopy) 2580 return false; // Do not clobber vgpr_hi16 2581 2582 if (DstReg.isVirtual() && 2583 UseMI.getOperand(0).getSubReg() != AMDGPU::lo16) 2584 return false; 2585 2586 UseMI.getOperand(0).setSubReg(0); 2587 if (DstReg.isPhysical()) { 2588 DstReg = RI.get32BitRegister(DstReg); 2589 UseMI.getOperand(0).setReg(DstReg); 2590 } 2591 assert(UseMI.getOperand(1).getReg().isVirtual()); 2592 } 2593 2594 UseMI.setDesc(get(NewOpc)); 2595 UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue()); 2596 UseMI.getOperand(1).setTargetFlags(0); 2597 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); 2598 return true; 2599 } 2600 2601 if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || 2602 Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64 || 2603 Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 || 2604 Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64) { 2605 // Don't fold if we are using source or output modifiers. The new VOP2 2606 // instructions don't have them. 2607 if (hasAnyModifiersSet(UseMI)) 2608 return false; 2609 2610 // If this is a free constant, there's no reason to do this. 2611 // TODO: We could fold this here instead of letting SIFoldOperands do it 2612 // later. 2613 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); 2614 2615 // Any src operand can be used for the legality check. 2616 if (isInlineConstant(UseMI, *Src0, *ImmOp)) 2617 return false; 2618 2619 bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || 2620 Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64; 2621 bool IsFMA = Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 || 2622 Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64; 2623 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); 2624 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); 2625 2626 // Multiplied part is the constant: Use v_madmk_{f16, f32}. 2627 // We should only expect these to be on src0 due to canonicalizations. 2628 if (Src0->isReg() && Src0->getReg() == Reg) { 2629 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 2630 return false; 2631 2632 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) 2633 return false; 2634 2635 unsigned NewOpc = 2636 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : AMDGPU::V_FMAMK_F16) 2637 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16); 2638 if (pseudoToMCOpcode(NewOpc) == -1) 2639 return false; 2640 2641 // We need to swap operands 0 and 1 since madmk constant is at operand 1. 2642 2643 const int64_t Imm = ImmOp->getImm(); 2644 2645 // FIXME: This would be a lot easier if we could return a new instruction 2646 // instead of having to modify in place. 2647 2648 // Remove these first since they are at the end. 2649 UseMI.RemoveOperand( 2650 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 2651 UseMI.RemoveOperand( 2652 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 2653 2654 Register Src1Reg = Src1->getReg(); 2655 unsigned Src1SubReg = Src1->getSubReg(); 2656 Src0->setReg(Src1Reg); 2657 Src0->setSubReg(Src1SubReg); 2658 Src0->setIsKill(Src1->isKill()); 2659 2660 if (Opc == AMDGPU::V_MAC_F32_e64 || 2661 Opc == AMDGPU::V_MAC_F16_e64 || 2662 Opc == AMDGPU::V_FMAC_F32_e64 || 2663 Opc == AMDGPU::V_FMAC_F16_e64) 2664 UseMI.untieRegOperand( 2665 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 2666 2667 Src1->ChangeToImmediate(Imm); 2668 2669 removeModOperands(UseMI); 2670 UseMI.setDesc(get(NewOpc)); 2671 2672 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 2673 if (DeleteDef) 2674 DefMI.eraseFromParent(); 2675 2676 return true; 2677 } 2678 2679 // Added part is the constant: Use v_madak_{f16, f32}. 2680 if (Src2->isReg() && Src2->getReg() == Reg) { 2681 // Not allowed to use constant bus for another operand. 2682 // We can however allow an inline immediate as src0. 2683 bool Src0Inlined = false; 2684 if (Src0->isReg()) { 2685 // Try to inline constant if possible. 2686 // If the Def moves immediate and the use is single 2687 // We are saving VGPR here. 2688 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg()); 2689 if (Def && Def->isMoveImmediate() && 2690 isInlineConstant(Def->getOperand(1)) && 2691 MRI->hasOneUse(Src0->getReg())) { 2692 Src0->ChangeToImmediate(Def->getOperand(1).getImm()); 2693 Src0Inlined = true; 2694 } else if ((Register::isPhysicalRegister(Src0->getReg()) && 2695 (ST.getConstantBusLimit(Opc) <= 1 && 2696 RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) || 2697 (Register::isVirtualRegister(Src0->getReg()) && 2698 (ST.getConstantBusLimit(Opc) <= 1 && 2699 RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))) 2700 return false; 2701 // VGPR is okay as Src0 - fallthrough 2702 } 2703 2704 if (Src1->isReg() && !Src0Inlined ) { 2705 // We have one slot for inlinable constant so far - try to fill it 2706 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg()); 2707 if (Def && Def->isMoveImmediate() && 2708 isInlineConstant(Def->getOperand(1)) && 2709 MRI->hasOneUse(Src1->getReg()) && 2710 commuteInstruction(UseMI)) { 2711 Src0->ChangeToImmediate(Def->getOperand(1).getImm()); 2712 } else if ((Register::isPhysicalRegister(Src1->getReg()) && 2713 RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) || 2714 (Register::isVirtualRegister(Src1->getReg()) && 2715 RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) 2716 return false; 2717 // VGPR is okay as Src1 - fallthrough 2718 } 2719 2720 unsigned NewOpc = 2721 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 : AMDGPU::V_FMAAK_F16) 2722 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16); 2723 if (pseudoToMCOpcode(NewOpc) == -1) 2724 return false; 2725 2726 const int64_t Imm = ImmOp->getImm(); 2727 2728 // FIXME: This would be a lot easier if we could return a new instruction 2729 // instead of having to modify in place. 2730 2731 // Remove these first since they are at the end. 2732 UseMI.RemoveOperand( 2733 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 2734 UseMI.RemoveOperand( 2735 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 2736 2737 if (Opc == AMDGPU::V_MAC_F32_e64 || 2738 Opc == AMDGPU::V_MAC_F16_e64 || 2739 Opc == AMDGPU::V_FMAC_F32_e64 || 2740 Opc == AMDGPU::V_FMAC_F16_e64) 2741 UseMI.untieRegOperand( 2742 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 2743 2744 // ChangingToImmediate adds Src2 back to the instruction. 2745 Src2->ChangeToImmediate(Imm); 2746 2747 // These come before src2. 2748 removeModOperands(UseMI); 2749 UseMI.setDesc(get(NewOpc)); 2750 // It might happen that UseMI was commuted 2751 // and we now have SGPR as SRC1. If so 2 inlined 2752 // constant and SGPR are illegal. 2753 legalizeOperands(UseMI); 2754 2755 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 2756 if (DeleteDef) 2757 DefMI.eraseFromParent(); 2758 2759 return true; 2760 } 2761 } 2762 2763 return false; 2764 } 2765 2766 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 2767 int WidthB, int OffsetB) { 2768 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 2769 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 2770 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 2771 return LowOffset + LowWidth <= HighOffset; 2772 } 2773 2774 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa, 2775 const MachineInstr &MIb) const { 2776 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1; 2777 int64_t Offset0, Offset1; 2778 unsigned Dummy0, Dummy1; 2779 bool Offset0IsScalable, Offset1IsScalable; 2780 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable, 2781 Dummy0, &RI) || 2782 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable, 2783 Dummy1, &RI)) 2784 return false; 2785 2786 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1)) 2787 return false; 2788 2789 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { 2790 // FIXME: Handle ds_read2 / ds_write2. 2791 return false; 2792 } 2793 unsigned Width0 = MIa.memoperands().front()->getSize(); 2794 unsigned Width1 = MIb.memoperands().front()->getSize(); 2795 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1); 2796 } 2797 2798 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, 2799 const MachineInstr &MIb) const { 2800 assert(MIa.mayLoadOrStore() && 2801 "MIa must load from or modify a memory location"); 2802 assert(MIb.mayLoadOrStore() && 2803 "MIb must load from or modify a memory location"); 2804 2805 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) 2806 return false; 2807 2808 // XXX - Can we relax this between address spaces? 2809 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 2810 return false; 2811 2812 // TODO: Should we check the address space from the MachineMemOperand? That 2813 // would allow us to distinguish objects we know don't alias based on the 2814 // underlying address space, even if it was lowered to a different one, 2815 // e.g. private accesses lowered to use MUBUF instructions on a scratch 2816 // buffer. 2817 if (isDS(MIa)) { 2818 if (isDS(MIb)) 2819 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2820 2821 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb); 2822 } 2823 2824 if (isMUBUF(MIa) || isMTBUF(MIa)) { 2825 if (isMUBUF(MIb) || isMTBUF(MIb)) 2826 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2827 2828 return !isFLAT(MIb) && !isSMRD(MIb); 2829 } 2830 2831 if (isSMRD(MIa)) { 2832 if (isSMRD(MIb)) 2833 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2834 2835 return !isFLAT(MIb) && !isMUBUF(MIb) && !isMTBUF(MIb); 2836 } 2837 2838 if (isFLAT(MIa)) { 2839 if (isFLAT(MIb)) 2840 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2841 2842 return false; 2843 } 2844 2845 return false; 2846 } 2847 2848 static int64_t getFoldableImm(const MachineOperand* MO) { 2849 if (!MO->isReg()) 2850 return false; 2851 const MachineFunction *MF = MO->getParent()->getParent()->getParent(); 2852 const MachineRegisterInfo &MRI = MF->getRegInfo(); 2853 auto Def = MRI.getUniqueVRegDef(MO->getReg()); 2854 if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 && 2855 Def->getOperand(1).isImm()) 2856 return Def->getOperand(1).getImm(); 2857 return AMDGPU::NoRegister; 2858 } 2859 2860 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, 2861 MachineInstr &MI, 2862 LiveVariables *LV) const { 2863 unsigned Opc = MI.getOpcode(); 2864 bool IsF16 = false; 2865 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 || 2866 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64; 2867 2868 switch (Opc) { 2869 default: 2870 return nullptr; 2871 case AMDGPU::V_MAC_F16_e64: 2872 case AMDGPU::V_FMAC_F16_e64: 2873 IsF16 = true; 2874 LLVM_FALLTHROUGH; 2875 case AMDGPU::V_MAC_F32_e64: 2876 case AMDGPU::V_FMAC_F32_e64: 2877 break; 2878 case AMDGPU::V_MAC_F16_e32: 2879 case AMDGPU::V_FMAC_F16_e32: 2880 IsF16 = true; 2881 LLVM_FALLTHROUGH; 2882 case AMDGPU::V_MAC_F32_e32: 2883 case AMDGPU::V_FMAC_F32_e32: { 2884 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 2885 AMDGPU::OpName::src0); 2886 const MachineOperand *Src0 = &MI.getOperand(Src0Idx); 2887 if (!Src0->isReg() && !Src0->isImm()) 2888 return nullptr; 2889 2890 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0)) 2891 return nullptr; 2892 2893 break; 2894 } 2895 } 2896 2897 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 2898 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); 2899 const MachineOperand *Src0Mods = 2900 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 2901 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 2902 const MachineOperand *Src1Mods = 2903 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 2904 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 2905 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); 2906 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); 2907 2908 if (!Src0Mods && !Src1Mods && !Clamp && !Omod && 2909 // If we have an SGPR input, we will violate the constant bus restriction. 2910 (ST.getConstantBusLimit(Opc) > 1 || 2911 !Src0->isReg() || 2912 !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) { 2913 if (auto Imm = getFoldableImm(Src2)) { 2914 unsigned NewOpc = 2915 IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32) 2916 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32); 2917 if (pseudoToMCOpcode(NewOpc) != -1) 2918 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) 2919 .add(*Dst) 2920 .add(*Src0) 2921 .add(*Src1) 2922 .addImm(Imm); 2923 } 2924 unsigned NewOpc = 2925 IsFMA ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32) 2926 : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32); 2927 if (auto Imm = getFoldableImm(Src1)) { 2928 if (pseudoToMCOpcode(NewOpc) != -1) 2929 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) 2930 .add(*Dst) 2931 .add(*Src0) 2932 .addImm(Imm) 2933 .add(*Src2); 2934 } 2935 if (auto Imm = getFoldableImm(Src0)) { 2936 if (pseudoToMCOpcode(NewOpc) != -1 && 2937 isOperandLegal(MI, AMDGPU::getNamedOperandIdx(NewOpc, 2938 AMDGPU::OpName::src0), Src1)) 2939 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) 2940 .add(*Dst) 2941 .add(*Src1) 2942 .addImm(Imm) 2943 .add(*Src2); 2944 } 2945 } 2946 2947 unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16 : AMDGPU::V_FMA_F32) 2948 : (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32); 2949 if (pseudoToMCOpcode(NewOpc) == -1) 2950 return nullptr; 2951 2952 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) 2953 .add(*Dst) 2954 .addImm(Src0Mods ? Src0Mods->getImm() : 0) 2955 .add(*Src0) 2956 .addImm(Src1Mods ? Src1Mods->getImm() : 0) 2957 .add(*Src1) 2958 .addImm(0) // Src mods 2959 .add(*Src2) 2960 .addImm(Clamp ? Clamp->getImm() : 0) 2961 .addImm(Omod ? Omod->getImm() : 0); 2962 } 2963 2964 // It's not generally safe to move VALU instructions across these since it will 2965 // start using the register as a base index rather than directly. 2966 // XXX - Why isn't hasSideEffects sufficient for these? 2967 static bool changesVGPRIndexingMode(const MachineInstr &MI) { 2968 switch (MI.getOpcode()) { 2969 case AMDGPU::S_SET_GPR_IDX_ON: 2970 case AMDGPU::S_SET_GPR_IDX_MODE: 2971 case AMDGPU::S_SET_GPR_IDX_OFF: 2972 return true; 2973 default: 2974 return false; 2975 } 2976 } 2977 2978 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, 2979 const MachineBasicBlock *MBB, 2980 const MachineFunction &MF) const { 2981 // Skipping the check for SP writes in the base implementation. The reason it 2982 // was added was apparently due to compile time concerns. 2983 // 2984 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops 2985 // but is probably avoidable. 2986 2987 // Copied from base implementation. 2988 // Terminators and labels can't be scheduled around. 2989 if (MI.isTerminator() || MI.isPosition()) 2990 return true; 2991 2992 // Target-independent instructions do not have an implicit-use of EXEC, even 2993 // when they operate on VGPRs. Treating EXEC modifications as scheduling 2994 // boundaries prevents incorrect movements of such instructions. 2995 2996 // TODO: Don't treat setreg with known constant that only changes MODE as 2997 // barrier. 2998 return MI.modifiesRegister(AMDGPU::EXEC, &RI) || 2999 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || 3000 MI.getOpcode() == AMDGPU::S_SETREG_B32 || 3001 changesVGPRIndexingMode(MI); 3002 } 3003 3004 bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { 3005 return Opcode == AMDGPU::DS_ORDERED_COUNT || 3006 Opcode == AMDGPU::DS_GWS_INIT || 3007 Opcode == AMDGPU::DS_GWS_SEMA_V || 3008 Opcode == AMDGPU::DS_GWS_SEMA_BR || 3009 Opcode == AMDGPU::DS_GWS_SEMA_P || 3010 Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL || 3011 Opcode == AMDGPU::DS_GWS_BARRIER; 3012 } 3013 3014 bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const { 3015 unsigned Opcode = MI.getOpcode(); 3016 3017 if (MI.mayStore() && isSMRD(MI)) 3018 return true; // scalar store or atomic 3019 3020 // This will terminate the function when other lanes may need to continue. 3021 if (MI.isReturn()) 3022 return true; 3023 3024 // These instructions cause shader I/O that may cause hardware lockups 3025 // when executed with an empty EXEC mask. 3026 // 3027 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when 3028 // EXEC = 0, but checking for that case here seems not worth it 3029 // given the typical code patterns. 3030 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT || 3031 Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE || 3032 Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP || 3033 Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER) 3034 return true; 3035 3036 if (MI.isCall() || MI.isInlineAsm()) 3037 return true; // conservative assumption 3038 3039 // These are like SALU instructions in terms of effects, so it's questionable 3040 // whether we should return true for those. 3041 // 3042 // However, executing them with EXEC = 0 causes them to operate on undefined 3043 // data, which we avoid by returning true here. 3044 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || Opcode == AMDGPU::V_READLANE_B32) 3045 return true; 3046 3047 return false; 3048 } 3049 3050 bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI, 3051 const MachineInstr &MI) const { 3052 if (MI.isMetaInstruction()) 3053 return false; 3054 3055 // This won't read exec if this is an SGPR->SGPR copy. 3056 if (MI.isCopyLike()) { 3057 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg())) 3058 return true; 3059 3060 // Make sure this isn't copying exec as a normal operand 3061 return MI.readsRegister(AMDGPU::EXEC, &RI); 3062 } 3063 3064 // Make a conservative assumption about the callee. 3065 if (MI.isCall()) 3066 return true; 3067 3068 // Be conservative with any unhandled generic opcodes. 3069 if (!isTargetSpecificOpcode(MI.getOpcode())) 3070 return true; 3071 3072 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI); 3073 } 3074 3075 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 3076 switch (Imm.getBitWidth()) { 3077 case 1: // This likely will be a condition code mask. 3078 return true; 3079 3080 case 32: 3081 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(), 3082 ST.hasInv2PiInlineImm()); 3083 case 64: 3084 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), 3085 ST.hasInv2PiInlineImm()); 3086 case 16: 3087 return ST.has16BitInsts() && 3088 AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), 3089 ST.hasInv2PiInlineImm()); 3090 default: 3091 llvm_unreachable("invalid bitwidth"); 3092 } 3093 } 3094 3095 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 3096 uint8_t OperandType) const { 3097 if (!MO.isImm() || 3098 OperandType < AMDGPU::OPERAND_SRC_FIRST || 3099 OperandType > AMDGPU::OPERAND_SRC_LAST) 3100 return false; 3101 3102 // MachineOperand provides no way to tell the true operand size, since it only 3103 // records a 64-bit value. We need to know the size to determine if a 32-bit 3104 // floating point immediate bit pattern is legal for an integer immediate. It 3105 // would be for any 32-bit integer operand, but would not be for a 64-bit one. 3106 3107 int64_t Imm = MO.getImm(); 3108 switch (OperandType) { 3109 case AMDGPU::OPERAND_REG_IMM_INT32: 3110 case AMDGPU::OPERAND_REG_IMM_FP32: 3111 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 3112 case AMDGPU::OPERAND_REG_INLINE_C_FP32: 3113 case AMDGPU::OPERAND_REG_INLINE_AC_INT32: 3114 case AMDGPU::OPERAND_REG_INLINE_AC_FP32: { 3115 int32_t Trunc = static_cast<int32_t>(Imm); 3116 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); 3117 } 3118 case AMDGPU::OPERAND_REG_IMM_INT64: 3119 case AMDGPU::OPERAND_REG_IMM_FP64: 3120 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 3121 case AMDGPU::OPERAND_REG_INLINE_C_FP64: 3122 return AMDGPU::isInlinableLiteral64(MO.getImm(), 3123 ST.hasInv2PiInlineImm()); 3124 case AMDGPU::OPERAND_REG_IMM_INT16: 3125 case AMDGPU::OPERAND_REG_IMM_FP16: 3126 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 3127 case AMDGPU::OPERAND_REG_INLINE_C_FP16: 3128 case AMDGPU::OPERAND_REG_INLINE_AC_INT16: 3129 case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { 3130 if (isInt<16>(Imm) || isUInt<16>(Imm)) { 3131 // A few special case instructions have 16-bit operands on subtargets 3132 // where 16-bit instructions are not legal. 3133 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle 3134 // constants in these cases 3135 int16_t Trunc = static_cast<int16_t>(Imm); 3136 return ST.has16BitInsts() && 3137 AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); 3138 } 3139 3140 return false; 3141 } 3142 case AMDGPU::OPERAND_REG_IMM_V2INT16: 3143 case AMDGPU::OPERAND_REG_IMM_V2FP16: 3144 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: 3145 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: 3146 case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: 3147 case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { 3148 uint32_t Trunc = static_cast<uint32_t>(Imm); 3149 return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); 3150 } 3151 default: 3152 llvm_unreachable("invalid bitwidth"); 3153 } 3154 } 3155 3156 bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO, 3157 const MCOperandInfo &OpInfo) const { 3158 switch (MO.getType()) { 3159 case MachineOperand::MO_Register: 3160 return false; 3161 case MachineOperand::MO_Immediate: 3162 return !isInlineConstant(MO, OpInfo); 3163 case MachineOperand::MO_FrameIndex: 3164 case MachineOperand::MO_MachineBasicBlock: 3165 case MachineOperand::MO_ExternalSymbol: 3166 case MachineOperand::MO_GlobalAddress: 3167 case MachineOperand::MO_MCSymbol: 3168 return true; 3169 default: 3170 llvm_unreachable("unexpected operand type"); 3171 } 3172 } 3173 3174 static bool compareMachineOp(const MachineOperand &Op0, 3175 const MachineOperand &Op1) { 3176 if (Op0.getType() != Op1.getType()) 3177 return false; 3178 3179 switch (Op0.getType()) { 3180 case MachineOperand::MO_Register: 3181 return Op0.getReg() == Op1.getReg(); 3182 case MachineOperand::MO_Immediate: 3183 return Op0.getImm() == Op1.getImm(); 3184 default: 3185 llvm_unreachable("Didn't expect to be comparing these operand types"); 3186 } 3187 } 3188 3189 bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, 3190 const MachineOperand &MO) const { 3191 const MCInstrDesc &InstDesc = MI.getDesc(); 3192 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo]; 3193 3194 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); 3195 3196 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 3197 return true; 3198 3199 if (OpInfo.RegClass < 0) 3200 return false; 3201 3202 const MachineFunction *MF = MI.getParent()->getParent(); 3203 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 3204 3205 if (MO.isImm() && isInlineConstant(MO, OpInfo)) { 3206 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() && 3207 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(), 3208 AMDGPU::OpName::src2)) 3209 return false; 3210 return RI.opCanUseInlineConstant(OpInfo.OperandType); 3211 } 3212 3213 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType)) 3214 return false; 3215 3216 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo)) 3217 return true; 3218 3219 return ST.hasVOP3Literal(); 3220 } 3221 3222 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 3223 int Op32 = AMDGPU::getVOPe32(Opcode); 3224 if (Op32 == -1) 3225 return false; 3226 3227 return pseudoToMCOpcode(Op32) != -1; 3228 } 3229 3230 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 3231 // The src0_modifier operand is present on all instructions 3232 // that have modifiers. 3233 3234 return AMDGPU::getNamedOperandIdx(Opcode, 3235 AMDGPU::OpName::src0_modifiers) != -1; 3236 } 3237 3238 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 3239 unsigned OpName) const { 3240 const MachineOperand *Mods = getNamedOperand(MI, OpName); 3241 return Mods && Mods->getImm(); 3242 } 3243 3244 bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { 3245 return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || 3246 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || 3247 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) || 3248 hasModifiersSet(MI, AMDGPU::OpName::clamp) || 3249 hasModifiersSet(MI, AMDGPU::OpName::omod); 3250 } 3251 3252 bool SIInstrInfo::canShrink(const MachineInstr &MI, 3253 const MachineRegisterInfo &MRI) const { 3254 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 3255 // Can't shrink instruction with three operands. 3256 // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add 3257 // a special case for it. It can only be shrunk if the third operand 3258 // is vcc, and src0_modifiers and src1_modifiers are not set. 3259 // We should handle this the same way we handle vopc, by addding 3260 // a register allocation hint pre-regalloc and then do the shrinking 3261 // post-regalloc. 3262 if (Src2) { 3263 switch (MI.getOpcode()) { 3264 default: return false; 3265 3266 case AMDGPU::V_ADDC_U32_e64: 3267 case AMDGPU::V_SUBB_U32_e64: 3268 case AMDGPU::V_SUBBREV_U32_e64: { 3269 const MachineOperand *Src1 3270 = getNamedOperand(MI, AMDGPU::OpName::src1); 3271 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg())) 3272 return false; 3273 // Additional verification is needed for sdst/src2. 3274 return true; 3275 } 3276 case AMDGPU::V_MAC_F32_e64: 3277 case AMDGPU::V_MAC_F16_e64: 3278 case AMDGPU::V_FMAC_F32_e64: 3279 case AMDGPU::V_FMAC_F16_e64: 3280 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) || 3281 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) 3282 return false; 3283 break; 3284 3285 case AMDGPU::V_CNDMASK_B32_e64: 3286 break; 3287 } 3288 } 3289 3290 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 3291 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) || 3292 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers))) 3293 return false; 3294 3295 // We don't need to check src0, all input types are legal, so just make sure 3296 // src0 isn't using any modifiers. 3297 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) 3298 return false; 3299 3300 // Can it be shrunk to a valid 32 bit opcode? 3301 if (!hasVALU32BitEncoding(MI.getOpcode())) 3302 return false; 3303 3304 // Check output modifiers 3305 return !hasModifiersSet(MI, AMDGPU::OpName::omod) && 3306 !hasModifiersSet(MI, AMDGPU::OpName::clamp); 3307 } 3308 3309 // Set VCC operand with all flags from \p Orig, except for setting it as 3310 // implicit. 3311 static void copyFlagsToImplicitVCC(MachineInstr &MI, 3312 const MachineOperand &Orig) { 3313 3314 for (MachineOperand &Use : MI.implicit_operands()) { 3315 if (Use.isUse() && Use.getReg() == AMDGPU::VCC) { 3316 Use.setIsUndef(Orig.isUndef()); 3317 Use.setIsKill(Orig.isKill()); 3318 return; 3319 } 3320 } 3321 } 3322 3323 MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI, 3324 unsigned Op32) const { 3325 MachineBasicBlock *MBB = MI.getParent();; 3326 MachineInstrBuilder Inst32 = 3327 BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32)) 3328 .setMIFlags(MI.getFlags()); 3329 3330 // Add the dst operand if the 32-bit encoding also has an explicit $vdst. 3331 // For VOPC instructions, this is replaced by an implicit def of vcc. 3332 int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst); 3333 if (Op32DstIdx != -1) { 3334 // dst 3335 Inst32.add(MI.getOperand(0)); 3336 } else { 3337 assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) || 3338 (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && 3339 "Unexpected case"); 3340 } 3341 3342 Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0)); 3343 3344 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 3345 if (Src1) 3346 Inst32.add(*Src1); 3347 3348 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 3349 3350 if (Src2) { 3351 int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); 3352 if (Op32Src2Idx != -1) { 3353 Inst32.add(*Src2); 3354 } else { 3355 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is 3356 // replaced with an implicit read of vcc. This was already added 3357 // during the initial BuildMI, so find it to preserve the flags. 3358 copyFlagsToImplicitVCC(*Inst32, *Src2); 3359 } 3360 } 3361 3362 return Inst32; 3363 } 3364 3365 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 3366 const MachineOperand &MO, 3367 const MCOperandInfo &OpInfo) const { 3368 // Literal constants use the constant bus. 3369 //if (isLiteralConstantLike(MO, OpInfo)) 3370 // return true; 3371 if (MO.isImm()) 3372 return !isInlineConstant(MO, OpInfo); 3373 3374 if (!MO.isReg()) 3375 return true; // Misc other operands like FrameIndex 3376 3377 if (!MO.isUse()) 3378 return false; 3379 3380 if (Register::isVirtualRegister(MO.getReg())) 3381 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 3382 3383 // Null is free 3384 if (MO.getReg() == AMDGPU::SGPR_NULL) 3385 return false; 3386 3387 // SGPRs use the constant bus 3388 if (MO.isImplicit()) { 3389 return MO.getReg() == AMDGPU::M0 || 3390 MO.getReg() == AMDGPU::VCC || 3391 MO.getReg() == AMDGPU::VCC_LO; 3392 } else { 3393 return AMDGPU::SReg_32RegClass.contains(MO.getReg()) || 3394 AMDGPU::SReg_64RegClass.contains(MO.getReg()); 3395 } 3396 } 3397 3398 static Register findImplicitSGPRRead(const MachineInstr &MI) { 3399 for (const MachineOperand &MO : MI.implicit_operands()) { 3400 // We only care about reads. 3401 if (MO.isDef()) 3402 continue; 3403 3404 switch (MO.getReg()) { 3405 case AMDGPU::VCC: 3406 case AMDGPU::VCC_LO: 3407 case AMDGPU::VCC_HI: 3408 case AMDGPU::M0: 3409 case AMDGPU::FLAT_SCR: 3410 return MO.getReg(); 3411 3412 default: 3413 break; 3414 } 3415 } 3416 3417 return AMDGPU::NoRegister; 3418 } 3419 3420 static bool shouldReadExec(const MachineInstr &MI) { 3421 if (SIInstrInfo::isVALU(MI)) { 3422 switch (MI.getOpcode()) { 3423 case AMDGPU::V_READLANE_B32: 3424 case AMDGPU::V_READLANE_B32_gfx6_gfx7: 3425 case AMDGPU::V_READLANE_B32_gfx10: 3426 case AMDGPU::V_READLANE_B32_vi: 3427 case AMDGPU::V_WRITELANE_B32: 3428 case AMDGPU::V_WRITELANE_B32_gfx6_gfx7: 3429 case AMDGPU::V_WRITELANE_B32_gfx10: 3430 case AMDGPU::V_WRITELANE_B32_vi: 3431 return false; 3432 } 3433 3434 return true; 3435 } 3436 3437 if (MI.isPreISelOpcode() || 3438 SIInstrInfo::isGenericOpcode(MI.getOpcode()) || 3439 SIInstrInfo::isSALU(MI) || 3440 SIInstrInfo::isSMRD(MI)) 3441 return false; 3442 3443 return true; 3444 } 3445 3446 static bool isSubRegOf(const SIRegisterInfo &TRI, 3447 const MachineOperand &SuperVec, 3448 const MachineOperand &SubReg) { 3449 if (Register::isPhysicalRegister(SubReg.getReg())) 3450 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); 3451 3452 return SubReg.getSubReg() != AMDGPU::NoSubRegister && 3453 SubReg.getReg() == SuperVec.getReg(); 3454 } 3455 3456 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, 3457 StringRef &ErrInfo) const { 3458 uint16_t Opcode = MI.getOpcode(); 3459 if (SIInstrInfo::isGenericOpcode(MI.getOpcode())) 3460 return true; 3461 3462 const MachineFunction *MF = MI.getParent()->getParent(); 3463 const MachineRegisterInfo &MRI = MF->getRegInfo(); 3464 3465 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 3466 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 3467 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 3468 3469 // Make sure the number of operands is correct. 3470 const MCInstrDesc &Desc = get(Opcode); 3471 if (!Desc.isVariadic() && 3472 Desc.getNumOperands() != MI.getNumExplicitOperands()) { 3473 ErrInfo = "Instruction has wrong number of operands."; 3474 return false; 3475 } 3476 3477 if (MI.isInlineAsm()) { 3478 // Verify register classes for inlineasm constraints. 3479 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands(); 3480 I != E; ++I) { 3481 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI); 3482 if (!RC) 3483 continue; 3484 3485 const MachineOperand &Op = MI.getOperand(I); 3486 if (!Op.isReg()) 3487 continue; 3488 3489 Register Reg = Op.getReg(); 3490 if (!Register::isVirtualRegister(Reg) && !RC->contains(Reg)) { 3491 ErrInfo = "inlineasm operand has incorrect register class."; 3492 return false; 3493 } 3494 } 3495 3496 return true; 3497 } 3498 3499 if (isMIMG(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) { 3500 ErrInfo = "missing memory operand from MIMG instruction."; 3501 return false; 3502 } 3503 3504 // Make sure the register classes are correct. 3505 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 3506 if (MI.getOperand(i).isFPImm()) { 3507 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 3508 "all fp values to integers."; 3509 return false; 3510 } 3511 3512 int RegClass = Desc.OpInfo[i].RegClass; 3513 3514 switch (Desc.OpInfo[i].OperandType) { 3515 case MCOI::OPERAND_REGISTER: 3516 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) { 3517 ErrInfo = "Illegal immediate value for operand."; 3518 return false; 3519 } 3520 break; 3521 case AMDGPU::OPERAND_REG_IMM_INT32: 3522 case AMDGPU::OPERAND_REG_IMM_FP32: 3523 break; 3524 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 3525 case AMDGPU::OPERAND_REG_INLINE_C_FP32: 3526 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 3527 case AMDGPU::OPERAND_REG_INLINE_C_FP64: 3528 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 3529 case AMDGPU::OPERAND_REG_INLINE_C_FP16: 3530 case AMDGPU::OPERAND_REG_INLINE_AC_INT32: 3531 case AMDGPU::OPERAND_REG_INLINE_AC_FP32: 3532 case AMDGPU::OPERAND_REG_INLINE_AC_INT16: 3533 case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { 3534 const MachineOperand &MO = MI.getOperand(i); 3535 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { 3536 ErrInfo = "Illegal immediate value for operand."; 3537 return false; 3538 } 3539 break; 3540 } 3541 case MCOI::OPERAND_IMMEDIATE: 3542 case AMDGPU::OPERAND_KIMM32: 3543 // Check if this operand is an immediate. 3544 // FrameIndex operands will be replaced by immediates, so they are 3545 // allowed. 3546 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { 3547 ErrInfo = "Expected immediate, but got non-immediate"; 3548 return false; 3549 } 3550 LLVM_FALLTHROUGH; 3551 default: 3552 continue; 3553 } 3554 3555 if (!MI.getOperand(i).isReg()) 3556 continue; 3557 3558 if (RegClass != -1) { 3559 Register Reg = MI.getOperand(i).getReg(); 3560 if (Reg == AMDGPU::NoRegister || Register::isVirtualRegister(Reg)) 3561 continue; 3562 3563 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 3564 if (!RC->contains(Reg)) { 3565 ErrInfo = "Operand has incorrect register class."; 3566 return false; 3567 } 3568 } 3569 } 3570 3571 // Verify SDWA 3572 if (isSDWA(MI)) { 3573 if (!ST.hasSDWA()) { 3574 ErrInfo = "SDWA is not supported on this target"; 3575 return false; 3576 } 3577 3578 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); 3579 3580 const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx }; 3581 3582 for (int OpIdx: OpIndicies) { 3583 if (OpIdx == -1) 3584 continue; 3585 const MachineOperand &MO = MI.getOperand(OpIdx); 3586 3587 if (!ST.hasSDWAScalar()) { 3588 // Only VGPRS on VI 3589 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) { 3590 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI"; 3591 return false; 3592 } 3593 } else { 3594 // No immediates on GFX9 3595 if (!MO.isReg()) { 3596 ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9"; 3597 return false; 3598 } 3599 } 3600 } 3601 3602 if (!ST.hasSDWAOmod()) { 3603 // No omod allowed on VI 3604 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); 3605 if (OMod != nullptr && 3606 (!OMod->isImm() || OMod->getImm() != 0)) { 3607 ErrInfo = "OMod not allowed in SDWA instructions on VI"; 3608 return false; 3609 } 3610 } 3611 3612 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode); 3613 if (isVOPC(BasicOpcode)) { 3614 if (!ST.hasSDWASdst() && DstIdx != -1) { 3615 // Only vcc allowed as dst on VI for VOPC 3616 const MachineOperand &Dst = MI.getOperand(DstIdx); 3617 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) { 3618 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI"; 3619 return false; 3620 } 3621 } else if (!ST.hasSDWAOutModsVOPC()) { 3622 // No clamp allowed on GFX9 for VOPC 3623 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); 3624 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) { 3625 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI"; 3626 return false; 3627 } 3628 3629 // No omod allowed on GFX9 for VOPC 3630 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); 3631 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) { 3632 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI"; 3633 return false; 3634 } 3635 } 3636 } 3637 3638 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused); 3639 if (DstUnused && DstUnused->isImm() && 3640 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) { 3641 const MachineOperand &Dst = MI.getOperand(DstIdx); 3642 if (!Dst.isReg() || !Dst.isTied()) { 3643 ErrInfo = "Dst register should have tied register"; 3644 return false; 3645 } 3646 3647 const MachineOperand &TiedMO = 3648 MI.getOperand(MI.findTiedOperandIdx(DstIdx)); 3649 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) { 3650 ErrInfo = 3651 "Dst register should be tied to implicit use of preserved register"; 3652 return false; 3653 } else if (Register::isPhysicalRegister(TiedMO.getReg()) && 3654 Dst.getReg() != TiedMO.getReg()) { 3655 ErrInfo = "Dst register should use same physical register as preserved"; 3656 return false; 3657 } 3658 } 3659 } 3660 3661 // Verify MIMG 3662 if (isMIMG(MI.getOpcode()) && !MI.mayStore()) { 3663 // Ensure that the return type used is large enough for all the options 3664 // being used TFE/LWE require an extra result register. 3665 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask); 3666 if (DMask) { 3667 uint64_t DMaskImm = DMask->getImm(); 3668 uint32_t RegCount = 3669 isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm); 3670 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe); 3671 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe); 3672 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16); 3673 3674 // Adjust for packed 16 bit values 3675 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem()) 3676 RegCount >>= 1; 3677 3678 // Adjust if using LWE or TFE 3679 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm())) 3680 RegCount += 1; 3681 3682 const uint32_t DstIdx = 3683 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); 3684 const MachineOperand &Dst = MI.getOperand(DstIdx); 3685 if (Dst.isReg()) { 3686 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx); 3687 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32; 3688 if (RegCount > DstSize) { 3689 ErrInfo = "MIMG instruction returns too many registers for dst " 3690 "register class"; 3691 return false; 3692 } 3693 } 3694 } 3695 } 3696 3697 // Verify VOP*. Ignore multiple sgpr operands on writelane. 3698 if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32 3699 && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) { 3700 // Only look at the true operands. Only a real operand can use the constant 3701 // bus, and we don't want to check pseudo-operands like the source modifier 3702 // flags. 3703 const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; 3704 3705 unsigned ConstantBusCount = 0; 3706 unsigned LiteralCount = 0; 3707 3708 if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) 3709 ++ConstantBusCount; 3710 3711 SmallVector<Register, 2> SGPRsUsed; 3712 Register SGPRUsed = findImplicitSGPRRead(MI); 3713 if (SGPRUsed != AMDGPU::NoRegister) { 3714 ++ConstantBusCount; 3715 SGPRsUsed.push_back(SGPRUsed); 3716 } 3717 3718 for (int OpIdx : OpIndices) { 3719 if (OpIdx == -1) 3720 break; 3721 const MachineOperand &MO = MI.getOperand(OpIdx); 3722 if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { 3723 if (MO.isReg()) { 3724 SGPRUsed = MO.getReg(); 3725 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) { 3726 return !RI.regsOverlap(SGPRUsed, SGPR); 3727 })) { 3728 ++ConstantBusCount; 3729 SGPRsUsed.push_back(SGPRUsed); 3730 } 3731 } else { 3732 ++ConstantBusCount; 3733 ++LiteralCount; 3734 } 3735 } 3736 } 3737 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 3738 // v_writelane_b32 is an exception from constant bus restriction: 3739 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const 3740 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) && 3741 Opcode != AMDGPU::V_WRITELANE_B32) { 3742 ErrInfo = "VOP* instruction violates constant bus restriction"; 3743 return false; 3744 } 3745 3746 if (isVOP3(MI) && LiteralCount) { 3747 if (LiteralCount && !ST.hasVOP3Literal()) { 3748 ErrInfo = "VOP3 instruction uses literal"; 3749 return false; 3750 } 3751 if (LiteralCount > 1) { 3752 ErrInfo = "VOP3 instruction uses more than one literal"; 3753 return false; 3754 } 3755 } 3756 } 3757 3758 // Special case for writelane - this can break the multiple constant bus rule, 3759 // but still can't use more than one SGPR register 3760 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) { 3761 unsigned SGPRCount = 0; 3762 Register SGPRUsed = AMDGPU::NoRegister; 3763 3764 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) { 3765 if (OpIdx == -1) 3766 break; 3767 3768 const MachineOperand &MO = MI.getOperand(OpIdx); 3769 3770 if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { 3771 if (MO.isReg() && MO.getReg() != AMDGPU::M0) { 3772 if (MO.getReg() != SGPRUsed) 3773 ++SGPRCount; 3774 SGPRUsed = MO.getReg(); 3775 } 3776 } 3777 if (SGPRCount > ST.getConstantBusLimit(Opcode)) { 3778 ErrInfo = "WRITELANE instruction violates constant bus restriction"; 3779 return false; 3780 } 3781 } 3782 } 3783 3784 // Verify misc. restrictions on specific instructions. 3785 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || 3786 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { 3787 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 3788 const MachineOperand &Src1 = MI.getOperand(Src1Idx); 3789 const MachineOperand &Src2 = MI.getOperand(Src2Idx); 3790 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 3791 if (!compareMachineOp(Src0, Src1) && 3792 !compareMachineOp(Src0, Src2)) { 3793 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 3794 return false; 3795 } 3796 } 3797 } 3798 3799 if (isSOP2(MI) || isSOPC(MI)) { 3800 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 3801 const MachineOperand &Src1 = MI.getOperand(Src1Idx); 3802 unsigned Immediates = 0; 3803 3804 if (!Src0.isReg() && 3805 !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType)) 3806 Immediates++; 3807 if (!Src1.isReg() && 3808 !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType)) 3809 Immediates++; 3810 3811 if (Immediates > 1) { 3812 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants"; 3813 return false; 3814 } 3815 } 3816 3817 if (isSOPK(MI)) { 3818 auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16); 3819 if (Desc.isBranch()) { 3820 if (!Op->isMBB()) { 3821 ErrInfo = "invalid branch target for SOPK instruction"; 3822 return false; 3823 } 3824 } else { 3825 uint64_t Imm = Op->getImm(); 3826 if (sopkIsZext(MI)) { 3827 if (!isUInt<16>(Imm)) { 3828 ErrInfo = "invalid immediate for SOPK instruction"; 3829 return false; 3830 } 3831 } else { 3832 if (!isInt<16>(Imm)) { 3833 ErrInfo = "invalid immediate for SOPK instruction"; 3834 return false; 3835 } 3836 } 3837 } 3838 } 3839 3840 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || 3841 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || 3842 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 3843 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { 3844 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 3845 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; 3846 3847 const unsigned StaticNumOps = Desc.getNumOperands() + 3848 Desc.getNumImplicitUses(); 3849 const unsigned NumImplicitOps = IsDst ? 2 : 1; 3850 3851 // Allow additional implicit operands. This allows a fixup done by the post 3852 // RA scheduler where the main implicit operand is killed and implicit-defs 3853 // are added for sub-registers that remain live after this instruction. 3854 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { 3855 ErrInfo = "missing implicit register operands"; 3856 return false; 3857 } 3858 3859 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 3860 if (IsDst) { 3861 if (!Dst->isUse()) { 3862 ErrInfo = "v_movreld_b32 vdst should be a use operand"; 3863 return false; 3864 } 3865 3866 unsigned UseOpIdx; 3867 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || 3868 UseOpIdx != StaticNumOps + 1) { 3869 ErrInfo = "movrel implicit operands should be tied"; 3870 return false; 3871 } 3872 } 3873 3874 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 3875 const MachineOperand &ImpUse 3876 = MI.getOperand(StaticNumOps + NumImplicitOps - 1); 3877 if (!ImpUse.isReg() || !ImpUse.isUse() || 3878 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { 3879 ErrInfo = "src0 should be subreg of implicit vector use"; 3880 return false; 3881 } 3882 } 3883 3884 // Make sure we aren't losing exec uses in the td files. This mostly requires 3885 // being careful when using let Uses to try to add other use registers. 3886 if (shouldReadExec(MI)) { 3887 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 3888 ErrInfo = "VALU instruction does not implicitly read exec mask"; 3889 return false; 3890 } 3891 } 3892 3893 if (isSMRD(MI)) { 3894 if (MI.mayStore()) { 3895 // The register offset form of scalar stores may only use m0 as the 3896 // soffset register. 3897 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff); 3898 if (Soff && Soff->getReg() != AMDGPU::M0) { 3899 ErrInfo = "scalar stores must use m0 as offset register"; 3900 return false; 3901 } 3902 } 3903 } 3904 3905 if (isFLAT(MI) && !MF->getSubtarget<GCNSubtarget>().hasFlatInstOffsets()) { 3906 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 3907 if (Offset->getImm() != 0) { 3908 ErrInfo = "subtarget does not support offsets in flat instructions"; 3909 return false; 3910 } 3911 } 3912 3913 if (isMIMG(MI)) { 3914 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim); 3915 if (DimOp) { 3916 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode, 3917 AMDGPU::OpName::vaddr0); 3918 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); 3919 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode); 3920 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3921 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); 3922 const AMDGPU::MIMGDimInfo *Dim = 3923 AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm()); 3924 3925 if (!Dim) { 3926 ErrInfo = "dim is out of range"; 3927 return false; 3928 } 3929 3930 bool IsA16 = false; 3931 if (ST.hasR128A16()) { 3932 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128); 3933 IsA16 = R128A16->getImm() != 0; 3934 } else if (ST.hasGFX10A16()) { 3935 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16); 3936 IsA16 = A16->getImm() != 0; 3937 } 3938 3939 bool PackDerivatives = IsA16; // Either A16 or G16 3940 bool IsNSA = SRsrcIdx - VAddr0Idx > 1; 3941 3942 unsigned AddrWords = BaseOpcode->NumExtraArgs; 3943 unsigned AddrComponents = (BaseOpcode->Coordinates ? Dim->NumCoords : 0) + 3944 (BaseOpcode->LodOrClampOrMip ? 1 : 0); 3945 if (IsA16) 3946 AddrWords += (AddrComponents + 1) / 2; 3947 else 3948 AddrWords += AddrComponents; 3949 3950 if (BaseOpcode->Gradients) { 3951 if (PackDerivatives) 3952 // There are two gradients per coordinate, we pack them separately. 3953 // For the 3d case, we get (dy/du, dx/du) (-, dz/du) (dy/dv, dx/dv) (-, dz/dv) 3954 AddrWords += (Dim->NumGradients / 2 + 1) / 2 * 2; 3955 else 3956 AddrWords += Dim->NumGradients; 3957 } 3958 3959 unsigned VAddrWords; 3960 if (IsNSA) { 3961 VAddrWords = SRsrcIdx - VAddr0Idx; 3962 } else { 3963 const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx); 3964 VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32; 3965 if (AddrWords > 8) 3966 AddrWords = 16; 3967 else if (AddrWords > 4) 3968 AddrWords = 8; 3969 else if (AddrWords == 4) 3970 AddrWords = 4; 3971 else if (AddrWords == 3) 3972 AddrWords = 3; 3973 } 3974 3975 if (VAddrWords != AddrWords) { 3976 ErrInfo = "bad vaddr size"; 3977 return false; 3978 } 3979 } 3980 } 3981 3982 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl); 3983 if (DppCt) { 3984 using namespace AMDGPU::DPP; 3985 3986 unsigned DC = DppCt->getImm(); 3987 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 || 3988 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST || 3989 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) || 3990 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) || 3991 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) || 3992 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) || 3993 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) { 3994 ErrInfo = "Invalid dpp_ctrl value"; 3995 return false; 3996 } 3997 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 && 3998 ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 3999 ErrInfo = "Invalid dpp_ctrl value: " 4000 "wavefront shifts are not supported on GFX10+"; 4001 return false; 4002 } 4003 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 && 4004 ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 4005 ErrInfo = "Invalid dpp_ctrl value: " 4006 "broadcasts are not supported on GFX10+"; 4007 return false; 4008 } 4009 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST && 4010 ST.getGeneration() < AMDGPUSubtarget::GFX10) { 4011 ErrInfo = "Invalid dpp_ctrl value: " 4012 "row_share and row_xmask are not supported before GFX10"; 4013 return false; 4014 } 4015 } 4016 4017 return true; 4018 } 4019 4020 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { 4021 switch (MI.getOpcode()) { 4022 default: return AMDGPU::INSTRUCTION_LIST_END; 4023 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 4024 case AMDGPU::COPY: return AMDGPU::COPY; 4025 case AMDGPU::PHI: return AMDGPU::PHI; 4026 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 4027 case AMDGPU::WQM: return AMDGPU::WQM; 4028 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM; 4029 case AMDGPU::WWM: return AMDGPU::WWM; 4030 case AMDGPU::S_MOV_B32: { 4031 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 4032 return MI.getOperand(1).isReg() || 4033 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ? 4034 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 4035 } 4036 case AMDGPU::S_ADD_I32: 4037 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32; 4038 case AMDGPU::S_ADDC_U32: 4039 return AMDGPU::V_ADDC_U32_e32; 4040 case AMDGPU::S_SUB_I32: 4041 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32; 4042 // FIXME: These are not consistently handled, and selected when the carry is 4043 // used. 4044 case AMDGPU::S_ADD_U32: 4045 return AMDGPU::V_ADD_I32_e32; 4046 case AMDGPU::S_SUB_U32: 4047 return AMDGPU::V_SUB_I32_e32; 4048 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 4049 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32; 4050 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32; 4051 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32; 4052 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; 4053 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; 4054 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; 4055 case AMDGPU::S_XNOR_B32: 4056 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; 4057 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; 4058 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; 4059 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; 4060 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; 4061 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 4062 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; 4063 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 4064 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; 4065 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 4066 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; 4067 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; 4068 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; 4069 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; 4070 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; 4071 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 4072 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 4073 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 4074 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 4075 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; 4076 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; 4077 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; 4078 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; 4079 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; 4080 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; 4081 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; 4082 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; 4083 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; 4084 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; 4085 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; 4086 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; 4087 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32; 4088 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32; 4089 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 4090 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 4091 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 4092 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 4093 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; 4094 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; 4095 } 4096 llvm_unreachable( 4097 "Unexpected scalar opcode without corresponding vector one!"); 4098 } 4099 4100 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 4101 unsigned OpNo) const { 4102 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 4103 const MCInstrDesc &Desc = get(MI.getOpcode()); 4104 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 4105 Desc.OpInfo[OpNo].RegClass == -1) { 4106 Register Reg = MI.getOperand(OpNo).getReg(); 4107 4108 if (Register::isVirtualRegister(Reg)) 4109 return MRI.getRegClass(Reg); 4110 return RI.getPhysRegClass(Reg); 4111 } 4112 4113 unsigned RCID = Desc.OpInfo[OpNo].RegClass; 4114 return RI.getRegClass(RCID); 4115 } 4116 4117 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { 4118 MachineBasicBlock::iterator I = MI; 4119 MachineBasicBlock *MBB = MI.getParent(); 4120 MachineOperand &MO = MI.getOperand(OpIdx); 4121 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 4122 const SIRegisterInfo *TRI = 4123 static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 4124 unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; 4125 const TargetRegisterClass *RC = RI.getRegClass(RCID); 4126 unsigned Size = TRI->getRegSizeInBits(*RC); 4127 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32; 4128 if (MO.isReg()) 4129 Opcode = AMDGPU::COPY; 4130 else if (RI.isSGPRClass(RC)) 4131 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 4132 4133 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 4134 if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) 4135 VRC = &AMDGPU::VReg_64RegClass; 4136 else 4137 VRC = &AMDGPU::VGPR_32RegClass; 4138 4139 Register Reg = MRI.createVirtualRegister(VRC); 4140 DebugLoc DL = MBB->findDebugLoc(I); 4141 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); 4142 MO.ChangeToRegister(Reg, false); 4143 } 4144 4145 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 4146 MachineRegisterInfo &MRI, 4147 MachineOperand &SuperReg, 4148 const TargetRegisterClass *SuperRC, 4149 unsigned SubIdx, 4150 const TargetRegisterClass *SubRC) 4151 const { 4152 MachineBasicBlock *MBB = MI->getParent(); 4153 DebugLoc DL = MI->getDebugLoc(); 4154 Register SubReg = MRI.createVirtualRegister(SubRC); 4155 4156 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { 4157 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 4158 .addReg(SuperReg.getReg(), 0, SubIdx); 4159 return SubReg; 4160 } 4161 4162 // Just in case the super register is itself a sub-register, copy it to a new 4163 // value so we don't need to worry about merging its subreg index with the 4164 // SubIdx passed to this function. The register coalescer should be able to 4165 // eliminate this extra copy. 4166 Register NewSuperReg = MRI.createVirtualRegister(SuperRC); 4167 4168 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 4169 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 4170 4171 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 4172 .addReg(NewSuperReg, 0, SubIdx); 4173 4174 return SubReg; 4175 } 4176 4177 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 4178 MachineBasicBlock::iterator MII, 4179 MachineRegisterInfo &MRI, 4180 MachineOperand &Op, 4181 const TargetRegisterClass *SuperRC, 4182 unsigned SubIdx, 4183 const TargetRegisterClass *SubRC) const { 4184 if (Op.isImm()) { 4185 if (SubIdx == AMDGPU::sub0) 4186 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); 4187 if (SubIdx == AMDGPU::sub1) 4188 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32)); 4189 4190 llvm_unreachable("Unhandled register index for immediate"); 4191 } 4192 4193 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 4194 SubIdx, SubRC); 4195 return MachineOperand::CreateReg(SubReg, false); 4196 } 4197 4198 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 4199 void SIInstrInfo::swapOperands(MachineInstr &Inst) const { 4200 assert(Inst.getNumExplicitOperands() == 3); 4201 MachineOperand Op1 = Inst.getOperand(1); 4202 Inst.RemoveOperand(1); 4203 Inst.addOperand(Op1); 4204 } 4205 4206 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, 4207 const MCOperandInfo &OpInfo, 4208 const MachineOperand &MO) const { 4209 if (!MO.isReg()) 4210 return false; 4211 4212 Register Reg = MO.getReg(); 4213 const TargetRegisterClass *RC = Register::isVirtualRegister(Reg) 4214 ? MRI.getRegClass(Reg) 4215 : RI.getPhysRegClass(Reg); 4216 4217 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass); 4218 if (MO.getSubReg()) { 4219 const MachineFunction *MF = MO.getParent()->getParent()->getParent(); 4220 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF); 4221 if (!SuperRC) 4222 return false; 4223 4224 DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()); 4225 if (!DRC) 4226 return false; 4227 } 4228 return RC->hasSuperClassEq(DRC); 4229 } 4230 4231 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, 4232 const MCOperandInfo &OpInfo, 4233 const MachineOperand &MO) const { 4234 if (MO.isReg()) 4235 return isLegalRegOperand(MRI, OpInfo, MO); 4236 4237 // Handle non-register types that are treated like immediates. 4238 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); 4239 return true; 4240 } 4241 4242 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, 4243 const MachineOperand *MO) const { 4244 const MachineFunction &MF = *MI.getParent()->getParent(); 4245 const MachineRegisterInfo &MRI = MF.getRegInfo(); 4246 const MCInstrDesc &InstDesc = MI.getDesc(); 4247 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; 4248 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 4249 const TargetRegisterClass *DefinedRC = 4250 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 4251 if (!MO) 4252 MO = &MI.getOperand(OpIdx); 4253 4254 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); 4255 int VOP3LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; 4256 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { 4257 if (isVOP3(MI) && isLiteralConstantLike(*MO, OpInfo) && !VOP3LiteralLimit--) 4258 return false; 4259 4260 SmallDenseSet<RegSubRegPair> SGPRsUsed; 4261 if (MO->isReg()) 4262 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg())); 4263 4264 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 4265 if (i == OpIdx) 4266 continue; 4267 const MachineOperand &Op = MI.getOperand(i); 4268 if (Op.isReg()) { 4269 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg()); 4270 if (!SGPRsUsed.count(SGPR) && 4271 usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) { 4272 if (--ConstantBusLimit <= 0) 4273 return false; 4274 SGPRsUsed.insert(SGPR); 4275 } 4276 } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { 4277 if (--ConstantBusLimit <= 0) 4278 return false; 4279 } else if (isVOP3(MI) && AMDGPU::isSISrcOperand(InstDesc, i) && 4280 isLiteralConstantLike(Op, InstDesc.OpInfo[i])) { 4281 if (!VOP3LiteralLimit--) 4282 return false; 4283 if (--ConstantBusLimit <= 0) 4284 return false; 4285 } 4286 } 4287 } 4288 4289 if (MO->isReg()) { 4290 assert(DefinedRC); 4291 return isLegalRegOperand(MRI, OpInfo, *MO); 4292 } 4293 4294 // Handle non-register types that are treated like immediates. 4295 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal()); 4296 4297 if (!DefinedRC) { 4298 // This operand expects an immediate. 4299 return true; 4300 } 4301 4302 return isImmOperandLegal(MI, OpIdx, *MO); 4303 } 4304 4305 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, 4306 MachineInstr &MI) const { 4307 unsigned Opc = MI.getOpcode(); 4308 const MCInstrDesc &InstrDesc = get(Opc); 4309 4310 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 4311 MachineOperand &Src0 = MI.getOperand(Src0Idx); 4312 4313 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 4314 MachineOperand &Src1 = MI.getOperand(Src1Idx); 4315 4316 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 4317 // we need to only have one constant bus use before GFX10. 4318 bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; 4319 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && 4320 Src0.isReg() && (RI.isSGPRReg(MRI, Src0.getReg()) || 4321 isLiteralConstantLike(Src0, InstrDesc.OpInfo[Src0Idx]))) 4322 legalizeOpWithMove(MI, Src0Idx); 4323 4324 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for 4325 // both the value to write (src0) and lane select (src1). Fix up non-SGPR 4326 // src0/src1 with V_READFIRSTLANE. 4327 if (Opc == AMDGPU::V_WRITELANE_B32) { 4328 const DebugLoc &DL = MI.getDebugLoc(); 4329 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) { 4330 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4331 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4332 .add(Src0); 4333 Src0.ChangeToRegister(Reg, false); 4334 } 4335 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) { 4336 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4337 const DebugLoc &DL = MI.getDebugLoc(); 4338 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4339 .add(Src1); 4340 Src1.ChangeToRegister(Reg, false); 4341 } 4342 return; 4343 } 4344 4345 // No VOP2 instructions support AGPRs. 4346 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg())) 4347 legalizeOpWithMove(MI, Src0Idx); 4348 4349 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg())) 4350 legalizeOpWithMove(MI, Src1Idx); 4351 4352 // VOP2 src0 instructions support all operand types, so we don't need to check 4353 // their legality. If src1 is already legal, we don't need to do anything. 4354 if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) 4355 return; 4356 4357 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for 4358 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane 4359 // select is uniform. 4360 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() && 4361 RI.isVGPR(MRI, Src1.getReg())) { 4362 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4363 const DebugLoc &DL = MI.getDebugLoc(); 4364 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4365 .add(Src1); 4366 Src1.ChangeToRegister(Reg, false); 4367 return; 4368 } 4369 4370 // We do not use commuteInstruction here because it is too aggressive and will 4371 // commute if it is possible. We only want to commute here if it improves 4372 // legality. This can be called a fairly large number of times so don't waste 4373 // compile time pointlessly swapping and checking legality again. 4374 if (HasImplicitSGPR || !MI.isCommutable()) { 4375 legalizeOpWithMove(MI, Src1Idx); 4376 return; 4377 } 4378 4379 // If src0 can be used as src1, commuting will make the operands legal. 4380 // Otherwise we have to give up and insert a move. 4381 // 4382 // TODO: Other immediate-like operand kinds could be commuted if there was a 4383 // MachineOperand::ChangeTo* for them. 4384 if ((!Src1.isImm() && !Src1.isReg()) || 4385 !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { 4386 legalizeOpWithMove(MI, Src1Idx); 4387 return; 4388 } 4389 4390 int CommutedOpc = commuteOpcode(MI); 4391 if (CommutedOpc == -1) { 4392 legalizeOpWithMove(MI, Src1Idx); 4393 return; 4394 } 4395 4396 MI.setDesc(get(CommutedOpc)); 4397 4398 Register Src0Reg = Src0.getReg(); 4399 unsigned Src0SubReg = Src0.getSubReg(); 4400 bool Src0Kill = Src0.isKill(); 4401 4402 if (Src1.isImm()) 4403 Src0.ChangeToImmediate(Src1.getImm()); 4404 else if (Src1.isReg()) { 4405 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); 4406 Src0.setSubReg(Src1.getSubReg()); 4407 } else 4408 llvm_unreachable("Should only have register or immediate operands"); 4409 4410 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); 4411 Src1.setSubReg(Src0SubReg); 4412 fixImplicitOperands(MI); 4413 } 4414 4415 // Legalize VOP3 operands. All operand types are supported for any operand 4416 // but only one literal constant and only starting from GFX10. 4417 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, 4418 MachineInstr &MI) const { 4419 unsigned Opc = MI.getOpcode(); 4420 4421 int VOP3Idx[3] = { 4422 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 4423 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), 4424 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) 4425 }; 4426 4427 if (Opc == AMDGPU::V_PERMLANE16_B32 || 4428 Opc == AMDGPU::V_PERMLANEX16_B32) { 4429 // src1 and src2 must be scalar 4430 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]); 4431 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]); 4432 const DebugLoc &DL = MI.getDebugLoc(); 4433 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) { 4434 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4435 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4436 .add(Src1); 4437 Src1.ChangeToRegister(Reg, false); 4438 } 4439 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) { 4440 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4441 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4442 .add(Src2); 4443 Src2.ChangeToRegister(Reg, false); 4444 } 4445 } 4446 4447 // Find the one SGPR operand we are allowed to use. 4448 int ConstantBusLimit = ST.getConstantBusLimit(Opc); 4449 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; 4450 SmallDenseSet<unsigned> SGPRsUsed; 4451 unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); 4452 if (SGPRReg != AMDGPU::NoRegister) { 4453 SGPRsUsed.insert(SGPRReg); 4454 --ConstantBusLimit; 4455 } 4456 4457 for (unsigned i = 0; i < 3; ++i) { 4458 int Idx = VOP3Idx[i]; 4459 if (Idx == -1) 4460 break; 4461 MachineOperand &MO = MI.getOperand(Idx); 4462 4463 if (!MO.isReg()) { 4464 if (!isLiteralConstantLike(MO, get(Opc).OpInfo[Idx])) 4465 continue; 4466 4467 if (LiteralLimit > 0 && ConstantBusLimit > 0) { 4468 --LiteralLimit; 4469 --ConstantBusLimit; 4470 continue; 4471 } 4472 4473 --LiteralLimit; 4474 --ConstantBusLimit; 4475 legalizeOpWithMove(MI, Idx); 4476 continue; 4477 } 4478 4479 if (RI.hasAGPRs(MRI.getRegClass(MO.getReg())) && 4480 !isOperandLegal(MI, Idx, &MO)) { 4481 legalizeOpWithMove(MI, Idx); 4482 continue; 4483 } 4484 4485 if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 4486 continue; // VGPRs are legal 4487 4488 // We can use one SGPR in each VOP3 instruction prior to GFX10 4489 // and two starting from GFX10. 4490 if (SGPRsUsed.count(MO.getReg())) 4491 continue; 4492 if (ConstantBusLimit > 0) { 4493 SGPRsUsed.insert(MO.getReg()); 4494 --ConstantBusLimit; 4495 continue; 4496 } 4497 4498 // If we make it this far, then the operand is not legal and we must 4499 // legalize it. 4500 legalizeOpWithMove(MI, Idx); 4501 } 4502 } 4503 4504 Register SIInstrInfo::readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, 4505 MachineRegisterInfo &MRI) const { 4506 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); 4507 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); 4508 Register DstReg = MRI.createVirtualRegister(SRC); 4509 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; 4510 4511 if (RI.hasAGPRs(VRC)) { 4512 VRC = RI.getEquivalentVGPRClass(VRC); 4513 Register NewSrcReg = MRI.createVirtualRegister(VRC); 4514 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 4515 get(TargetOpcode::COPY), NewSrcReg) 4516 .addReg(SrcReg); 4517 SrcReg = NewSrcReg; 4518 } 4519 4520 if (SubRegs == 1) { 4521 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 4522 get(AMDGPU::V_READFIRSTLANE_B32), DstReg) 4523 .addReg(SrcReg); 4524 return DstReg; 4525 } 4526 4527 SmallVector<unsigned, 8> SRegs; 4528 for (unsigned i = 0; i < SubRegs; ++i) { 4529 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4530 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 4531 get(AMDGPU::V_READFIRSTLANE_B32), SGPR) 4532 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); 4533 SRegs.push_back(SGPR); 4534 } 4535 4536 MachineInstrBuilder MIB = 4537 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 4538 get(AMDGPU::REG_SEQUENCE), DstReg); 4539 for (unsigned i = 0; i < SubRegs; ++i) { 4540 MIB.addReg(SRegs[i]); 4541 MIB.addImm(RI.getSubRegFromChannel(i)); 4542 } 4543 return DstReg; 4544 } 4545 4546 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, 4547 MachineInstr &MI) const { 4548 4549 // If the pointer is store in VGPRs, then we need to move them to 4550 // SGPRs using v_readfirstlane. This is safe because we only select 4551 // loads with uniform pointers to SMRD instruction so we know the 4552 // pointer value is uniform. 4553 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); 4554 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { 4555 unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); 4556 SBase->setReg(SGPR); 4557 } 4558 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff); 4559 if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) { 4560 unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI); 4561 SOff->setReg(SGPR); 4562 } 4563 } 4564 4565 void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, 4566 MachineBasicBlock::iterator I, 4567 const TargetRegisterClass *DstRC, 4568 MachineOperand &Op, 4569 MachineRegisterInfo &MRI, 4570 const DebugLoc &DL) const { 4571 Register OpReg = Op.getReg(); 4572 unsigned OpSubReg = Op.getSubReg(); 4573 4574 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( 4575 RI.getRegClassForReg(MRI, OpReg), OpSubReg); 4576 4577 // Check if operand is already the correct register class. 4578 if (DstRC == OpRC) 4579 return; 4580 4581 Register DstReg = MRI.createVirtualRegister(DstRC); 4582 MachineInstr *Copy = 4583 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); 4584 4585 Op.setReg(DstReg); 4586 Op.setSubReg(0); 4587 4588 MachineInstr *Def = MRI.getVRegDef(OpReg); 4589 if (!Def) 4590 return; 4591 4592 // Try to eliminate the copy if it is copying an immediate value. 4593 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass) 4594 FoldImmediate(*Copy, *Def, OpReg, &MRI); 4595 4596 bool ImpDef = Def->isImplicitDef(); 4597 while (!ImpDef && Def && Def->isCopy()) { 4598 if (Def->getOperand(1).getReg().isPhysical()) 4599 break; 4600 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg()); 4601 ImpDef = Def && Def->isImplicitDef(); 4602 } 4603 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) && 4604 !ImpDef) 4605 Copy->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 4606 } 4607 4608 // Emit the actual waterfall loop, executing the wrapped instruction for each 4609 // unique value of \p Rsrc across all lanes. In the best case we execute 1 4610 // iteration, in the worst case we execute 64 (once per lane). 4611 static void 4612 emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, 4613 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, 4614 const DebugLoc &DL, MachineOperand &Rsrc) { 4615 MachineFunction &MF = *OrigBB.getParent(); 4616 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 4617 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 4618 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 4619 unsigned SaveExecOpc = 4620 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 4621 unsigned XorTermOpc = 4622 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; 4623 unsigned AndOpc = 4624 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 4625 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 4626 4627 MachineBasicBlock::iterator I = LoopBB.begin(); 4628 4629 Register VRsrc = Rsrc.getReg(); 4630 unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef()); 4631 4632 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); 4633 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC); 4634 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC); 4635 Register AndCond = MRI.createVirtualRegister(BoolXExecRC); 4636 Register SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4637 Register SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4638 Register SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4639 Register SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4640 Register SRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 4641 4642 // Beginning of the loop, read the next Rsrc variant. 4643 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0) 4644 .addReg(VRsrc, VRsrcUndef, AMDGPU::sub0); 4645 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub1) 4646 .addReg(VRsrc, VRsrcUndef, AMDGPU::sub1); 4647 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub2) 4648 .addReg(VRsrc, VRsrcUndef, AMDGPU::sub2); 4649 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub3) 4650 .addReg(VRsrc, VRsrcUndef, AMDGPU::sub3); 4651 4652 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc) 4653 .addReg(SRsrcSub0) 4654 .addImm(AMDGPU::sub0) 4655 .addReg(SRsrcSub1) 4656 .addImm(AMDGPU::sub1) 4657 .addReg(SRsrcSub2) 4658 .addImm(AMDGPU::sub2) 4659 .addReg(SRsrcSub3) 4660 .addImm(AMDGPU::sub3); 4661 4662 // Update Rsrc operand to use the SGPR Rsrc. 4663 Rsrc.setReg(SRsrc); 4664 Rsrc.setIsKill(true); 4665 4666 // Identify all lanes with identical Rsrc operands in their VGPRs. 4667 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg0) 4668 .addReg(SRsrc, 0, AMDGPU::sub0_sub1) 4669 .addReg(VRsrc, 0, AMDGPU::sub0_sub1); 4670 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1) 4671 .addReg(SRsrc, 0, AMDGPU::sub2_sub3) 4672 .addReg(VRsrc, 0, AMDGPU::sub2_sub3); 4673 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndCond) 4674 .addReg(CondReg0) 4675 .addReg(CondReg1); 4676 4677 MRI.setSimpleHint(SaveExec, AndCond); 4678 4679 // Update EXEC to matching lanes, saving original to SaveExec. 4680 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec) 4681 .addReg(AndCond, RegState::Kill); 4682 4683 // The original instruction is here; we insert the terminators after it. 4684 I = LoopBB.end(); 4685 4686 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 4687 BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec) 4688 .addReg(Exec) 4689 .addReg(SaveExec); 4690 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB); 4691 } 4692 4693 // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register 4694 // with SGPRs by iterating over all unique values across all lanes. 4695 static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, 4696 MachineOperand &Rsrc, MachineDominatorTree *MDT) { 4697 MachineBasicBlock &MBB = *MI.getParent(); 4698 MachineFunction &MF = *MBB.getParent(); 4699 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 4700 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 4701 MachineRegisterInfo &MRI = MF.getRegInfo(); 4702 MachineBasicBlock::iterator I(&MI); 4703 const DebugLoc &DL = MI.getDebugLoc(); 4704 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 4705 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 4706 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 4707 4708 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); 4709 4710 // Save the EXEC mask 4711 BuildMI(MBB, I, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec); 4712 4713 // Killed uses in the instruction we are waterfalling around will be 4714 // incorrect due to the added control-flow. 4715 for (auto &MO : MI.uses()) { 4716 if (MO.isReg() && MO.isUse()) { 4717 MRI.clearKillFlags(MO.getReg()); 4718 } 4719 } 4720 4721 // To insert the loop we need to split the block. Move everything after this 4722 // point to a new block, and insert a new empty block between the two. 4723 MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock(); 4724 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock(); 4725 MachineFunction::iterator MBBI(MBB); 4726 ++MBBI; 4727 4728 MF.insert(MBBI, LoopBB); 4729 MF.insert(MBBI, RemainderBB); 4730 4731 LoopBB->addSuccessor(LoopBB); 4732 LoopBB->addSuccessor(RemainderBB); 4733 4734 // Move MI to the LoopBB, and the remainder of the block to RemainderBB. 4735 MachineBasicBlock::iterator J = I++; 4736 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 4737 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); 4738 LoopBB->splice(LoopBB->begin(), &MBB, J); 4739 4740 MBB.addSuccessor(LoopBB); 4741 4742 // Update dominators. We know that MBB immediately dominates LoopBB, that 4743 // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately 4744 // dominates all of the successors transferred to it from MBB that MBB used 4745 // to properly dominate. 4746 if (MDT) { 4747 MDT->addNewBlock(LoopBB, &MBB); 4748 MDT->addNewBlock(RemainderBB, LoopBB); 4749 for (auto &Succ : RemainderBB->successors()) { 4750 if (MDT->properlyDominates(&MBB, Succ)) { 4751 MDT->changeImmediateDominator(Succ, RemainderBB); 4752 } 4753 } 4754 } 4755 4756 emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc); 4757 4758 // Restore the EXEC mask 4759 MachineBasicBlock::iterator First = RemainderBB->begin(); 4760 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec); 4761 } 4762 4763 // Extract pointer from Rsrc and return a zero-value Rsrc replacement. 4764 static std::tuple<unsigned, unsigned> 4765 extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) { 4766 MachineBasicBlock &MBB = *MI.getParent(); 4767 MachineFunction &MF = *MBB.getParent(); 4768 MachineRegisterInfo &MRI = MF.getRegInfo(); 4769 4770 // Extract the ptr from the resource descriptor. 4771 unsigned RsrcPtr = 4772 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass, 4773 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); 4774 4775 // Create an empty resource descriptor 4776 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4777 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4778 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4779 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 4780 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat(); 4781 4782 // Zero64 = 0 4783 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64) 4784 .addImm(0); 4785 4786 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 4787 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo) 4788 .addImm(RsrcDataFormat & 0xFFFFFFFF); 4789 4790 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 4791 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi) 4792 .addImm(RsrcDataFormat >> 32); 4793 4794 // NewSRsrc = {Zero64, SRsrcFormat} 4795 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc) 4796 .addReg(Zero64) 4797 .addImm(AMDGPU::sub0_sub1) 4798 .addReg(SRsrcFormatLo) 4799 .addImm(AMDGPU::sub2) 4800 .addReg(SRsrcFormatHi) 4801 .addImm(AMDGPU::sub3); 4802 4803 return std::make_tuple(RsrcPtr, NewSRsrc); 4804 } 4805 4806 void SIInstrInfo::legalizeOperands(MachineInstr &MI, 4807 MachineDominatorTree *MDT) const { 4808 MachineFunction &MF = *MI.getParent()->getParent(); 4809 MachineRegisterInfo &MRI = MF.getRegInfo(); 4810 4811 // Legalize VOP2 4812 if (isVOP2(MI) || isVOPC(MI)) { 4813 legalizeOperandsVOP2(MRI, MI); 4814 return; 4815 } 4816 4817 // Legalize VOP3 4818 if (isVOP3(MI)) { 4819 legalizeOperandsVOP3(MRI, MI); 4820 return; 4821 } 4822 4823 // Legalize SMRD 4824 if (isSMRD(MI)) { 4825 legalizeOperandsSMRD(MRI, MI); 4826 return; 4827 } 4828 4829 // Legalize REG_SEQUENCE and PHI 4830 // The register class of the operands much be the same type as the register 4831 // class of the output. 4832 if (MI.getOpcode() == AMDGPU::PHI) { 4833 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 4834 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { 4835 if (!MI.getOperand(i).isReg() || 4836 !Register::isVirtualRegister(MI.getOperand(i).getReg())) 4837 continue; 4838 const TargetRegisterClass *OpRC = 4839 MRI.getRegClass(MI.getOperand(i).getReg()); 4840 if (RI.hasVectorRegisters(OpRC)) { 4841 VRC = OpRC; 4842 } else { 4843 SRC = OpRC; 4844 } 4845 } 4846 4847 // If any of the operands are VGPR registers, then they all most be 4848 // otherwise we will create illegal VGPR->SGPR copies when legalizing 4849 // them. 4850 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { 4851 if (!VRC) { 4852 assert(SRC); 4853 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) { 4854 VRC = &AMDGPU::VReg_1RegClass; 4855 } else 4856 VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) 4857 ? RI.getEquivalentAGPRClass(SRC) 4858 : RI.getEquivalentVGPRClass(SRC); 4859 } else { 4860 VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) 4861 ? RI.getEquivalentAGPRClass(VRC) 4862 : RI.getEquivalentVGPRClass(VRC); 4863 } 4864 RC = VRC; 4865 } else { 4866 RC = SRC; 4867 } 4868 4869 // Update all the operands so they have the same type. 4870 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 4871 MachineOperand &Op = MI.getOperand(I); 4872 if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg())) 4873 continue; 4874 4875 // MI is a PHI instruction. 4876 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); 4877 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); 4878 4879 // Avoid creating no-op copies with the same src and dst reg class. These 4880 // confuse some of the machine passes. 4881 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); 4882 } 4883 } 4884 4885 // REG_SEQUENCE doesn't really require operand legalization, but if one has a 4886 // VGPR dest type and SGPR sources, insert copies so all operands are 4887 // VGPRs. This seems to help operand folding / the register coalescer. 4888 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { 4889 MachineBasicBlock *MBB = MI.getParent(); 4890 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); 4891 if (RI.hasVGPRs(DstRC)) { 4892 // Update all the operands so they are VGPR register classes. These may 4893 // not be the same register class because REG_SEQUENCE supports mixing 4894 // subregister index types e.g. sub0_sub1 + sub2 + sub3 4895 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 4896 MachineOperand &Op = MI.getOperand(I); 4897 if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg())) 4898 continue; 4899 4900 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); 4901 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); 4902 if (VRC == OpRC) 4903 continue; 4904 4905 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc()); 4906 Op.setIsKill(); 4907 } 4908 } 4909 4910 return; 4911 } 4912 4913 // Legalize INSERT_SUBREG 4914 // src0 must have the same register class as dst 4915 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { 4916 Register Dst = MI.getOperand(0).getReg(); 4917 Register Src0 = MI.getOperand(1).getReg(); 4918 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 4919 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 4920 if (DstRC != Src0RC) { 4921 MachineBasicBlock *MBB = MI.getParent(); 4922 MachineOperand &Op = MI.getOperand(1); 4923 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); 4924 } 4925 return; 4926 } 4927 4928 // Legalize SI_INIT_M0 4929 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) { 4930 MachineOperand &Src = MI.getOperand(0); 4931 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) 4932 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); 4933 return; 4934 } 4935 4936 // Legalize MIMG and MUBUF/MTBUF for shaders. 4937 // 4938 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via 4939 // scratch memory access. In both cases, the legalization never involves 4940 // conversion to the addr64 form. 4941 if (isMIMG(MI) || 4942 (AMDGPU::isShader(MF.getFunction().getCallingConv()) && 4943 (isMUBUF(MI) || isMTBUF(MI)))) { 4944 MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); 4945 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { 4946 unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); 4947 SRsrc->setReg(SGPR); 4948 } 4949 4950 MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); 4951 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { 4952 unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); 4953 SSamp->setReg(SGPR); 4954 } 4955 return; 4956 } 4957 4958 // Legalize MUBUF* instructions. 4959 int RsrcIdx = 4960 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); 4961 if (RsrcIdx != -1) { 4962 // We have an MUBUF instruction 4963 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx); 4964 unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass; 4965 if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()), 4966 RI.getRegClass(RsrcRC))) { 4967 // The operands are legal. 4968 // FIXME: We may need to legalize operands besided srsrc. 4969 return; 4970 } 4971 4972 // Legalize a VGPR Rsrc. 4973 // 4974 // If the instruction is _ADDR64, we can avoid a waterfall by extracting 4975 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using 4976 // a zero-value SRsrc. 4977 // 4978 // If the instruction is _OFFSET (both idxen and offen disabled), and we 4979 // support ADDR64 instructions, we can convert to ADDR64 and do the same as 4980 // above. 4981 // 4982 // Otherwise we are on non-ADDR64 hardware, and/or we have 4983 // idxen/offen/bothen and we fall back to a waterfall loop. 4984 4985 MachineBasicBlock &MBB = *MI.getParent(); 4986 4987 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 4988 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) { 4989 // This is already an ADDR64 instruction so we need to add the pointer 4990 // extracted from the resource descriptor to the current value of VAddr. 4991 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4992 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4993 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 4994 4995 const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 4996 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC); 4997 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC); 4998 4999 unsigned RsrcPtr, NewSRsrc; 5000 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); 5001 5002 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0 5003 const DebugLoc &DL = MI.getDebugLoc(); 5004 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e64), NewVAddrLo) 5005 .addDef(CondReg0) 5006 .addReg(RsrcPtr, 0, AMDGPU::sub0) 5007 .addReg(VAddr->getReg(), 0, AMDGPU::sub0) 5008 .addImm(0); 5009 5010 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1 5011 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi) 5012 .addDef(CondReg1, RegState::Dead) 5013 .addReg(RsrcPtr, 0, AMDGPU::sub1) 5014 .addReg(VAddr->getReg(), 0, AMDGPU::sub1) 5015 .addReg(CondReg0, RegState::Kill) 5016 .addImm(0); 5017 5018 // NewVaddr = {NewVaddrHi, NewVaddrLo} 5019 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 5020 .addReg(NewVAddrLo) 5021 .addImm(AMDGPU::sub0) 5022 .addReg(NewVAddrHi) 5023 .addImm(AMDGPU::sub1); 5024 5025 VAddr->setReg(NewVAddr); 5026 Rsrc->setReg(NewSRsrc); 5027 } else if (!VAddr && ST.hasAddr64()) { 5028 // This instructions is the _OFFSET variant, so we need to convert it to 5029 // ADDR64. 5030 assert(MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration() 5031 < AMDGPUSubtarget::VOLCANIC_ISLANDS && 5032 "FIXME: Need to emit flat atomics here"); 5033 5034 unsigned RsrcPtr, NewSRsrc; 5035 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); 5036 5037 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 5038 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); 5039 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 5040 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); 5041 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); 5042 5043 // Atomics rith return have have an additional tied operand and are 5044 // missing some of the special bits. 5045 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); 5046 MachineInstr *Addr64; 5047 5048 if (!VDataIn) { 5049 // Regular buffer load / store. 5050 MachineInstrBuilder MIB = 5051 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 5052 .add(*VData) 5053 .addReg(NewVAddr) 5054 .addReg(NewSRsrc) 5055 .add(*SOffset) 5056 .add(*Offset); 5057 5058 // Atomics do not have this operand. 5059 if (const MachineOperand *GLC = 5060 getNamedOperand(MI, AMDGPU::OpName::glc)) { 5061 MIB.addImm(GLC->getImm()); 5062 } 5063 if (const MachineOperand *DLC = 5064 getNamedOperand(MI, AMDGPU::OpName::dlc)) { 5065 MIB.addImm(DLC->getImm()); 5066 } 5067 5068 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)); 5069 5070 if (const MachineOperand *TFE = 5071 getNamedOperand(MI, AMDGPU::OpName::tfe)) { 5072 MIB.addImm(TFE->getImm()); 5073 } 5074 5075 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz)); 5076 5077 MIB.cloneMemRefs(MI); 5078 Addr64 = MIB; 5079 } else { 5080 // Atomics with return. 5081 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 5082 .add(*VData) 5083 .add(*VDataIn) 5084 .addReg(NewVAddr) 5085 .addReg(NewSRsrc) 5086 .add(*SOffset) 5087 .add(*Offset) 5088 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) 5089 .cloneMemRefs(MI); 5090 } 5091 5092 MI.removeFromParent(); 5093 5094 // NewVaddr = {NewVaddrHi, NewVaddrLo} 5095 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), 5096 NewVAddr) 5097 .addReg(RsrcPtr, 0, AMDGPU::sub0) 5098 .addImm(AMDGPU::sub0) 5099 .addReg(RsrcPtr, 0, AMDGPU::sub1) 5100 .addImm(AMDGPU::sub1); 5101 } else { 5102 // This is another variant; legalize Rsrc with waterfall loop from VGPRs 5103 // to SGPRs. 5104 loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT); 5105 } 5106 } 5107 } 5108 5109 void SIInstrInfo::moveToVALU(MachineInstr &TopInst, 5110 MachineDominatorTree *MDT) const { 5111 SetVectorType Worklist; 5112 Worklist.insert(&TopInst); 5113 5114 while (!Worklist.empty()) { 5115 MachineInstr &Inst = *Worklist.pop_back_val(); 5116 MachineBasicBlock *MBB = Inst.getParent(); 5117 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 5118 5119 unsigned Opcode = Inst.getOpcode(); 5120 unsigned NewOpcode = getVALUOp(Inst); 5121 5122 // Handle some special cases 5123 switch (Opcode) { 5124 default: 5125 break; 5126 case AMDGPU::S_ADD_U64_PSEUDO: 5127 case AMDGPU::S_SUB_U64_PSEUDO: 5128 splitScalar64BitAddSub(Worklist, Inst, MDT); 5129 Inst.eraseFromParent(); 5130 continue; 5131 case AMDGPU::S_ADD_I32: 5132 case AMDGPU::S_SUB_I32: 5133 // FIXME: The u32 versions currently selected use the carry. 5134 if (moveScalarAddSub(Worklist, Inst, MDT)) 5135 continue; 5136 5137 // Default handling 5138 break; 5139 case AMDGPU::S_AND_B64: 5140 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); 5141 Inst.eraseFromParent(); 5142 continue; 5143 5144 case AMDGPU::S_OR_B64: 5145 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT); 5146 Inst.eraseFromParent(); 5147 continue; 5148 5149 case AMDGPU::S_XOR_B64: 5150 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT); 5151 Inst.eraseFromParent(); 5152 continue; 5153 5154 case AMDGPU::S_NAND_B64: 5155 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT); 5156 Inst.eraseFromParent(); 5157 continue; 5158 5159 case AMDGPU::S_NOR_B64: 5160 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT); 5161 Inst.eraseFromParent(); 5162 continue; 5163 5164 case AMDGPU::S_XNOR_B64: 5165 if (ST.hasDLInsts()) 5166 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); 5167 else 5168 splitScalar64BitXnor(Worklist, Inst, MDT); 5169 Inst.eraseFromParent(); 5170 continue; 5171 5172 case AMDGPU::S_ANDN2_B64: 5173 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT); 5174 Inst.eraseFromParent(); 5175 continue; 5176 5177 case AMDGPU::S_ORN2_B64: 5178 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT); 5179 Inst.eraseFromParent(); 5180 continue; 5181 5182 case AMDGPU::S_NOT_B64: 5183 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); 5184 Inst.eraseFromParent(); 5185 continue; 5186 5187 case AMDGPU::S_BCNT1_I32_B64: 5188 splitScalar64BitBCNT(Worklist, Inst); 5189 Inst.eraseFromParent(); 5190 continue; 5191 5192 case AMDGPU::S_BFE_I64: 5193 splitScalar64BitBFE(Worklist, Inst); 5194 Inst.eraseFromParent(); 5195 continue; 5196 5197 case AMDGPU::S_LSHL_B32: 5198 if (ST.hasOnlyRevVALUShifts()) { 5199 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 5200 swapOperands(Inst); 5201 } 5202 break; 5203 case AMDGPU::S_ASHR_I32: 5204 if (ST.hasOnlyRevVALUShifts()) { 5205 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 5206 swapOperands(Inst); 5207 } 5208 break; 5209 case AMDGPU::S_LSHR_B32: 5210 if (ST.hasOnlyRevVALUShifts()) { 5211 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 5212 swapOperands(Inst); 5213 } 5214 break; 5215 case AMDGPU::S_LSHL_B64: 5216 if (ST.hasOnlyRevVALUShifts()) { 5217 NewOpcode = AMDGPU::V_LSHLREV_B64; 5218 swapOperands(Inst); 5219 } 5220 break; 5221 case AMDGPU::S_ASHR_I64: 5222 if (ST.hasOnlyRevVALUShifts()) { 5223 NewOpcode = AMDGPU::V_ASHRREV_I64; 5224 swapOperands(Inst); 5225 } 5226 break; 5227 case AMDGPU::S_LSHR_B64: 5228 if (ST.hasOnlyRevVALUShifts()) { 5229 NewOpcode = AMDGPU::V_LSHRREV_B64; 5230 swapOperands(Inst); 5231 } 5232 break; 5233 5234 case AMDGPU::S_ABS_I32: 5235 lowerScalarAbs(Worklist, Inst); 5236 Inst.eraseFromParent(); 5237 continue; 5238 5239 case AMDGPU::S_CBRANCH_SCC0: 5240 case AMDGPU::S_CBRANCH_SCC1: 5241 // Clear unused bits of vcc 5242 if (ST.isWave32()) 5243 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B32), 5244 AMDGPU::VCC_LO) 5245 .addReg(AMDGPU::EXEC_LO) 5246 .addReg(AMDGPU::VCC_LO); 5247 else 5248 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), 5249 AMDGPU::VCC) 5250 .addReg(AMDGPU::EXEC) 5251 .addReg(AMDGPU::VCC); 5252 break; 5253 5254 case AMDGPU::S_BFE_U64: 5255 case AMDGPU::S_BFM_B64: 5256 llvm_unreachable("Moving this op to VALU not implemented"); 5257 5258 case AMDGPU::S_PACK_LL_B32_B16: 5259 case AMDGPU::S_PACK_LH_B32_B16: 5260 case AMDGPU::S_PACK_HH_B32_B16: 5261 movePackToVALU(Worklist, MRI, Inst); 5262 Inst.eraseFromParent(); 5263 continue; 5264 5265 case AMDGPU::S_XNOR_B32: 5266 lowerScalarXnor(Worklist, Inst); 5267 Inst.eraseFromParent(); 5268 continue; 5269 5270 case AMDGPU::S_NAND_B32: 5271 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32); 5272 Inst.eraseFromParent(); 5273 continue; 5274 5275 case AMDGPU::S_NOR_B32: 5276 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32); 5277 Inst.eraseFromParent(); 5278 continue; 5279 5280 case AMDGPU::S_ANDN2_B32: 5281 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32); 5282 Inst.eraseFromParent(); 5283 continue; 5284 5285 case AMDGPU::S_ORN2_B32: 5286 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); 5287 Inst.eraseFromParent(); 5288 continue; 5289 5290 // TODO: remove as soon as everything is ready 5291 // to replace VGPR to SGPR copy with V_READFIRSTLANEs. 5292 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO 5293 // can only be selected from the uniform SDNode. 5294 case AMDGPU::S_ADD_CO_PSEUDO: 5295 case AMDGPU::S_SUB_CO_PSEUDO: { 5296 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) 5297 ? AMDGPU::V_ADDC_U32_e64 5298 : AMDGPU::V_SUBB_U32_e64; 5299 const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 5300 5301 Register CarryInReg = Inst.getOperand(4).getReg(); 5302 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) { 5303 Register NewCarryReg = MRI.createVirtualRegister(CarryRC); 5304 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg) 5305 .addReg(CarryInReg); 5306 } 5307 5308 Register CarryOutReg = Inst.getOperand(1).getReg(); 5309 5310 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass( 5311 MRI.getRegClass(Inst.getOperand(0).getReg()))); 5312 MachineInstr *CarryOp = 5313 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg) 5314 .addReg(CarryOutReg, RegState::Define) 5315 .add(Inst.getOperand(2)) 5316 .add(Inst.getOperand(3)) 5317 .addReg(CarryInReg) 5318 .addImm(0); 5319 legalizeOperands(*CarryOp); 5320 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg); 5321 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); 5322 Inst.eraseFromParent(); 5323 } 5324 continue; 5325 case AMDGPU::S_UADDO_PSEUDO: 5326 case AMDGPU::S_USUBO_PSEUDO: { 5327 const DebugLoc &DL = Inst.getDebugLoc(); 5328 MachineOperand &Dest0 = Inst.getOperand(0); 5329 MachineOperand &Dest1 = Inst.getOperand(1); 5330 MachineOperand &Src0 = Inst.getOperand(2); 5331 MachineOperand &Src1 = Inst.getOperand(3); 5332 5333 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO) 5334 ? AMDGPU::V_ADD_I32_e64 5335 : AMDGPU::V_SUB_I32_e64; 5336 const TargetRegisterClass *NewRC = 5337 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg())); 5338 Register DestReg = MRI.createVirtualRegister(NewRC); 5339 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg) 5340 .addReg(Dest1.getReg(), RegState::Define) 5341 .add(Src0) 5342 .add(Src1) 5343 .addImm(0); // clamp bit 5344 5345 legalizeOperands(*NewInstr, MDT); 5346 5347 MRI.replaceRegWith(Dest0.getReg(), DestReg); 5348 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, 5349 Worklist); 5350 Inst.eraseFromParent(); 5351 } 5352 continue; 5353 } 5354 5355 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 5356 // We cannot move this instruction to the VALU, so we should try to 5357 // legalize its operands instead. 5358 legalizeOperands(Inst, MDT); 5359 continue; 5360 } 5361 5362 // Use the new VALU Opcode. 5363 const MCInstrDesc &NewDesc = get(NewOpcode); 5364 Inst.setDesc(NewDesc); 5365 5366 // Remove any references to SCC. Vector instructions can't read from it, and 5367 // We're just about to add the implicit use / defs of VCC, and we don't want 5368 // both. 5369 for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { 5370 MachineOperand &Op = Inst.getOperand(i); 5371 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { 5372 // Only propagate through live-def of SCC. 5373 if (Op.isDef() && !Op.isDead()) 5374 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist); 5375 Inst.RemoveOperand(i); 5376 } 5377 } 5378 5379 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 5380 // We are converting these to a BFE, so we need to add the missing 5381 // operands for the size and offset. 5382 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 5383 Inst.addOperand(MachineOperand::CreateImm(0)); 5384 Inst.addOperand(MachineOperand::CreateImm(Size)); 5385 5386 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 5387 // The VALU version adds the second operand to the result, so insert an 5388 // extra 0 operand. 5389 Inst.addOperand(MachineOperand::CreateImm(0)); 5390 } 5391 5392 Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); 5393 fixImplicitOperands(Inst); 5394 5395 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 5396 const MachineOperand &OffsetWidthOp = Inst.getOperand(2); 5397 // If we need to move this to VGPRs, we need to unpack the second operand 5398 // back into the 2 separate ones for bit offset and width. 5399 assert(OffsetWidthOp.isImm() && 5400 "Scalar BFE is only implemented for constant width and offset"); 5401 uint32_t Imm = OffsetWidthOp.getImm(); 5402 5403 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 5404 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 5405 Inst.RemoveOperand(2); // Remove old immediate. 5406 Inst.addOperand(MachineOperand::CreateImm(Offset)); 5407 Inst.addOperand(MachineOperand::CreateImm(BitWidth)); 5408 } 5409 5410 bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); 5411 unsigned NewDstReg = AMDGPU::NoRegister; 5412 if (HasDst) { 5413 Register DstReg = Inst.getOperand(0).getReg(); 5414 if (Register::isPhysicalRegister(DstReg)) 5415 continue; 5416 5417 // Update the destination register class. 5418 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); 5419 if (!NewDstRC) 5420 continue; 5421 5422 if (Inst.isCopy() && 5423 Register::isVirtualRegister(Inst.getOperand(1).getReg()) && 5424 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { 5425 // Instead of creating a copy where src and dst are the same register 5426 // class, we just replace all uses of dst with src. These kinds of 5427 // copies interfere with the heuristics MachineSink uses to decide 5428 // whether or not to split a critical edge. Since the pass assumes 5429 // that copies will end up as machine instructions and not be 5430 // eliminated. 5431 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); 5432 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); 5433 MRI.clearKillFlags(Inst.getOperand(1).getReg()); 5434 Inst.getOperand(0).setReg(DstReg); 5435 5436 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally 5437 // these are deleted later, but at -O0 it would leave a suspicious 5438 // looking illegal copy of an undef register. 5439 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I) 5440 Inst.RemoveOperand(I); 5441 Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); 5442 continue; 5443 } 5444 5445 NewDstReg = MRI.createVirtualRegister(NewDstRC); 5446 MRI.replaceRegWith(DstReg, NewDstReg); 5447 } 5448 5449 // Legalize the operands 5450 legalizeOperands(Inst, MDT); 5451 5452 if (HasDst) 5453 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 5454 } 5455 } 5456 5457 // Add/sub require special handling to deal with carry outs. 5458 bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, 5459 MachineDominatorTree *MDT) const { 5460 if (ST.hasAddNoCarry()) { 5461 // Assume there is no user of scc since we don't select this in that case. 5462 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant 5463 // is used. 5464 5465 MachineBasicBlock &MBB = *Inst.getParent(); 5466 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5467 5468 Register OldDstReg = Inst.getOperand(0).getReg(); 5469 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5470 5471 unsigned Opc = Inst.getOpcode(); 5472 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32); 5473 5474 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ? 5475 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64; 5476 5477 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC); 5478 Inst.RemoveOperand(3); 5479 5480 Inst.setDesc(get(NewOpc)); 5481 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit 5482 Inst.addImplicitDefUseOperands(*MBB.getParent()); 5483 MRI.replaceRegWith(OldDstReg, ResultReg); 5484 legalizeOperands(Inst, MDT); 5485 5486 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 5487 return true; 5488 } 5489 5490 return false; 5491 } 5492 5493 void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, 5494 MachineInstr &Inst) const { 5495 MachineBasicBlock &MBB = *Inst.getParent(); 5496 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5497 MachineBasicBlock::iterator MII = Inst; 5498 DebugLoc DL = Inst.getDebugLoc(); 5499 5500 MachineOperand &Dest = Inst.getOperand(0); 5501 MachineOperand &Src = Inst.getOperand(1); 5502 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5503 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5504 5505 unsigned SubOp = ST.hasAddNoCarry() ? 5506 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32; 5507 5508 BuildMI(MBB, MII, DL, get(SubOp), TmpReg) 5509 .addImm(0) 5510 .addReg(Src.getReg()); 5511 5512 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) 5513 .addReg(Src.getReg()) 5514 .addReg(TmpReg); 5515 5516 MRI.replaceRegWith(Dest.getReg(), ResultReg); 5517 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 5518 } 5519 5520 void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, 5521 MachineInstr &Inst) const { 5522 MachineBasicBlock &MBB = *Inst.getParent(); 5523 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5524 MachineBasicBlock::iterator MII = Inst; 5525 const DebugLoc &DL = Inst.getDebugLoc(); 5526 5527 MachineOperand &Dest = Inst.getOperand(0); 5528 MachineOperand &Src0 = Inst.getOperand(1); 5529 MachineOperand &Src1 = Inst.getOperand(2); 5530 5531 if (ST.hasDLInsts()) { 5532 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5533 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); 5534 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); 5535 5536 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest) 5537 .add(Src0) 5538 .add(Src1); 5539 5540 MRI.replaceRegWith(Dest.getReg(), NewDest); 5541 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 5542 } else { 5543 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can 5544 // invert either source and then perform the XOR. If either source is a 5545 // scalar register, then we can leave the inversion on the scalar unit to 5546 // acheive a better distrubution of scalar and vector instructions. 5547 bool Src0IsSGPR = Src0.isReg() && 5548 RI.isSGPRClass(MRI.getRegClass(Src0.getReg())); 5549 bool Src1IsSGPR = Src1.isReg() && 5550 RI.isSGPRClass(MRI.getRegClass(Src1.getReg())); 5551 MachineInstr *Xor; 5552 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5553 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5554 5555 // Build a pair of scalar instructions and add them to the work list. 5556 // The next iteration over the work list will lower these to the vector 5557 // unit as necessary. 5558 if (Src0IsSGPR) { 5559 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0); 5560 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) 5561 .addReg(Temp) 5562 .add(Src1); 5563 } else if (Src1IsSGPR) { 5564 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1); 5565 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) 5566 .add(Src0) 5567 .addReg(Temp); 5568 } else { 5569 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp) 5570 .add(Src0) 5571 .add(Src1); 5572 MachineInstr *Not = 5573 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp); 5574 Worklist.insert(Not); 5575 } 5576 5577 MRI.replaceRegWith(Dest.getReg(), NewDest); 5578 5579 Worklist.insert(Xor); 5580 5581 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 5582 } 5583 } 5584 5585 void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist, 5586 MachineInstr &Inst, 5587 unsigned Opcode) const { 5588 MachineBasicBlock &MBB = *Inst.getParent(); 5589 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5590 MachineBasicBlock::iterator MII = Inst; 5591 const DebugLoc &DL = Inst.getDebugLoc(); 5592 5593 MachineOperand &Dest = Inst.getOperand(0); 5594 MachineOperand &Src0 = Inst.getOperand(1); 5595 MachineOperand &Src1 = Inst.getOperand(2); 5596 5597 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5598 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5599 5600 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm) 5601 .add(Src0) 5602 .add(Src1); 5603 5604 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest) 5605 .addReg(Interm); 5606 5607 Worklist.insert(&Op); 5608 Worklist.insert(&Not); 5609 5610 MRI.replaceRegWith(Dest.getReg(), NewDest); 5611 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 5612 } 5613 5614 void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist, 5615 MachineInstr &Inst, 5616 unsigned Opcode) const { 5617 MachineBasicBlock &MBB = *Inst.getParent(); 5618 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5619 MachineBasicBlock::iterator MII = Inst; 5620 const DebugLoc &DL = Inst.getDebugLoc(); 5621 5622 MachineOperand &Dest = Inst.getOperand(0); 5623 MachineOperand &Src0 = Inst.getOperand(1); 5624 MachineOperand &Src1 = Inst.getOperand(2); 5625 5626 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 5627 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 5628 5629 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm) 5630 .add(Src1); 5631 5632 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest) 5633 .add(Src0) 5634 .addReg(Interm); 5635 5636 Worklist.insert(&Not); 5637 Worklist.insert(&Op); 5638 5639 MRI.replaceRegWith(Dest.getReg(), NewDest); 5640 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 5641 } 5642 5643 void SIInstrInfo::splitScalar64BitUnaryOp( 5644 SetVectorType &Worklist, MachineInstr &Inst, 5645 unsigned Opcode) const { 5646 MachineBasicBlock &MBB = *Inst.getParent(); 5647 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5648 5649 MachineOperand &Dest = Inst.getOperand(0); 5650 MachineOperand &Src0 = Inst.getOperand(1); 5651 DebugLoc DL = Inst.getDebugLoc(); 5652 5653 MachineBasicBlock::iterator MII = Inst; 5654 5655 const MCInstrDesc &InstDesc = get(Opcode); 5656 const TargetRegisterClass *Src0RC = Src0.isReg() ? 5657 MRI.getRegClass(Src0.getReg()) : 5658 &AMDGPU::SGPR_32RegClass; 5659 5660 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 5661 5662 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5663 AMDGPU::sub0, Src0SubRC); 5664 5665 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 5666 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 5667 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 5668 5669 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 5670 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); 5671 5672 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5673 AMDGPU::sub1, Src0SubRC); 5674 5675 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 5676 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); 5677 5678 Register FullDestReg = MRI.createVirtualRegister(NewDestRC); 5679 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 5680 .addReg(DestSub0) 5681 .addImm(AMDGPU::sub0) 5682 .addReg(DestSub1) 5683 .addImm(AMDGPU::sub1); 5684 5685 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 5686 5687 Worklist.insert(&LoHalf); 5688 Worklist.insert(&HiHalf); 5689 5690 // We don't need to legalizeOperands here because for a single operand, src0 5691 // will support any kind of input. 5692 5693 // Move all users of this moved value. 5694 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 5695 } 5696 5697 void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, 5698 MachineInstr &Inst, 5699 MachineDominatorTree *MDT) const { 5700 bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); 5701 5702 MachineBasicBlock &MBB = *Inst.getParent(); 5703 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5704 const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 5705 5706 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 5707 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5708 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5709 5710 Register CarryReg = MRI.createVirtualRegister(CarryRC); 5711 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC); 5712 5713 MachineOperand &Dest = Inst.getOperand(0); 5714 MachineOperand &Src0 = Inst.getOperand(1); 5715 MachineOperand &Src1 = Inst.getOperand(2); 5716 const DebugLoc &DL = Inst.getDebugLoc(); 5717 MachineBasicBlock::iterator MII = Inst; 5718 5719 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); 5720 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); 5721 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 5722 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 5723 5724 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5725 AMDGPU::sub0, Src0SubRC); 5726 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 5727 AMDGPU::sub0, Src1SubRC); 5728 5729 5730 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5731 AMDGPU::sub1, Src0SubRC); 5732 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 5733 AMDGPU::sub1, Src1SubRC); 5734 5735 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; 5736 MachineInstr *LoHalf = 5737 BuildMI(MBB, MII, DL, get(LoOpc), DestSub0) 5738 .addReg(CarryReg, RegState::Define) 5739 .add(SrcReg0Sub0) 5740 .add(SrcReg1Sub0) 5741 .addImm(0); // clamp bit 5742 5743 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 5744 MachineInstr *HiHalf = 5745 BuildMI(MBB, MII, DL, get(HiOpc), DestSub1) 5746 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 5747 .add(SrcReg0Sub1) 5748 .add(SrcReg1Sub1) 5749 .addReg(CarryReg, RegState::Kill) 5750 .addImm(0); // clamp bit 5751 5752 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 5753 .addReg(DestSub0) 5754 .addImm(AMDGPU::sub0) 5755 .addReg(DestSub1) 5756 .addImm(AMDGPU::sub1); 5757 5758 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 5759 5760 // Try to legalize the operands in case we need to swap the order to keep it 5761 // valid. 5762 legalizeOperands(*LoHalf, MDT); 5763 legalizeOperands(*HiHalf, MDT); 5764 5765 // Move all users of this moved vlaue. 5766 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 5767 } 5768 5769 void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, 5770 MachineInstr &Inst, unsigned Opcode, 5771 MachineDominatorTree *MDT) const { 5772 MachineBasicBlock &MBB = *Inst.getParent(); 5773 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5774 5775 MachineOperand &Dest = Inst.getOperand(0); 5776 MachineOperand &Src0 = Inst.getOperand(1); 5777 MachineOperand &Src1 = Inst.getOperand(2); 5778 DebugLoc DL = Inst.getDebugLoc(); 5779 5780 MachineBasicBlock::iterator MII = Inst; 5781 5782 const MCInstrDesc &InstDesc = get(Opcode); 5783 const TargetRegisterClass *Src0RC = Src0.isReg() ? 5784 MRI.getRegClass(Src0.getReg()) : 5785 &AMDGPU::SGPR_32RegClass; 5786 5787 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 5788 const TargetRegisterClass *Src1RC = Src1.isReg() ? 5789 MRI.getRegClass(Src1.getReg()) : 5790 &AMDGPU::SGPR_32RegClass; 5791 5792 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 5793 5794 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5795 AMDGPU::sub0, Src0SubRC); 5796 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 5797 AMDGPU::sub0, Src1SubRC); 5798 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5799 AMDGPU::sub1, Src0SubRC); 5800 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 5801 AMDGPU::sub1, Src1SubRC); 5802 5803 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 5804 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 5805 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 5806 5807 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 5808 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) 5809 .add(SrcReg0Sub0) 5810 .add(SrcReg1Sub0); 5811 5812 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 5813 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) 5814 .add(SrcReg0Sub1) 5815 .add(SrcReg1Sub1); 5816 5817 Register FullDestReg = MRI.createVirtualRegister(NewDestRC); 5818 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 5819 .addReg(DestSub0) 5820 .addImm(AMDGPU::sub0) 5821 .addReg(DestSub1) 5822 .addImm(AMDGPU::sub1); 5823 5824 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 5825 5826 Worklist.insert(&LoHalf); 5827 Worklist.insert(&HiHalf); 5828 5829 // Move all users of this moved vlaue. 5830 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 5831 } 5832 5833 void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist, 5834 MachineInstr &Inst, 5835 MachineDominatorTree *MDT) const { 5836 MachineBasicBlock &MBB = *Inst.getParent(); 5837 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5838 5839 MachineOperand &Dest = Inst.getOperand(0); 5840 MachineOperand &Src0 = Inst.getOperand(1); 5841 MachineOperand &Src1 = Inst.getOperand(2); 5842 const DebugLoc &DL = Inst.getDebugLoc(); 5843 5844 MachineBasicBlock::iterator MII = Inst; 5845 5846 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 5847 5848 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 5849 5850 MachineOperand* Op0; 5851 MachineOperand* Op1; 5852 5853 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) { 5854 Op0 = &Src0; 5855 Op1 = &Src1; 5856 } else { 5857 Op0 = &Src1; 5858 Op1 = &Src0; 5859 } 5860 5861 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm) 5862 .add(*Op0); 5863 5864 Register NewDest = MRI.createVirtualRegister(DestRC); 5865 5866 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest) 5867 .addReg(Interm) 5868 .add(*Op1); 5869 5870 MRI.replaceRegWith(Dest.getReg(), NewDest); 5871 5872 Worklist.insert(&Xor); 5873 } 5874 5875 void SIInstrInfo::splitScalar64BitBCNT( 5876 SetVectorType &Worklist, MachineInstr &Inst) const { 5877 MachineBasicBlock &MBB = *Inst.getParent(); 5878 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5879 5880 MachineBasicBlock::iterator MII = Inst; 5881 const DebugLoc &DL = Inst.getDebugLoc(); 5882 5883 MachineOperand &Dest = Inst.getOperand(0); 5884 MachineOperand &Src = Inst.getOperand(1); 5885 5886 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 5887 const TargetRegisterClass *SrcRC = Src.isReg() ? 5888 MRI.getRegClass(Src.getReg()) : 5889 &AMDGPU::SGPR_32RegClass; 5890 5891 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5892 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5893 5894 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); 5895 5896 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 5897 AMDGPU::sub0, SrcSubRC); 5898 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 5899 AMDGPU::sub1, SrcSubRC); 5900 5901 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0); 5902 5903 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg); 5904 5905 MRI.replaceRegWith(Dest.getReg(), ResultReg); 5906 5907 // We don't need to legalize operands here. src0 for etiher instruction can be 5908 // an SGPR, and the second input is unused or determined here. 5909 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 5910 } 5911 5912 void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, 5913 MachineInstr &Inst) const { 5914 MachineBasicBlock &MBB = *Inst.getParent(); 5915 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5916 MachineBasicBlock::iterator MII = Inst; 5917 const DebugLoc &DL = Inst.getDebugLoc(); 5918 5919 MachineOperand &Dest = Inst.getOperand(0); 5920 uint32_t Imm = Inst.getOperand(2).getImm(); 5921 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 5922 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 5923 5924 (void) Offset; 5925 5926 // Only sext_inreg cases handled. 5927 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && 5928 Offset == 0 && "Not implemented"); 5929 5930 if (BitWidth < 32) { 5931 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5932 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5933 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 5934 5935 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) 5936 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) 5937 .addImm(0) 5938 .addImm(BitWidth); 5939 5940 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 5941 .addImm(31) 5942 .addReg(MidRegLo); 5943 5944 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 5945 .addReg(MidRegLo) 5946 .addImm(AMDGPU::sub0) 5947 .addReg(MidRegHi) 5948 .addImm(AMDGPU::sub1); 5949 5950 MRI.replaceRegWith(Dest.getReg(), ResultReg); 5951 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 5952 return; 5953 } 5954 5955 MachineOperand &Src = Inst.getOperand(1); 5956 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5957 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 5958 5959 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 5960 .addImm(31) 5961 .addReg(Src.getReg(), 0, AMDGPU::sub0); 5962 5963 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 5964 .addReg(Src.getReg(), 0, AMDGPU::sub0) 5965 .addImm(AMDGPU::sub0) 5966 .addReg(TmpReg) 5967 .addImm(AMDGPU::sub1); 5968 5969 MRI.replaceRegWith(Dest.getReg(), ResultReg); 5970 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 5971 } 5972 5973 void SIInstrInfo::addUsersToMoveToVALUWorklist( 5974 Register DstReg, 5975 MachineRegisterInfo &MRI, 5976 SetVectorType &Worklist) const { 5977 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), 5978 E = MRI.use_end(); I != E;) { 5979 MachineInstr &UseMI = *I->getParent(); 5980 5981 unsigned OpNo = 0; 5982 5983 switch (UseMI.getOpcode()) { 5984 case AMDGPU::COPY: 5985 case AMDGPU::WQM: 5986 case AMDGPU::SOFT_WQM: 5987 case AMDGPU::WWM: 5988 case AMDGPU::REG_SEQUENCE: 5989 case AMDGPU::PHI: 5990 case AMDGPU::INSERT_SUBREG: 5991 break; 5992 default: 5993 OpNo = I.getOperandNo(); 5994 break; 5995 } 5996 5997 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) { 5998 Worklist.insert(&UseMI); 5999 6000 do { 6001 ++I; 6002 } while (I != E && I->getParent() == &UseMI); 6003 } else { 6004 ++I; 6005 } 6006 } 6007 } 6008 6009 void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, 6010 MachineRegisterInfo &MRI, 6011 MachineInstr &Inst) const { 6012 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6013 MachineBasicBlock *MBB = Inst.getParent(); 6014 MachineOperand &Src0 = Inst.getOperand(1); 6015 MachineOperand &Src1 = Inst.getOperand(2); 6016 const DebugLoc &DL = Inst.getDebugLoc(); 6017 6018 switch (Inst.getOpcode()) { 6019 case AMDGPU::S_PACK_LL_B32_B16: { 6020 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6021 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6022 6023 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are 6024 // 0. 6025 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 6026 .addImm(0xffff); 6027 6028 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg) 6029 .addReg(ImmReg, RegState::Kill) 6030 .add(Src0); 6031 6032 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg) 6033 .add(Src1) 6034 .addImm(16) 6035 .addReg(TmpReg, RegState::Kill); 6036 break; 6037 } 6038 case AMDGPU::S_PACK_LH_B32_B16: { 6039 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6040 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 6041 .addImm(0xffff); 6042 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg) 6043 .addReg(ImmReg, RegState::Kill) 6044 .add(Src0) 6045 .add(Src1); 6046 break; 6047 } 6048 case AMDGPU::S_PACK_HH_B32_B16: { 6049 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6050 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6051 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) 6052 .addImm(16) 6053 .add(Src0); 6054 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 6055 .addImm(0xffff0000); 6056 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg) 6057 .add(Src1) 6058 .addReg(ImmReg, RegState::Kill) 6059 .addReg(TmpReg, RegState::Kill); 6060 break; 6061 } 6062 default: 6063 llvm_unreachable("unhandled s_pack_* instruction"); 6064 } 6065 6066 MachineOperand &Dest = Inst.getOperand(0); 6067 MRI.replaceRegWith(Dest.getReg(), ResultReg); 6068 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 6069 } 6070 6071 void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, 6072 MachineInstr &SCCDefInst, 6073 SetVectorType &Worklist) const { 6074 // Ensure that def inst defines SCC, which is still live. 6075 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && 6076 !Op.isDead() && Op.getParent() == &SCCDefInst); 6077 SmallVector<MachineInstr *, 4> CopyToDelete; 6078 // This assumes that all the users of SCC are in the same block 6079 // as the SCC def. 6080 for (MachineInstr &MI : // Skip the def inst itself. 6081 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)), 6082 SCCDefInst.getParent()->end())) { 6083 // Check if SCC is used first. 6084 if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) { 6085 if (MI.isCopy()) { 6086 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 6087 unsigned DestReg = MI.getOperand(0).getReg(); 6088 SmallVector<MachineInstr *, 4> Users; 6089 for (auto &User : MRI.use_nodbg_instructions(DestReg)) { 6090 if ((User.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) || 6091 (User.getOpcode() == AMDGPU::S_SUB_CO_PSEUDO)) { 6092 Users.push_back(&User); 6093 Worklist.insert(&User); 6094 } 6095 } 6096 for (auto &U : Users) 6097 U->getOperand(4).setReg(RI.getVCC()); 6098 CopyToDelete.push_back(&MI); 6099 } else 6100 Worklist.insert(&MI); 6101 } 6102 // Exit if we find another SCC def. 6103 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1) 6104 break; 6105 } 6106 for (auto &Copy : CopyToDelete) 6107 Copy->eraseFromParent(); 6108 } 6109 6110 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( 6111 const MachineInstr &Inst) const { 6112 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); 6113 6114 switch (Inst.getOpcode()) { 6115 // For target instructions, getOpRegClass just returns the virtual register 6116 // class associated with the operand, so we need to find an equivalent VGPR 6117 // register class in order to move the instruction to the VALU. 6118 case AMDGPU::COPY: 6119 case AMDGPU::PHI: 6120 case AMDGPU::REG_SEQUENCE: 6121 case AMDGPU::INSERT_SUBREG: 6122 case AMDGPU::WQM: 6123 case AMDGPU::SOFT_WQM: 6124 case AMDGPU::WWM: { 6125 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1); 6126 if (RI.hasAGPRs(SrcRC)) { 6127 if (RI.hasAGPRs(NewDstRC)) 6128 return nullptr; 6129 6130 switch (Inst.getOpcode()) { 6131 case AMDGPU::PHI: 6132 case AMDGPU::REG_SEQUENCE: 6133 case AMDGPU::INSERT_SUBREG: 6134 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC); 6135 break; 6136 default: 6137 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 6138 } 6139 6140 if (!NewDstRC) 6141 return nullptr; 6142 } else { 6143 if (RI.hasVGPRs(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass) 6144 return nullptr; 6145 6146 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 6147 if (!NewDstRC) 6148 return nullptr; 6149 } 6150 6151 return NewDstRC; 6152 } 6153 default: 6154 return NewDstRC; 6155 } 6156 } 6157 6158 // Find the one SGPR operand we are allowed to use. 6159 Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI, 6160 int OpIndices[3]) const { 6161 const MCInstrDesc &Desc = MI.getDesc(); 6162 6163 // Find the one SGPR operand we are allowed to use. 6164 // 6165 // First we need to consider the instruction's operand requirements before 6166 // legalizing. Some operands are required to be SGPRs, such as implicit uses 6167 // of VCC, but we are still bound by the constant bus requirement to only use 6168 // one. 6169 // 6170 // If the operand's class is an SGPR, we can never move it. 6171 6172 Register SGPRReg = findImplicitSGPRRead(MI); 6173 if (SGPRReg != AMDGPU::NoRegister) 6174 return SGPRReg; 6175 6176 Register UsedSGPRs[3] = { AMDGPU::NoRegister }; 6177 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 6178 6179 for (unsigned i = 0; i < 3; ++i) { 6180 int Idx = OpIndices[i]; 6181 if (Idx == -1) 6182 break; 6183 6184 const MachineOperand &MO = MI.getOperand(Idx); 6185 if (!MO.isReg()) 6186 continue; 6187 6188 // Is this operand statically required to be an SGPR based on the operand 6189 // constraints? 6190 const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); 6191 bool IsRequiredSGPR = RI.isSGPRClass(OpRC); 6192 if (IsRequiredSGPR) 6193 return MO.getReg(); 6194 6195 // If this could be a VGPR or an SGPR, Check the dynamic register class. 6196 Register Reg = MO.getReg(); 6197 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); 6198 if (RI.isSGPRClass(RegRC)) 6199 UsedSGPRs[i] = Reg; 6200 } 6201 6202 // We don't have a required SGPR operand, so we have a bit more freedom in 6203 // selecting operands to move. 6204 6205 // Try to select the most used SGPR. If an SGPR is equal to one of the 6206 // others, we choose that. 6207 // 6208 // e.g. 6209 // V_FMA_F32 v0, s0, s0, s0 -> No moves 6210 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 6211 6212 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should 6213 // prefer those. 6214 6215 if (UsedSGPRs[0] != AMDGPU::NoRegister) { 6216 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 6217 SGPRReg = UsedSGPRs[0]; 6218 } 6219 6220 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { 6221 if (UsedSGPRs[1] == UsedSGPRs[2]) 6222 SGPRReg = UsedSGPRs[1]; 6223 } 6224 6225 return SGPRReg; 6226 } 6227 6228 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 6229 unsigned OperandName) const { 6230 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 6231 if (Idx == -1) 6232 return nullptr; 6233 6234 return &MI.getOperand(Idx); 6235 } 6236 6237 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 6238 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 6239 return (22ULL << 44) | // IMG_FORMAT_32_FLOAT 6240 (1ULL << 56) | // RESOURCE_LEVEL = 1 6241 (3ULL << 60); // OOB_SELECT = 3 6242 } 6243 6244 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 6245 if (ST.isAmdHsaOS()) { 6246 // Set ATC = 1. GFX9 doesn't have this bit. 6247 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) 6248 RsrcDataFormat |= (1ULL << 56); 6249 6250 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this. 6251 // BTW, it disables TC L2 and therefore decreases performance. 6252 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) 6253 RsrcDataFormat |= (2ULL << 59); 6254 } 6255 6256 return RsrcDataFormat; 6257 } 6258 6259 uint64_t SIInstrInfo::getScratchRsrcWords23() const { 6260 uint64_t Rsrc23 = getDefaultRsrcDataFormat() | 6261 AMDGPU::RSRC_TID_ENABLE | 6262 0xffffffff; // Size; 6263 6264 // GFX9 doesn't have ELEMENT_SIZE. 6265 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 6266 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; 6267 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; 6268 } 6269 6270 // IndexStride = 64 / 32. 6271 uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2; 6272 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; 6273 6274 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. 6275 // Clear them unless we want a huge stride. 6276 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 6277 ST.getGeneration() <= AMDGPUSubtarget::GFX9) 6278 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; 6279 6280 return Rsrc23; 6281 } 6282 6283 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { 6284 unsigned Opc = MI.getOpcode(); 6285 6286 return isSMRD(Opc); 6287 } 6288 6289 bool SIInstrInfo::isHighLatencyDef(int Opc) const { 6290 return get(Opc).mayLoad() && 6291 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc)); 6292 } 6293 6294 unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, 6295 int &FrameIndex) const { 6296 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 6297 if (!Addr || !Addr->isFI()) 6298 return AMDGPU::NoRegister; 6299 6300 assert(!MI.memoperands_empty() && 6301 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS); 6302 6303 FrameIndex = Addr->getIndex(); 6304 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); 6305 } 6306 6307 unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, 6308 int &FrameIndex) const { 6309 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr); 6310 assert(Addr && Addr->isFI()); 6311 FrameIndex = Addr->getIndex(); 6312 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg(); 6313 } 6314 6315 unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 6316 int &FrameIndex) const { 6317 if (!MI.mayLoad()) 6318 return AMDGPU::NoRegister; 6319 6320 if (isMUBUF(MI) || isVGPRSpill(MI)) 6321 return isStackAccess(MI, FrameIndex); 6322 6323 if (isSGPRSpill(MI)) 6324 return isSGPRStackAccess(MI, FrameIndex); 6325 6326 return AMDGPU::NoRegister; 6327 } 6328 6329 unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI, 6330 int &FrameIndex) const { 6331 if (!MI.mayStore()) 6332 return AMDGPU::NoRegister; 6333 6334 if (isMUBUF(MI) || isVGPRSpill(MI)) 6335 return isStackAccess(MI, FrameIndex); 6336 6337 if (isSGPRSpill(MI)) 6338 return isSGPRStackAccess(MI, FrameIndex); 6339 6340 return AMDGPU::NoRegister; 6341 } 6342 6343 unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const { 6344 unsigned Size = 0; 6345 MachineBasicBlock::const_instr_iterator I = MI.getIterator(); 6346 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); 6347 while (++I != E && I->isInsideBundle()) { 6348 assert(!I->isBundle() && "No nested bundle!"); 6349 Size += getInstSizeInBytes(*I); 6350 } 6351 6352 return Size; 6353 } 6354 6355 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 6356 unsigned Opc = MI.getOpcode(); 6357 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); 6358 unsigned DescSize = Desc.getSize(); 6359 6360 // If we have a definitive size, we can use it. Otherwise we need to inspect 6361 // the operands to know the size. 6362 if (isFixedSize(MI)) 6363 return DescSize; 6364 6365 // 4-byte instructions may have a 32-bit literal encoded after them. Check 6366 // operands that coud ever be literals. 6367 if (isVALU(MI) || isSALU(MI)) { 6368 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 6369 if (Src0Idx == -1) 6370 return DescSize; // No operands. 6371 6372 if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx])) 6373 return isVOP3(MI) ? 12 : (DescSize + 4); 6374 6375 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 6376 if (Src1Idx == -1) 6377 return DescSize; 6378 6379 if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx])) 6380 return isVOP3(MI) ? 12 : (DescSize + 4); 6381 6382 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 6383 if (Src2Idx == -1) 6384 return DescSize; 6385 6386 if (isLiteralConstantLike(MI.getOperand(Src2Idx), Desc.OpInfo[Src2Idx])) 6387 return isVOP3(MI) ? 12 : (DescSize + 4); 6388 6389 return DescSize; 6390 } 6391 6392 // Check whether we have extra NSA words. 6393 if (isMIMG(MI)) { 6394 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 6395 if (VAddr0Idx < 0) 6396 return 8; 6397 6398 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 6399 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4); 6400 } 6401 6402 switch (Opc) { 6403 case TargetOpcode::IMPLICIT_DEF: 6404 case TargetOpcode::KILL: 6405 case TargetOpcode::DBG_VALUE: 6406 case TargetOpcode::EH_LABEL: 6407 return 0; 6408 case TargetOpcode::BUNDLE: 6409 return getInstBundleSize(MI); 6410 case TargetOpcode::INLINEASM: 6411 case TargetOpcode::INLINEASM_BR: { 6412 const MachineFunction *MF = MI.getParent()->getParent(); 6413 const char *AsmStr = MI.getOperand(0).getSymbolName(); 6414 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), 6415 &MF->getSubtarget()); 6416 } 6417 default: 6418 return DescSize; 6419 } 6420 } 6421 6422 bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { 6423 if (!isFLAT(MI)) 6424 return false; 6425 6426 if (MI.memoperands_empty()) 6427 return true; 6428 6429 for (const MachineMemOperand *MMO : MI.memoperands()) { 6430 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) 6431 return true; 6432 } 6433 return false; 6434 } 6435 6436 bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const { 6437 return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO; 6438 } 6439 6440 void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, 6441 MachineBasicBlock *IfEnd) const { 6442 MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator(); 6443 assert(TI != IfEntry->end()); 6444 6445 MachineInstr *Branch = &(*TI); 6446 MachineFunction *MF = IfEntry->getParent(); 6447 MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo(); 6448 6449 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 6450 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); 6451 MachineInstr *SIIF = 6452 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg) 6453 .add(Branch->getOperand(0)) 6454 .add(Branch->getOperand(1)); 6455 MachineInstr *SIEND = 6456 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF)) 6457 .addReg(DstReg); 6458 6459 IfEntry->erase(TI); 6460 IfEntry->insert(IfEntry->end(), SIIF); 6461 IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND); 6462 } 6463 } 6464 6465 void SIInstrInfo::convertNonUniformLoopRegion( 6466 MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const { 6467 MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator(); 6468 // We expect 2 terminators, one conditional and one unconditional. 6469 assert(TI != LoopEnd->end()); 6470 6471 MachineInstr *Branch = &(*TI); 6472 MachineFunction *MF = LoopEnd->getParent(); 6473 MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo(); 6474 6475 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 6476 6477 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); 6478 Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC()); 6479 MachineInstrBuilder HeaderPHIBuilder = 6480 BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg); 6481 for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(), 6482 E = LoopEntry->pred_end(); 6483 PI != E; ++PI) { 6484 if (*PI == LoopEnd) { 6485 HeaderPHIBuilder.addReg(BackEdgeReg); 6486 } else { 6487 MachineBasicBlock *PMBB = *PI; 6488 Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC()); 6489 materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(), 6490 ZeroReg, 0); 6491 HeaderPHIBuilder.addReg(ZeroReg); 6492 } 6493 HeaderPHIBuilder.addMBB(*PI); 6494 } 6495 MachineInstr *HeaderPhi = HeaderPHIBuilder; 6496 MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(), 6497 get(AMDGPU::SI_IF_BREAK), BackEdgeReg) 6498 .addReg(DstReg) 6499 .add(Branch->getOperand(0)); 6500 MachineInstr *SILOOP = 6501 BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP)) 6502 .addReg(BackEdgeReg) 6503 .addMBB(LoopEntry); 6504 6505 LoopEntry->insert(LoopEntry->begin(), HeaderPhi); 6506 LoopEnd->erase(TI); 6507 LoopEnd->insert(LoopEnd->end(), SIIFBREAK); 6508 LoopEnd->insert(LoopEnd->end(), SILOOP); 6509 } 6510 } 6511 6512 ArrayRef<std::pair<int, const char *>> 6513 SIInstrInfo::getSerializableTargetIndices() const { 6514 static const std::pair<int, const char *> TargetIndices[] = { 6515 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, 6516 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, 6517 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, 6518 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, 6519 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; 6520 return makeArrayRef(TargetIndices); 6521 } 6522 6523 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The 6524 /// post-RA version of misched uses CreateTargetMIHazardRecognizer. 6525 ScheduleHazardRecognizer * 6526 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, 6527 const ScheduleDAG *DAG) const { 6528 return new GCNHazardRecognizer(DAG->MF); 6529 } 6530 6531 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer 6532 /// pass. 6533 ScheduleHazardRecognizer * 6534 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { 6535 return new GCNHazardRecognizer(MF); 6536 } 6537 6538 std::pair<unsigned, unsigned> 6539 SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { 6540 return std::make_pair(TF & MO_MASK, TF & ~MO_MASK); 6541 } 6542 6543 ArrayRef<std::pair<unsigned, const char *>> 6544 SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { 6545 static const std::pair<unsigned, const char *> TargetFlags[] = { 6546 { MO_GOTPCREL, "amdgpu-gotprel" }, 6547 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" }, 6548 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" }, 6549 { MO_REL32_LO, "amdgpu-rel32-lo" }, 6550 { MO_REL32_HI, "amdgpu-rel32-hi" }, 6551 { MO_ABS32_LO, "amdgpu-abs32-lo" }, 6552 { MO_ABS32_HI, "amdgpu-abs32-hi" }, 6553 }; 6554 6555 return makeArrayRef(TargetFlags); 6556 } 6557 6558 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { 6559 return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && 6560 MI.modifiesRegister(AMDGPU::EXEC, &RI); 6561 } 6562 6563 MachineInstrBuilder 6564 SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, 6565 MachineBasicBlock::iterator I, 6566 const DebugLoc &DL, 6567 Register DestReg) const { 6568 if (ST.hasAddNoCarry()) 6569 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg); 6570 6571 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6572 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC()); 6573 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC()); 6574 6575 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) 6576 .addReg(UnusedCarry, RegState::Define | RegState::Dead); 6577 } 6578 6579 MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, 6580 MachineBasicBlock::iterator I, 6581 const DebugLoc &DL, 6582 Register DestReg, 6583 RegScavenger &RS) const { 6584 if (ST.hasAddNoCarry()) 6585 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg); 6586 6587 // If available, prefer to use vcc. 6588 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC) 6589 ? Register(RI.getVCC()) 6590 : RS.scavengeRegister(RI.getBoolRC(), I, 0, false); 6591 6592 // TODO: Users need to deal with this. 6593 if (!UnusedCarry.isValid()) 6594 return MachineInstrBuilder(); 6595 6596 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) 6597 .addReg(UnusedCarry, RegState::Define | RegState::Dead); 6598 } 6599 6600 bool SIInstrInfo::isKillTerminator(unsigned Opcode) { 6601 switch (Opcode) { 6602 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: 6603 case AMDGPU::SI_KILL_I1_TERMINATOR: 6604 return true; 6605 default: 6606 return false; 6607 } 6608 } 6609 6610 const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const { 6611 switch (Opcode) { 6612 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: 6613 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR); 6614 case AMDGPU::SI_KILL_I1_PSEUDO: 6615 return get(AMDGPU::SI_KILL_I1_TERMINATOR); 6616 default: 6617 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO"); 6618 } 6619 } 6620 6621 void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const { 6622 MachineBasicBlock *MBB = MI.getParent(); 6623 MachineFunction *MF = MBB->getParent(); 6624 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 6625 6626 if (!ST.isWave32()) 6627 return; 6628 6629 for (auto &Op : MI.implicit_operands()) { 6630 if (Op.isReg() && Op.getReg() == AMDGPU::VCC) 6631 Op.setReg(AMDGPU::VCC_LO); 6632 } 6633 } 6634 6635 bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { 6636 if (!isSMRD(MI)) 6637 return false; 6638 6639 // Check that it is using a buffer resource. 6640 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase); 6641 if (Idx == -1) // e.g. s_memtime 6642 return false; 6643 6644 const auto RCID = MI.getDesc().OpInfo[Idx].RegClass; 6645 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass); 6646 } 6647 6648 unsigned SIInstrInfo::getNumFlatOffsetBits(unsigned AddrSpace, 6649 bool Signed) const { 6650 if (!ST.hasFlatInstOffsets()) 6651 return 0; 6652 6653 if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS) 6654 return 0; 6655 6656 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) 6657 return Signed ? 12 : 11; 6658 6659 return Signed ? 13 : 12; 6660 } 6661 6662 bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, 6663 bool Signed) const { 6664 // TODO: Should 0 be special cased? 6665 if (!ST.hasFlatInstOffsets()) 6666 return false; 6667 6668 if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS) 6669 return false; 6670 6671 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 6672 return (Signed && isInt<12>(Offset)) || 6673 (!Signed && isUInt<11>(Offset)); 6674 } 6675 6676 return (Signed && isInt<13>(Offset)) || 6677 (!Signed && isUInt<12>(Offset)); 6678 } 6679 6680 6681 // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td 6682 enum SIEncodingFamily { 6683 SI = 0, 6684 VI = 1, 6685 SDWA = 2, 6686 SDWA9 = 3, 6687 GFX80 = 4, 6688 GFX9 = 5, 6689 GFX10 = 6, 6690 SDWA10 = 7 6691 }; 6692 6693 static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) { 6694 switch (ST.getGeneration()) { 6695 default: 6696 break; 6697 case AMDGPUSubtarget::SOUTHERN_ISLANDS: 6698 case AMDGPUSubtarget::SEA_ISLANDS: 6699 return SIEncodingFamily::SI; 6700 case AMDGPUSubtarget::VOLCANIC_ISLANDS: 6701 case AMDGPUSubtarget::GFX9: 6702 return SIEncodingFamily::VI; 6703 case AMDGPUSubtarget::GFX10: 6704 return SIEncodingFamily::GFX10; 6705 } 6706 llvm_unreachable("Unknown subtarget generation!"); 6707 } 6708 6709 bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const { 6710 switch(MCOp) { 6711 // These opcodes use indirect register addressing so 6712 // they need special handling by codegen (currently missing). 6713 // Therefore it is too risky to allow these opcodes 6714 // to be selected by dpp combiner or sdwa peepholer. 6715 case AMDGPU::V_MOVRELS_B32_dpp_gfx10: 6716 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10: 6717 case AMDGPU::V_MOVRELD_B32_dpp_gfx10: 6718 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10: 6719 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10: 6720 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10: 6721 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10: 6722 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10: 6723 return true; 6724 default: 6725 return false; 6726 } 6727 } 6728 6729 int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { 6730 SIEncodingFamily Gen = subtargetEncodingFamily(ST); 6731 6732 if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 && 6733 ST.getGeneration() == AMDGPUSubtarget::GFX9) 6734 Gen = SIEncodingFamily::GFX9; 6735 6736 // Adjust the encoding family to GFX80 for D16 buffer instructions when the 6737 // subtarget has UnpackedD16VMem feature. 6738 // TODO: remove this when we discard GFX80 encoding. 6739 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf)) 6740 Gen = SIEncodingFamily::GFX80; 6741 6742 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) { 6743 switch (ST.getGeneration()) { 6744 default: 6745 Gen = SIEncodingFamily::SDWA; 6746 break; 6747 case AMDGPUSubtarget::GFX9: 6748 Gen = SIEncodingFamily::SDWA9; 6749 break; 6750 case AMDGPUSubtarget::GFX10: 6751 Gen = SIEncodingFamily::SDWA10; 6752 break; 6753 } 6754 } 6755 6756 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); 6757 6758 // -1 means that Opcode is already a native instruction. 6759 if (MCOp == -1) 6760 return Opcode; 6761 6762 // (uint16_t)-1 means that Opcode is a pseudo instruction that has 6763 // no encoding in the given subtarget generation. 6764 if (MCOp == (uint16_t)-1) 6765 return -1; 6766 6767 if (isAsmOnlyOpcode(MCOp)) 6768 return -1; 6769 6770 return MCOp; 6771 } 6772 6773 static 6774 TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) { 6775 assert(RegOpnd.isReg()); 6776 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() : 6777 getRegSubRegPair(RegOpnd); 6778 } 6779 6780 TargetInstrInfo::RegSubRegPair 6781 llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) { 6782 assert(MI.isRegSequence()); 6783 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I) 6784 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) { 6785 auto &RegOp = MI.getOperand(1 + 2 * I); 6786 return getRegOrUndef(RegOp); 6787 } 6788 return TargetInstrInfo::RegSubRegPair(); 6789 } 6790 6791 // Try to find the definition of reg:subreg in subreg-manipulation pseudos 6792 // Following a subreg of reg:subreg isn't supported 6793 static bool followSubRegDef(MachineInstr &MI, 6794 TargetInstrInfo::RegSubRegPair &RSR) { 6795 if (!RSR.SubReg) 6796 return false; 6797 switch (MI.getOpcode()) { 6798 default: break; 6799 case AMDGPU::REG_SEQUENCE: 6800 RSR = getRegSequenceSubReg(MI, RSR.SubReg); 6801 return true; 6802 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg 6803 case AMDGPU::INSERT_SUBREG: 6804 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm()) 6805 // inserted the subreg we're looking for 6806 RSR = getRegOrUndef(MI.getOperand(2)); 6807 else { // the subreg in the rest of the reg 6808 auto R1 = getRegOrUndef(MI.getOperand(1)); 6809 if (R1.SubReg) // subreg of subreg isn't supported 6810 return false; 6811 RSR.Reg = R1.Reg; 6812 } 6813 return true; 6814 } 6815 return false; 6816 } 6817 6818 MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, 6819 MachineRegisterInfo &MRI) { 6820 assert(MRI.isSSA()); 6821 if (!Register::isVirtualRegister(P.Reg)) 6822 return nullptr; 6823 6824 auto RSR = P; 6825 auto *DefInst = MRI.getVRegDef(RSR.Reg); 6826 while (auto *MI = DefInst) { 6827 DefInst = nullptr; 6828 switch (MI->getOpcode()) { 6829 case AMDGPU::COPY: 6830 case AMDGPU::V_MOV_B32_e32: { 6831 auto &Op1 = MI->getOperand(1); 6832 if (Op1.isReg() && Register::isVirtualRegister(Op1.getReg())) { 6833 if (Op1.isUndef()) 6834 return nullptr; 6835 RSR = getRegSubRegPair(Op1); 6836 DefInst = MRI.getVRegDef(RSR.Reg); 6837 } 6838 break; 6839 } 6840 default: 6841 if (followSubRegDef(*MI, RSR)) { 6842 if (!RSR.Reg) 6843 return nullptr; 6844 DefInst = MRI.getVRegDef(RSR.Reg); 6845 } 6846 } 6847 if (!DefInst) 6848 return MI; 6849 } 6850 return nullptr; 6851 } 6852 6853 bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, 6854 Register VReg, 6855 const MachineInstr &DefMI, 6856 const MachineInstr &UseMI) { 6857 assert(MRI.isSSA() && "Must be run on SSA"); 6858 6859 auto *TRI = MRI.getTargetRegisterInfo(); 6860 auto *DefBB = DefMI.getParent(); 6861 6862 // Don't bother searching between blocks, although it is possible this block 6863 // doesn't modify exec. 6864 if (UseMI.getParent() != DefBB) 6865 return true; 6866 6867 const int MaxInstScan = 20; 6868 int NumInst = 0; 6869 6870 // Stop scan at the use. 6871 auto E = UseMI.getIterator(); 6872 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) { 6873 if (I->isDebugInstr()) 6874 continue; 6875 6876 if (++NumInst > MaxInstScan) 6877 return true; 6878 6879 if (I->modifiesRegister(AMDGPU::EXEC, TRI)) 6880 return true; 6881 } 6882 6883 return false; 6884 } 6885 6886 bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, 6887 Register VReg, 6888 const MachineInstr &DefMI) { 6889 assert(MRI.isSSA() && "Must be run on SSA"); 6890 6891 auto *TRI = MRI.getTargetRegisterInfo(); 6892 auto *DefBB = DefMI.getParent(); 6893 6894 const int MaxUseInstScan = 10; 6895 int NumUseInst = 0; 6896 6897 for (auto &UseInst : MRI.use_nodbg_instructions(VReg)) { 6898 // Don't bother searching between blocks, although it is possible this block 6899 // doesn't modify exec. 6900 if (UseInst.getParent() != DefBB) 6901 return true; 6902 6903 if (++NumUseInst > MaxUseInstScan) 6904 return true; 6905 } 6906 6907 const int MaxInstScan = 20; 6908 int NumInst = 0; 6909 6910 // Stop scan when we have seen all the uses. 6911 for (auto I = std::next(DefMI.getIterator()); ; ++I) { 6912 if (I->isDebugInstr()) 6913 continue; 6914 6915 if (++NumInst > MaxInstScan) 6916 return true; 6917 6918 if (I->readsRegister(VReg)) 6919 if (--NumUseInst == 0) 6920 return false; 6921 6922 if (I->modifiesRegister(AMDGPU::EXEC, TRI)) 6923 return true; 6924 } 6925 } 6926 6927 MachineInstr *SIInstrInfo::createPHIDestinationCopy( 6928 MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt, 6929 const DebugLoc &DL, Register Src, Register Dst) const { 6930 auto Cur = MBB.begin(); 6931 if (Cur != MBB.end()) 6932 do { 6933 if (!Cur->isPHI() && Cur->readsRegister(Dst)) 6934 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src); 6935 ++Cur; 6936 } while (Cur != MBB.end() && Cur != LastPHIIt); 6937 6938 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src, 6939 Dst); 6940 } 6941 6942 MachineInstr *SIInstrInfo::createPHISourceCopy( 6943 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, 6944 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const { 6945 if (InsPt != MBB.end() && 6946 (InsPt->getOpcode() == AMDGPU::SI_IF || 6947 InsPt->getOpcode() == AMDGPU::SI_ELSE || 6948 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) && 6949 InsPt->definesRegister(Src)) { 6950 InsPt++; 6951 return BuildMI(MBB, InsPt, DL, 6952 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term 6953 : AMDGPU::S_MOV_B64_term), 6954 Dst) 6955 .addReg(Src, 0, SrcSubReg) 6956 .addReg(AMDGPU::EXEC, RegState::Implicit); 6957 } 6958 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg, 6959 Dst); 6960 } 6961 6962 bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); } 6963 6964 MachineInstr *SIInstrInfo::foldMemoryOperandImpl( 6965 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, 6966 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS, 6967 VirtRegMap *VRM) const { 6968 // This is a bit of a hack (copied from AArch64). Consider this instruction: 6969 // 6970 // %0:sreg_32 = COPY $m0 6971 // 6972 // We explicitly chose SReg_32 for the virtual register so such a copy might 6973 // be eliminated by RegisterCoalescer. However, that may not be possible, and 6974 // %0 may even spill. We can't spill $m0 normally (it would require copying to 6975 // a numbered SGPR anyway), and since it is in the SReg_32 register class, 6976 // TargetInstrInfo::foldMemoryOperand() is going to try. 6977 // 6978 // To prevent that, constrain the %0 register class here. 6979 if (MI.isFullCopy()) { 6980 Register DstReg = MI.getOperand(0).getReg(); 6981 Register SrcReg = MI.getOperand(1).getReg(); 6982 6983 if (DstReg == AMDGPU::M0 && SrcReg.isVirtual()) { 6984 MF.getRegInfo().constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); 6985 return nullptr; 6986 } 6987 6988 if (SrcReg == AMDGPU::M0 && DstReg.isVirtual()) { 6989 MF.getRegInfo().constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass); 6990 return nullptr; 6991 } 6992 } 6993 6994 return nullptr; 6995 } 6996 6997 unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, 6998 const MachineInstr &MI, 6999 unsigned *PredCost) const { 7000 if (MI.isBundle()) { 7001 MachineBasicBlock::const_instr_iterator I(MI.getIterator()); 7002 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end()); 7003 unsigned Lat = 0, Count = 0; 7004 for (++I; I != E && I->isBundledWithPred(); ++I) { 7005 ++Count; 7006 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I)); 7007 } 7008 return Lat + Count - 1; 7009 } 7010 7011 return SchedModel.computeInstrLatency(&MI); 7012 } 7013