1 //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI Implementation of TargetInstrInfo. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIInstrInfo.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUSubtarget.h" 17 #include "GCNHazardRecognizer.h" 18 #include "SIDefines.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "SIRegisterInfo.h" 21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/APInt.h" 24 #include "llvm/ADT/ArrayRef.h" 25 #include "llvm/ADT/SmallVector.h" 26 #include "llvm/ADT/StringRef.h" 27 #include "llvm/ADT/iterator_range.h" 28 #include "llvm/Analysis/AliasAnalysis.h" 29 #include "llvm/Analysis/MemoryLocation.h" 30 #include "llvm/Analysis/ValueTracking.h" 31 #include "llvm/CodeGen/MachineBasicBlock.h" 32 #include "llvm/CodeGen/MachineDominators.h" 33 #include "llvm/CodeGen/MachineFrameInfo.h" 34 #include "llvm/CodeGen/MachineFunction.h" 35 #include "llvm/CodeGen/MachineInstr.h" 36 #include "llvm/CodeGen/MachineInstrBuilder.h" 37 #include "llvm/CodeGen/MachineInstrBundle.h" 38 #include "llvm/CodeGen/MachineMemOperand.h" 39 #include "llvm/CodeGen/MachineOperand.h" 40 #include "llvm/CodeGen/MachineRegisterInfo.h" 41 #include "llvm/CodeGen/RegisterScavenging.h" 42 #include "llvm/CodeGen/ScheduleDAG.h" 43 #include "llvm/CodeGen/SelectionDAGNodes.h" 44 #include "llvm/CodeGen/TargetOpcodes.h" 45 #include "llvm/CodeGen/TargetRegisterInfo.h" 46 #include "llvm/IR/DebugLoc.h" 47 #include "llvm/IR/DiagnosticInfo.h" 48 #include "llvm/IR/Function.h" 49 #include "llvm/IR/InlineAsm.h" 50 #include "llvm/IR/LLVMContext.h" 51 #include "llvm/MC/MCInstrDesc.h" 52 #include "llvm/Support/Casting.h" 53 #include "llvm/Support/CommandLine.h" 54 #include "llvm/Support/Compiler.h" 55 #include "llvm/Support/ErrorHandling.h" 56 #include "llvm/Support/MachineValueType.h" 57 #include "llvm/Support/MathExtras.h" 58 #include "llvm/Target/TargetMachine.h" 59 #include <cassert> 60 #include <cstdint> 61 #include <iterator> 62 #include <utility> 63 64 using namespace llvm; 65 66 #define GET_INSTRINFO_CTOR_DTOR 67 #include "AMDGPUGenInstrInfo.inc" 68 69 namespace llvm { 70 namespace AMDGPU { 71 #define GET_D16ImageDimIntrinsics_IMPL 72 #define GET_ImageDimIntrinsicTable_IMPL 73 #define GET_RsrcIntrinsics_IMPL 74 #include "AMDGPUGenSearchableTables.inc" 75 } 76 } 77 78 79 // Must be at least 4 to be able to branch over minimum unconditional branch 80 // code. This is only for making it possible to write reasonably small tests for 81 // long branches. 82 static cl::opt<unsigned> 83 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), 84 cl::desc("Restrict range of branch instructions (DEBUG)")); 85 86 static cl::opt<bool> Fix16BitCopies( 87 "amdgpu-fix-16-bit-physreg-copies", 88 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), 89 cl::init(true), 90 cl::ReallyHidden); 91 92 SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) 93 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), 94 RI(ST), ST(ST) { 95 SchedModel.init(&ST); 96 } 97 98 //===----------------------------------------------------------------------===// 99 // TargetInstrInfo callbacks 100 //===----------------------------------------------------------------------===// 101 102 static unsigned getNumOperandsNoGlue(SDNode *Node) { 103 unsigned N = Node->getNumOperands(); 104 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 105 --N; 106 return N; 107 } 108 109 /// Returns true if both nodes have the same value for the given 110 /// operand \p Op, or if both nodes do not have this operand. 111 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 112 unsigned Opc0 = N0->getMachineOpcode(); 113 unsigned Opc1 = N1->getMachineOpcode(); 114 115 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 116 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 117 118 if (Op0Idx == -1 && Op1Idx == -1) 119 return true; 120 121 122 if ((Op0Idx == -1 && Op1Idx != -1) || 123 (Op1Idx == -1 && Op0Idx != -1)) 124 return false; 125 126 // getNamedOperandIdx returns the index for the MachineInstr's operands, 127 // which includes the result as the first operand. We are indexing into the 128 // MachineSDNode's operands, so we need to skip the result operand to get 129 // the real index. 130 --Op0Idx; 131 --Op1Idx; 132 133 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 134 } 135 136 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, 137 AliasAnalysis *AA) const { 138 // TODO: The generic check fails for VALU instructions that should be 139 // rematerializable due to implicit reads of exec. We really want all of the 140 // generic logic for this except for this. 141 switch (MI.getOpcode()) { 142 case AMDGPU::V_MOV_B32_e32: 143 case AMDGPU::V_MOV_B32_e64: 144 case AMDGPU::V_MOV_B64_PSEUDO: 145 // No implicit operands. 146 return MI.getNumOperands() == MI.getDesc().getNumOperands(); 147 default: 148 return false; 149 } 150 } 151 152 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 153 int64_t &Offset0, 154 int64_t &Offset1) const { 155 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 156 return false; 157 158 unsigned Opc0 = Load0->getMachineOpcode(); 159 unsigned Opc1 = Load1->getMachineOpcode(); 160 161 // Make sure both are actually loads. 162 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 163 return false; 164 165 if (isDS(Opc0) && isDS(Opc1)) { 166 167 // FIXME: Handle this case: 168 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 169 return false; 170 171 // Check base reg. 172 if (Load0->getOperand(0) != Load1->getOperand(0)) 173 return false; 174 175 // Skip read2 / write2 variants for simplicity. 176 // TODO: We should report true if the used offsets are adjacent (excluded 177 // st64 versions). 178 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 179 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 180 if (Offset0Idx == -1 || Offset1Idx == -1) 181 return false; 182 183 // XXX - be careful of datalesss loads 184 // getNamedOperandIdx returns the index for MachineInstrs. Since they 185 // include the output in the operand list, but SDNodes don't, we need to 186 // subtract the index by one. 187 Offset0Idx -= get(Opc0).NumDefs; 188 Offset1Idx -= get(Opc1).NumDefs; 189 Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue(); 190 Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue(); 191 return true; 192 } 193 194 if (isSMRD(Opc0) && isSMRD(Opc1)) { 195 // Skip time and cache invalidation instructions. 196 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 || 197 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1) 198 return false; 199 200 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); 201 202 // Check base reg. 203 if (Load0->getOperand(0) != Load1->getOperand(0)) 204 return false; 205 206 const ConstantSDNode *Load0Offset = 207 dyn_cast<ConstantSDNode>(Load0->getOperand(1)); 208 const ConstantSDNode *Load1Offset = 209 dyn_cast<ConstantSDNode>(Load1->getOperand(1)); 210 211 if (!Load0Offset || !Load1Offset) 212 return false; 213 214 Offset0 = Load0Offset->getZExtValue(); 215 Offset1 = Load1Offset->getZExtValue(); 216 return true; 217 } 218 219 // MUBUF and MTBUF can access the same addresses. 220 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 221 222 // MUBUF and MTBUF have vaddr at different indices. 223 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 224 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 225 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 226 return false; 227 228 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 229 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 230 231 if (OffIdx0 == -1 || OffIdx1 == -1) 232 return false; 233 234 // getNamedOperandIdx returns the index for MachineInstrs. Since they 235 // include the output in the operand list, but SDNodes don't, we need to 236 // subtract the index by one. 237 OffIdx0 -= get(Opc0).NumDefs; 238 OffIdx1 -= get(Opc1).NumDefs; 239 240 SDValue Off0 = Load0->getOperand(OffIdx0); 241 SDValue Off1 = Load1->getOperand(OffIdx1); 242 243 // The offset might be a FrameIndexSDNode. 244 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 245 return false; 246 247 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 248 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 249 return true; 250 } 251 252 return false; 253 } 254 255 static bool isStride64(unsigned Opc) { 256 switch (Opc) { 257 case AMDGPU::DS_READ2ST64_B32: 258 case AMDGPU::DS_READ2ST64_B64: 259 case AMDGPU::DS_WRITE2ST64_B32: 260 case AMDGPU::DS_WRITE2ST64_B64: 261 return true; 262 default: 263 return false; 264 } 265 } 266 267 bool SIInstrInfo::getMemOperandsWithOffsetWidth( 268 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, 269 int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, 270 const TargetRegisterInfo *TRI) const { 271 if (!LdSt.mayLoadOrStore()) 272 return false; 273 274 unsigned Opc = LdSt.getOpcode(); 275 OffsetIsScalable = false; 276 const MachineOperand *BaseOp, *OffsetOp; 277 int DataOpIdx; 278 279 if (isDS(LdSt)) { 280 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); 281 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); 282 if (OffsetOp) { 283 // Normal, single offset LDS instruction. 284 if (!BaseOp) { 285 // DS_CONSUME/DS_APPEND use M0 for the base address. 286 // TODO: find the implicit use operand for M0 and use that as BaseOp? 287 return false; 288 } 289 BaseOps.push_back(BaseOp); 290 Offset = OffsetOp->getImm(); 291 // Get appropriate operand, and compute width accordingly. 292 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 293 if (DataOpIdx == -1) 294 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 295 Width = getOpSize(LdSt, DataOpIdx); 296 } else { 297 // The 2 offset instructions use offset0 and offset1 instead. We can treat 298 // these as a load with a single offset if the 2 offsets are consecutive. 299 // We will use this for some partially aligned loads. 300 const MachineOperand *Offset0Op = 301 getNamedOperand(LdSt, AMDGPU::OpName::offset0); 302 const MachineOperand *Offset1Op = 303 getNamedOperand(LdSt, AMDGPU::OpName::offset1); 304 305 unsigned Offset0 = Offset0Op->getImm(); 306 unsigned Offset1 = Offset1Op->getImm(); 307 if (Offset0 + 1 != Offset1) 308 return false; 309 310 // Each of these offsets is in element sized units, so we need to convert 311 // to bytes of the individual reads. 312 313 unsigned EltSize; 314 if (LdSt.mayLoad()) 315 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16; 316 else { 317 assert(LdSt.mayStore()); 318 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 319 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8; 320 } 321 322 if (isStride64(Opc)) 323 EltSize *= 64; 324 325 BaseOps.push_back(BaseOp); 326 Offset = EltSize * Offset0; 327 // Get appropriate operand(s), and compute width accordingly. 328 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 329 if (DataOpIdx == -1) { 330 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 331 Width = getOpSize(LdSt, DataOpIdx); 332 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1); 333 Width += getOpSize(LdSt, DataOpIdx); 334 } else { 335 Width = getOpSize(LdSt, DataOpIdx); 336 } 337 } 338 return true; 339 } 340 341 if (isMUBUF(LdSt) || isMTBUF(LdSt)) { 342 const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset); 343 if (SOffset && SOffset->isReg()) { 344 // We can only handle this if it's a stack access, as any other resource 345 // would require reporting multiple base registers. 346 const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 347 if (AddrReg && !AddrReg->isFI()) 348 return false; 349 350 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); 351 const SIMachineFunctionInfo *MFI 352 = LdSt.getParent()->getParent()->getInfo<SIMachineFunctionInfo>(); 353 if (RSrc->getReg() != MFI->getScratchRSrcReg()) 354 return false; 355 356 const MachineOperand *OffsetImm = 357 getNamedOperand(LdSt, AMDGPU::OpName::offset); 358 BaseOps.push_back(RSrc); 359 BaseOps.push_back(SOffset); 360 Offset = OffsetImm->getImm(); 361 } else { 362 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); 363 if (!BaseOp) // e.g. BUFFER_WBINVL1_VOL 364 return false; 365 BaseOps.push_back(BaseOp); 366 367 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 368 if (BaseOp) 369 BaseOps.push_back(BaseOp); 370 371 const MachineOperand *OffsetImm = 372 getNamedOperand(LdSt, AMDGPU::OpName::offset); 373 Offset = OffsetImm->getImm(); 374 if (SOffset) // soffset can be an inline immediate. 375 Offset += SOffset->getImm(); 376 } 377 // Get appropriate operand, and compute width accordingly. 378 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 379 if (DataOpIdx == -1) 380 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); 381 Width = getOpSize(LdSt, DataOpIdx); 382 return true; 383 } 384 385 if (isMIMG(LdSt)) { 386 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 387 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx)); 388 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 389 if (VAddr0Idx >= 0) { 390 // GFX10 possible NSA encoding. 391 for (int I = VAddr0Idx; I < SRsrcIdx; ++I) 392 BaseOps.push_back(&LdSt.getOperand(I)); 393 } else { 394 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr)); 395 } 396 Offset = 0; 397 return true; 398 } 399 400 if (isSMRD(LdSt)) { 401 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase); 402 if (!BaseOp) // e.g. S_MEMTIME 403 return false; 404 BaseOps.push_back(BaseOp); 405 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); 406 Offset = OffsetOp ? OffsetOp->getImm() : 0; 407 // Get appropriate operand, and compute width accordingly. 408 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst); 409 Width = getOpSize(LdSt, DataOpIdx); 410 return true; 411 } 412 413 if (isFLAT(LdSt)) { 414 // Instructions have either vaddr or saddr or both. 415 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 416 if (BaseOp) 417 BaseOps.push_back(BaseOp); 418 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr); 419 if (BaseOp) 420 BaseOps.push_back(BaseOp); 421 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm(); 422 // Get appropriate operand, and compute width accordingly. 423 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 424 if (DataOpIdx == -1) 425 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); 426 Width = getOpSize(LdSt, DataOpIdx); 427 return true; 428 } 429 430 return false; 431 } 432 433 static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, 434 ArrayRef<const MachineOperand *> BaseOps1, 435 const MachineInstr &MI2, 436 ArrayRef<const MachineOperand *> BaseOps2) { 437 // Only examine the first "base" operand of each instruction, on the 438 // assumption that it represents the real base address of the memory access. 439 // Other operands are typically offsets or indices from this base address. 440 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front())) 441 return true; 442 443 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand()) 444 return false; 445 446 auto MO1 = *MI1.memoperands_begin(); 447 auto MO2 = *MI2.memoperands_begin(); 448 if (MO1->getAddrSpace() != MO2->getAddrSpace()) 449 return false; 450 451 auto Base1 = MO1->getValue(); 452 auto Base2 = MO2->getValue(); 453 if (!Base1 || !Base2) 454 return false; 455 const MachineFunction &MF = *MI1.getParent()->getParent(); 456 const DataLayout &DL = MF.getFunction().getParent()->getDataLayout(); 457 Base1 = GetUnderlyingObject(Base1, DL); 458 Base2 = GetUnderlyingObject(Base2, DL); 459 460 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2)) 461 return false; 462 463 return Base1 == Base2; 464 } 465 466 bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1, 467 ArrayRef<const MachineOperand *> BaseOps2, 468 unsigned NumLoads, 469 unsigned NumBytes) const { 470 assert(!BaseOps1.empty() && !BaseOps2.empty()); 471 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent(); 472 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent(); 473 474 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) 475 return false; 476 477 const MachineOperand *FirstDst = nullptr; 478 const MachineOperand *SecondDst = nullptr; 479 480 if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || 481 (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) || 482 (isMIMG(FirstLdSt) && isMIMG(SecondLdSt)) || 483 (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) { 484 const unsigned MaxGlobalLoadCluster = 7; 485 if (NumLoads > MaxGlobalLoadCluster) 486 return false; 487 488 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); 489 if (!FirstDst) 490 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); 491 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); 492 if (!SecondDst) 493 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); 494 } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { 495 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); 496 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); 497 } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) { 498 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); 499 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); 500 } 501 502 if (!FirstDst || !SecondDst) 503 return false; 504 505 // Try to limit clustering based on the total number of bytes loaded 506 // rather than the number of instructions. This is done to help reduce 507 // register pressure. The method used is somewhat inexact, though, 508 // because it assumes that all loads in the cluster will load the 509 // same number of bytes as FirstLdSt. 510 511 // The unit of this value is bytes. 512 // FIXME: This needs finer tuning. 513 unsigned LoadClusterThreshold = 16; 514 515 const MachineRegisterInfo &MRI = 516 FirstLdSt.getParent()->getParent()->getRegInfo(); 517 518 const Register Reg = FirstDst->getReg(); 519 520 const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg) 521 ? MRI.getRegClass(Reg) 522 : RI.getPhysRegClass(Reg); 523 524 // FIXME: NumLoads should not be subtracted 1. This is to match behavior 525 // of clusterNeighboringMemOps which was previosly passing cluster length 526 // less 1. LoadClusterThreshold should be tuned instead. 527 return ((NumLoads - 1) * (RI.getRegSizeInBits(*DstRC) / 8)) <= 528 LoadClusterThreshold; 529 } 530 531 // FIXME: This behaves strangely. If, for example, you have 32 load + stores, 532 // the first 16 loads will be interleaved with the stores, and the next 16 will 533 // be clustered as expected. It should really split into 2 16 store batches. 534 // 535 // Loads are clustered until this returns false, rather than trying to schedule 536 // groups of stores. This also means we have to deal with saying different 537 // address space loads should be clustered, and ones which might cause bank 538 // conflicts. 539 // 540 // This might be deprecated so it might not be worth that much effort to fix. 541 bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, 542 int64_t Offset0, int64_t Offset1, 543 unsigned NumLoads) const { 544 assert(Offset1 > Offset0 && 545 "Second offset should be larger than first offset!"); 546 // If we have less than 16 loads in a row, and the offsets are within 64 547 // bytes, then schedule together. 548 549 // A cacheline is 64 bytes (for global memory). 550 return (NumLoads <= 16 && (Offset1 - Offset0) < 64); 551 } 552 553 static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, 554 MachineBasicBlock::iterator MI, 555 const DebugLoc &DL, MCRegister DestReg, 556 MCRegister SrcReg, bool KillSrc, 557 const char *Msg = "illegal SGPR to VGPR copy") { 558 MachineFunction *MF = MBB.getParent(); 559 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error); 560 LLVMContext &C = MF->getFunction().getContext(); 561 C.diagnose(IllegalCopy); 562 563 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg) 564 .addReg(SrcReg, getKillRegState(KillSrc)); 565 } 566 567 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 568 MachineBasicBlock::iterator MI, 569 const DebugLoc &DL, MCRegister DestReg, 570 MCRegister SrcReg, bool KillSrc) const { 571 const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg); 572 573 // FIXME: This is hack to resolve copies between 16 bit and 32 bit 574 // registers until all patterns are fixed. 575 if (Fix16BitCopies && 576 ((RI.getRegSizeInBits(*RC) == 16) ^ 577 (RI.getRegSizeInBits(*RI.getPhysRegClass(SrcReg)) == 16))) { 578 MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg; 579 MCRegister Super = RI.get32BitRegister(RegToFix); 580 assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix); 581 RegToFix = Super; 582 583 if (DestReg == SrcReg) { 584 // Insert empty bundle since ExpandPostRA expects an instruction here. 585 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE)); 586 return; 587 } 588 589 RC = RI.getPhysRegClass(DestReg); 590 } 591 592 if (RC == &AMDGPU::VGPR_32RegClass) { 593 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 594 AMDGPU::SReg_32RegClass.contains(SrcReg) || 595 AMDGPU::AGPR_32RegClass.contains(SrcReg)); 596 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ? 597 AMDGPU::V_ACCVGPR_READ_B32 : AMDGPU::V_MOV_B32_e32; 598 BuildMI(MBB, MI, DL, get(Opc), DestReg) 599 .addReg(SrcReg, getKillRegState(KillSrc)); 600 return; 601 } 602 603 if (RC == &AMDGPU::SReg_32_XM0RegClass || 604 RC == &AMDGPU::SReg_32RegClass) { 605 if (SrcReg == AMDGPU::SCC) { 606 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) 607 .addImm(1) 608 .addImm(0); 609 return; 610 } 611 612 if (DestReg == AMDGPU::VCC_LO) { 613 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) { 614 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO) 615 .addReg(SrcReg, getKillRegState(KillSrc)); 616 } else { 617 // FIXME: Hack until VReg_1 removed. 618 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 619 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) 620 .addImm(0) 621 .addReg(SrcReg, getKillRegState(KillSrc)); 622 } 623 624 return; 625 } 626 627 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) { 628 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 629 return; 630 } 631 632 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 633 .addReg(SrcReg, getKillRegState(KillSrc)); 634 return; 635 } 636 637 if (RC == &AMDGPU::SReg_64RegClass) { 638 if (DestReg == AMDGPU::VCC) { 639 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 640 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 641 .addReg(SrcReg, getKillRegState(KillSrc)); 642 } else { 643 // FIXME: Hack until VReg_1 removed. 644 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 645 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) 646 .addImm(0) 647 .addReg(SrcReg, getKillRegState(KillSrc)); 648 } 649 650 return; 651 } 652 653 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) { 654 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 655 return; 656 } 657 658 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 659 .addReg(SrcReg, getKillRegState(KillSrc)); 660 return; 661 } 662 663 if (DestReg == AMDGPU::SCC) { 664 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 665 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) 666 .addReg(SrcReg, getKillRegState(KillSrc)) 667 .addImm(0); 668 return; 669 } 670 671 if (RC == &AMDGPU::AGPR_32RegClass) { 672 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 673 AMDGPU::SReg_32RegClass.contains(SrcReg) || 674 AMDGPU::AGPR_32RegClass.contains(SrcReg)); 675 if (!AMDGPU::VGPR_32RegClass.contains(SrcReg)) { 676 // First try to find defining accvgpr_write to avoid temporary registers. 677 for (auto Def = MI, E = MBB.begin(); Def != E; ) { 678 --Def; 679 if (!Def->definesRegister(SrcReg, &RI)) 680 continue; 681 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32) 682 break; 683 684 MachineOperand &DefOp = Def->getOperand(1); 685 assert(DefOp.isReg() || DefOp.isImm()); 686 687 if (DefOp.isReg()) { 688 // Check that register source operand if not clobbered before MI. 689 // Immediate operands are always safe to propagate. 690 bool SafeToPropagate = true; 691 for (auto I = Def; I != MI && SafeToPropagate; ++I) 692 if (I->modifiesRegister(DefOp.getReg(), &RI)) 693 SafeToPropagate = false; 694 695 if (!SafeToPropagate) 696 break; 697 698 DefOp.setIsKill(false); 699 } 700 701 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) 702 .add(DefOp); 703 return; 704 } 705 706 RegScavenger RS; 707 RS.enterBasicBlock(MBB); 708 RS.forward(MI); 709 710 // Ideally we want to have three registers for a long reg_sequence copy 711 // to hide 2 waitstates between v_mov_b32 and accvgpr_write. 712 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 713 *MBB.getParent()); 714 715 // Registers in the sequence are allocated contiguously so we can just 716 // use register number to pick one of three round-robin temps. 717 unsigned RegNo = DestReg % 3; 718 Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); 719 if (!Tmp) 720 report_fatal_error("Cannot scavenge VGPR to copy to AGPR"); 721 RS.setRegUsed(Tmp); 722 // Only loop through if there are any free registers left, otherwise 723 // scavenger may report a fatal error without emergency spill slot 724 // or spill with the slot. 725 while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { 726 unsigned Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); 727 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) 728 break; 729 Tmp = Tmp2; 730 RS.setRegUsed(Tmp); 731 } 732 copyPhysReg(MBB, MI, DL, Tmp, SrcReg, KillSrc); 733 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) 734 .addReg(Tmp, RegState::Kill); 735 return; 736 } 737 738 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) 739 .addReg(SrcReg, getKillRegState(KillSrc)); 740 return; 741 } 742 743 if (RI.getRegSizeInBits(*RC) == 16) { 744 assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || 745 AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || 746 AMDGPU::SReg_LO16RegClass.contains(SrcReg) || 747 AMDGPU::AGPR_LO16RegClass.contains(SrcReg)); 748 749 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg); 750 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg); 751 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg); 752 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg); 753 bool DstLow = AMDGPU::VGPR_LO16RegClass.contains(DestReg) || 754 AMDGPU::SReg_LO16RegClass.contains(DestReg) || 755 AMDGPU::AGPR_LO16RegClass.contains(DestReg); 756 bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || 757 AMDGPU::SReg_LO16RegClass.contains(SrcReg) || 758 AMDGPU::AGPR_LO16RegClass.contains(SrcReg); 759 MCRegister NewDestReg = RI.get32BitRegister(DestReg); 760 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg); 761 762 if (IsSGPRDst) { 763 if (!IsSGPRSrc) { 764 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 765 return; 766 } 767 768 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg) 769 .addReg(NewSrcReg, getKillRegState(KillSrc)); 770 return; 771 } 772 773 if (IsAGPRDst || IsAGPRSrc) { 774 if (!DstLow || !SrcLow) { 775 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, 776 "Cannot use hi16 subreg with an AGPR!"); 777 } 778 779 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc); 780 return; 781 } 782 783 if (IsSGPRSrc && !ST.hasSDWAScalar()) { 784 if (!DstLow || !SrcLow) { 785 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, 786 "Cannot use hi16 subreg on VI!"); 787 } 788 789 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg) 790 .addReg(NewSrcReg, getKillRegState(KillSrc)); 791 return; 792 } 793 794 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg) 795 .addImm(0) // src0_modifiers 796 .addReg(NewSrcReg) 797 .addImm(0) // clamp 798 .addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0 799 : AMDGPU::SDWA::SdwaSel::WORD_1) 800 .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) 801 .addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0 802 : AMDGPU::SDWA::SdwaSel::WORD_1) 803 .addReg(NewDestReg, RegState::Implicit | RegState::Undef); 804 // First implicit operand is $exec. 805 MIB->tieOperands(0, MIB->getNumOperands() - 1); 806 return; 807 } 808 809 unsigned EltSize = 4; 810 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 811 if (RI.isSGPRClass(RC)) { 812 // TODO: Copy vec3/vec5 with s_mov_b64s then final s_mov_b32. 813 if (!(RI.getRegSizeInBits(*RC) % 64)) { 814 Opcode = AMDGPU::S_MOV_B64; 815 EltSize = 8; 816 } else { 817 Opcode = AMDGPU::S_MOV_B32; 818 EltSize = 4; 819 } 820 821 if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) { 822 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 823 return; 824 } 825 } else if (RI.hasAGPRs(RC)) { 826 Opcode = RI.hasVGPRs(RI.getPhysRegClass(SrcReg)) ? 827 AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY; 828 } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) { 829 Opcode = AMDGPU::V_ACCVGPR_READ_B32; 830 } 831 832 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); 833 bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); 834 835 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 836 unsigned SubIdx; 837 if (Forward) 838 SubIdx = SubIndices[Idx]; 839 else 840 SubIdx = SubIndices[SubIndices.size() - Idx - 1]; 841 842 if (Opcode == TargetOpcode::COPY) { 843 copyPhysReg(MBB, MI, DL, RI.getSubReg(DestReg, SubIdx), 844 RI.getSubReg(SrcReg, SubIdx), KillSrc); 845 continue; 846 } 847 848 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 849 get(Opcode), RI.getSubReg(DestReg, SubIdx)); 850 851 Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); 852 853 if (Idx == 0) 854 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 855 856 bool UseKill = KillSrc && Idx == SubIndices.size() - 1; 857 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); 858 } 859 } 860 861 int SIInstrInfo::commuteOpcode(unsigned Opcode) const { 862 int NewOpc; 863 864 // Try to map original to commuted opcode 865 NewOpc = AMDGPU::getCommuteRev(Opcode); 866 if (NewOpc != -1) 867 // Check if the commuted (REV) opcode exists on the target. 868 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 869 870 // Try to map commuted to original opcode 871 NewOpc = AMDGPU::getCommuteOrig(Opcode); 872 if (NewOpc != -1) 873 // Check if the original (non-REV) opcode exists on the target. 874 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 875 876 return Opcode; 877 } 878 879 void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, 880 MachineBasicBlock::iterator MI, 881 const DebugLoc &DL, unsigned DestReg, 882 int64_t Value) const { 883 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 884 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg); 885 if (RegClass == &AMDGPU::SReg_32RegClass || 886 RegClass == &AMDGPU::SGPR_32RegClass || 887 RegClass == &AMDGPU::SReg_32_XM0RegClass || 888 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) { 889 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 890 .addImm(Value); 891 return; 892 } 893 894 if (RegClass == &AMDGPU::SReg_64RegClass || 895 RegClass == &AMDGPU::SGPR_64RegClass || 896 RegClass == &AMDGPU::SReg_64_XEXECRegClass) { 897 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 898 .addImm(Value); 899 return; 900 } 901 902 if (RegClass == &AMDGPU::VGPR_32RegClass) { 903 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 904 .addImm(Value); 905 return; 906 } 907 if (RegClass == &AMDGPU::VReg_64RegClass) { 908 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg) 909 .addImm(Value); 910 return; 911 } 912 913 unsigned EltSize = 4; 914 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 915 if (RI.isSGPRClass(RegClass)) { 916 if (RI.getRegSizeInBits(*RegClass) > 32) { 917 Opcode = AMDGPU::S_MOV_B64; 918 EltSize = 8; 919 } else { 920 Opcode = AMDGPU::S_MOV_B32; 921 EltSize = 4; 922 } 923 } 924 925 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize); 926 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 927 int64_t IdxValue = Idx == 0 ? Value : 0; 928 929 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 930 get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx])); 931 Builder.addImm(IdxValue); 932 } 933 } 934 935 const TargetRegisterClass * 936 SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const { 937 return &AMDGPU::VGPR_32RegClass; 938 } 939 940 void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, 941 MachineBasicBlock::iterator I, 942 const DebugLoc &DL, Register DstReg, 943 ArrayRef<MachineOperand> Cond, 944 Register TrueReg, 945 Register FalseReg) const { 946 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 947 MachineFunction *MF = MBB.getParent(); 948 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 949 const TargetRegisterClass *BoolXExecRC = 950 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 951 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && 952 "Not a VGPR32 reg"); 953 954 if (Cond.size() == 1) { 955 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 956 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 957 .add(Cond[0]); 958 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 959 .addImm(0) 960 .addReg(FalseReg) 961 .addImm(0) 962 .addReg(TrueReg) 963 .addReg(SReg); 964 } else if (Cond.size() == 2) { 965 assert(Cond[0].isImm() && "Cond[0] is not an immediate"); 966 switch (Cond[0].getImm()) { 967 case SIInstrInfo::SCC_TRUE: { 968 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 969 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 970 : AMDGPU::S_CSELECT_B64), SReg) 971 .addImm(1) 972 .addImm(0); 973 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 974 .addImm(0) 975 .addReg(FalseReg) 976 .addImm(0) 977 .addReg(TrueReg) 978 .addReg(SReg); 979 break; 980 } 981 case SIInstrInfo::SCC_FALSE: { 982 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 983 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 984 : AMDGPU::S_CSELECT_B64), SReg) 985 .addImm(0) 986 .addImm(1); 987 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 988 .addImm(0) 989 .addReg(FalseReg) 990 .addImm(0) 991 .addReg(TrueReg) 992 .addReg(SReg); 993 break; 994 } 995 case SIInstrInfo::VCCNZ: { 996 MachineOperand RegOp = Cond[1]; 997 RegOp.setImplicit(false); 998 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 999 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 1000 .add(RegOp); 1001 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1002 .addImm(0) 1003 .addReg(FalseReg) 1004 .addImm(0) 1005 .addReg(TrueReg) 1006 .addReg(SReg); 1007 break; 1008 } 1009 case SIInstrInfo::VCCZ: { 1010 MachineOperand RegOp = Cond[1]; 1011 RegOp.setImplicit(false); 1012 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1013 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 1014 .add(RegOp); 1015 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1016 .addImm(0) 1017 .addReg(TrueReg) 1018 .addImm(0) 1019 .addReg(FalseReg) 1020 .addReg(SReg); 1021 break; 1022 } 1023 case SIInstrInfo::EXECNZ: { 1024 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1025 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); 1026 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 1027 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 1028 .addImm(0); 1029 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 1030 : AMDGPU::S_CSELECT_B64), SReg) 1031 .addImm(1) 1032 .addImm(0); 1033 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1034 .addImm(0) 1035 .addReg(FalseReg) 1036 .addImm(0) 1037 .addReg(TrueReg) 1038 .addReg(SReg); 1039 break; 1040 } 1041 case SIInstrInfo::EXECZ: { 1042 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1043 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); 1044 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 1045 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 1046 .addImm(0); 1047 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 1048 : AMDGPU::S_CSELECT_B64), SReg) 1049 .addImm(0) 1050 .addImm(1); 1051 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1052 .addImm(0) 1053 .addReg(FalseReg) 1054 .addImm(0) 1055 .addReg(TrueReg) 1056 .addReg(SReg); 1057 llvm_unreachable("Unhandled branch predicate EXECZ"); 1058 break; 1059 } 1060 default: 1061 llvm_unreachable("invalid branch predicate"); 1062 } 1063 } else { 1064 llvm_unreachable("Can only handle Cond size 1 or 2"); 1065 } 1066 } 1067 1068 Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB, 1069 MachineBasicBlock::iterator I, 1070 const DebugLoc &DL, 1071 Register SrcReg, int Value) const { 1072 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1073 Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); 1074 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) 1075 .addImm(Value) 1076 .addReg(SrcReg); 1077 1078 return Reg; 1079 } 1080 1081 Register SIInstrInfo::insertNE(MachineBasicBlock *MBB, 1082 MachineBasicBlock::iterator I, 1083 const DebugLoc &DL, 1084 Register SrcReg, int Value) const { 1085 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1086 Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); 1087 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) 1088 .addImm(Value) 1089 .addReg(SrcReg); 1090 1091 return Reg; 1092 } 1093 1094 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 1095 1096 if (RI.hasAGPRs(DstRC)) 1097 return AMDGPU::COPY; 1098 if (RI.getRegSizeInBits(*DstRC) == 32) { 1099 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1100 } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { 1101 return AMDGPU::S_MOV_B64; 1102 } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) { 1103 return AMDGPU::V_MOV_B64_PSEUDO; 1104 } 1105 return AMDGPU::COPY; 1106 } 1107 1108 static unsigned getIndirectVGPRWritePseudoOpc(unsigned VecSize) { 1109 if (VecSize <= 32) // 4 bytes 1110 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V1; 1111 if (VecSize <= 64) // 8 bytes 1112 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V2; 1113 if (VecSize <= 96) // 12 bytes 1114 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V3; 1115 if (VecSize <= 128) // 16 bytes 1116 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V4; 1117 if (VecSize <= 160) // 20 bytes 1118 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V5; 1119 if (VecSize <= 256) // 32 bytes 1120 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V8; 1121 if (VecSize <= 512) // 64 bytes 1122 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V16; 1123 if (VecSize <= 1024) // 128 bytes 1124 return AMDGPU::V_INDIRECT_REG_WRITE_B32_V32; 1125 1126 llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); 1127 } 1128 1129 static unsigned getIndirectSGPRWritePseudo32(unsigned VecSize) { 1130 if (VecSize <= 32) // 4 bytes 1131 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V1; 1132 if (VecSize <= 64) // 8 bytes 1133 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V2; 1134 if (VecSize <= 96) // 12 bytes 1135 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V3; 1136 if (VecSize <= 128) // 16 bytes 1137 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V4; 1138 if (VecSize <= 160) // 20 bytes 1139 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V5; 1140 if (VecSize <= 256) // 32 bytes 1141 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V8; 1142 if (VecSize <= 512) // 64 bytes 1143 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V16; 1144 if (VecSize <= 1024) // 128 bytes 1145 return AMDGPU::S_INDIRECT_REG_WRITE_B32_V32; 1146 1147 llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); 1148 } 1149 1150 static unsigned getIndirectSGPRWritePseudo64(unsigned VecSize) { 1151 if (VecSize <= 64) // 8 bytes 1152 return AMDGPU::S_INDIRECT_REG_WRITE_B64_V1; 1153 if (VecSize <= 128) // 16 bytes 1154 return AMDGPU::S_INDIRECT_REG_WRITE_B64_V2; 1155 if (VecSize <= 256) // 32 bytes 1156 return AMDGPU::S_INDIRECT_REG_WRITE_B64_V4; 1157 if (VecSize <= 512) // 64 bytes 1158 return AMDGPU::S_INDIRECT_REG_WRITE_B64_V8; 1159 if (VecSize <= 1024) // 128 bytes 1160 return AMDGPU::S_INDIRECT_REG_WRITE_B64_V16; 1161 1162 llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); 1163 } 1164 1165 const MCInstrDesc &SIInstrInfo::getIndirectRegWritePseudo( 1166 unsigned VecSize, unsigned EltSize, bool IsSGPR) const { 1167 if (IsSGPR) { 1168 switch (EltSize) { 1169 case 32: 1170 return get(getIndirectSGPRWritePseudo32(VecSize)); 1171 case 64: 1172 return get(getIndirectSGPRWritePseudo64(VecSize)); 1173 default: 1174 llvm_unreachable("invalid reg indexing elt size"); 1175 } 1176 } 1177 1178 assert(EltSize == 32 && "invalid reg indexing elt size"); 1179 return get(getIndirectVGPRWritePseudoOpc(VecSize)); 1180 } 1181 1182 static unsigned getSGPRSpillSaveOpcode(unsigned Size) { 1183 switch (Size) { 1184 case 4: 1185 return AMDGPU::SI_SPILL_S32_SAVE; 1186 case 8: 1187 return AMDGPU::SI_SPILL_S64_SAVE; 1188 case 12: 1189 return AMDGPU::SI_SPILL_S96_SAVE; 1190 case 16: 1191 return AMDGPU::SI_SPILL_S128_SAVE; 1192 case 20: 1193 return AMDGPU::SI_SPILL_S160_SAVE; 1194 case 32: 1195 return AMDGPU::SI_SPILL_S256_SAVE; 1196 case 64: 1197 return AMDGPU::SI_SPILL_S512_SAVE; 1198 case 128: 1199 return AMDGPU::SI_SPILL_S1024_SAVE; 1200 default: 1201 llvm_unreachable("unknown register size"); 1202 } 1203 } 1204 1205 static unsigned getVGPRSpillSaveOpcode(unsigned Size) { 1206 switch (Size) { 1207 case 4: 1208 return AMDGPU::SI_SPILL_V32_SAVE; 1209 case 8: 1210 return AMDGPU::SI_SPILL_V64_SAVE; 1211 case 12: 1212 return AMDGPU::SI_SPILL_V96_SAVE; 1213 case 16: 1214 return AMDGPU::SI_SPILL_V128_SAVE; 1215 case 20: 1216 return AMDGPU::SI_SPILL_V160_SAVE; 1217 case 32: 1218 return AMDGPU::SI_SPILL_V256_SAVE; 1219 case 64: 1220 return AMDGPU::SI_SPILL_V512_SAVE; 1221 case 128: 1222 return AMDGPU::SI_SPILL_V1024_SAVE; 1223 default: 1224 llvm_unreachable("unknown register size"); 1225 } 1226 } 1227 1228 static unsigned getAGPRSpillSaveOpcode(unsigned Size) { 1229 switch (Size) { 1230 case 4: 1231 return AMDGPU::SI_SPILL_A32_SAVE; 1232 case 8: 1233 return AMDGPU::SI_SPILL_A64_SAVE; 1234 case 16: 1235 return AMDGPU::SI_SPILL_A128_SAVE; 1236 case 64: 1237 return AMDGPU::SI_SPILL_A512_SAVE; 1238 case 128: 1239 return AMDGPU::SI_SPILL_A1024_SAVE; 1240 default: 1241 llvm_unreachable("unknown register size"); 1242 } 1243 } 1244 1245 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 1246 MachineBasicBlock::iterator MI, 1247 Register SrcReg, bool isKill, 1248 int FrameIndex, 1249 const TargetRegisterClass *RC, 1250 const TargetRegisterInfo *TRI) const { 1251 MachineFunction *MF = MBB.getParent(); 1252 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1253 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1254 const DebugLoc &DL = MBB.findDebugLoc(MI); 1255 1256 MachinePointerInfo PtrInfo 1257 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 1258 MachineMemOperand *MMO = MF->getMachineMemOperand( 1259 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex), 1260 FrameInfo.getObjectAlign(FrameIndex)); 1261 unsigned SpillSize = TRI->getSpillSize(*RC); 1262 1263 if (RI.isSGPRClass(RC)) { 1264 MFI->setHasSpilledSGPRs(); 1265 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled"); 1266 1267 // We are only allowed to create one new instruction when spilling 1268 // registers, so we need to use pseudo instruction for spilling SGPRs. 1269 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize)); 1270 1271 // The SGPR spill/restore instructions only work on number sgprs, so we need 1272 // to make sure we are using the correct register class. 1273 if (Register::isVirtualRegister(SrcReg) && SpillSize == 4) { 1274 MachineRegisterInfo &MRI = MF->getRegInfo(); 1275 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); 1276 } 1277 1278 BuildMI(MBB, MI, DL, OpDesc) 1279 .addReg(SrcReg, getKillRegState(isKill)) // data 1280 .addFrameIndex(FrameIndex) // addr 1281 .addMemOperand(MMO) 1282 .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) 1283 .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); 1284 // Add the scratch resource registers as implicit uses because we may end up 1285 // needing them, and need to ensure that the reserved registers are 1286 // correctly handled. 1287 if (RI.spillSGPRToVGPR()) 1288 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); 1289 return; 1290 } 1291 1292 unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillSaveOpcode(SpillSize) 1293 : getVGPRSpillSaveOpcode(SpillSize); 1294 MFI->setHasSpilledVGPRs(); 1295 1296 auto MIB = BuildMI(MBB, MI, DL, get(Opcode)); 1297 if (RI.hasAGPRs(RC)) { 1298 MachineRegisterInfo &MRI = MF->getRegInfo(); 1299 Register Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1300 MIB.addReg(Tmp, RegState::Define); 1301 } 1302 MIB.addReg(SrcReg, getKillRegState(isKill)) // data 1303 .addFrameIndex(FrameIndex) // addr 1304 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 1305 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset 1306 .addImm(0) // offset 1307 .addMemOperand(MMO); 1308 } 1309 1310 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { 1311 switch (Size) { 1312 case 4: 1313 return AMDGPU::SI_SPILL_S32_RESTORE; 1314 case 8: 1315 return AMDGPU::SI_SPILL_S64_RESTORE; 1316 case 12: 1317 return AMDGPU::SI_SPILL_S96_RESTORE; 1318 case 16: 1319 return AMDGPU::SI_SPILL_S128_RESTORE; 1320 case 20: 1321 return AMDGPU::SI_SPILL_S160_RESTORE; 1322 case 32: 1323 return AMDGPU::SI_SPILL_S256_RESTORE; 1324 case 64: 1325 return AMDGPU::SI_SPILL_S512_RESTORE; 1326 case 128: 1327 return AMDGPU::SI_SPILL_S1024_RESTORE; 1328 default: 1329 llvm_unreachable("unknown register size"); 1330 } 1331 } 1332 1333 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { 1334 switch (Size) { 1335 case 4: 1336 return AMDGPU::SI_SPILL_V32_RESTORE; 1337 case 8: 1338 return AMDGPU::SI_SPILL_V64_RESTORE; 1339 case 12: 1340 return AMDGPU::SI_SPILL_V96_RESTORE; 1341 case 16: 1342 return AMDGPU::SI_SPILL_V128_RESTORE; 1343 case 20: 1344 return AMDGPU::SI_SPILL_V160_RESTORE; 1345 case 32: 1346 return AMDGPU::SI_SPILL_V256_RESTORE; 1347 case 64: 1348 return AMDGPU::SI_SPILL_V512_RESTORE; 1349 case 128: 1350 return AMDGPU::SI_SPILL_V1024_RESTORE; 1351 default: 1352 llvm_unreachable("unknown register size"); 1353 } 1354 } 1355 1356 static unsigned getAGPRSpillRestoreOpcode(unsigned Size) { 1357 switch (Size) { 1358 case 4: 1359 return AMDGPU::SI_SPILL_A32_RESTORE; 1360 case 8: 1361 return AMDGPU::SI_SPILL_A64_RESTORE; 1362 case 16: 1363 return AMDGPU::SI_SPILL_A128_RESTORE; 1364 case 64: 1365 return AMDGPU::SI_SPILL_A512_RESTORE; 1366 case 128: 1367 return AMDGPU::SI_SPILL_A1024_RESTORE; 1368 default: 1369 llvm_unreachable("unknown register size"); 1370 } 1371 } 1372 1373 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 1374 MachineBasicBlock::iterator MI, 1375 Register DestReg, int FrameIndex, 1376 const TargetRegisterClass *RC, 1377 const TargetRegisterInfo *TRI) const { 1378 MachineFunction *MF = MBB.getParent(); 1379 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1380 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1381 const DebugLoc &DL = MBB.findDebugLoc(MI); 1382 unsigned SpillSize = TRI->getSpillSize(*RC); 1383 1384 MachinePointerInfo PtrInfo 1385 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 1386 1387 MachineMemOperand *MMO = MF->getMachineMemOperand( 1388 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex), 1389 FrameInfo.getObjectAlign(FrameIndex)); 1390 1391 if (RI.isSGPRClass(RC)) { 1392 MFI->setHasSpilledSGPRs(); 1393 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into"); 1394 1395 // FIXME: Maybe this should not include a memoperand because it will be 1396 // lowered to non-memory instructions. 1397 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); 1398 if (DestReg.isVirtual() && SpillSize == 4) { 1399 MachineRegisterInfo &MRI = MF->getRegInfo(); 1400 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); 1401 } 1402 1403 if (RI.spillSGPRToVGPR()) 1404 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); 1405 BuildMI(MBB, MI, DL, OpDesc, DestReg) 1406 .addFrameIndex(FrameIndex) // addr 1407 .addMemOperand(MMO) 1408 .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) 1409 .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); 1410 return; 1411 } 1412 1413 unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize) 1414 : getVGPRSpillRestoreOpcode(SpillSize); 1415 auto MIB = BuildMI(MBB, MI, DL, get(Opcode), DestReg); 1416 if (RI.hasAGPRs(RC)) { 1417 MachineRegisterInfo &MRI = MF->getRegInfo(); 1418 Register Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1419 MIB.addReg(Tmp, RegState::Define); 1420 } 1421 MIB.addFrameIndex(FrameIndex) // vaddr 1422 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 1423 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset 1424 .addImm(0) // offset 1425 .addMemOperand(MMO); 1426 } 1427 1428 /// \param @Offset Offset in bytes of the FrameIndex being spilled 1429 unsigned SIInstrInfo::calculateLDSSpillAddress( 1430 MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg, 1431 unsigned FrameOffset, unsigned Size) const { 1432 MachineFunction *MF = MBB.getParent(); 1433 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1434 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 1435 const DebugLoc &DL = MBB.findDebugLoc(MI); 1436 unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); 1437 unsigned WavefrontSize = ST.getWavefrontSize(); 1438 1439 Register TIDReg = MFI->getTIDReg(); 1440 if (!MFI->hasCalculatedTID()) { 1441 MachineBasicBlock &Entry = MBB.getParent()->front(); 1442 MachineBasicBlock::iterator Insert = Entry.front(); 1443 const DebugLoc &DL = Insert->getDebugLoc(); 1444 1445 TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass, 1446 *MF); 1447 if (TIDReg == AMDGPU::NoRegister) 1448 return TIDReg; 1449 1450 if (!AMDGPU::isShader(MF->getFunction().getCallingConv()) && 1451 WorkGroupSize > WavefrontSize) { 1452 Register TIDIGXReg = 1453 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 1454 Register TIDIGYReg = 1455 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 1456 Register TIDIGZReg = 1457 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 1458 Register InputPtrReg = 1459 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1460 for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { 1461 if (!Entry.isLiveIn(Reg)) 1462 Entry.addLiveIn(Reg); 1463 } 1464 1465 RS->enterBasicBlock(Entry); 1466 // FIXME: Can we scavenge an SReg_64 and access the subregs? 1467 Register STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 1468 Register STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 1469 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) 1470 .addReg(InputPtrReg) 1471 .addImm(SI::KernelInputOffsets::NGROUPS_Z); 1472 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) 1473 .addReg(InputPtrReg) 1474 .addImm(SI::KernelInputOffsets::NGROUPS_Y); 1475 1476 // NGROUPS.X * NGROUPS.Y 1477 BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) 1478 .addReg(STmp1) 1479 .addReg(STmp0); 1480 // (NGROUPS.X * NGROUPS.Y) * TIDIG.X 1481 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) 1482 .addReg(STmp1) 1483 .addReg(TIDIGXReg); 1484 // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) 1485 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) 1486 .addReg(STmp0) 1487 .addReg(TIDIGYReg) 1488 .addReg(TIDReg); 1489 // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z 1490 getAddNoCarry(Entry, Insert, DL, TIDReg) 1491 .addReg(TIDReg) 1492 .addReg(TIDIGZReg) 1493 .addImm(0); // clamp bit 1494 } else { 1495 // Get the wave id 1496 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), 1497 TIDReg) 1498 .addImm(-1) 1499 .addImm(0); 1500 1501 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), 1502 TIDReg) 1503 .addImm(-1) 1504 .addReg(TIDReg); 1505 } 1506 1507 BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), 1508 TIDReg) 1509 .addImm(2) 1510 .addReg(TIDReg); 1511 MFI->setTIDReg(TIDReg); 1512 } 1513 1514 // Add FrameIndex to LDS offset 1515 unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize); 1516 getAddNoCarry(MBB, MI, DL, TmpReg) 1517 .addImm(LDSOffset) 1518 .addReg(TIDReg) 1519 .addImm(0); // clamp bit 1520 1521 return TmpReg; 1522 } 1523 1524 void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB, 1525 MachineBasicBlock::iterator MI, 1526 int Count) const { 1527 DebugLoc DL = MBB.findDebugLoc(MI); 1528 while (Count > 0) { 1529 int Arg; 1530 if (Count >= 8) 1531 Arg = 7; 1532 else 1533 Arg = Count - 1; 1534 Count -= 8; 1535 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)) 1536 .addImm(Arg); 1537 } 1538 } 1539 1540 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, 1541 MachineBasicBlock::iterator MI) const { 1542 insertWaitStates(MBB, MI, 1); 1543 } 1544 1545 void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { 1546 auto MF = MBB.getParent(); 1547 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 1548 1549 assert(Info->isEntryFunction()); 1550 1551 if (MBB.succ_empty()) { 1552 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end(); 1553 if (HasNoTerminator) { 1554 if (Info->returnsVoid()) { 1555 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0); 1556 } else { 1557 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG)); 1558 } 1559 } 1560 } 1561 } 1562 1563 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { 1564 switch (MI.getOpcode()) { 1565 default: return 1; // FIXME: Do wait states equal cycles? 1566 1567 case AMDGPU::S_NOP: 1568 return MI.getOperand(0).getImm() + 1; 1569 } 1570 } 1571 1572 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 1573 MachineBasicBlock &MBB = *MI.getParent(); 1574 DebugLoc DL = MBB.findDebugLoc(MI); 1575 switch (MI.getOpcode()) { 1576 default: return TargetInstrInfo::expandPostRAPseudo(MI); 1577 case AMDGPU::S_MOV_B64_term: 1578 // This is only a terminator to get the correct spill code placement during 1579 // register allocation. 1580 MI.setDesc(get(AMDGPU::S_MOV_B64)); 1581 break; 1582 1583 case AMDGPU::S_MOV_B32_term: 1584 // This is only a terminator to get the correct spill code placement during 1585 // register allocation. 1586 MI.setDesc(get(AMDGPU::S_MOV_B32)); 1587 break; 1588 1589 case AMDGPU::S_XOR_B64_term: 1590 // This is only a terminator to get the correct spill code placement during 1591 // register allocation. 1592 MI.setDesc(get(AMDGPU::S_XOR_B64)); 1593 break; 1594 1595 case AMDGPU::S_XOR_B32_term: 1596 // This is only a terminator to get the correct spill code placement during 1597 // register allocation. 1598 MI.setDesc(get(AMDGPU::S_XOR_B32)); 1599 break; 1600 1601 case AMDGPU::S_OR_B32_term: 1602 // This is only a terminator to get the correct spill code placement during 1603 // register allocation. 1604 MI.setDesc(get(AMDGPU::S_OR_B32)); 1605 break; 1606 1607 case AMDGPU::S_ANDN2_B64_term: 1608 // This is only a terminator to get the correct spill code placement during 1609 // register allocation. 1610 MI.setDesc(get(AMDGPU::S_ANDN2_B64)); 1611 break; 1612 1613 case AMDGPU::S_ANDN2_B32_term: 1614 // This is only a terminator to get the correct spill code placement during 1615 // register allocation. 1616 MI.setDesc(get(AMDGPU::S_ANDN2_B32)); 1617 break; 1618 1619 case AMDGPU::V_MOV_B64_PSEUDO: { 1620 Register Dst = MI.getOperand(0).getReg(); 1621 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 1622 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 1623 1624 const MachineOperand &SrcOp = MI.getOperand(1); 1625 // FIXME: Will this work for 64-bit floating point immediates? 1626 assert(!SrcOp.isFPImm()); 1627 if (SrcOp.isImm()) { 1628 APInt Imm(64, SrcOp.getImm()); 1629 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 1630 .addImm(Imm.getLoBits(32).getZExtValue()) 1631 .addReg(Dst, RegState::Implicit | RegState::Define); 1632 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 1633 .addImm(Imm.getHiBits(32).getZExtValue()) 1634 .addReg(Dst, RegState::Implicit | RegState::Define); 1635 } else { 1636 assert(SrcOp.isReg()); 1637 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 1638 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 1639 .addReg(Dst, RegState::Implicit | RegState::Define); 1640 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 1641 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 1642 .addReg(Dst, RegState::Implicit | RegState::Define); 1643 } 1644 MI.eraseFromParent(); 1645 break; 1646 } 1647 case AMDGPU::V_MOV_B64_DPP_PSEUDO: { 1648 expandMovDPP64(MI); 1649 break; 1650 } 1651 case AMDGPU::V_SET_INACTIVE_B32: { 1652 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; 1653 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1654 BuildMI(MBB, MI, DL, get(NotOpc), Exec) 1655 .addReg(Exec); 1656 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) 1657 .add(MI.getOperand(2)); 1658 BuildMI(MBB, MI, DL, get(NotOpc), Exec) 1659 .addReg(Exec); 1660 MI.eraseFromParent(); 1661 break; 1662 } 1663 case AMDGPU::V_SET_INACTIVE_B64: { 1664 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; 1665 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1666 BuildMI(MBB, MI, DL, get(NotOpc), Exec) 1667 .addReg(Exec); 1668 MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), 1669 MI.getOperand(0).getReg()) 1670 .add(MI.getOperand(2)); 1671 expandPostRAPseudo(*Copy); 1672 BuildMI(MBB, MI, DL, get(NotOpc), Exec) 1673 .addReg(Exec); 1674 MI.eraseFromParent(); 1675 break; 1676 } 1677 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V1: 1678 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V2: 1679 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V3: 1680 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V4: 1681 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V5: 1682 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V8: 1683 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V16: 1684 case AMDGPU::V_INDIRECT_REG_WRITE_B32_V32: 1685 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V1: 1686 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V2: 1687 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V3: 1688 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V4: 1689 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V5: 1690 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V8: 1691 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V16: 1692 case AMDGPU::S_INDIRECT_REG_WRITE_B32_V32: 1693 case AMDGPU::S_INDIRECT_REG_WRITE_B64_V1: 1694 case AMDGPU::S_INDIRECT_REG_WRITE_B64_V2: 1695 case AMDGPU::S_INDIRECT_REG_WRITE_B64_V4: 1696 case AMDGPU::S_INDIRECT_REG_WRITE_B64_V8: 1697 case AMDGPU::S_INDIRECT_REG_WRITE_B64_V16: { 1698 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2); 1699 1700 unsigned Opc; 1701 if (RI.hasVGPRs(EltRC)) { 1702 Opc = ST.useVGPRIndexMode() ? 1703 AMDGPU::V_MOV_B32_indirect : AMDGPU::V_MOVRELD_B32_e32; 1704 } else { 1705 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? 1706 AMDGPU::S_MOVRELD_B64 : AMDGPU::S_MOVRELD_B32; 1707 } 1708 1709 const MCInstrDesc &OpDesc = get(Opc); 1710 Register VecReg = MI.getOperand(0).getReg(); 1711 bool IsUndef = MI.getOperand(1).isUndef(); 1712 unsigned SubReg = MI.getOperand(3).getImm(); 1713 assert(VecReg == MI.getOperand(1).getReg()); 1714 1715 MachineInstrBuilder MIB = 1716 BuildMI(MBB, MI, DL, OpDesc) 1717 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) 1718 .add(MI.getOperand(2)) 1719 .addReg(VecReg, RegState::ImplicitDefine) 1720 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); 1721 1722 const int ImpDefIdx = 1723 OpDesc.getNumOperands() + OpDesc.getNumImplicitUses(); 1724 const int ImpUseIdx = ImpDefIdx + 1; 1725 MIB->tieOperands(ImpDefIdx, ImpUseIdx); 1726 MI.eraseFromParent(); 1727 break; 1728 } 1729 case AMDGPU::SI_PC_ADD_REL_OFFSET: { 1730 MachineFunction &MF = *MBB.getParent(); 1731 Register Reg = MI.getOperand(0).getReg(); 1732 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0); 1733 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1); 1734 1735 // Create a bundle so these instructions won't be re-ordered by the 1736 // post-RA scheduler. 1737 MIBundleBuilder Bundler(MBB, MI); 1738 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); 1739 1740 // Add 32-bit offset from this instruction to the start of the 1741 // constant data. 1742 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) 1743 .addReg(RegLo) 1744 .add(MI.getOperand(1))); 1745 1746 MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) 1747 .addReg(RegHi); 1748 MIB.add(MI.getOperand(2)); 1749 1750 Bundler.append(MIB); 1751 finalizeBundle(MBB, Bundler.begin()); 1752 1753 MI.eraseFromParent(); 1754 break; 1755 } 1756 case AMDGPU::ENTER_WWM: { 1757 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when 1758 // WWM is entered. 1759 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 1760 : AMDGPU::S_OR_SAVEEXEC_B64)); 1761 break; 1762 } 1763 case AMDGPU::EXIT_WWM: { 1764 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when 1765 // WWM is exited. 1766 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); 1767 break; 1768 } 1769 } 1770 return true; 1771 } 1772 1773 std::pair<MachineInstr*, MachineInstr*> 1774 SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { 1775 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); 1776 1777 MachineBasicBlock &MBB = *MI.getParent(); 1778 DebugLoc DL = MBB.findDebugLoc(MI); 1779 MachineFunction *MF = MBB.getParent(); 1780 MachineRegisterInfo &MRI = MF->getRegInfo(); 1781 Register Dst = MI.getOperand(0).getReg(); 1782 unsigned Part = 0; 1783 MachineInstr *Split[2]; 1784 1785 1786 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) { 1787 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp)); 1788 if (Dst.isPhysical()) { 1789 MovDPP.addDef(RI.getSubReg(Dst, Sub)); 1790 } else { 1791 assert(MRI.isSSA()); 1792 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1793 MovDPP.addDef(Tmp); 1794 } 1795 1796 for (unsigned I = 1; I <= 2; ++I) { // old and src operands. 1797 const MachineOperand &SrcOp = MI.getOperand(I); 1798 assert(!SrcOp.isFPImm()); 1799 if (SrcOp.isImm()) { 1800 APInt Imm(64, SrcOp.getImm()); 1801 Imm.ashrInPlace(Part * 32); 1802 MovDPP.addImm(Imm.getLoBits(32).getZExtValue()); 1803 } else { 1804 assert(SrcOp.isReg()); 1805 Register Src = SrcOp.getReg(); 1806 if (Src.isPhysical()) 1807 MovDPP.addReg(RI.getSubReg(Src, Sub)); 1808 else 1809 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub); 1810 } 1811 } 1812 1813 for (unsigned I = 3; I < MI.getNumExplicitOperands(); ++I) 1814 MovDPP.addImm(MI.getOperand(I).getImm()); 1815 1816 Split[Part] = MovDPP; 1817 ++Part; 1818 } 1819 1820 if (Dst.isVirtual()) 1821 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst) 1822 .addReg(Split[0]->getOperand(0).getReg()) 1823 .addImm(AMDGPU::sub0) 1824 .addReg(Split[1]->getOperand(0).getReg()) 1825 .addImm(AMDGPU::sub1); 1826 1827 MI.eraseFromParent(); 1828 return std::make_pair(Split[0], Split[1]); 1829 } 1830 1831 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, 1832 MachineOperand &Src0, 1833 unsigned Src0OpName, 1834 MachineOperand &Src1, 1835 unsigned Src1OpName) const { 1836 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); 1837 if (!Src0Mods) 1838 return false; 1839 1840 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); 1841 assert(Src1Mods && 1842 "All commutable instructions have both src0 and src1 modifiers"); 1843 1844 int Src0ModsVal = Src0Mods->getImm(); 1845 int Src1ModsVal = Src1Mods->getImm(); 1846 1847 Src1Mods->setImm(Src0ModsVal); 1848 Src0Mods->setImm(Src1ModsVal); 1849 return true; 1850 } 1851 1852 static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, 1853 MachineOperand &RegOp, 1854 MachineOperand &NonRegOp) { 1855 Register Reg = RegOp.getReg(); 1856 unsigned SubReg = RegOp.getSubReg(); 1857 bool IsKill = RegOp.isKill(); 1858 bool IsDead = RegOp.isDead(); 1859 bool IsUndef = RegOp.isUndef(); 1860 bool IsDebug = RegOp.isDebug(); 1861 1862 if (NonRegOp.isImm()) 1863 RegOp.ChangeToImmediate(NonRegOp.getImm()); 1864 else if (NonRegOp.isFI()) 1865 RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); 1866 else 1867 return nullptr; 1868 1869 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); 1870 NonRegOp.setSubReg(SubReg); 1871 1872 return &MI; 1873 } 1874 1875 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, 1876 unsigned Src0Idx, 1877 unsigned Src1Idx) const { 1878 assert(!NewMI && "this should never be used"); 1879 1880 unsigned Opc = MI.getOpcode(); 1881 int CommutedOpcode = commuteOpcode(Opc); 1882 if (CommutedOpcode == -1) 1883 return nullptr; 1884 1885 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == 1886 static_cast<int>(Src0Idx) && 1887 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == 1888 static_cast<int>(Src1Idx) && 1889 "inconsistency with findCommutedOpIndices"); 1890 1891 MachineOperand &Src0 = MI.getOperand(Src0Idx); 1892 MachineOperand &Src1 = MI.getOperand(Src1Idx); 1893 1894 MachineInstr *CommutedMI = nullptr; 1895 if (Src0.isReg() && Src1.isReg()) { 1896 if (isOperandLegal(MI, Src1Idx, &Src0)) { 1897 // Be sure to copy the source modifiers to the right place. 1898 CommutedMI 1899 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); 1900 } 1901 1902 } else if (Src0.isReg() && !Src1.isReg()) { 1903 // src0 should always be able to support any operand type, so no need to 1904 // check operand legality. 1905 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); 1906 } else if (!Src0.isReg() && Src1.isReg()) { 1907 if (isOperandLegal(MI, Src1Idx, &Src0)) 1908 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); 1909 } else { 1910 // FIXME: Found two non registers to commute. This does happen. 1911 return nullptr; 1912 } 1913 1914 if (CommutedMI) { 1915 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, 1916 Src1, AMDGPU::OpName::src1_modifiers); 1917 1918 CommutedMI->setDesc(get(CommutedOpcode)); 1919 } 1920 1921 return CommutedMI; 1922 } 1923 1924 // This needs to be implemented because the source modifiers may be inserted 1925 // between the true commutable operands, and the base 1926 // TargetInstrInfo::commuteInstruction uses it. 1927 bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI, 1928 unsigned &SrcOpIdx0, 1929 unsigned &SrcOpIdx1) const { 1930 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1); 1931 } 1932 1933 bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0, 1934 unsigned &SrcOpIdx1) const { 1935 if (!Desc.isCommutable()) 1936 return false; 1937 1938 unsigned Opc = Desc.getOpcode(); 1939 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1940 if (Src0Idx == -1) 1941 return false; 1942 1943 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1944 if (Src1Idx == -1) 1945 return false; 1946 1947 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); 1948 } 1949 1950 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, 1951 int64_t BrOffset) const { 1952 // BranchRelaxation should never have to check s_setpc_b64 because its dest 1953 // block is unanalyzable. 1954 assert(BranchOp != AMDGPU::S_SETPC_B64); 1955 1956 // Convert to dwords. 1957 BrOffset /= 4; 1958 1959 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is 1960 // from the next instruction. 1961 BrOffset -= 1; 1962 1963 return isIntN(BranchOffsetBits, BrOffset); 1964 } 1965 1966 MachineBasicBlock *SIInstrInfo::getBranchDestBlock( 1967 const MachineInstr &MI) const { 1968 if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { 1969 // This would be a difficult analysis to perform, but can always be legal so 1970 // there's no need to analyze it. 1971 return nullptr; 1972 } 1973 1974 return MI.getOperand(0).getMBB(); 1975 } 1976 1977 unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, 1978 MachineBasicBlock &DestBB, 1979 const DebugLoc &DL, 1980 int64_t BrOffset, 1981 RegScavenger *RS) const { 1982 assert(RS && "RegScavenger required for long branching"); 1983 assert(MBB.empty() && 1984 "new block should be inserted for expanding unconditional branch"); 1985 assert(MBB.pred_size() == 1); 1986 1987 MachineFunction *MF = MBB.getParent(); 1988 MachineRegisterInfo &MRI = MF->getRegInfo(); 1989 1990 // FIXME: Virtual register workaround for RegScavenger not working with empty 1991 // blocks. 1992 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1993 1994 auto I = MBB.end(); 1995 1996 // We need to compute the offset relative to the instruction immediately after 1997 // s_getpc_b64. Insert pc arithmetic code before last terminator. 1998 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); 1999 2000 // TODO: Handle > 32-bit block address. 2001 if (BrOffset >= 0) { 2002 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) 2003 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 2004 .addReg(PCReg, 0, AMDGPU::sub0) 2005 .addMBB(&DestBB, MO_LONG_BRANCH_FORWARD); 2006 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) 2007 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 2008 .addReg(PCReg, 0, AMDGPU::sub1) 2009 .addImm(0); 2010 } else { 2011 // Backwards branch. 2012 BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32)) 2013 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 2014 .addReg(PCReg, 0, AMDGPU::sub0) 2015 .addMBB(&DestBB, MO_LONG_BRANCH_BACKWARD); 2016 BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32)) 2017 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 2018 .addReg(PCReg, 0, AMDGPU::sub1) 2019 .addImm(0); 2020 } 2021 2022 // Insert the indirect branch after the other terminator. 2023 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) 2024 .addReg(PCReg); 2025 2026 // FIXME: If spilling is necessary, this will fail because this scavenger has 2027 // no emergency stack slots. It is non-trivial to spill in this situation, 2028 // because the restore code needs to be specially placed after the 2029 // jump. BranchRelaxation then needs to be made aware of the newly inserted 2030 // block. 2031 // 2032 // If a spill is needed for the pc register pair, we need to insert a spill 2033 // restore block right before the destination block, and insert a short branch 2034 // into the old destination block's fallthrough predecessor. 2035 // e.g.: 2036 // 2037 // s_cbranch_scc0 skip_long_branch: 2038 // 2039 // long_branch_bb: 2040 // spill s[8:9] 2041 // s_getpc_b64 s[8:9] 2042 // s_add_u32 s8, s8, restore_bb 2043 // s_addc_u32 s9, s9, 0 2044 // s_setpc_b64 s[8:9] 2045 // 2046 // skip_long_branch: 2047 // foo; 2048 // 2049 // ..... 2050 // 2051 // dest_bb_fallthrough_predecessor: 2052 // bar; 2053 // s_branch dest_bb 2054 // 2055 // restore_bb: 2056 // restore s[8:9] 2057 // fallthrough dest_bb 2058 /// 2059 // dest_bb: 2060 // buzz; 2061 2062 RS->enterBasicBlockEnd(MBB); 2063 unsigned Scav = RS->scavengeRegisterBackwards( 2064 AMDGPU::SReg_64RegClass, 2065 MachineBasicBlock::iterator(GetPC), false, 0); 2066 MRI.replaceRegWith(PCReg, Scav); 2067 MRI.clearVirtRegs(); 2068 RS->setRegUsed(Scav); 2069 2070 return 4 + 8 + 4 + 4; 2071 } 2072 2073 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { 2074 switch (Cond) { 2075 case SIInstrInfo::SCC_TRUE: 2076 return AMDGPU::S_CBRANCH_SCC1; 2077 case SIInstrInfo::SCC_FALSE: 2078 return AMDGPU::S_CBRANCH_SCC0; 2079 case SIInstrInfo::VCCNZ: 2080 return AMDGPU::S_CBRANCH_VCCNZ; 2081 case SIInstrInfo::VCCZ: 2082 return AMDGPU::S_CBRANCH_VCCZ; 2083 case SIInstrInfo::EXECNZ: 2084 return AMDGPU::S_CBRANCH_EXECNZ; 2085 case SIInstrInfo::EXECZ: 2086 return AMDGPU::S_CBRANCH_EXECZ; 2087 default: 2088 llvm_unreachable("invalid branch predicate"); 2089 } 2090 } 2091 2092 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { 2093 switch (Opcode) { 2094 case AMDGPU::S_CBRANCH_SCC0: 2095 return SCC_FALSE; 2096 case AMDGPU::S_CBRANCH_SCC1: 2097 return SCC_TRUE; 2098 case AMDGPU::S_CBRANCH_VCCNZ: 2099 return VCCNZ; 2100 case AMDGPU::S_CBRANCH_VCCZ: 2101 return VCCZ; 2102 case AMDGPU::S_CBRANCH_EXECNZ: 2103 return EXECNZ; 2104 case AMDGPU::S_CBRANCH_EXECZ: 2105 return EXECZ; 2106 default: 2107 return INVALID_BR; 2108 } 2109 } 2110 2111 bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, 2112 MachineBasicBlock::iterator I, 2113 MachineBasicBlock *&TBB, 2114 MachineBasicBlock *&FBB, 2115 SmallVectorImpl<MachineOperand> &Cond, 2116 bool AllowModify) const { 2117 if (I->getOpcode() == AMDGPU::S_BRANCH) { 2118 // Unconditional Branch 2119 TBB = I->getOperand(0).getMBB(); 2120 return false; 2121 } 2122 2123 MachineBasicBlock *CondBB = nullptr; 2124 2125 if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 2126 CondBB = I->getOperand(1).getMBB(); 2127 Cond.push_back(I->getOperand(0)); 2128 } else { 2129 BranchPredicate Pred = getBranchPredicate(I->getOpcode()); 2130 if (Pred == INVALID_BR) 2131 return true; 2132 2133 CondBB = I->getOperand(0).getMBB(); 2134 Cond.push_back(MachineOperand::CreateImm(Pred)); 2135 Cond.push_back(I->getOperand(1)); // Save the branch register. 2136 } 2137 ++I; 2138 2139 if (I == MBB.end()) { 2140 // Conditional branch followed by fall-through. 2141 TBB = CondBB; 2142 return false; 2143 } 2144 2145 if (I->getOpcode() == AMDGPU::S_BRANCH) { 2146 TBB = CondBB; 2147 FBB = I->getOperand(0).getMBB(); 2148 return false; 2149 } 2150 2151 return true; 2152 } 2153 2154 bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, 2155 MachineBasicBlock *&FBB, 2156 SmallVectorImpl<MachineOperand> &Cond, 2157 bool AllowModify) const { 2158 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 2159 auto E = MBB.end(); 2160 if (I == E) 2161 return false; 2162 2163 // Skip over the instructions that are artificially terminators for special 2164 // exec management. 2165 while (I != E && !I->isBranch() && !I->isReturn() && 2166 I->getOpcode() != AMDGPU::SI_MASK_BRANCH) { 2167 switch (I->getOpcode()) { 2168 case AMDGPU::SI_MASK_BRANCH: 2169 case AMDGPU::S_MOV_B64_term: 2170 case AMDGPU::S_XOR_B64_term: 2171 case AMDGPU::S_ANDN2_B64_term: 2172 case AMDGPU::S_MOV_B32_term: 2173 case AMDGPU::S_XOR_B32_term: 2174 case AMDGPU::S_OR_B32_term: 2175 case AMDGPU::S_ANDN2_B32_term: 2176 break; 2177 case AMDGPU::SI_IF: 2178 case AMDGPU::SI_ELSE: 2179 case AMDGPU::SI_KILL_I1_TERMINATOR: 2180 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: 2181 // FIXME: It's messy that these need to be considered here at all. 2182 return true; 2183 default: 2184 llvm_unreachable("unexpected non-branch terminator inst"); 2185 } 2186 2187 ++I; 2188 } 2189 2190 if (I == E) 2191 return false; 2192 2193 if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH) 2194 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); 2195 2196 ++I; 2197 2198 // TODO: Should be able to treat as fallthrough? 2199 if (I == MBB.end()) 2200 return true; 2201 2202 if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify)) 2203 return true; 2204 2205 MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB(); 2206 2207 // Specifically handle the case where the conditional branch is to the same 2208 // destination as the mask branch. e.g. 2209 // 2210 // si_mask_branch BB8 2211 // s_cbranch_execz BB8 2212 // s_cbranch BB9 2213 // 2214 // This is required to understand divergent loops which may need the branches 2215 // to be relaxed. 2216 if (TBB != MaskBrDest || Cond.empty()) 2217 return true; 2218 2219 auto Pred = Cond[0].getImm(); 2220 return (Pred != EXECZ && Pred != EXECNZ); 2221 } 2222 2223 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, 2224 int *BytesRemoved) const { 2225 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 2226 2227 unsigned Count = 0; 2228 unsigned RemovedSize = 0; 2229 while (I != MBB.end()) { 2230 MachineBasicBlock::iterator Next = std::next(I); 2231 if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) { 2232 I = Next; 2233 continue; 2234 } 2235 2236 RemovedSize += getInstSizeInBytes(*I); 2237 I->eraseFromParent(); 2238 ++Count; 2239 I = Next; 2240 } 2241 2242 if (BytesRemoved) 2243 *BytesRemoved = RemovedSize; 2244 2245 return Count; 2246 } 2247 2248 // Copy the flags onto the implicit condition register operand. 2249 static void preserveCondRegFlags(MachineOperand &CondReg, 2250 const MachineOperand &OrigCond) { 2251 CondReg.setIsUndef(OrigCond.isUndef()); 2252 CondReg.setIsKill(OrigCond.isKill()); 2253 } 2254 2255 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, 2256 MachineBasicBlock *TBB, 2257 MachineBasicBlock *FBB, 2258 ArrayRef<MachineOperand> Cond, 2259 const DebugLoc &DL, 2260 int *BytesAdded) const { 2261 if (!FBB && Cond.empty()) { 2262 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 2263 .addMBB(TBB); 2264 if (BytesAdded) 2265 *BytesAdded = 4; 2266 return 1; 2267 } 2268 2269 if(Cond.size() == 1 && Cond[0].isReg()) { 2270 BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO)) 2271 .add(Cond[0]) 2272 .addMBB(TBB); 2273 return 1; 2274 } 2275 2276 assert(TBB && Cond[0].isImm()); 2277 2278 unsigned Opcode 2279 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); 2280 2281 if (!FBB) { 2282 Cond[1].isUndef(); 2283 MachineInstr *CondBr = 2284 BuildMI(&MBB, DL, get(Opcode)) 2285 .addMBB(TBB); 2286 2287 // Copy the flags onto the implicit condition register operand. 2288 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); 2289 2290 if (BytesAdded) 2291 *BytesAdded = 4; 2292 return 1; 2293 } 2294 2295 assert(TBB && FBB); 2296 2297 MachineInstr *CondBr = 2298 BuildMI(&MBB, DL, get(Opcode)) 2299 .addMBB(TBB); 2300 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 2301 .addMBB(FBB); 2302 2303 MachineOperand &CondReg = CondBr->getOperand(1); 2304 CondReg.setIsUndef(Cond[1].isUndef()); 2305 CondReg.setIsKill(Cond[1].isKill()); 2306 2307 if (BytesAdded) 2308 *BytesAdded = 8; 2309 2310 return 2; 2311 } 2312 2313 bool SIInstrInfo::reverseBranchCondition( 2314 SmallVectorImpl<MachineOperand> &Cond) const { 2315 if (Cond.size() != 2) { 2316 return true; 2317 } 2318 2319 if (Cond[0].isImm()) { 2320 Cond[0].setImm(-Cond[0].getImm()); 2321 return false; 2322 } 2323 2324 return true; 2325 } 2326 2327 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 2328 ArrayRef<MachineOperand> Cond, 2329 Register DstReg, Register TrueReg, 2330 Register FalseReg, int &CondCycles, 2331 int &TrueCycles, int &FalseCycles) const { 2332 switch (Cond[0].getImm()) { 2333 case VCCNZ: 2334 case VCCZ: { 2335 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2336 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 2337 assert(MRI.getRegClass(FalseReg) == RC); 2338 2339 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 2340 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 2341 2342 // Limit to equal cost for branch vs. N v_cndmask_b32s. 2343 return RI.hasVGPRs(RC) && NumInsts <= 6; 2344 } 2345 case SCC_TRUE: 2346 case SCC_FALSE: { 2347 // FIXME: We could insert for VGPRs if we could replace the original compare 2348 // with a vector one. 2349 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2350 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 2351 assert(MRI.getRegClass(FalseReg) == RC); 2352 2353 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 2354 2355 // Multiples of 8 can do s_cselect_b64 2356 if (NumInsts % 2 == 0) 2357 NumInsts /= 2; 2358 2359 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 2360 return RI.isSGPRClass(RC); 2361 } 2362 default: 2363 return false; 2364 } 2365 } 2366 2367 void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, 2368 MachineBasicBlock::iterator I, const DebugLoc &DL, 2369 Register DstReg, ArrayRef<MachineOperand> Cond, 2370 Register TrueReg, Register FalseReg) const { 2371 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); 2372 if (Pred == VCCZ || Pred == SCC_FALSE) { 2373 Pred = static_cast<BranchPredicate>(-Pred); 2374 std::swap(TrueReg, FalseReg); 2375 } 2376 2377 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2378 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); 2379 unsigned DstSize = RI.getRegSizeInBits(*DstRC); 2380 2381 if (DstSize == 32) { 2382 MachineInstr *Select; 2383 if (Pred == SCC_TRUE) { 2384 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg) 2385 .addReg(TrueReg) 2386 .addReg(FalseReg); 2387 } else { 2388 // Instruction's operands are backwards from what is expected. 2389 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg) 2390 .addReg(FalseReg) 2391 .addReg(TrueReg); 2392 } 2393 2394 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 2395 return; 2396 } 2397 2398 if (DstSize == 64 && Pred == SCC_TRUE) { 2399 MachineInstr *Select = 2400 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) 2401 .addReg(TrueReg) 2402 .addReg(FalseReg); 2403 2404 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 2405 return; 2406 } 2407 2408 static const int16_t Sub0_15[] = { 2409 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 2410 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 2411 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 2412 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 2413 }; 2414 2415 static const int16_t Sub0_15_64[] = { 2416 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 2417 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 2418 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 2419 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 2420 }; 2421 2422 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; 2423 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; 2424 const int16_t *SubIndices = Sub0_15; 2425 int NElts = DstSize / 32; 2426 2427 // 64-bit select is only available for SALU. 2428 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit. 2429 if (Pred == SCC_TRUE) { 2430 if (NElts % 2) { 2431 SelOp = AMDGPU::S_CSELECT_B32; 2432 EltRC = &AMDGPU::SGPR_32RegClass; 2433 } else { 2434 SelOp = AMDGPU::S_CSELECT_B64; 2435 EltRC = &AMDGPU::SGPR_64RegClass; 2436 SubIndices = Sub0_15_64; 2437 NElts /= 2; 2438 } 2439 } 2440 2441 MachineInstrBuilder MIB = BuildMI( 2442 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); 2443 2444 I = MIB->getIterator(); 2445 2446 SmallVector<Register, 8> Regs; 2447 for (int Idx = 0; Idx != NElts; ++Idx) { 2448 Register DstElt = MRI.createVirtualRegister(EltRC); 2449 Regs.push_back(DstElt); 2450 2451 unsigned SubIdx = SubIndices[Idx]; 2452 2453 MachineInstr *Select; 2454 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) { 2455 Select = 2456 BuildMI(MBB, I, DL, get(SelOp), DstElt) 2457 .addReg(FalseReg, 0, SubIdx) 2458 .addReg(TrueReg, 0, SubIdx); 2459 } else { 2460 Select = 2461 BuildMI(MBB, I, DL, get(SelOp), DstElt) 2462 .addReg(TrueReg, 0, SubIdx) 2463 .addReg(FalseReg, 0, SubIdx); 2464 } 2465 2466 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 2467 fixImplicitOperands(*Select); 2468 2469 MIB.addReg(DstElt) 2470 .addImm(SubIdx); 2471 } 2472 } 2473 2474 bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { 2475 switch (MI.getOpcode()) { 2476 case AMDGPU::V_MOV_B32_e32: 2477 case AMDGPU::V_MOV_B32_e64: 2478 case AMDGPU::V_MOV_B64_PSEUDO: { 2479 // If there are additional implicit register operands, this may be used for 2480 // register indexing so the source register operand isn't simply copied. 2481 unsigned NumOps = MI.getDesc().getNumOperands() + 2482 MI.getDesc().getNumImplicitUses(); 2483 2484 return MI.getNumOperands() == NumOps; 2485 } 2486 case AMDGPU::S_MOV_B32: 2487 case AMDGPU::S_MOV_B64: 2488 case AMDGPU::COPY: 2489 case AMDGPU::V_ACCVGPR_WRITE_B32: 2490 case AMDGPU::V_ACCVGPR_READ_B32: 2491 return true; 2492 default: 2493 return false; 2494 } 2495 } 2496 2497 unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind( 2498 unsigned Kind) const { 2499 switch(Kind) { 2500 case PseudoSourceValue::Stack: 2501 case PseudoSourceValue::FixedStack: 2502 return AMDGPUAS::PRIVATE_ADDRESS; 2503 case PseudoSourceValue::ConstantPool: 2504 case PseudoSourceValue::GOT: 2505 case PseudoSourceValue::JumpTable: 2506 case PseudoSourceValue::GlobalValueCallEntry: 2507 case PseudoSourceValue::ExternalSymbolCallEntry: 2508 case PseudoSourceValue::TargetCustom: 2509 return AMDGPUAS::CONSTANT_ADDRESS; 2510 } 2511 return AMDGPUAS::FLAT_ADDRESS; 2512 } 2513 2514 static void removeModOperands(MachineInstr &MI) { 2515 unsigned Opc = MI.getOpcode(); 2516 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, 2517 AMDGPU::OpName::src0_modifiers); 2518 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, 2519 AMDGPU::OpName::src1_modifiers); 2520 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, 2521 AMDGPU::OpName::src2_modifiers); 2522 2523 MI.RemoveOperand(Src2ModIdx); 2524 MI.RemoveOperand(Src1ModIdx); 2525 MI.RemoveOperand(Src0ModIdx); 2526 } 2527 2528 bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, 2529 Register Reg, MachineRegisterInfo *MRI) const { 2530 if (!MRI->hasOneNonDBGUse(Reg)) 2531 return false; 2532 2533 switch (DefMI.getOpcode()) { 2534 default: 2535 return false; 2536 case AMDGPU::S_MOV_B64: 2537 // TODO: We could fold 64-bit immediates, but this get compilicated 2538 // when there are sub-registers. 2539 return false; 2540 2541 case AMDGPU::V_MOV_B32_e32: 2542 case AMDGPU::S_MOV_B32: 2543 case AMDGPU::V_ACCVGPR_WRITE_B32: 2544 break; 2545 } 2546 2547 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); 2548 assert(ImmOp); 2549 // FIXME: We could handle FrameIndex values here. 2550 if (!ImmOp->isImm()) 2551 return false; 2552 2553 unsigned Opc = UseMI.getOpcode(); 2554 if (Opc == AMDGPU::COPY) { 2555 Register DstReg = UseMI.getOperand(0).getReg(); 2556 bool Is16Bit = getOpSize(UseMI, 0) == 2; 2557 bool isVGPRCopy = RI.isVGPR(*MRI, DstReg); 2558 unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 2559 APInt Imm(32, ImmOp->getImm()); 2560 2561 if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16) 2562 Imm = Imm.ashr(16); 2563 2564 if (RI.isAGPR(*MRI, DstReg)) { 2565 if (!isInlineConstant(Imm)) 2566 return false; 2567 NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32; 2568 } 2569 2570 if (Is16Bit) { 2571 if (isVGPRCopy) 2572 return false; // Do not clobber vgpr_hi16 2573 2574 if (DstReg.isVirtual() && 2575 UseMI.getOperand(0).getSubReg() != AMDGPU::lo16) 2576 return false; 2577 2578 UseMI.getOperand(0).setSubReg(0); 2579 if (DstReg.isPhysical()) { 2580 DstReg = RI.get32BitRegister(DstReg); 2581 UseMI.getOperand(0).setReg(DstReg); 2582 } 2583 assert(UseMI.getOperand(1).getReg().isVirtual()); 2584 } 2585 2586 UseMI.setDesc(get(NewOpc)); 2587 UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue()); 2588 UseMI.getOperand(1).setTargetFlags(0); 2589 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); 2590 return true; 2591 } 2592 2593 if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || 2594 Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64 || 2595 Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 || 2596 Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64) { 2597 // Don't fold if we are using source or output modifiers. The new VOP2 2598 // instructions don't have them. 2599 if (hasAnyModifiersSet(UseMI)) 2600 return false; 2601 2602 // If this is a free constant, there's no reason to do this. 2603 // TODO: We could fold this here instead of letting SIFoldOperands do it 2604 // later. 2605 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); 2606 2607 // Any src operand can be used for the legality check. 2608 if (isInlineConstant(UseMI, *Src0, *ImmOp)) 2609 return false; 2610 2611 bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || 2612 Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64; 2613 bool IsFMA = Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 || 2614 Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64; 2615 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); 2616 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); 2617 2618 // Multiplied part is the constant: Use v_madmk_{f16, f32}. 2619 // We should only expect these to be on src0 due to canonicalizations. 2620 if (Src0->isReg() && Src0->getReg() == Reg) { 2621 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 2622 return false; 2623 2624 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) 2625 return false; 2626 2627 unsigned NewOpc = 2628 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : AMDGPU::V_FMAMK_F16) 2629 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16); 2630 if (pseudoToMCOpcode(NewOpc) == -1) 2631 return false; 2632 2633 // We need to swap operands 0 and 1 since madmk constant is at operand 1. 2634 2635 const int64_t Imm = ImmOp->getImm(); 2636 2637 // FIXME: This would be a lot easier if we could return a new instruction 2638 // instead of having to modify in place. 2639 2640 // Remove these first since they are at the end. 2641 UseMI.RemoveOperand( 2642 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 2643 UseMI.RemoveOperand( 2644 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 2645 2646 Register Src1Reg = Src1->getReg(); 2647 unsigned Src1SubReg = Src1->getSubReg(); 2648 Src0->setReg(Src1Reg); 2649 Src0->setSubReg(Src1SubReg); 2650 Src0->setIsKill(Src1->isKill()); 2651 2652 if (Opc == AMDGPU::V_MAC_F32_e64 || 2653 Opc == AMDGPU::V_MAC_F16_e64 || 2654 Opc == AMDGPU::V_FMAC_F32_e64 || 2655 Opc == AMDGPU::V_FMAC_F16_e64) 2656 UseMI.untieRegOperand( 2657 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 2658 2659 Src1->ChangeToImmediate(Imm); 2660 2661 removeModOperands(UseMI); 2662 UseMI.setDesc(get(NewOpc)); 2663 2664 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 2665 if (DeleteDef) 2666 DefMI.eraseFromParent(); 2667 2668 return true; 2669 } 2670 2671 // Added part is the constant: Use v_madak_{f16, f32}. 2672 if (Src2->isReg() && Src2->getReg() == Reg) { 2673 // Not allowed to use constant bus for another operand. 2674 // We can however allow an inline immediate as src0. 2675 bool Src0Inlined = false; 2676 if (Src0->isReg()) { 2677 // Try to inline constant if possible. 2678 // If the Def moves immediate and the use is single 2679 // We are saving VGPR here. 2680 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg()); 2681 if (Def && Def->isMoveImmediate() && 2682 isInlineConstant(Def->getOperand(1)) && 2683 MRI->hasOneUse(Src0->getReg())) { 2684 Src0->ChangeToImmediate(Def->getOperand(1).getImm()); 2685 Src0Inlined = true; 2686 } else if ((Register::isPhysicalRegister(Src0->getReg()) && 2687 (ST.getConstantBusLimit(Opc) <= 1 && 2688 RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) || 2689 (Register::isVirtualRegister(Src0->getReg()) && 2690 (ST.getConstantBusLimit(Opc) <= 1 && 2691 RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))) 2692 return false; 2693 // VGPR is okay as Src0 - fallthrough 2694 } 2695 2696 if (Src1->isReg() && !Src0Inlined ) { 2697 // We have one slot for inlinable constant so far - try to fill it 2698 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg()); 2699 if (Def && Def->isMoveImmediate() && 2700 isInlineConstant(Def->getOperand(1)) && 2701 MRI->hasOneUse(Src1->getReg()) && 2702 commuteInstruction(UseMI)) { 2703 Src0->ChangeToImmediate(Def->getOperand(1).getImm()); 2704 } else if ((Register::isPhysicalRegister(Src1->getReg()) && 2705 RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) || 2706 (Register::isVirtualRegister(Src1->getReg()) && 2707 RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) 2708 return false; 2709 // VGPR is okay as Src1 - fallthrough 2710 } 2711 2712 unsigned NewOpc = 2713 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 : AMDGPU::V_FMAAK_F16) 2714 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16); 2715 if (pseudoToMCOpcode(NewOpc) == -1) 2716 return false; 2717 2718 const int64_t Imm = ImmOp->getImm(); 2719 2720 // FIXME: This would be a lot easier if we could return a new instruction 2721 // instead of having to modify in place. 2722 2723 // Remove these first since they are at the end. 2724 UseMI.RemoveOperand( 2725 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 2726 UseMI.RemoveOperand( 2727 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 2728 2729 if (Opc == AMDGPU::V_MAC_F32_e64 || 2730 Opc == AMDGPU::V_MAC_F16_e64 || 2731 Opc == AMDGPU::V_FMAC_F32_e64 || 2732 Opc == AMDGPU::V_FMAC_F16_e64) 2733 UseMI.untieRegOperand( 2734 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 2735 2736 // ChangingToImmediate adds Src2 back to the instruction. 2737 Src2->ChangeToImmediate(Imm); 2738 2739 // These come before src2. 2740 removeModOperands(UseMI); 2741 UseMI.setDesc(get(NewOpc)); 2742 // It might happen that UseMI was commuted 2743 // and we now have SGPR as SRC1. If so 2 inlined 2744 // constant and SGPR are illegal. 2745 legalizeOperands(UseMI); 2746 2747 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 2748 if (DeleteDef) 2749 DefMI.eraseFromParent(); 2750 2751 return true; 2752 } 2753 } 2754 2755 return false; 2756 } 2757 2758 static bool 2759 memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1, 2760 ArrayRef<const MachineOperand *> BaseOps2) { 2761 if (BaseOps1.size() != BaseOps2.size()) 2762 return false; 2763 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) { 2764 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I])) 2765 return false; 2766 } 2767 return true; 2768 } 2769 2770 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 2771 int WidthB, int OffsetB) { 2772 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 2773 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 2774 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 2775 return LowOffset + LowWidth <= HighOffset; 2776 } 2777 2778 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa, 2779 const MachineInstr &MIb) const { 2780 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1; 2781 int64_t Offset0, Offset1; 2782 unsigned Dummy0, Dummy1; 2783 bool Offset0IsScalable, Offset1IsScalable; 2784 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable, 2785 Dummy0, &RI) || 2786 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable, 2787 Dummy1, &RI)) 2788 return false; 2789 2790 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1)) 2791 return false; 2792 2793 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { 2794 // FIXME: Handle ds_read2 / ds_write2. 2795 return false; 2796 } 2797 unsigned Width0 = MIa.memoperands().front()->getSize(); 2798 unsigned Width1 = MIb.memoperands().front()->getSize(); 2799 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1); 2800 } 2801 2802 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, 2803 const MachineInstr &MIb) const { 2804 assert(MIa.mayLoadOrStore() && 2805 "MIa must load from or modify a memory location"); 2806 assert(MIb.mayLoadOrStore() && 2807 "MIb must load from or modify a memory location"); 2808 2809 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) 2810 return false; 2811 2812 // XXX - Can we relax this between address spaces? 2813 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 2814 return false; 2815 2816 // TODO: Should we check the address space from the MachineMemOperand? That 2817 // would allow us to distinguish objects we know don't alias based on the 2818 // underlying address space, even if it was lowered to a different one, 2819 // e.g. private accesses lowered to use MUBUF instructions on a scratch 2820 // buffer. 2821 if (isDS(MIa)) { 2822 if (isDS(MIb)) 2823 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2824 2825 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb); 2826 } 2827 2828 if (isMUBUF(MIa) || isMTBUF(MIa)) { 2829 if (isMUBUF(MIb) || isMTBUF(MIb)) 2830 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2831 2832 return !isFLAT(MIb) && !isSMRD(MIb); 2833 } 2834 2835 if (isSMRD(MIa)) { 2836 if (isSMRD(MIb)) 2837 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2838 2839 return !isFLAT(MIb) && !isMUBUF(MIb) && !isMTBUF(MIb); 2840 } 2841 2842 if (isFLAT(MIa)) { 2843 if (isFLAT(MIb)) 2844 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2845 2846 return false; 2847 } 2848 2849 return false; 2850 } 2851 2852 static int64_t getFoldableImm(const MachineOperand* MO) { 2853 if (!MO->isReg()) 2854 return false; 2855 const MachineFunction *MF = MO->getParent()->getParent()->getParent(); 2856 const MachineRegisterInfo &MRI = MF->getRegInfo(); 2857 auto Def = MRI.getUniqueVRegDef(MO->getReg()); 2858 if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 && 2859 Def->getOperand(1).isImm()) 2860 return Def->getOperand(1).getImm(); 2861 return AMDGPU::NoRegister; 2862 } 2863 2864 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, 2865 MachineInstr &MI, 2866 LiveVariables *LV) const { 2867 unsigned Opc = MI.getOpcode(); 2868 bool IsF16 = false; 2869 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 || 2870 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64; 2871 2872 switch (Opc) { 2873 default: 2874 return nullptr; 2875 case AMDGPU::V_MAC_F16_e64: 2876 case AMDGPU::V_FMAC_F16_e64: 2877 IsF16 = true; 2878 LLVM_FALLTHROUGH; 2879 case AMDGPU::V_MAC_F32_e64: 2880 case AMDGPU::V_FMAC_F32_e64: 2881 break; 2882 case AMDGPU::V_MAC_F16_e32: 2883 case AMDGPU::V_FMAC_F16_e32: 2884 IsF16 = true; 2885 LLVM_FALLTHROUGH; 2886 case AMDGPU::V_MAC_F32_e32: 2887 case AMDGPU::V_FMAC_F32_e32: { 2888 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 2889 AMDGPU::OpName::src0); 2890 const MachineOperand *Src0 = &MI.getOperand(Src0Idx); 2891 if (!Src0->isReg() && !Src0->isImm()) 2892 return nullptr; 2893 2894 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0)) 2895 return nullptr; 2896 2897 break; 2898 } 2899 } 2900 2901 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 2902 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); 2903 const MachineOperand *Src0Mods = 2904 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 2905 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 2906 const MachineOperand *Src1Mods = 2907 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 2908 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 2909 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); 2910 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); 2911 2912 if (!Src0Mods && !Src1Mods && !Clamp && !Omod && 2913 // If we have an SGPR input, we will violate the constant bus restriction. 2914 (ST.getConstantBusLimit(Opc) > 1 || 2915 !Src0->isReg() || 2916 !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) { 2917 if (auto Imm = getFoldableImm(Src2)) { 2918 unsigned NewOpc = 2919 IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32) 2920 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32); 2921 if (pseudoToMCOpcode(NewOpc) != -1) 2922 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) 2923 .add(*Dst) 2924 .add(*Src0) 2925 .add(*Src1) 2926 .addImm(Imm); 2927 } 2928 unsigned NewOpc = 2929 IsFMA ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32) 2930 : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32); 2931 if (auto Imm = getFoldableImm(Src1)) { 2932 if (pseudoToMCOpcode(NewOpc) != -1) 2933 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) 2934 .add(*Dst) 2935 .add(*Src0) 2936 .addImm(Imm) 2937 .add(*Src2); 2938 } 2939 if (auto Imm = getFoldableImm(Src0)) { 2940 if (pseudoToMCOpcode(NewOpc) != -1 && 2941 isOperandLegal(MI, AMDGPU::getNamedOperandIdx(NewOpc, 2942 AMDGPU::OpName::src0), Src1)) 2943 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) 2944 .add(*Dst) 2945 .add(*Src1) 2946 .addImm(Imm) 2947 .add(*Src2); 2948 } 2949 } 2950 2951 unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16 : AMDGPU::V_FMA_F32) 2952 : (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32); 2953 if (pseudoToMCOpcode(NewOpc) == -1) 2954 return nullptr; 2955 2956 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) 2957 .add(*Dst) 2958 .addImm(Src0Mods ? Src0Mods->getImm() : 0) 2959 .add(*Src0) 2960 .addImm(Src1Mods ? Src1Mods->getImm() : 0) 2961 .add(*Src1) 2962 .addImm(0) // Src mods 2963 .add(*Src2) 2964 .addImm(Clamp ? Clamp->getImm() : 0) 2965 .addImm(Omod ? Omod->getImm() : 0); 2966 } 2967 2968 // It's not generally safe to move VALU instructions across these since it will 2969 // start using the register as a base index rather than directly. 2970 // XXX - Why isn't hasSideEffects sufficient for these? 2971 static bool changesVGPRIndexingMode(const MachineInstr &MI) { 2972 switch (MI.getOpcode()) { 2973 case AMDGPU::S_SET_GPR_IDX_ON: 2974 case AMDGPU::S_SET_GPR_IDX_MODE: 2975 case AMDGPU::S_SET_GPR_IDX_OFF: 2976 return true; 2977 default: 2978 return false; 2979 } 2980 } 2981 2982 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, 2983 const MachineBasicBlock *MBB, 2984 const MachineFunction &MF) const { 2985 // Skipping the check for SP writes in the base implementation. The reason it 2986 // was added was apparently due to compile time concerns. 2987 // 2988 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops 2989 // but is probably avoidable. 2990 2991 // Copied from base implementation. 2992 // Terminators and labels can't be scheduled around. 2993 if (MI.isTerminator() || MI.isPosition()) 2994 return true; 2995 2996 // Target-independent instructions do not have an implicit-use of EXEC, even 2997 // when they operate on VGPRs. Treating EXEC modifications as scheduling 2998 // boundaries prevents incorrect movements of such instructions. 2999 3000 // TODO: Don't treat setreg with known constant that only changes MODE as 3001 // barrier. 3002 return MI.modifiesRegister(AMDGPU::EXEC, &RI) || 3003 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || 3004 MI.getOpcode() == AMDGPU::S_SETREG_B32 || 3005 changesVGPRIndexingMode(MI); 3006 } 3007 3008 bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { 3009 return Opcode == AMDGPU::DS_ORDERED_COUNT || 3010 Opcode == AMDGPU::DS_GWS_INIT || 3011 Opcode == AMDGPU::DS_GWS_SEMA_V || 3012 Opcode == AMDGPU::DS_GWS_SEMA_BR || 3013 Opcode == AMDGPU::DS_GWS_SEMA_P || 3014 Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL || 3015 Opcode == AMDGPU::DS_GWS_BARRIER; 3016 } 3017 3018 bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) { 3019 // Skip the full operand and register alias search modifiesRegister 3020 // does. There's only a handful of instructions that touch this, it's only an 3021 // implicit def, and doesn't alias any other registers. 3022 if (const MCPhysReg *ImpDef = MI.getDesc().getImplicitDefs()) { 3023 for (; ImpDef && *ImpDef; ++ImpDef) { 3024 if (*ImpDef == AMDGPU::MODE) 3025 return true; 3026 } 3027 } 3028 3029 return false; 3030 } 3031 3032 bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const { 3033 unsigned Opcode = MI.getOpcode(); 3034 3035 if (MI.mayStore() && isSMRD(MI)) 3036 return true; // scalar store or atomic 3037 3038 // This will terminate the function when other lanes may need to continue. 3039 if (MI.isReturn()) 3040 return true; 3041 3042 // These instructions cause shader I/O that may cause hardware lockups 3043 // when executed with an empty EXEC mask. 3044 // 3045 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when 3046 // EXEC = 0, but checking for that case here seems not worth it 3047 // given the typical code patterns. 3048 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT || 3049 Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE || 3050 Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP || 3051 Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER) 3052 return true; 3053 3054 if (MI.isCall() || MI.isInlineAsm()) 3055 return true; // conservative assumption 3056 3057 // A mode change is a scalar operation that influences vector instructions. 3058 if (modifiesModeRegister(MI)) 3059 return true; 3060 3061 // These are like SALU instructions in terms of effects, so it's questionable 3062 // whether we should return true for those. 3063 // 3064 // However, executing them with EXEC = 0 causes them to operate on undefined 3065 // data, which we avoid by returning true here. 3066 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || Opcode == AMDGPU::V_READLANE_B32) 3067 return true; 3068 3069 return false; 3070 } 3071 3072 bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI, 3073 const MachineInstr &MI) const { 3074 if (MI.isMetaInstruction()) 3075 return false; 3076 3077 // This won't read exec if this is an SGPR->SGPR copy. 3078 if (MI.isCopyLike()) { 3079 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg())) 3080 return true; 3081 3082 // Make sure this isn't copying exec as a normal operand 3083 return MI.readsRegister(AMDGPU::EXEC, &RI); 3084 } 3085 3086 // Make a conservative assumption about the callee. 3087 if (MI.isCall()) 3088 return true; 3089 3090 // Be conservative with any unhandled generic opcodes. 3091 if (!isTargetSpecificOpcode(MI.getOpcode())) 3092 return true; 3093 3094 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI); 3095 } 3096 3097 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 3098 switch (Imm.getBitWidth()) { 3099 case 1: // This likely will be a condition code mask. 3100 return true; 3101 3102 case 32: 3103 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(), 3104 ST.hasInv2PiInlineImm()); 3105 case 64: 3106 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), 3107 ST.hasInv2PiInlineImm()); 3108 case 16: 3109 return ST.has16BitInsts() && 3110 AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), 3111 ST.hasInv2PiInlineImm()); 3112 default: 3113 llvm_unreachable("invalid bitwidth"); 3114 } 3115 } 3116 3117 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 3118 uint8_t OperandType) const { 3119 if (!MO.isImm() || 3120 OperandType < AMDGPU::OPERAND_SRC_FIRST || 3121 OperandType > AMDGPU::OPERAND_SRC_LAST) 3122 return false; 3123 3124 // MachineOperand provides no way to tell the true operand size, since it only 3125 // records a 64-bit value. We need to know the size to determine if a 32-bit 3126 // floating point immediate bit pattern is legal for an integer immediate. It 3127 // would be for any 32-bit integer operand, but would not be for a 64-bit one. 3128 3129 int64_t Imm = MO.getImm(); 3130 switch (OperandType) { 3131 case AMDGPU::OPERAND_REG_IMM_INT32: 3132 case AMDGPU::OPERAND_REG_IMM_FP32: 3133 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 3134 case AMDGPU::OPERAND_REG_INLINE_C_FP32: 3135 case AMDGPU::OPERAND_REG_INLINE_AC_INT32: 3136 case AMDGPU::OPERAND_REG_INLINE_AC_FP32: { 3137 int32_t Trunc = static_cast<int32_t>(Imm); 3138 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); 3139 } 3140 case AMDGPU::OPERAND_REG_IMM_INT64: 3141 case AMDGPU::OPERAND_REG_IMM_FP64: 3142 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 3143 case AMDGPU::OPERAND_REG_INLINE_C_FP64: 3144 return AMDGPU::isInlinableLiteral64(MO.getImm(), 3145 ST.hasInv2PiInlineImm()); 3146 case AMDGPU::OPERAND_REG_IMM_INT16: 3147 case AMDGPU::OPERAND_REG_IMM_FP16: 3148 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 3149 case AMDGPU::OPERAND_REG_INLINE_C_FP16: 3150 case AMDGPU::OPERAND_REG_INLINE_AC_INT16: 3151 case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { 3152 if (isInt<16>(Imm) || isUInt<16>(Imm)) { 3153 // A few special case instructions have 16-bit operands on subtargets 3154 // where 16-bit instructions are not legal. 3155 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle 3156 // constants in these cases 3157 int16_t Trunc = static_cast<int16_t>(Imm); 3158 return ST.has16BitInsts() && 3159 AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); 3160 } 3161 3162 return false; 3163 } 3164 case AMDGPU::OPERAND_REG_IMM_V2INT16: 3165 case AMDGPU::OPERAND_REG_IMM_V2FP16: 3166 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: 3167 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: 3168 case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: 3169 case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { 3170 uint32_t Trunc = static_cast<uint32_t>(Imm); 3171 return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); 3172 } 3173 default: 3174 llvm_unreachable("invalid bitwidth"); 3175 } 3176 } 3177 3178 bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO, 3179 const MCOperandInfo &OpInfo) const { 3180 switch (MO.getType()) { 3181 case MachineOperand::MO_Register: 3182 return false; 3183 case MachineOperand::MO_Immediate: 3184 return !isInlineConstant(MO, OpInfo); 3185 case MachineOperand::MO_FrameIndex: 3186 case MachineOperand::MO_MachineBasicBlock: 3187 case MachineOperand::MO_ExternalSymbol: 3188 case MachineOperand::MO_GlobalAddress: 3189 case MachineOperand::MO_MCSymbol: 3190 return true; 3191 default: 3192 llvm_unreachable("unexpected operand type"); 3193 } 3194 } 3195 3196 static bool compareMachineOp(const MachineOperand &Op0, 3197 const MachineOperand &Op1) { 3198 if (Op0.getType() != Op1.getType()) 3199 return false; 3200 3201 switch (Op0.getType()) { 3202 case MachineOperand::MO_Register: 3203 return Op0.getReg() == Op1.getReg(); 3204 case MachineOperand::MO_Immediate: 3205 return Op0.getImm() == Op1.getImm(); 3206 default: 3207 llvm_unreachable("Didn't expect to be comparing these operand types"); 3208 } 3209 } 3210 3211 bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, 3212 const MachineOperand &MO) const { 3213 const MCInstrDesc &InstDesc = MI.getDesc(); 3214 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo]; 3215 3216 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); 3217 3218 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 3219 return true; 3220 3221 if (OpInfo.RegClass < 0) 3222 return false; 3223 3224 const MachineFunction *MF = MI.getParent()->getParent(); 3225 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 3226 3227 if (MO.isImm() && isInlineConstant(MO, OpInfo)) { 3228 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() && 3229 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(), 3230 AMDGPU::OpName::src2)) 3231 return false; 3232 return RI.opCanUseInlineConstant(OpInfo.OperandType); 3233 } 3234 3235 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType)) 3236 return false; 3237 3238 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo)) 3239 return true; 3240 3241 return ST.hasVOP3Literal(); 3242 } 3243 3244 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 3245 int Op32 = AMDGPU::getVOPe32(Opcode); 3246 if (Op32 == -1) 3247 return false; 3248 3249 return pseudoToMCOpcode(Op32) != -1; 3250 } 3251 3252 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 3253 // The src0_modifier operand is present on all instructions 3254 // that have modifiers. 3255 3256 return AMDGPU::getNamedOperandIdx(Opcode, 3257 AMDGPU::OpName::src0_modifiers) != -1; 3258 } 3259 3260 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 3261 unsigned OpName) const { 3262 const MachineOperand *Mods = getNamedOperand(MI, OpName); 3263 return Mods && Mods->getImm(); 3264 } 3265 3266 bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { 3267 return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || 3268 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || 3269 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) || 3270 hasModifiersSet(MI, AMDGPU::OpName::clamp) || 3271 hasModifiersSet(MI, AMDGPU::OpName::omod); 3272 } 3273 3274 bool SIInstrInfo::canShrink(const MachineInstr &MI, 3275 const MachineRegisterInfo &MRI) const { 3276 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 3277 // Can't shrink instruction with three operands. 3278 // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add 3279 // a special case for it. It can only be shrunk if the third operand 3280 // is vcc, and src0_modifiers and src1_modifiers are not set. 3281 // We should handle this the same way we handle vopc, by addding 3282 // a register allocation hint pre-regalloc and then do the shrinking 3283 // post-regalloc. 3284 if (Src2) { 3285 switch (MI.getOpcode()) { 3286 default: return false; 3287 3288 case AMDGPU::V_ADDC_U32_e64: 3289 case AMDGPU::V_SUBB_U32_e64: 3290 case AMDGPU::V_SUBBREV_U32_e64: { 3291 const MachineOperand *Src1 3292 = getNamedOperand(MI, AMDGPU::OpName::src1); 3293 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg())) 3294 return false; 3295 // Additional verification is needed for sdst/src2. 3296 return true; 3297 } 3298 case AMDGPU::V_MAC_F32_e64: 3299 case AMDGPU::V_MAC_F16_e64: 3300 case AMDGPU::V_FMAC_F32_e64: 3301 case AMDGPU::V_FMAC_F16_e64: 3302 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) || 3303 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) 3304 return false; 3305 break; 3306 3307 case AMDGPU::V_CNDMASK_B32_e64: 3308 break; 3309 } 3310 } 3311 3312 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 3313 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) || 3314 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers))) 3315 return false; 3316 3317 // We don't need to check src0, all input types are legal, so just make sure 3318 // src0 isn't using any modifiers. 3319 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) 3320 return false; 3321 3322 // Can it be shrunk to a valid 32 bit opcode? 3323 if (!hasVALU32BitEncoding(MI.getOpcode())) 3324 return false; 3325 3326 // Check output modifiers 3327 return !hasModifiersSet(MI, AMDGPU::OpName::omod) && 3328 !hasModifiersSet(MI, AMDGPU::OpName::clamp); 3329 } 3330 3331 // Set VCC operand with all flags from \p Orig, except for setting it as 3332 // implicit. 3333 static void copyFlagsToImplicitVCC(MachineInstr &MI, 3334 const MachineOperand &Orig) { 3335 3336 for (MachineOperand &Use : MI.implicit_operands()) { 3337 if (Use.isUse() && Use.getReg() == AMDGPU::VCC) { 3338 Use.setIsUndef(Orig.isUndef()); 3339 Use.setIsKill(Orig.isKill()); 3340 return; 3341 } 3342 } 3343 } 3344 3345 MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI, 3346 unsigned Op32) const { 3347 MachineBasicBlock *MBB = MI.getParent();; 3348 MachineInstrBuilder Inst32 = 3349 BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32)) 3350 .setMIFlags(MI.getFlags()); 3351 3352 // Add the dst operand if the 32-bit encoding also has an explicit $vdst. 3353 // For VOPC instructions, this is replaced by an implicit def of vcc. 3354 int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst); 3355 if (Op32DstIdx != -1) { 3356 // dst 3357 Inst32.add(MI.getOperand(0)); 3358 } else { 3359 assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) || 3360 (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && 3361 "Unexpected case"); 3362 } 3363 3364 Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0)); 3365 3366 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 3367 if (Src1) 3368 Inst32.add(*Src1); 3369 3370 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 3371 3372 if (Src2) { 3373 int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); 3374 if (Op32Src2Idx != -1) { 3375 Inst32.add(*Src2); 3376 } else { 3377 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is 3378 // replaced with an implicit read of vcc. This was already added 3379 // during the initial BuildMI, so find it to preserve the flags. 3380 copyFlagsToImplicitVCC(*Inst32, *Src2); 3381 } 3382 } 3383 3384 return Inst32; 3385 } 3386 3387 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 3388 const MachineOperand &MO, 3389 const MCOperandInfo &OpInfo) const { 3390 // Literal constants use the constant bus. 3391 //if (isLiteralConstantLike(MO, OpInfo)) 3392 // return true; 3393 if (MO.isImm()) 3394 return !isInlineConstant(MO, OpInfo); 3395 3396 if (!MO.isReg()) 3397 return true; // Misc other operands like FrameIndex 3398 3399 if (!MO.isUse()) 3400 return false; 3401 3402 if (Register::isVirtualRegister(MO.getReg())) 3403 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 3404 3405 // Null is free 3406 if (MO.getReg() == AMDGPU::SGPR_NULL) 3407 return false; 3408 3409 // SGPRs use the constant bus 3410 if (MO.isImplicit()) { 3411 return MO.getReg() == AMDGPU::M0 || 3412 MO.getReg() == AMDGPU::VCC || 3413 MO.getReg() == AMDGPU::VCC_LO; 3414 } else { 3415 return AMDGPU::SReg_32RegClass.contains(MO.getReg()) || 3416 AMDGPU::SReg_64RegClass.contains(MO.getReg()); 3417 } 3418 } 3419 3420 static Register findImplicitSGPRRead(const MachineInstr &MI) { 3421 for (const MachineOperand &MO : MI.implicit_operands()) { 3422 // We only care about reads. 3423 if (MO.isDef()) 3424 continue; 3425 3426 switch (MO.getReg()) { 3427 case AMDGPU::VCC: 3428 case AMDGPU::VCC_LO: 3429 case AMDGPU::VCC_HI: 3430 case AMDGPU::M0: 3431 case AMDGPU::FLAT_SCR: 3432 return MO.getReg(); 3433 3434 default: 3435 break; 3436 } 3437 } 3438 3439 return AMDGPU::NoRegister; 3440 } 3441 3442 static bool shouldReadExec(const MachineInstr &MI) { 3443 if (SIInstrInfo::isVALU(MI)) { 3444 switch (MI.getOpcode()) { 3445 case AMDGPU::V_READLANE_B32: 3446 case AMDGPU::V_READLANE_B32_gfx6_gfx7: 3447 case AMDGPU::V_READLANE_B32_gfx10: 3448 case AMDGPU::V_READLANE_B32_vi: 3449 case AMDGPU::V_WRITELANE_B32: 3450 case AMDGPU::V_WRITELANE_B32_gfx6_gfx7: 3451 case AMDGPU::V_WRITELANE_B32_gfx10: 3452 case AMDGPU::V_WRITELANE_B32_vi: 3453 return false; 3454 } 3455 3456 return true; 3457 } 3458 3459 if (MI.isPreISelOpcode() || 3460 SIInstrInfo::isGenericOpcode(MI.getOpcode()) || 3461 SIInstrInfo::isSALU(MI) || 3462 SIInstrInfo::isSMRD(MI)) 3463 return false; 3464 3465 return true; 3466 } 3467 3468 static bool isSubRegOf(const SIRegisterInfo &TRI, 3469 const MachineOperand &SuperVec, 3470 const MachineOperand &SubReg) { 3471 if (Register::isPhysicalRegister(SubReg.getReg())) 3472 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); 3473 3474 return SubReg.getSubReg() != AMDGPU::NoSubRegister && 3475 SubReg.getReg() == SuperVec.getReg(); 3476 } 3477 3478 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, 3479 StringRef &ErrInfo) const { 3480 uint16_t Opcode = MI.getOpcode(); 3481 if (SIInstrInfo::isGenericOpcode(MI.getOpcode())) 3482 return true; 3483 3484 const MachineFunction *MF = MI.getParent()->getParent(); 3485 const MachineRegisterInfo &MRI = MF->getRegInfo(); 3486 3487 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 3488 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 3489 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 3490 3491 // Make sure the number of operands is correct. 3492 const MCInstrDesc &Desc = get(Opcode); 3493 if (!Desc.isVariadic() && 3494 Desc.getNumOperands() != MI.getNumExplicitOperands()) { 3495 ErrInfo = "Instruction has wrong number of operands."; 3496 return false; 3497 } 3498 3499 if (MI.isInlineAsm()) { 3500 // Verify register classes for inlineasm constraints. 3501 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands(); 3502 I != E; ++I) { 3503 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI); 3504 if (!RC) 3505 continue; 3506 3507 const MachineOperand &Op = MI.getOperand(I); 3508 if (!Op.isReg()) 3509 continue; 3510 3511 Register Reg = Op.getReg(); 3512 if (!Register::isVirtualRegister(Reg) && !RC->contains(Reg)) { 3513 ErrInfo = "inlineasm operand has incorrect register class."; 3514 return false; 3515 } 3516 } 3517 3518 return true; 3519 } 3520 3521 if (isMIMG(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) { 3522 ErrInfo = "missing memory operand from MIMG instruction."; 3523 return false; 3524 } 3525 3526 // Make sure the register classes are correct. 3527 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 3528 if (MI.getOperand(i).isFPImm()) { 3529 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 3530 "all fp values to integers."; 3531 return false; 3532 } 3533 3534 int RegClass = Desc.OpInfo[i].RegClass; 3535 3536 switch (Desc.OpInfo[i].OperandType) { 3537 case MCOI::OPERAND_REGISTER: 3538 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) { 3539 ErrInfo = "Illegal immediate value for operand."; 3540 return false; 3541 } 3542 break; 3543 case AMDGPU::OPERAND_REG_IMM_INT32: 3544 case AMDGPU::OPERAND_REG_IMM_FP32: 3545 break; 3546 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 3547 case AMDGPU::OPERAND_REG_INLINE_C_FP32: 3548 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 3549 case AMDGPU::OPERAND_REG_INLINE_C_FP64: 3550 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 3551 case AMDGPU::OPERAND_REG_INLINE_C_FP16: 3552 case AMDGPU::OPERAND_REG_INLINE_AC_INT32: 3553 case AMDGPU::OPERAND_REG_INLINE_AC_FP32: 3554 case AMDGPU::OPERAND_REG_INLINE_AC_INT16: 3555 case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { 3556 const MachineOperand &MO = MI.getOperand(i); 3557 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { 3558 ErrInfo = "Illegal immediate value for operand."; 3559 return false; 3560 } 3561 break; 3562 } 3563 case MCOI::OPERAND_IMMEDIATE: 3564 case AMDGPU::OPERAND_KIMM32: 3565 // Check if this operand is an immediate. 3566 // FrameIndex operands will be replaced by immediates, so they are 3567 // allowed. 3568 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { 3569 ErrInfo = "Expected immediate, but got non-immediate"; 3570 return false; 3571 } 3572 LLVM_FALLTHROUGH; 3573 default: 3574 continue; 3575 } 3576 3577 if (!MI.getOperand(i).isReg()) 3578 continue; 3579 3580 if (RegClass != -1) { 3581 Register Reg = MI.getOperand(i).getReg(); 3582 if (Reg == AMDGPU::NoRegister || Register::isVirtualRegister(Reg)) 3583 continue; 3584 3585 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 3586 if (!RC->contains(Reg)) { 3587 ErrInfo = "Operand has incorrect register class."; 3588 return false; 3589 } 3590 } 3591 } 3592 3593 // Verify SDWA 3594 if (isSDWA(MI)) { 3595 if (!ST.hasSDWA()) { 3596 ErrInfo = "SDWA is not supported on this target"; 3597 return false; 3598 } 3599 3600 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); 3601 3602 const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx }; 3603 3604 for (int OpIdx: OpIndicies) { 3605 if (OpIdx == -1) 3606 continue; 3607 const MachineOperand &MO = MI.getOperand(OpIdx); 3608 3609 if (!ST.hasSDWAScalar()) { 3610 // Only VGPRS on VI 3611 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) { 3612 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI"; 3613 return false; 3614 } 3615 } else { 3616 // No immediates on GFX9 3617 if (!MO.isReg()) { 3618 ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9"; 3619 return false; 3620 } 3621 } 3622 } 3623 3624 if (!ST.hasSDWAOmod()) { 3625 // No omod allowed on VI 3626 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); 3627 if (OMod != nullptr && 3628 (!OMod->isImm() || OMod->getImm() != 0)) { 3629 ErrInfo = "OMod not allowed in SDWA instructions on VI"; 3630 return false; 3631 } 3632 } 3633 3634 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode); 3635 if (isVOPC(BasicOpcode)) { 3636 if (!ST.hasSDWASdst() && DstIdx != -1) { 3637 // Only vcc allowed as dst on VI for VOPC 3638 const MachineOperand &Dst = MI.getOperand(DstIdx); 3639 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) { 3640 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI"; 3641 return false; 3642 } 3643 } else if (!ST.hasSDWAOutModsVOPC()) { 3644 // No clamp allowed on GFX9 for VOPC 3645 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); 3646 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) { 3647 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI"; 3648 return false; 3649 } 3650 3651 // No omod allowed on GFX9 for VOPC 3652 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); 3653 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) { 3654 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI"; 3655 return false; 3656 } 3657 } 3658 } 3659 3660 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused); 3661 if (DstUnused && DstUnused->isImm() && 3662 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) { 3663 const MachineOperand &Dst = MI.getOperand(DstIdx); 3664 if (!Dst.isReg() || !Dst.isTied()) { 3665 ErrInfo = "Dst register should have tied register"; 3666 return false; 3667 } 3668 3669 const MachineOperand &TiedMO = 3670 MI.getOperand(MI.findTiedOperandIdx(DstIdx)); 3671 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) { 3672 ErrInfo = 3673 "Dst register should be tied to implicit use of preserved register"; 3674 return false; 3675 } else if (Register::isPhysicalRegister(TiedMO.getReg()) && 3676 Dst.getReg() != TiedMO.getReg()) { 3677 ErrInfo = "Dst register should use same physical register as preserved"; 3678 return false; 3679 } 3680 } 3681 } 3682 3683 // Verify MIMG 3684 if (isMIMG(MI.getOpcode()) && !MI.mayStore()) { 3685 // Ensure that the return type used is large enough for all the options 3686 // being used TFE/LWE require an extra result register. 3687 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask); 3688 if (DMask) { 3689 uint64_t DMaskImm = DMask->getImm(); 3690 uint32_t RegCount = 3691 isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm); 3692 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe); 3693 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe); 3694 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16); 3695 3696 // Adjust for packed 16 bit values 3697 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem()) 3698 RegCount >>= 1; 3699 3700 // Adjust if using LWE or TFE 3701 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm())) 3702 RegCount += 1; 3703 3704 const uint32_t DstIdx = 3705 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); 3706 const MachineOperand &Dst = MI.getOperand(DstIdx); 3707 if (Dst.isReg()) { 3708 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx); 3709 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32; 3710 if (RegCount > DstSize) { 3711 ErrInfo = "MIMG instruction returns too many registers for dst " 3712 "register class"; 3713 return false; 3714 } 3715 } 3716 } 3717 } 3718 3719 // Verify VOP*. Ignore multiple sgpr operands on writelane. 3720 if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32 3721 && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) { 3722 // Only look at the true operands. Only a real operand can use the constant 3723 // bus, and we don't want to check pseudo-operands like the source modifier 3724 // flags. 3725 const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; 3726 3727 unsigned ConstantBusCount = 0; 3728 unsigned LiteralCount = 0; 3729 3730 if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) 3731 ++ConstantBusCount; 3732 3733 SmallVector<Register, 2> SGPRsUsed; 3734 Register SGPRUsed = findImplicitSGPRRead(MI); 3735 if (SGPRUsed != AMDGPU::NoRegister) { 3736 ++ConstantBusCount; 3737 SGPRsUsed.push_back(SGPRUsed); 3738 } 3739 3740 for (int OpIdx : OpIndices) { 3741 if (OpIdx == -1) 3742 break; 3743 const MachineOperand &MO = MI.getOperand(OpIdx); 3744 if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { 3745 if (MO.isReg()) { 3746 SGPRUsed = MO.getReg(); 3747 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) { 3748 return !RI.regsOverlap(SGPRUsed, SGPR); 3749 })) { 3750 ++ConstantBusCount; 3751 SGPRsUsed.push_back(SGPRUsed); 3752 } 3753 } else { 3754 ++ConstantBusCount; 3755 ++LiteralCount; 3756 } 3757 } 3758 } 3759 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 3760 // v_writelane_b32 is an exception from constant bus restriction: 3761 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const 3762 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) && 3763 Opcode != AMDGPU::V_WRITELANE_B32) { 3764 ErrInfo = "VOP* instruction violates constant bus restriction"; 3765 return false; 3766 } 3767 3768 if (isVOP3(MI) && LiteralCount) { 3769 if (LiteralCount && !ST.hasVOP3Literal()) { 3770 ErrInfo = "VOP3 instruction uses literal"; 3771 return false; 3772 } 3773 if (LiteralCount > 1) { 3774 ErrInfo = "VOP3 instruction uses more than one literal"; 3775 return false; 3776 } 3777 } 3778 } 3779 3780 // Special case for writelane - this can break the multiple constant bus rule, 3781 // but still can't use more than one SGPR register 3782 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) { 3783 unsigned SGPRCount = 0; 3784 Register SGPRUsed = AMDGPU::NoRegister; 3785 3786 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) { 3787 if (OpIdx == -1) 3788 break; 3789 3790 const MachineOperand &MO = MI.getOperand(OpIdx); 3791 3792 if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { 3793 if (MO.isReg() && MO.getReg() != AMDGPU::M0) { 3794 if (MO.getReg() != SGPRUsed) 3795 ++SGPRCount; 3796 SGPRUsed = MO.getReg(); 3797 } 3798 } 3799 if (SGPRCount > ST.getConstantBusLimit(Opcode)) { 3800 ErrInfo = "WRITELANE instruction violates constant bus restriction"; 3801 return false; 3802 } 3803 } 3804 } 3805 3806 // Verify misc. restrictions on specific instructions. 3807 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || 3808 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { 3809 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 3810 const MachineOperand &Src1 = MI.getOperand(Src1Idx); 3811 const MachineOperand &Src2 = MI.getOperand(Src2Idx); 3812 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 3813 if (!compareMachineOp(Src0, Src1) && 3814 !compareMachineOp(Src0, Src2)) { 3815 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 3816 return false; 3817 } 3818 } 3819 } 3820 3821 if (isSOP2(MI) || isSOPC(MI)) { 3822 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 3823 const MachineOperand &Src1 = MI.getOperand(Src1Idx); 3824 unsigned Immediates = 0; 3825 3826 if (!Src0.isReg() && 3827 !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType)) 3828 Immediates++; 3829 if (!Src1.isReg() && 3830 !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType)) 3831 Immediates++; 3832 3833 if (Immediates > 1) { 3834 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants"; 3835 return false; 3836 } 3837 } 3838 3839 if (isSOPK(MI)) { 3840 auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16); 3841 if (Desc.isBranch()) { 3842 if (!Op->isMBB()) { 3843 ErrInfo = "invalid branch target for SOPK instruction"; 3844 return false; 3845 } 3846 } else { 3847 uint64_t Imm = Op->getImm(); 3848 if (sopkIsZext(MI)) { 3849 if (!isUInt<16>(Imm)) { 3850 ErrInfo = "invalid immediate for SOPK instruction"; 3851 return false; 3852 } 3853 } else { 3854 if (!isInt<16>(Imm)) { 3855 ErrInfo = "invalid immediate for SOPK instruction"; 3856 return false; 3857 } 3858 } 3859 } 3860 } 3861 3862 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || 3863 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || 3864 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 3865 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { 3866 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 3867 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; 3868 3869 const unsigned StaticNumOps = Desc.getNumOperands() + 3870 Desc.getNumImplicitUses(); 3871 const unsigned NumImplicitOps = IsDst ? 2 : 1; 3872 3873 // Allow additional implicit operands. This allows a fixup done by the post 3874 // RA scheduler where the main implicit operand is killed and implicit-defs 3875 // are added for sub-registers that remain live after this instruction. 3876 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { 3877 ErrInfo = "missing implicit register operands"; 3878 return false; 3879 } 3880 3881 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 3882 if (IsDst) { 3883 if (!Dst->isUse()) { 3884 ErrInfo = "v_movreld_b32 vdst should be a use operand"; 3885 return false; 3886 } 3887 3888 unsigned UseOpIdx; 3889 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || 3890 UseOpIdx != StaticNumOps + 1) { 3891 ErrInfo = "movrel implicit operands should be tied"; 3892 return false; 3893 } 3894 } 3895 3896 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 3897 const MachineOperand &ImpUse 3898 = MI.getOperand(StaticNumOps + NumImplicitOps - 1); 3899 if (!ImpUse.isReg() || !ImpUse.isUse() || 3900 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { 3901 ErrInfo = "src0 should be subreg of implicit vector use"; 3902 return false; 3903 } 3904 } 3905 3906 // Make sure we aren't losing exec uses in the td files. This mostly requires 3907 // being careful when using let Uses to try to add other use registers. 3908 if (shouldReadExec(MI)) { 3909 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 3910 ErrInfo = "VALU instruction does not implicitly read exec mask"; 3911 return false; 3912 } 3913 } 3914 3915 if (isSMRD(MI)) { 3916 if (MI.mayStore()) { 3917 // The register offset form of scalar stores may only use m0 as the 3918 // soffset register. 3919 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff); 3920 if (Soff && Soff->getReg() != AMDGPU::M0) { 3921 ErrInfo = "scalar stores must use m0 as offset register"; 3922 return false; 3923 } 3924 } 3925 } 3926 3927 if (isFLAT(MI) && !MF->getSubtarget<GCNSubtarget>().hasFlatInstOffsets()) { 3928 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 3929 if (Offset->getImm() != 0) { 3930 ErrInfo = "subtarget does not support offsets in flat instructions"; 3931 return false; 3932 } 3933 } 3934 3935 if (isMIMG(MI)) { 3936 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim); 3937 if (DimOp) { 3938 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode, 3939 AMDGPU::OpName::vaddr0); 3940 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); 3941 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode); 3942 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3943 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); 3944 const AMDGPU::MIMGDimInfo *Dim = 3945 AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm()); 3946 3947 if (!Dim) { 3948 ErrInfo = "dim is out of range"; 3949 return false; 3950 } 3951 3952 bool IsA16 = false; 3953 if (ST.hasR128A16()) { 3954 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128); 3955 IsA16 = R128A16->getImm() != 0; 3956 } else if (ST.hasGFX10A16()) { 3957 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16); 3958 IsA16 = A16->getImm() != 0; 3959 } 3960 3961 bool PackDerivatives = IsA16; // Either A16 or G16 3962 bool IsNSA = SRsrcIdx - VAddr0Idx > 1; 3963 3964 unsigned AddrWords = BaseOpcode->NumExtraArgs; 3965 unsigned AddrComponents = (BaseOpcode->Coordinates ? Dim->NumCoords : 0) + 3966 (BaseOpcode->LodOrClampOrMip ? 1 : 0); 3967 if (IsA16) 3968 AddrWords += (AddrComponents + 1) / 2; 3969 else 3970 AddrWords += AddrComponents; 3971 3972 if (BaseOpcode->Gradients) { 3973 if (PackDerivatives) 3974 // There are two gradients per coordinate, we pack them separately. 3975 // For the 3d case, we get (dy/du, dx/du) (-, dz/du) (dy/dv, dx/dv) (-, dz/dv) 3976 AddrWords += (Dim->NumGradients / 2 + 1) / 2 * 2; 3977 else 3978 AddrWords += Dim->NumGradients; 3979 } 3980 3981 unsigned VAddrWords; 3982 if (IsNSA) { 3983 VAddrWords = SRsrcIdx - VAddr0Idx; 3984 } else { 3985 const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx); 3986 VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32; 3987 if (AddrWords > 8) 3988 AddrWords = 16; 3989 else if (AddrWords > 4) 3990 AddrWords = 8; 3991 else if (AddrWords == 4) 3992 AddrWords = 4; 3993 else if (AddrWords == 3) 3994 AddrWords = 3; 3995 } 3996 3997 if (VAddrWords != AddrWords) { 3998 ErrInfo = "bad vaddr size"; 3999 return false; 4000 } 4001 } 4002 } 4003 4004 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl); 4005 if (DppCt) { 4006 using namespace AMDGPU::DPP; 4007 4008 unsigned DC = DppCt->getImm(); 4009 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 || 4010 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST || 4011 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) || 4012 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) || 4013 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) || 4014 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) || 4015 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) { 4016 ErrInfo = "Invalid dpp_ctrl value"; 4017 return false; 4018 } 4019 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 && 4020 ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 4021 ErrInfo = "Invalid dpp_ctrl value: " 4022 "wavefront shifts are not supported on GFX10+"; 4023 return false; 4024 } 4025 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 && 4026 ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 4027 ErrInfo = "Invalid dpp_ctrl value: " 4028 "broadcasts are not supported on GFX10+"; 4029 return false; 4030 } 4031 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST && 4032 ST.getGeneration() < AMDGPUSubtarget::GFX10) { 4033 ErrInfo = "Invalid dpp_ctrl value: " 4034 "row_share and row_xmask are not supported before GFX10"; 4035 return false; 4036 } 4037 } 4038 4039 return true; 4040 } 4041 4042 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { 4043 switch (MI.getOpcode()) { 4044 default: return AMDGPU::INSTRUCTION_LIST_END; 4045 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 4046 case AMDGPU::COPY: return AMDGPU::COPY; 4047 case AMDGPU::PHI: return AMDGPU::PHI; 4048 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 4049 case AMDGPU::WQM: return AMDGPU::WQM; 4050 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM; 4051 case AMDGPU::WWM: return AMDGPU::WWM; 4052 case AMDGPU::S_MOV_B32: { 4053 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 4054 return MI.getOperand(1).isReg() || 4055 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ? 4056 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 4057 } 4058 case AMDGPU::S_ADD_I32: 4059 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32; 4060 case AMDGPU::S_ADDC_U32: 4061 return AMDGPU::V_ADDC_U32_e32; 4062 case AMDGPU::S_SUB_I32: 4063 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32; 4064 // FIXME: These are not consistently handled, and selected when the carry is 4065 // used. 4066 case AMDGPU::S_ADD_U32: 4067 return AMDGPU::V_ADD_I32_e32; 4068 case AMDGPU::S_SUB_U32: 4069 return AMDGPU::V_SUB_I32_e32; 4070 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 4071 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32; 4072 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32; 4073 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32; 4074 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; 4075 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; 4076 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; 4077 case AMDGPU::S_XNOR_B32: 4078 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; 4079 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; 4080 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; 4081 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; 4082 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; 4083 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 4084 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; 4085 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 4086 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; 4087 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 4088 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; 4089 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; 4090 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; 4091 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; 4092 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; 4093 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 4094 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 4095 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 4096 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 4097 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; 4098 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; 4099 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; 4100 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; 4101 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; 4102 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; 4103 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; 4104 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; 4105 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; 4106 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; 4107 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; 4108 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; 4109 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32; 4110 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32; 4111 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 4112 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 4113 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 4114 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 4115 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; 4116 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; 4117 } 4118 llvm_unreachable( 4119 "Unexpected scalar opcode without corresponding vector one!"); 4120 } 4121 4122 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 4123 unsigned OpNo) const { 4124 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 4125 const MCInstrDesc &Desc = get(MI.getOpcode()); 4126 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 4127 Desc.OpInfo[OpNo].RegClass == -1) { 4128 Register Reg = MI.getOperand(OpNo).getReg(); 4129 4130 if (Register::isVirtualRegister(Reg)) 4131 return MRI.getRegClass(Reg); 4132 return RI.getPhysRegClass(Reg); 4133 } 4134 4135 unsigned RCID = Desc.OpInfo[OpNo].RegClass; 4136 return RI.getRegClass(RCID); 4137 } 4138 4139 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { 4140 MachineBasicBlock::iterator I = MI; 4141 MachineBasicBlock *MBB = MI.getParent(); 4142 MachineOperand &MO = MI.getOperand(OpIdx); 4143 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 4144 const SIRegisterInfo *TRI = 4145 static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 4146 unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; 4147 const TargetRegisterClass *RC = RI.getRegClass(RCID); 4148 unsigned Size = TRI->getRegSizeInBits(*RC); 4149 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32; 4150 if (MO.isReg()) 4151 Opcode = AMDGPU::COPY; 4152 else if (RI.isSGPRClass(RC)) 4153 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 4154 4155 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 4156 if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) 4157 VRC = &AMDGPU::VReg_64RegClass; 4158 else 4159 VRC = &AMDGPU::VGPR_32RegClass; 4160 4161 Register Reg = MRI.createVirtualRegister(VRC); 4162 DebugLoc DL = MBB->findDebugLoc(I); 4163 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); 4164 MO.ChangeToRegister(Reg, false); 4165 } 4166 4167 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 4168 MachineRegisterInfo &MRI, 4169 MachineOperand &SuperReg, 4170 const TargetRegisterClass *SuperRC, 4171 unsigned SubIdx, 4172 const TargetRegisterClass *SubRC) 4173 const { 4174 MachineBasicBlock *MBB = MI->getParent(); 4175 DebugLoc DL = MI->getDebugLoc(); 4176 Register SubReg = MRI.createVirtualRegister(SubRC); 4177 4178 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { 4179 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 4180 .addReg(SuperReg.getReg(), 0, SubIdx); 4181 return SubReg; 4182 } 4183 4184 // Just in case the super register is itself a sub-register, copy it to a new 4185 // value so we don't need to worry about merging its subreg index with the 4186 // SubIdx passed to this function. The register coalescer should be able to 4187 // eliminate this extra copy. 4188 Register NewSuperReg = MRI.createVirtualRegister(SuperRC); 4189 4190 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 4191 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 4192 4193 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 4194 .addReg(NewSuperReg, 0, SubIdx); 4195 4196 return SubReg; 4197 } 4198 4199 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 4200 MachineBasicBlock::iterator MII, 4201 MachineRegisterInfo &MRI, 4202 MachineOperand &Op, 4203 const TargetRegisterClass *SuperRC, 4204 unsigned SubIdx, 4205 const TargetRegisterClass *SubRC) const { 4206 if (Op.isImm()) { 4207 if (SubIdx == AMDGPU::sub0) 4208 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); 4209 if (SubIdx == AMDGPU::sub1) 4210 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32)); 4211 4212 llvm_unreachable("Unhandled register index for immediate"); 4213 } 4214 4215 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 4216 SubIdx, SubRC); 4217 return MachineOperand::CreateReg(SubReg, false); 4218 } 4219 4220 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 4221 void SIInstrInfo::swapOperands(MachineInstr &Inst) const { 4222 assert(Inst.getNumExplicitOperands() == 3); 4223 MachineOperand Op1 = Inst.getOperand(1); 4224 Inst.RemoveOperand(1); 4225 Inst.addOperand(Op1); 4226 } 4227 4228 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, 4229 const MCOperandInfo &OpInfo, 4230 const MachineOperand &MO) const { 4231 if (!MO.isReg()) 4232 return false; 4233 4234 Register Reg = MO.getReg(); 4235 const TargetRegisterClass *RC = Register::isVirtualRegister(Reg) 4236 ? MRI.getRegClass(Reg) 4237 : RI.getPhysRegClass(Reg); 4238 4239 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass); 4240 if (MO.getSubReg()) { 4241 const MachineFunction *MF = MO.getParent()->getParent()->getParent(); 4242 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF); 4243 if (!SuperRC) 4244 return false; 4245 4246 DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()); 4247 if (!DRC) 4248 return false; 4249 } 4250 return RC->hasSuperClassEq(DRC); 4251 } 4252 4253 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, 4254 const MCOperandInfo &OpInfo, 4255 const MachineOperand &MO) const { 4256 if (MO.isReg()) 4257 return isLegalRegOperand(MRI, OpInfo, MO); 4258 4259 // Handle non-register types that are treated like immediates. 4260 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); 4261 return true; 4262 } 4263 4264 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, 4265 const MachineOperand *MO) const { 4266 const MachineFunction &MF = *MI.getParent()->getParent(); 4267 const MachineRegisterInfo &MRI = MF.getRegInfo(); 4268 const MCInstrDesc &InstDesc = MI.getDesc(); 4269 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; 4270 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 4271 const TargetRegisterClass *DefinedRC = 4272 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 4273 if (!MO) 4274 MO = &MI.getOperand(OpIdx); 4275 4276 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); 4277 int VOP3LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; 4278 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { 4279 if (isVOP3(MI) && isLiteralConstantLike(*MO, OpInfo) && !VOP3LiteralLimit--) 4280 return false; 4281 4282 SmallDenseSet<RegSubRegPair> SGPRsUsed; 4283 if (MO->isReg()) 4284 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg())); 4285 4286 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 4287 if (i == OpIdx) 4288 continue; 4289 const MachineOperand &Op = MI.getOperand(i); 4290 if (Op.isReg()) { 4291 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg()); 4292 if (!SGPRsUsed.count(SGPR) && 4293 usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) { 4294 if (--ConstantBusLimit <= 0) 4295 return false; 4296 SGPRsUsed.insert(SGPR); 4297 } 4298 } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { 4299 if (--ConstantBusLimit <= 0) 4300 return false; 4301 } else if (isVOP3(MI) && AMDGPU::isSISrcOperand(InstDesc, i) && 4302 isLiteralConstantLike(Op, InstDesc.OpInfo[i])) { 4303 if (!VOP3LiteralLimit--) 4304 return false; 4305 if (--ConstantBusLimit <= 0) 4306 return false; 4307 } 4308 } 4309 } 4310 4311 if (MO->isReg()) { 4312 assert(DefinedRC); 4313 return isLegalRegOperand(MRI, OpInfo, *MO); 4314 } 4315 4316 // Handle non-register types that are treated like immediates. 4317 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal()); 4318 4319 if (!DefinedRC) { 4320 // This operand expects an immediate. 4321 return true; 4322 } 4323 4324 return isImmOperandLegal(MI, OpIdx, *MO); 4325 } 4326 4327 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, 4328 MachineInstr &MI) const { 4329 unsigned Opc = MI.getOpcode(); 4330 const MCInstrDesc &InstrDesc = get(Opc); 4331 4332 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 4333 MachineOperand &Src0 = MI.getOperand(Src0Idx); 4334 4335 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 4336 MachineOperand &Src1 = MI.getOperand(Src1Idx); 4337 4338 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 4339 // we need to only have one constant bus use before GFX10. 4340 bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; 4341 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && 4342 Src0.isReg() && (RI.isSGPRReg(MRI, Src0.getReg()) || 4343 isLiteralConstantLike(Src0, InstrDesc.OpInfo[Src0Idx]))) 4344 legalizeOpWithMove(MI, Src0Idx); 4345 4346 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for 4347 // both the value to write (src0) and lane select (src1). Fix up non-SGPR 4348 // src0/src1 with V_READFIRSTLANE. 4349 if (Opc == AMDGPU::V_WRITELANE_B32) { 4350 const DebugLoc &DL = MI.getDebugLoc(); 4351 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) { 4352 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4353 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4354 .add(Src0); 4355 Src0.ChangeToRegister(Reg, false); 4356 } 4357 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) { 4358 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4359 const DebugLoc &DL = MI.getDebugLoc(); 4360 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4361 .add(Src1); 4362 Src1.ChangeToRegister(Reg, false); 4363 } 4364 return; 4365 } 4366 4367 // No VOP2 instructions support AGPRs. 4368 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg())) 4369 legalizeOpWithMove(MI, Src0Idx); 4370 4371 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg())) 4372 legalizeOpWithMove(MI, Src1Idx); 4373 4374 // VOP2 src0 instructions support all operand types, so we don't need to check 4375 // their legality. If src1 is already legal, we don't need to do anything. 4376 if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) 4377 return; 4378 4379 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for 4380 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane 4381 // select is uniform. 4382 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() && 4383 RI.isVGPR(MRI, Src1.getReg())) { 4384 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4385 const DebugLoc &DL = MI.getDebugLoc(); 4386 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4387 .add(Src1); 4388 Src1.ChangeToRegister(Reg, false); 4389 return; 4390 } 4391 4392 // We do not use commuteInstruction here because it is too aggressive and will 4393 // commute if it is possible. We only want to commute here if it improves 4394 // legality. This can be called a fairly large number of times so don't waste 4395 // compile time pointlessly swapping and checking legality again. 4396 if (HasImplicitSGPR || !MI.isCommutable()) { 4397 legalizeOpWithMove(MI, Src1Idx); 4398 return; 4399 } 4400 4401 // If src0 can be used as src1, commuting will make the operands legal. 4402 // Otherwise we have to give up and insert a move. 4403 // 4404 // TODO: Other immediate-like operand kinds could be commuted if there was a 4405 // MachineOperand::ChangeTo* for them. 4406 if ((!Src1.isImm() && !Src1.isReg()) || 4407 !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { 4408 legalizeOpWithMove(MI, Src1Idx); 4409 return; 4410 } 4411 4412 int CommutedOpc = commuteOpcode(MI); 4413 if (CommutedOpc == -1) { 4414 legalizeOpWithMove(MI, Src1Idx); 4415 return; 4416 } 4417 4418 MI.setDesc(get(CommutedOpc)); 4419 4420 Register Src0Reg = Src0.getReg(); 4421 unsigned Src0SubReg = Src0.getSubReg(); 4422 bool Src0Kill = Src0.isKill(); 4423 4424 if (Src1.isImm()) 4425 Src0.ChangeToImmediate(Src1.getImm()); 4426 else if (Src1.isReg()) { 4427 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); 4428 Src0.setSubReg(Src1.getSubReg()); 4429 } else 4430 llvm_unreachable("Should only have register or immediate operands"); 4431 4432 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); 4433 Src1.setSubReg(Src0SubReg); 4434 fixImplicitOperands(MI); 4435 } 4436 4437 // Legalize VOP3 operands. All operand types are supported for any operand 4438 // but only one literal constant and only starting from GFX10. 4439 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, 4440 MachineInstr &MI) const { 4441 unsigned Opc = MI.getOpcode(); 4442 4443 int VOP3Idx[3] = { 4444 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 4445 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), 4446 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) 4447 }; 4448 4449 if (Opc == AMDGPU::V_PERMLANE16_B32 || 4450 Opc == AMDGPU::V_PERMLANEX16_B32) { 4451 // src1 and src2 must be scalar 4452 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]); 4453 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]); 4454 const DebugLoc &DL = MI.getDebugLoc(); 4455 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) { 4456 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4457 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4458 .add(Src1); 4459 Src1.ChangeToRegister(Reg, false); 4460 } 4461 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) { 4462 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4463 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4464 .add(Src2); 4465 Src2.ChangeToRegister(Reg, false); 4466 } 4467 } 4468 4469 // Find the one SGPR operand we are allowed to use. 4470 int ConstantBusLimit = ST.getConstantBusLimit(Opc); 4471 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; 4472 SmallDenseSet<unsigned> SGPRsUsed; 4473 unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); 4474 if (SGPRReg != AMDGPU::NoRegister) { 4475 SGPRsUsed.insert(SGPRReg); 4476 --ConstantBusLimit; 4477 } 4478 4479 for (unsigned i = 0; i < 3; ++i) { 4480 int Idx = VOP3Idx[i]; 4481 if (Idx == -1) 4482 break; 4483 MachineOperand &MO = MI.getOperand(Idx); 4484 4485 if (!MO.isReg()) { 4486 if (!isLiteralConstantLike(MO, get(Opc).OpInfo[Idx])) 4487 continue; 4488 4489 if (LiteralLimit > 0 && ConstantBusLimit > 0) { 4490 --LiteralLimit; 4491 --ConstantBusLimit; 4492 continue; 4493 } 4494 4495 --LiteralLimit; 4496 --ConstantBusLimit; 4497 legalizeOpWithMove(MI, Idx); 4498 continue; 4499 } 4500 4501 if (RI.hasAGPRs(MRI.getRegClass(MO.getReg())) && 4502 !isOperandLegal(MI, Idx, &MO)) { 4503 legalizeOpWithMove(MI, Idx); 4504 continue; 4505 } 4506 4507 if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 4508 continue; // VGPRs are legal 4509 4510 // We can use one SGPR in each VOP3 instruction prior to GFX10 4511 // and two starting from GFX10. 4512 if (SGPRsUsed.count(MO.getReg())) 4513 continue; 4514 if (ConstantBusLimit > 0) { 4515 SGPRsUsed.insert(MO.getReg()); 4516 --ConstantBusLimit; 4517 continue; 4518 } 4519 4520 // If we make it this far, then the operand is not legal and we must 4521 // legalize it. 4522 legalizeOpWithMove(MI, Idx); 4523 } 4524 } 4525 4526 Register SIInstrInfo::readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, 4527 MachineRegisterInfo &MRI) const { 4528 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); 4529 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); 4530 Register DstReg = MRI.createVirtualRegister(SRC); 4531 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; 4532 4533 if (RI.hasAGPRs(VRC)) { 4534 VRC = RI.getEquivalentVGPRClass(VRC); 4535 Register NewSrcReg = MRI.createVirtualRegister(VRC); 4536 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 4537 get(TargetOpcode::COPY), NewSrcReg) 4538 .addReg(SrcReg); 4539 SrcReg = NewSrcReg; 4540 } 4541 4542 if (SubRegs == 1) { 4543 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 4544 get(AMDGPU::V_READFIRSTLANE_B32), DstReg) 4545 .addReg(SrcReg); 4546 return DstReg; 4547 } 4548 4549 SmallVector<unsigned, 8> SRegs; 4550 for (unsigned i = 0; i < SubRegs; ++i) { 4551 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4552 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 4553 get(AMDGPU::V_READFIRSTLANE_B32), SGPR) 4554 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); 4555 SRegs.push_back(SGPR); 4556 } 4557 4558 MachineInstrBuilder MIB = 4559 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 4560 get(AMDGPU::REG_SEQUENCE), DstReg); 4561 for (unsigned i = 0; i < SubRegs; ++i) { 4562 MIB.addReg(SRegs[i]); 4563 MIB.addImm(RI.getSubRegFromChannel(i)); 4564 } 4565 return DstReg; 4566 } 4567 4568 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, 4569 MachineInstr &MI) const { 4570 4571 // If the pointer is store in VGPRs, then we need to move them to 4572 // SGPRs using v_readfirstlane. This is safe because we only select 4573 // loads with uniform pointers to SMRD instruction so we know the 4574 // pointer value is uniform. 4575 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); 4576 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { 4577 unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); 4578 SBase->setReg(SGPR); 4579 } 4580 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff); 4581 if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) { 4582 unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI); 4583 SOff->setReg(SGPR); 4584 } 4585 } 4586 4587 void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, 4588 MachineBasicBlock::iterator I, 4589 const TargetRegisterClass *DstRC, 4590 MachineOperand &Op, 4591 MachineRegisterInfo &MRI, 4592 const DebugLoc &DL) const { 4593 Register OpReg = Op.getReg(); 4594 unsigned OpSubReg = Op.getSubReg(); 4595 4596 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( 4597 RI.getRegClassForReg(MRI, OpReg), OpSubReg); 4598 4599 // Check if operand is already the correct register class. 4600 if (DstRC == OpRC) 4601 return; 4602 4603 Register DstReg = MRI.createVirtualRegister(DstRC); 4604 MachineInstr *Copy = 4605 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); 4606 4607 Op.setReg(DstReg); 4608 Op.setSubReg(0); 4609 4610 MachineInstr *Def = MRI.getVRegDef(OpReg); 4611 if (!Def) 4612 return; 4613 4614 // Try to eliminate the copy if it is copying an immediate value. 4615 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass) 4616 FoldImmediate(*Copy, *Def, OpReg, &MRI); 4617 4618 bool ImpDef = Def->isImplicitDef(); 4619 while (!ImpDef && Def && Def->isCopy()) { 4620 if (Def->getOperand(1).getReg().isPhysical()) 4621 break; 4622 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg()); 4623 ImpDef = Def && Def->isImplicitDef(); 4624 } 4625 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) && 4626 !ImpDef) 4627 Copy->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 4628 } 4629 4630 // Emit the actual waterfall loop, executing the wrapped instruction for each 4631 // unique value of \p Rsrc across all lanes. In the best case we execute 1 4632 // iteration, in the worst case we execute 64 (once per lane). 4633 static void 4634 emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, 4635 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, 4636 const DebugLoc &DL, MachineOperand &Rsrc) { 4637 MachineFunction &MF = *OrigBB.getParent(); 4638 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 4639 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 4640 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 4641 unsigned SaveExecOpc = 4642 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 4643 unsigned XorTermOpc = 4644 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; 4645 unsigned AndOpc = 4646 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 4647 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 4648 4649 MachineBasicBlock::iterator I = LoopBB.begin(); 4650 4651 Register VRsrc = Rsrc.getReg(); 4652 unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef()); 4653 4654 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); 4655 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC); 4656 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC); 4657 Register AndCond = MRI.createVirtualRegister(BoolXExecRC); 4658 Register SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4659 Register SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4660 Register SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4661 Register SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4662 Register SRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 4663 4664 // Beginning of the loop, read the next Rsrc variant. 4665 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0) 4666 .addReg(VRsrc, VRsrcUndef, AMDGPU::sub0); 4667 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub1) 4668 .addReg(VRsrc, VRsrcUndef, AMDGPU::sub1); 4669 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub2) 4670 .addReg(VRsrc, VRsrcUndef, AMDGPU::sub2); 4671 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub3) 4672 .addReg(VRsrc, VRsrcUndef, AMDGPU::sub3); 4673 4674 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc) 4675 .addReg(SRsrcSub0) 4676 .addImm(AMDGPU::sub0) 4677 .addReg(SRsrcSub1) 4678 .addImm(AMDGPU::sub1) 4679 .addReg(SRsrcSub2) 4680 .addImm(AMDGPU::sub2) 4681 .addReg(SRsrcSub3) 4682 .addImm(AMDGPU::sub3); 4683 4684 // Update Rsrc operand to use the SGPR Rsrc. 4685 Rsrc.setReg(SRsrc); 4686 Rsrc.setIsKill(true); 4687 4688 // Identify all lanes with identical Rsrc operands in their VGPRs. 4689 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg0) 4690 .addReg(SRsrc, 0, AMDGPU::sub0_sub1) 4691 .addReg(VRsrc, 0, AMDGPU::sub0_sub1); 4692 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1) 4693 .addReg(SRsrc, 0, AMDGPU::sub2_sub3) 4694 .addReg(VRsrc, 0, AMDGPU::sub2_sub3); 4695 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndCond) 4696 .addReg(CondReg0) 4697 .addReg(CondReg1); 4698 4699 MRI.setSimpleHint(SaveExec, AndCond); 4700 4701 // Update EXEC to matching lanes, saving original to SaveExec. 4702 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec) 4703 .addReg(AndCond, RegState::Kill); 4704 4705 // The original instruction is here; we insert the terminators after it. 4706 I = LoopBB.end(); 4707 4708 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 4709 BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec) 4710 .addReg(Exec) 4711 .addReg(SaveExec); 4712 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB); 4713 } 4714 4715 // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register 4716 // with SGPRs by iterating over all unique values across all lanes. 4717 static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, 4718 MachineOperand &Rsrc, MachineDominatorTree *MDT) { 4719 MachineBasicBlock &MBB = *MI.getParent(); 4720 MachineFunction &MF = *MBB.getParent(); 4721 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 4722 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 4723 MachineRegisterInfo &MRI = MF.getRegInfo(); 4724 MachineBasicBlock::iterator I(&MI); 4725 const DebugLoc &DL = MI.getDebugLoc(); 4726 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 4727 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 4728 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 4729 4730 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); 4731 4732 // Save the EXEC mask 4733 BuildMI(MBB, I, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec); 4734 4735 // Killed uses in the instruction we are waterfalling around will be 4736 // incorrect due to the added control-flow. 4737 for (auto &MO : MI.uses()) { 4738 if (MO.isReg() && MO.isUse()) { 4739 MRI.clearKillFlags(MO.getReg()); 4740 } 4741 } 4742 4743 // To insert the loop we need to split the block. Move everything after this 4744 // point to a new block, and insert a new empty block between the two. 4745 MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock(); 4746 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock(); 4747 MachineFunction::iterator MBBI(MBB); 4748 ++MBBI; 4749 4750 MF.insert(MBBI, LoopBB); 4751 MF.insert(MBBI, RemainderBB); 4752 4753 LoopBB->addSuccessor(LoopBB); 4754 LoopBB->addSuccessor(RemainderBB); 4755 4756 // Move MI to the LoopBB, and the remainder of the block to RemainderBB. 4757 MachineBasicBlock::iterator J = I++; 4758 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 4759 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); 4760 LoopBB->splice(LoopBB->begin(), &MBB, J); 4761 4762 MBB.addSuccessor(LoopBB); 4763 4764 // Update dominators. We know that MBB immediately dominates LoopBB, that 4765 // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately 4766 // dominates all of the successors transferred to it from MBB that MBB used 4767 // to properly dominate. 4768 if (MDT) { 4769 MDT->addNewBlock(LoopBB, &MBB); 4770 MDT->addNewBlock(RemainderBB, LoopBB); 4771 for (auto &Succ : RemainderBB->successors()) { 4772 if (MDT->properlyDominates(&MBB, Succ)) { 4773 MDT->changeImmediateDominator(Succ, RemainderBB); 4774 } 4775 } 4776 } 4777 4778 emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc); 4779 4780 // Restore the EXEC mask 4781 MachineBasicBlock::iterator First = RemainderBB->begin(); 4782 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec); 4783 } 4784 4785 // Extract pointer from Rsrc and return a zero-value Rsrc replacement. 4786 static std::tuple<unsigned, unsigned> 4787 extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) { 4788 MachineBasicBlock &MBB = *MI.getParent(); 4789 MachineFunction &MF = *MBB.getParent(); 4790 MachineRegisterInfo &MRI = MF.getRegInfo(); 4791 4792 // Extract the ptr from the resource descriptor. 4793 unsigned RsrcPtr = 4794 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass, 4795 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); 4796 4797 // Create an empty resource descriptor 4798 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4799 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4800 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4801 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 4802 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat(); 4803 4804 // Zero64 = 0 4805 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64) 4806 .addImm(0); 4807 4808 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 4809 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo) 4810 .addImm(RsrcDataFormat & 0xFFFFFFFF); 4811 4812 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 4813 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi) 4814 .addImm(RsrcDataFormat >> 32); 4815 4816 // NewSRsrc = {Zero64, SRsrcFormat} 4817 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc) 4818 .addReg(Zero64) 4819 .addImm(AMDGPU::sub0_sub1) 4820 .addReg(SRsrcFormatLo) 4821 .addImm(AMDGPU::sub2) 4822 .addReg(SRsrcFormatHi) 4823 .addImm(AMDGPU::sub3); 4824 4825 return std::make_tuple(RsrcPtr, NewSRsrc); 4826 } 4827 4828 void SIInstrInfo::legalizeOperands(MachineInstr &MI, 4829 MachineDominatorTree *MDT) const { 4830 MachineFunction &MF = *MI.getParent()->getParent(); 4831 MachineRegisterInfo &MRI = MF.getRegInfo(); 4832 4833 // Legalize VOP2 4834 if (isVOP2(MI) || isVOPC(MI)) { 4835 legalizeOperandsVOP2(MRI, MI); 4836 return; 4837 } 4838 4839 // Legalize VOP3 4840 if (isVOP3(MI)) { 4841 legalizeOperandsVOP3(MRI, MI); 4842 return; 4843 } 4844 4845 // Legalize SMRD 4846 if (isSMRD(MI)) { 4847 legalizeOperandsSMRD(MRI, MI); 4848 return; 4849 } 4850 4851 // Legalize REG_SEQUENCE and PHI 4852 // The register class of the operands much be the same type as the register 4853 // class of the output. 4854 if (MI.getOpcode() == AMDGPU::PHI) { 4855 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 4856 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { 4857 if (!MI.getOperand(i).isReg() || 4858 !Register::isVirtualRegister(MI.getOperand(i).getReg())) 4859 continue; 4860 const TargetRegisterClass *OpRC = 4861 MRI.getRegClass(MI.getOperand(i).getReg()); 4862 if (RI.hasVectorRegisters(OpRC)) { 4863 VRC = OpRC; 4864 } else { 4865 SRC = OpRC; 4866 } 4867 } 4868 4869 // If any of the operands are VGPR registers, then they all most be 4870 // otherwise we will create illegal VGPR->SGPR copies when legalizing 4871 // them. 4872 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { 4873 if (!VRC) { 4874 assert(SRC); 4875 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) { 4876 VRC = &AMDGPU::VReg_1RegClass; 4877 } else 4878 VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) 4879 ? RI.getEquivalentAGPRClass(SRC) 4880 : RI.getEquivalentVGPRClass(SRC); 4881 } else { 4882 VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) 4883 ? RI.getEquivalentAGPRClass(VRC) 4884 : RI.getEquivalentVGPRClass(VRC); 4885 } 4886 RC = VRC; 4887 } else { 4888 RC = SRC; 4889 } 4890 4891 // Update all the operands so they have the same type. 4892 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 4893 MachineOperand &Op = MI.getOperand(I); 4894 if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg())) 4895 continue; 4896 4897 // MI is a PHI instruction. 4898 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); 4899 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); 4900 4901 // Avoid creating no-op copies with the same src and dst reg class. These 4902 // confuse some of the machine passes. 4903 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); 4904 } 4905 } 4906 4907 // REG_SEQUENCE doesn't really require operand legalization, but if one has a 4908 // VGPR dest type and SGPR sources, insert copies so all operands are 4909 // VGPRs. This seems to help operand folding / the register coalescer. 4910 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { 4911 MachineBasicBlock *MBB = MI.getParent(); 4912 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); 4913 if (RI.hasVGPRs(DstRC)) { 4914 // Update all the operands so they are VGPR register classes. These may 4915 // not be the same register class because REG_SEQUENCE supports mixing 4916 // subregister index types e.g. sub0_sub1 + sub2 + sub3 4917 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 4918 MachineOperand &Op = MI.getOperand(I); 4919 if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg())) 4920 continue; 4921 4922 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); 4923 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); 4924 if (VRC == OpRC) 4925 continue; 4926 4927 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc()); 4928 Op.setIsKill(); 4929 } 4930 } 4931 4932 return; 4933 } 4934 4935 // Legalize INSERT_SUBREG 4936 // src0 must have the same register class as dst 4937 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { 4938 Register Dst = MI.getOperand(0).getReg(); 4939 Register Src0 = MI.getOperand(1).getReg(); 4940 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 4941 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 4942 if (DstRC != Src0RC) { 4943 MachineBasicBlock *MBB = MI.getParent(); 4944 MachineOperand &Op = MI.getOperand(1); 4945 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); 4946 } 4947 return; 4948 } 4949 4950 // Legalize SI_INIT_M0 4951 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) { 4952 MachineOperand &Src = MI.getOperand(0); 4953 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) 4954 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); 4955 return; 4956 } 4957 4958 // Legalize MIMG and MUBUF/MTBUF for shaders. 4959 // 4960 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via 4961 // scratch memory access. In both cases, the legalization never involves 4962 // conversion to the addr64 form. 4963 if (isMIMG(MI) || 4964 (AMDGPU::isShader(MF.getFunction().getCallingConv()) && 4965 (isMUBUF(MI) || isMTBUF(MI)))) { 4966 MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); 4967 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { 4968 unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); 4969 SRsrc->setReg(SGPR); 4970 } 4971 4972 MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); 4973 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { 4974 unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); 4975 SSamp->setReg(SGPR); 4976 } 4977 return; 4978 } 4979 4980 // Legalize MUBUF* instructions. 4981 int RsrcIdx = 4982 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); 4983 if (RsrcIdx != -1) { 4984 // We have an MUBUF instruction 4985 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx); 4986 unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass; 4987 if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()), 4988 RI.getRegClass(RsrcRC))) { 4989 // The operands are legal. 4990 // FIXME: We may need to legalize operands besided srsrc. 4991 return; 4992 } 4993 4994 // Legalize a VGPR Rsrc. 4995 // 4996 // If the instruction is _ADDR64, we can avoid a waterfall by extracting 4997 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using 4998 // a zero-value SRsrc. 4999 // 5000 // If the instruction is _OFFSET (both idxen and offen disabled), and we 5001 // support ADDR64 instructions, we can convert to ADDR64 and do the same as 5002 // above. 5003 // 5004 // Otherwise we are on non-ADDR64 hardware, and/or we have 5005 // idxen/offen/bothen and we fall back to a waterfall loop. 5006 5007 MachineBasicBlock &MBB = *MI.getParent(); 5008 5009 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 5010 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) { 5011 // This is already an ADDR64 instruction so we need to add the pointer 5012 // extracted from the resource descriptor to the current value of VAddr. 5013 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5014 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5015 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 5016 5017 const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 5018 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC); 5019 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC); 5020 5021 unsigned RsrcPtr, NewSRsrc; 5022 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); 5023 5024 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0 5025 const DebugLoc &DL = MI.getDebugLoc(); 5026 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e64), NewVAddrLo) 5027 .addDef(CondReg0) 5028 .addReg(RsrcPtr, 0, AMDGPU::sub0) 5029 .addReg(VAddr->getReg(), 0, AMDGPU::sub0) 5030 .addImm(0); 5031 5032 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1 5033 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi) 5034 .addDef(CondReg1, RegState::Dead) 5035 .addReg(RsrcPtr, 0, AMDGPU::sub1) 5036 .addReg(VAddr->getReg(), 0, AMDGPU::sub1) 5037 .addReg(CondReg0, RegState::Kill) 5038 .addImm(0); 5039 5040 // NewVaddr = {NewVaddrHi, NewVaddrLo} 5041 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 5042 .addReg(NewVAddrLo) 5043 .addImm(AMDGPU::sub0) 5044 .addReg(NewVAddrHi) 5045 .addImm(AMDGPU::sub1); 5046 5047 VAddr->setReg(NewVAddr); 5048 Rsrc->setReg(NewSRsrc); 5049 } else if (!VAddr && ST.hasAddr64()) { 5050 // This instructions is the _OFFSET variant, so we need to convert it to 5051 // ADDR64. 5052 assert(MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration() 5053 < AMDGPUSubtarget::VOLCANIC_ISLANDS && 5054 "FIXME: Need to emit flat atomics here"); 5055 5056 unsigned RsrcPtr, NewSRsrc; 5057 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); 5058 5059 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 5060 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); 5061 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 5062 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); 5063 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); 5064 5065 // Atomics rith return have have an additional tied operand and are 5066 // missing some of the special bits. 5067 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); 5068 MachineInstr *Addr64; 5069 5070 if (!VDataIn) { 5071 // Regular buffer load / store. 5072 MachineInstrBuilder MIB = 5073 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 5074 .add(*VData) 5075 .addReg(NewVAddr) 5076 .addReg(NewSRsrc) 5077 .add(*SOffset) 5078 .add(*Offset); 5079 5080 // Atomics do not have this operand. 5081 if (const MachineOperand *GLC = 5082 getNamedOperand(MI, AMDGPU::OpName::glc)) { 5083 MIB.addImm(GLC->getImm()); 5084 } 5085 if (const MachineOperand *DLC = 5086 getNamedOperand(MI, AMDGPU::OpName::dlc)) { 5087 MIB.addImm(DLC->getImm()); 5088 } 5089 5090 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)); 5091 5092 if (const MachineOperand *TFE = 5093 getNamedOperand(MI, AMDGPU::OpName::tfe)) { 5094 MIB.addImm(TFE->getImm()); 5095 } 5096 5097 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz)); 5098 5099 MIB.cloneMemRefs(MI); 5100 Addr64 = MIB; 5101 } else { 5102 // Atomics with return. 5103 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 5104 .add(*VData) 5105 .add(*VDataIn) 5106 .addReg(NewVAddr) 5107 .addReg(NewSRsrc) 5108 .add(*SOffset) 5109 .add(*Offset) 5110 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) 5111 .cloneMemRefs(MI); 5112 } 5113 5114 MI.removeFromParent(); 5115 5116 // NewVaddr = {NewVaddrHi, NewVaddrLo} 5117 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), 5118 NewVAddr) 5119 .addReg(RsrcPtr, 0, AMDGPU::sub0) 5120 .addImm(AMDGPU::sub0) 5121 .addReg(RsrcPtr, 0, AMDGPU::sub1) 5122 .addImm(AMDGPU::sub1); 5123 } else { 5124 // This is another variant; legalize Rsrc with waterfall loop from VGPRs 5125 // to SGPRs. 5126 loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT); 5127 } 5128 } 5129 } 5130 5131 void SIInstrInfo::moveToVALU(MachineInstr &TopInst, 5132 MachineDominatorTree *MDT) const { 5133 SetVectorType Worklist; 5134 Worklist.insert(&TopInst); 5135 5136 while (!Worklist.empty()) { 5137 MachineInstr &Inst = *Worklist.pop_back_val(); 5138 MachineBasicBlock *MBB = Inst.getParent(); 5139 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 5140 5141 unsigned Opcode = Inst.getOpcode(); 5142 unsigned NewOpcode = getVALUOp(Inst); 5143 5144 // Handle some special cases 5145 switch (Opcode) { 5146 default: 5147 break; 5148 case AMDGPU::S_ADD_U64_PSEUDO: 5149 case AMDGPU::S_SUB_U64_PSEUDO: 5150 splitScalar64BitAddSub(Worklist, Inst, MDT); 5151 Inst.eraseFromParent(); 5152 continue; 5153 case AMDGPU::S_ADD_I32: 5154 case AMDGPU::S_SUB_I32: 5155 // FIXME: The u32 versions currently selected use the carry. 5156 if (moveScalarAddSub(Worklist, Inst, MDT)) 5157 continue; 5158 5159 // Default handling 5160 break; 5161 case AMDGPU::S_AND_B64: 5162 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); 5163 Inst.eraseFromParent(); 5164 continue; 5165 5166 case AMDGPU::S_OR_B64: 5167 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT); 5168 Inst.eraseFromParent(); 5169 continue; 5170 5171 case AMDGPU::S_XOR_B64: 5172 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT); 5173 Inst.eraseFromParent(); 5174 continue; 5175 5176 case AMDGPU::S_NAND_B64: 5177 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT); 5178 Inst.eraseFromParent(); 5179 continue; 5180 5181 case AMDGPU::S_NOR_B64: 5182 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT); 5183 Inst.eraseFromParent(); 5184 continue; 5185 5186 case AMDGPU::S_XNOR_B64: 5187 if (ST.hasDLInsts()) 5188 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); 5189 else 5190 splitScalar64BitXnor(Worklist, Inst, MDT); 5191 Inst.eraseFromParent(); 5192 continue; 5193 5194 case AMDGPU::S_ANDN2_B64: 5195 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT); 5196 Inst.eraseFromParent(); 5197 continue; 5198 5199 case AMDGPU::S_ORN2_B64: 5200 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT); 5201 Inst.eraseFromParent(); 5202 continue; 5203 5204 case AMDGPU::S_NOT_B64: 5205 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); 5206 Inst.eraseFromParent(); 5207 continue; 5208 5209 case AMDGPU::S_BCNT1_I32_B64: 5210 splitScalar64BitBCNT(Worklist, Inst); 5211 Inst.eraseFromParent(); 5212 continue; 5213 5214 case AMDGPU::S_BFE_I64: 5215 splitScalar64BitBFE(Worklist, Inst); 5216 Inst.eraseFromParent(); 5217 continue; 5218 5219 case AMDGPU::S_LSHL_B32: 5220 if (ST.hasOnlyRevVALUShifts()) { 5221 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 5222 swapOperands(Inst); 5223 } 5224 break; 5225 case AMDGPU::S_ASHR_I32: 5226 if (ST.hasOnlyRevVALUShifts()) { 5227 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 5228 swapOperands(Inst); 5229 } 5230 break; 5231 case AMDGPU::S_LSHR_B32: 5232 if (ST.hasOnlyRevVALUShifts()) { 5233 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 5234 swapOperands(Inst); 5235 } 5236 break; 5237 case AMDGPU::S_LSHL_B64: 5238 if (ST.hasOnlyRevVALUShifts()) { 5239 NewOpcode = AMDGPU::V_LSHLREV_B64; 5240 swapOperands(Inst); 5241 } 5242 break; 5243 case AMDGPU::S_ASHR_I64: 5244 if (ST.hasOnlyRevVALUShifts()) { 5245 NewOpcode = AMDGPU::V_ASHRREV_I64; 5246 swapOperands(Inst); 5247 } 5248 break; 5249 case AMDGPU::S_LSHR_B64: 5250 if (ST.hasOnlyRevVALUShifts()) { 5251 NewOpcode = AMDGPU::V_LSHRREV_B64; 5252 swapOperands(Inst); 5253 } 5254 break; 5255 5256 case AMDGPU::S_ABS_I32: 5257 lowerScalarAbs(Worklist, Inst); 5258 Inst.eraseFromParent(); 5259 continue; 5260 5261 case AMDGPU::S_CBRANCH_SCC0: 5262 case AMDGPU::S_CBRANCH_SCC1: 5263 // Clear unused bits of vcc 5264 if (ST.isWave32()) 5265 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B32), 5266 AMDGPU::VCC_LO) 5267 .addReg(AMDGPU::EXEC_LO) 5268 .addReg(AMDGPU::VCC_LO); 5269 else 5270 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), 5271 AMDGPU::VCC) 5272 .addReg(AMDGPU::EXEC) 5273 .addReg(AMDGPU::VCC); 5274 break; 5275 5276 case AMDGPU::S_BFE_U64: 5277 case AMDGPU::S_BFM_B64: 5278 llvm_unreachable("Moving this op to VALU not implemented"); 5279 5280 case AMDGPU::S_PACK_LL_B32_B16: 5281 case AMDGPU::S_PACK_LH_B32_B16: 5282 case AMDGPU::S_PACK_HH_B32_B16: 5283 movePackToVALU(Worklist, MRI, Inst); 5284 Inst.eraseFromParent(); 5285 continue; 5286 5287 case AMDGPU::S_XNOR_B32: 5288 lowerScalarXnor(Worklist, Inst); 5289 Inst.eraseFromParent(); 5290 continue; 5291 5292 case AMDGPU::S_NAND_B32: 5293 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32); 5294 Inst.eraseFromParent(); 5295 continue; 5296 5297 case AMDGPU::S_NOR_B32: 5298 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32); 5299 Inst.eraseFromParent(); 5300 continue; 5301 5302 case AMDGPU::S_ANDN2_B32: 5303 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32); 5304 Inst.eraseFromParent(); 5305 continue; 5306 5307 case AMDGPU::S_ORN2_B32: 5308 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); 5309 Inst.eraseFromParent(); 5310 continue; 5311 5312 // TODO: remove as soon as everything is ready 5313 // to replace VGPR to SGPR copy with V_READFIRSTLANEs. 5314 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO 5315 // can only be selected from the uniform SDNode. 5316 case AMDGPU::S_ADD_CO_PSEUDO: 5317 case AMDGPU::S_SUB_CO_PSEUDO: { 5318 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) 5319 ? AMDGPU::V_ADDC_U32_e64 5320 : AMDGPU::V_SUBB_U32_e64; 5321 const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 5322 5323 Register CarryInReg = Inst.getOperand(4).getReg(); 5324 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) { 5325 Register NewCarryReg = MRI.createVirtualRegister(CarryRC); 5326 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg) 5327 .addReg(CarryInReg); 5328 } 5329 5330 Register CarryOutReg = Inst.getOperand(1).getReg(); 5331 5332 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass( 5333 MRI.getRegClass(Inst.getOperand(0).getReg()))); 5334 MachineInstr *CarryOp = 5335 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg) 5336 .addReg(CarryOutReg, RegState::Define) 5337 .add(Inst.getOperand(2)) 5338 .add(Inst.getOperand(3)) 5339 .addReg(CarryInReg) 5340 .addImm(0); 5341 legalizeOperands(*CarryOp); 5342 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg); 5343 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); 5344 Inst.eraseFromParent(); 5345 } 5346 continue; 5347 case AMDGPU::S_UADDO_PSEUDO: 5348 case AMDGPU::S_USUBO_PSEUDO: { 5349 const DebugLoc &DL = Inst.getDebugLoc(); 5350 MachineOperand &Dest0 = Inst.getOperand(0); 5351 MachineOperand &Dest1 = Inst.getOperand(1); 5352 MachineOperand &Src0 = Inst.getOperand(2); 5353 MachineOperand &Src1 = Inst.getOperand(3); 5354 5355 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO) 5356 ? AMDGPU::V_ADD_I32_e64 5357 : AMDGPU::V_SUB_I32_e64; 5358 const TargetRegisterClass *NewRC = 5359 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg())); 5360 Register DestReg = MRI.createVirtualRegister(NewRC); 5361 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg) 5362 .addReg(Dest1.getReg(), RegState::Define) 5363 .add(Src0) 5364 .add(Src1) 5365 .addImm(0); // clamp bit 5366 5367 legalizeOperands(*NewInstr, MDT); 5368 5369 MRI.replaceRegWith(Dest0.getReg(), DestReg); 5370 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, 5371 Worklist); 5372 Inst.eraseFromParent(); 5373 } 5374 continue; 5375 } 5376 5377 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 5378 // We cannot move this instruction to the VALU, so we should try to 5379 // legalize its operands instead. 5380 legalizeOperands(Inst, MDT); 5381 continue; 5382 } 5383 5384 // Use the new VALU Opcode. 5385 const MCInstrDesc &NewDesc = get(NewOpcode); 5386 Inst.setDesc(NewDesc); 5387 5388 // Remove any references to SCC. Vector instructions can't read from it, and 5389 // We're just about to add the implicit use / defs of VCC, and we don't want 5390 // both. 5391 for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { 5392 MachineOperand &Op = Inst.getOperand(i); 5393 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { 5394 // Only propagate through live-def of SCC. 5395 if (Op.isDef() && !Op.isDead()) 5396 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist); 5397 Inst.RemoveOperand(i); 5398 } 5399 } 5400 5401 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 5402 // We are converting these to a BFE, so we need to add the missing 5403 // operands for the size and offset. 5404 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 5405 Inst.addOperand(MachineOperand::CreateImm(0)); 5406 Inst.addOperand(MachineOperand::CreateImm(Size)); 5407 5408 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 5409 // The VALU version adds the second operand to the result, so insert an 5410 // extra 0 operand. 5411 Inst.addOperand(MachineOperand::CreateImm(0)); 5412 } 5413 5414 Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); 5415 fixImplicitOperands(Inst); 5416 5417 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 5418 const MachineOperand &OffsetWidthOp = Inst.getOperand(2); 5419 // If we need to move this to VGPRs, we need to unpack the second operand 5420 // back into the 2 separate ones for bit offset and width. 5421 assert(OffsetWidthOp.isImm() && 5422 "Scalar BFE is only implemented for constant width and offset"); 5423 uint32_t Imm = OffsetWidthOp.getImm(); 5424 5425 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 5426 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 5427 Inst.RemoveOperand(2); // Remove old immediate. 5428 Inst.addOperand(MachineOperand::CreateImm(Offset)); 5429 Inst.addOperand(MachineOperand::CreateImm(BitWidth)); 5430 } 5431 5432 bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); 5433 unsigned NewDstReg = AMDGPU::NoRegister; 5434 if (HasDst) { 5435 Register DstReg = Inst.getOperand(0).getReg(); 5436 if (Register::isPhysicalRegister(DstReg)) 5437 continue; 5438 5439 // Update the destination register class. 5440 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); 5441 if (!NewDstRC) 5442 continue; 5443 5444 if (Inst.isCopy() && 5445 Register::isVirtualRegister(Inst.getOperand(1).getReg()) && 5446 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { 5447 // Instead of creating a copy where src and dst are the same register 5448 // class, we just replace all uses of dst with src. These kinds of 5449 // copies interfere with the heuristics MachineSink uses to decide 5450 // whether or not to split a critical edge. Since the pass assumes 5451 // that copies will end up as machine instructions and not be 5452 // eliminated. 5453 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); 5454 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); 5455 MRI.clearKillFlags(Inst.getOperand(1).getReg()); 5456 Inst.getOperand(0).setReg(DstReg); 5457 5458 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally 5459 // these are deleted later, but at -O0 it would leave a suspicious 5460 // looking illegal copy of an undef register. 5461 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I) 5462 Inst.RemoveOperand(I); 5463 Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); 5464 continue; 5465 } 5466 5467 NewDstReg = MRI.createVirtualRegister(NewDstRC); 5468 MRI.replaceRegWith(DstReg, NewDstReg); 5469 } 5470 5471 // Legalize the operands 5472 legalizeOperands(Inst, MDT); 5473 5474 if (HasDst) 5475 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 5476 } 5477 } 5478 5479 // Add/sub require special handling to deal with carry outs. 5480 bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, 5481 MachineDominatorTree *MDT) const { 5482 if (ST.hasAddNoCarry()) { 5483 // Assume there is no user of scc since we don't select this in that case. 5484 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant 5485 // is used. 5486 5487 MachineBasicBlock &MBB = *Inst.getParent(); 5488 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5489 5490 Register OldDstReg = Inst.getOperand(0).getReg(); 5491 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5492 5493 unsigned Opc = Inst.getOpcode(); 5494 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32); 5495 5496 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ? 5497 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64; 5498 5499 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC); 5500 Inst.RemoveOperand(3); 5501 5502 Inst.setDesc(get(NewOpc)); 5503 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit 5504 Inst.addImplicitDefUseOperands(*MBB.getParent()); 5505 MRI.replaceRegWith(OldDstReg, ResultReg); 5506 legalizeOperands(Inst, MDT); 5507 5508 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 5509 return true; 5510 } 5511 5512 return false; 5513 } 5514 5515 void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, 5516 MachineInstr &Inst) const { 5517 MachineBasicBlock &MBB = *Inst.getParent(); 5518 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5519 MachineBasicBlock::iterator MII = Inst; 5520 DebugLoc DL = Inst.getDebugLoc(); 5521 5522 MachineOperand &Dest = Inst.getOperand(0); 5523 MachineOperand &Src = Inst.getOperand(1); 5524 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5525 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5526 5527 unsigned SubOp = ST.hasAddNoCarry() ? 5528 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32; 5529 5530 BuildMI(MBB, MII, DL, get(SubOp), TmpReg) 5531 .addImm(0) 5532 .addReg(Src.getReg()); 5533 5534 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) 5535 .addReg(Src.getReg()) 5536 .addReg(TmpReg); 5537 5538 MRI.replaceRegWith(Dest.getReg(), ResultReg); 5539 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 5540 } 5541 5542 void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, 5543 MachineInstr &Inst) const { 5544 MachineBasicBlock &MBB = *Inst.getParent(); 5545 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5546 MachineBasicBlock::iterator MII = Inst; 5547 const DebugLoc &DL = Inst.getDebugLoc(); 5548 5549 MachineOperand &Dest = Inst.getOperand(0); 5550 MachineOperand &Src0 = Inst.getOperand(1); 5551 MachineOperand &Src1 = Inst.getOperand(2); 5552 5553 if (ST.hasDLInsts()) { 5554 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5555 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); 5556 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); 5557 5558 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest) 5559 .add(Src0) 5560 .add(Src1); 5561 5562 MRI.replaceRegWith(Dest.getReg(), NewDest); 5563 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 5564 } else { 5565 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can 5566 // invert either source and then perform the XOR. If either source is a 5567 // scalar register, then we can leave the inversion on the scalar unit to 5568 // acheive a better distrubution of scalar and vector instructions. 5569 bool Src0IsSGPR = Src0.isReg() && 5570 RI.isSGPRClass(MRI.getRegClass(Src0.getReg())); 5571 bool Src1IsSGPR = Src1.isReg() && 5572 RI.isSGPRClass(MRI.getRegClass(Src1.getReg())); 5573 MachineInstr *Xor; 5574 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5575 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5576 5577 // Build a pair of scalar instructions and add them to the work list. 5578 // The next iteration over the work list will lower these to the vector 5579 // unit as necessary. 5580 if (Src0IsSGPR) { 5581 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0); 5582 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) 5583 .addReg(Temp) 5584 .add(Src1); 5585 } else if (Src1IsSGPR) { 5586 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1); 5587 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) 5588 .add(Src0) 5589 .addReg(Temp); 5590 } else { 5591 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp) 5592 .add(Src0) 5593 .add(Src1); 5594 MachineInstr *Not = 5595 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp); 5596 Worklist.insert(Not); 5597 } 5598 5599 MRI.replaceRegWith(Dest.getReg(), NewDest); 5600 5601 Worklist.insert(Xor); 5602 5603 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 5604 } 5605 } 5606 5607 void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist, 5608 MachineInstr &Inst, 5609 unsigned Opcode) const { 5610 MachineBasicBlock &MBB = *Inst.getParent(); 5611 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5612 MachineBasicBlock::iterator MII = Inst; 5613 const DebugLoc &DL = Inst.getDebugLoc(); 5614 5615 MachineOperand &Dest = Inst.getOperand(0); 5616 MachineOperand &Src0 = Inst.getOperand(1); 5617 MachineOperand &Src1 = Inst.getOperand(2); 5618 5619 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5620 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5621 5622 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm) 5623 .add(Src0) 5624 .add(Src1); 5625 5626 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest) 5627 .addReg(Interm); 5628 5629 Worklist.insert(&Op); 5630 Worklist.insert(&Not); 5631 5632 MRI.replaceRegWith(Dest.getReg(), NewDest); 5633 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 5634 } 5635 5636 void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist, 5637 MachineInstr &Inst, 5638 unsigned Opcode) const { 5639 MachineBasicBlock &MBB = *Inst.getParent(); 5640 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5641 MachineBasicBlock::iterator MII = Inst; 5642 const DebugLoc &DL = Inst.getDebugLoc(); 5643 5644 MachineOperand &Dest = Inst.getOperand(0); 5645 MachineOperand &Src0 = Inst.getOperand(1); 5646 MachineOperand &Src1 = Inst.getOperand(2); 5647 5648 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 5649 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 5650 5651 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm) 5652 .add(Src1); 5653 5654 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest) 5655 .add(Src0) 5656 .addReg(Interm); 5657 5658 Worklist.insert(&Not); 5659 Worklist.insert(&Op); 5660 5661 MRI.replaceRegWith(Dest.getReg(), NewDest); 5662 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 5663 } 5664 5665 void SIInstrInfo::splitScalar64BitUnaryOp( 5666 SetVectorType &Worklist, MachineInstr &Inst, 5667 unsigned Opcode) const { 5668 MachineBasicBlock &MBB = *Inst.getParent(); 5669 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5670 5671 MachineOperand &Dest = Inst.getOperand(0); 5672 MachineOperand &Src0 = Inst.getOperand(1); 5673 DebugLoc DL = Inst.getDebugLoc(); 5674 5675 MachineBasicBlock::iterator MII = Inst; 5676 5677 const MCInstrDesc &InstDesc = get(Opcode); 5678 const TargetRegisterClass *Src0RC = Src0.isReg() ? 5679 MRI.getRegClass(Src0.getReg()) : 5680 &AMDGPU::SGPR_32RegClass; 5681 5682 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 5683 5684 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5685 AMDGPU::sub0, Src0SubRC); 5686 5687 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 5688 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 5689 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 5690 5691 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 5692 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); 5693 5694 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5695 AMDGPU::sub1, Src0SubRC); 5696 5697 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 5698 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); 5699 5700 Register FullDestReg = MRI.createVirtualRegister(NewDestRC); 5701 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 5702 .addReg(DestSub0) 5703 .addImm(AMDGPU::sub0) 5704 .addReg(DestSub1) 5705 .addImm(AMDGPU::sub1); 5706 5707 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 5708 5709 Worklist.insert(&LoHalf); 5710 Worklist.insert(&HiHalf); 5711 5712 // We don't need to legalizeOperands here because for a single operand, src0 5713 // will support any kind of input. 5714 5715 // Move all users of this moved value. 5716 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 5717 } 5718 5719 void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, 5720 MachineInstr &Inst, 5721 MachineDominatorTree *MDT) const { 5722 bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); 5723 5724 MachineBasicBlock &MBB = *Inst.getParent(); 5725 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5726 const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 5727 5728 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 5729 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5730 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5731 5732 Register CarryReg = MRI.createVirtualRegister(CarryRC); 5733 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC); 5734 5735 MachineOperand &Dest = Inst.getOperand(0); 5736 MachineOperand &Src0 = Inst.getOperand(1); 5737 MachineOperand &Src1 = Inst.getOperand(2); 5738 const DebugLoc &DL = Inst.getDebugLoc(); 5739 MachineBasicBlock::iterator MII = Inst; 5740 5741 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); 5742 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); 5743 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 5744 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 5745 5746 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5747 AMDGPU::sub0, Src0SubRC); 5748 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 5749 AMDGPU::sub0, Src1SubRC); 5750 5751 5752 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5753 AMDGPU::sub1, Src0SubRC); 5754 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 5755 AMDGPU::sub1, Src1SubRC); 5756 5757 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; 5758 MachineInstr *LoHalf = 5759 BuildMI(MBB, MII, DL, get(LoOpc), DestSub0) 5760 .addReg(CarryReg, RegState::Define) 5761 .add(SrcReg0Sub0) 5762 .add(SrcReg1Sub0) 5763 .addImm(0); // clamp bit 5764 5765 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 5766 MachineInstr *HiHalf = 5767 BuildMI(MBB, MII, DL, get(HiOpc), DestSub1) 5768 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 5769 .add(SrcReg0Sub1) 5770 .add(SrcReg1Sub1) 5771 .addReg(CarryReg, RegState::Kill) 5772 .addImm(0); // clamp bit 5773 5774 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 5775 .addReg(DestSub0) 5776 .addImm(AMDGPU::sub0) 5777 .addReg(DestSub1) 5778 .addImm(AMDGPU::sub1); 5779 5780 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 5781 5782 // Try to legalize the operands in case we need to swap the order to keep it 5783 // valid. 5784 legalizeOperands(*LoHalf, MDT); 5785 legalizeOperands(*HiHalf, MDT); 5786 5787 // Move all users of this moved vlaue. 5788 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 5789 } 5790 5791 void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, 5792 MachineInstr &Inst, unsigned Opcode, 5793 MachineDominatorTree *MDT) const { 5794 MachineBasicBlock &MBB = *Inst.getParent(); 5795 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5796 5797 MachineOperand &Dest = Inst.getOperand(0); 5798 MachineOperand &Src0 = Inst.getOperand(1); 5799 MachineOperand &Src1 = Inst.getOperand(2); 5800 DebugLoc DL = Inst.getDebugLoc(); 5801 5802 MachineBasicBlock::iterator MII = Inst; 5803 5804 const MCInstrDesc &InstDesc = get(Opcode); 5805 const TargetRegisterClass *Src0RC = Src0.isReg() ? 5806 MRI.getRegClass(Src0.getReg()) : 5807 &AMDGPU::SGPR_32RegClass; 5808 5809 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 5810 const TargetRegisterClass *Src1RC = Src1.isReg() ? 5811 MRI.getRegClass(Src1.getReg()) : 5812 &AMDGPU::SGPR_32RegClass; 5813 5814 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 5815 5816 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5817 AMDGPU::sub0, Src0SubRC); 5818 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 5819 AMDGPU::sub0, Src1SubRC); 5820 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 5821 AMDGPU::sub1, Src0SubRC); 5822 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 5823 AMDGPU::sub1, Src1SubRC); 5824 5825 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 5826 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 5827 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 5828 5829 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 5830 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) 5831 .add(SrcReg0Sub0) 5832 .add(SrcReg1Sub0); 5833 5834 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 5835 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) 5836 .add(SrcReg0Sub1) 5837 .add(SrcReg1Sub1); 5838 5839 Register FullDestReg = MRI.createVirtualRegister(NewDestRC); 5840 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 5841 .addReg(DestSub0) 5842 .addImm(AMDGPU::sub0) 5843 .addReg(DestSub1) 5844 .addImm(AMDGPU::sub1); 5845 5846 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 5847 5848 Worklist.insert(&LoHalf); 5849 Worklist.insert(&HiHalf); 5850 5851 // Move all users of this moved vlaue. 5852 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 5853 } 5854 5855 void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist, 5856 MachineInstr &Inst, 5857 MachineDominatorTree *MDT) const { 5858 MachineBasicBlock &MBB = *Inst.getParent(); 5859 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5860 5861 MachineOperand &Dest = Inst.getOperand(0); 5862 MachineOperand &Src0 = Inst.getOperand(1); 5863 MachineOperand &Src1 = Inst.getOperand(2); 5864 const DebugLoc &DL = Inst.getDebugLoc(); 5865 5866 MachineBasicBlock::iterator MII = Inst; 5867 5868 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 5869 5870 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 5871 5872 MachineOperand* Op0; 5873 MachineOperand* Op1; 5874 5875 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) { 5876 Op0 = &Src0; 5877 Op1 = &Src1; 5878 } else { 5879 Op0 = &Src1; 5880 Op1 = &Src0; 5881 } 5882 5883 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm) 5884 .add(*Op0); 5885 5886 Register NewDest = MRI.createVirtualRegister(DestRC); 5887 5888 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest) 5889 .addReg(Interm) 5890 .add(*Op1); 5891 5892 MRI.replaceRegWith(Dest.getReg(), NewDest); 5893 5894 Worklist.insert(&Xor); 5895 } 5896 5897 void SIInstrInfo::splitScalar64BitBCNT( 5898 SetVectorType &Worklist, MachineInstr &Inst) const { 5899 MachineBasicBlock &MBB = *Inst.getParent(); 5900 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5901 5902 MachineBasicBlock::iterator MII = Inst; 5903 const DebugLoc &DL = Inst.getDebugLoc(); 5904 5905 MachineOperand &Dest = Inst.getOperand(0); 5906 MachineOperand &Src = Inst.getOperand(1); 5907 5908 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 5909 const TargetRegisterClass *SrcRC = Src.isReg() ? 5910 MRI.getRegClass(Src.getReg()) : 5911 &AMDGPU::SGPR_32RegClass; 5912 5913 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5914 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5915 5916 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); 5917 5918 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 5919 AMDGPU::sub0, SrcSubRC); 5920 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 5921 AMDGPU::sub1, SrcSubRC); 5922 5923 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0); 5924 5925 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg); 5926 5927 MRI.replaceRegWith(Dest.getReg(), ResultReg); 5928 5929 // We don't need to legalize operands here. src0 for etiher instruction can be 5930 // an SGPR, and the second input is unused or determined here. 5931 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 5932 } 5933 5934 void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, 5935 MachineInstr &Inst) const { 5936 MachineBasicBlock &MBB = *Inst.getParent(); 5937 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5938 MachineBasicBlock::iterator MII = Inst; 5939 const DebugLoc &DL = Inst.getDebugLoc(); 5940 5941 MachineOperand &Dest = Inst.getOperand(0); 5942 uint32_t Imm = Inst.getOperand(2).getImm(); 5943 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 5944 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 5945 5946 (void) Offset; 5947 5948 // Only sext_inreg cases handled. 5949 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && 5950 Offset == 0 && "Not implemented"); 5951 5952 if (BitWidth < 32) { 5953 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5954 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5955 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 5956 5957 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) 5958 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) 5959 .addImm(0) 5960 .addImm(BitWidth); 5961 5962 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 5963 .addImm(31) 5964 .addReg(MidRegLo); 5965 5966 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 5967 .addReg(MidRegLo) 5968 .addImm(AMDGPU::sub0) 5969 .addReg(MidRegHi) 5970 .addImm(AMDGPU::sub1); 5971 5972 MRI.replaceRegWith(Dest.getReg(), ResultReg); 5973 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 5974 return; 5975 } 5976 5977 MachineOperand &Src = Inst.getOperand(1); 5978 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5979 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 5980 5981 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 5982 .addImm(31) 5983 .addReg(Src.getReg(), 0, AMDGPU::sub0); 5984 5985 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 5986 .addReg(Src.getReg(), 0, AMDGPU::sub0) 5987 .addImm(AMDGPU::sub0) 5988 .addReg(TmpReg) 5989 .addImm(AMDGPU::sub1); 5990 5991 MRI.replaceRegWith(Dest.getReg(), ResultReg); 5992 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 5993 } 5994 5995 void SIInstrInfo::addUsersToMoveToVALUWorklist( 5996 Register DstReg, 5997 MachineRegisterInfo &MRI, 5998 SetVectorType &Worklist) const { 5999 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), 6000 E = MRI.use_end(); I != E;) { 6001 MachineInstr &UseMI = *I->getParent(); 6002 6003 unsigned OpNo = 0; 6004 6005 switch (UseMI.getOpcode()) { 6006 case AMDGPU::COPY: 6007 case AMDGPU::WQM: 6008 case AMDGPU::SOFT_WQM: 6009 case AMDGPU::WWM: 6010 case AMDGPU::REG_SEQUENCE: 6011 case AMDGPU::PHI: 6012 case AMDGPU::INSERT_SUBREG: 6013 break; 6014 default: 6015 OpNo = I.getOperandNo(); 6016 break; 6017 } 6018 6019 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) { 6020 Worklist.insert(&UseMI); 6021 6022 do { 6023 ++I; 6024 } while (I != E && I->getParent() == &UseMI); 6025 } else { 6026 ++I; 6027 } 6028 } 6029 } 6030 6031 void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, 6032 MachineRegisterInfo &MRI, 6033 MachineInstr &Inst) const { 6034 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6035 MachineBasicBlock *MBB = Inst.getParent(); 6036 MachineOperand &Src0 = Inst.getOperand(1); 6037 MachineOperand &Src1 = Inst.getOperand(2); 6038 const DebugLoc &DL = Inst.getDebugLoc(); 6039 6040 switch (Inst.getOpcode()) { 6041 case AMDGPU::S_PACK_LL_B32_B16: { 6042 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6043 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6044 6045 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are 6046 // 0. 6047 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 6048 .addImm(0xffff); 6049 6050 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg) 6051 .addReg(ImmReg, RegState::Kill) 6052 .add(Src0); 6053 6054 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg) 6055 .add(Src1) 6056 .addImm(16) 6057 .addReg(TmpReg, RegState::Kill); 6058 break; 6059 } 6060 case AMDGPU::S_PACK_LH_B32_B16: { 6061 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6062 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 6063 .addImm(0xffff); 6064 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg) 6065 .addReg(ImmReg, RegState::Kill) 6066 .add(Src0) 6067 .add(Src1); 6068 break; 6069 } 6070 case AMDGPU::S_PACK_HH_B32_B16: { 6071 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6072 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6073 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) 6074 .addImm(16) 6075 .add(Src0); 6076 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 6077 .addImm(0xffff0000); 6078 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg) 6079 .add(Src1) 6080 .addReg(ImmReg, RegState::Kill) 6081 .addReg(TmpReg, RegState::Kill); 6082 break; 6083 } 6084 default: 6085 llvm_unreachable("unhandled s_pack_* instruction"); 6086 } 6087 6088 MachineOperand &Dest = Inst.getOperand(0); 6089 MRI.replaceRegWith(Dest.getReg(), ResultReg); 6090 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 6091 } 6092 6093 void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, 6094 MachineInstr &SCCDefInst, 6095 SetVectorType &Worklist) const { 6096 // Ensure that def inst defines SCC, which is still live. 6097 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && 6098 !Op.isDead() && Op.getParent() == &SCCDefInst); 6099 SmallVector<MachineInstr *, 4> CopyToDelete; 6100 // This assumes that all the users of SCC are in the same block 6101 // as the SCC def. 6102 for (MachineInstr &MI : // Skip the def inst itself. 6103 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)), 6104 SCCDefInst.getParent()->end())) { 6105 // Check if SCC is used first. 6106 if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) { 6107 if (MI.isCopy()) { 6108 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 6109 unsigned DestReg = MI.getOperand(0).getReg(); 6110 SmallVector<MachineInstr *, 4> Users; 6111 for (auto &User : MRI.use_nodbg_instructions(DestReg)) { 6112 if ((User.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) || 6113 (User.getOpcode() == AMDGPU::S_SUB_CO_PSEUDO)) { 6114 Users.push_back(&User); 6115 Worklist.insert(&User); 6116 } 6117 } 6118 for (auto &U : Users) 6119 U->getOperand(4).setReg(RI.getVCC()); 6120 CopyToDelete.push_back(&MI); 6121 } else 6122 Worklist.insert(&MI); 6123 } 6124 // Exit if we find another SCC def. 6125 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1) 6126 break; 6127 } 6128 for (auto &Copy : CopyToDelete) 6129 Copy->eraseFromParent(); 6130 } 6131 6132 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( 6133 const MachineInstr &Inst) const { 6134 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); 6135 6136 switch (Inst.getOpcode()) { 6137 // For target instructions, getOpRegClass just returns the virtual register 6138 // class associated with the operand, so we need to find an equivalent VGPR 6139 // register class in order to move the instruction to the VALU. 6140 case AMDGPU::COPY: 6141 case AMDGPU::PHI: 6142 case AMDGPU::REG_SEQUENCE: 6143 case AMDGPU::INSERT_SUBREG: 6144 case AMDGPU::WQM: 6145 case AMDGPU::SOFT_WQM: 6146 case AMDGPU::WWM: { 6147 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1); 6148 if (RI.hasAGPRs(SrcRC)) { 6149 if (RI.hasAGPRs(NewDstRC)) 6150 return nullptr; 6151 6152 switch (Inst.getOpcode()) { 6153 case AMDGPU::PHI: 6154 case AMDGPU::REG_SEQUENCE: 6155 case AMDGPU::INSERT_SUBREG: 6156 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC); 6157 break; 6158 default: 6159 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 6160 } 6161 6162 if (!NewDstRC) 6163 return nullptr; 6164 } else { 6165 if (RI.hasVGPRs(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass) 6166 return nullptr; 6167 6168 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 6169 if (!NewDstRC) 6170 return nullptr; 6171 } 6172 6173 return NewDstRC; 6174 } 6175 default: 6176 return NewDstRC; 6177 } 6178 } 6179 6180 // Find the one SGPR operand we are allowed to use. 6181 Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI, 6182 int OpIndices[3]) const { 6183 const MCInstrDesc &Desc = MI.getDesc(); 6184 6185 // Find the one SGPR operand we are allowed to use. 6186 // 6187 // First we need to consider the instruction's operand requirements before 6188 // legalizing. Some operands are required to be SGPRs, such as implicit uses 6189 // of VCC, but we are still bound by the constant bus requirement to only use 6190 // one. 6191 // 6192 // If the operand's class is an SGPR, we can never move it. 6193 6194 Register SGPRReg = findImplicitSGPRRead(MI); 6195 if (SGPRReg != AMDGPU::NoRegister) 6196 return SGPRReg; 6197 6198 Register UsedSGPRs[3] = { AMDGPU::NoRegister }; 6199 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 6200 6201 for (unsigned i = 0; i < 3; ++i) { 6202 int Idx = OpIndices[i]; 6203 if (Idx == -1) 6204 break; 6205 6206 const MachineOperand &MO = MI.getOperand(Idx); 6207 if (!MO.isReg()) 6208 continue; 6209 6210 // Is this operand statically required to be an SGPR based on the operand 6211 // constraints? 6212 const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); 6213 bool IsRequiredSGPR = RI.isSGPRClass(OpRC); 6214 if (IsRequiredSGPR) 6215 return MO.getReg(); 6216 6217 // If this could be a VGPR or an SGPR, Check the dynamic register class. 6218 Register Reg = MO.getReg(); 6219 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); 6220 if (RI.isSGPRClass(RegRC)) 6221 UsedSGPRs[i] = Reg; 6222 } 6223 6224 // We don't have a required SGPR operand, so we have a bit more freedom in 6225 // selecting operands to move. 6226 6227 // Try to select the most used SGPR. If an SGPR is equal to one of the 6228 // others, we choose that. 6229 // 6230 // e.g. 6231 // V_FMA_F32 v0, s0, s0, s0 -> No moves 6232 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 6233 6234 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should 6235 // prefer those. 6236 6237 if (UsedSGPRs[0] != AMDGPU::NoRegister) { 6238 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 6239 SGPRReg = UsedSGPRs[0]; 6240 } 6241 6242 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { 6243 if (UsedSGPRs[1] == UsedSGPRs[2]) 6244 SGPRReg = UsedSGPRs[1]; 6245 } 6246 6247 return SGPRReg; 6248 } 6249 6250 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 6251 unsigned OperandName) const { 6252 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 6253 if (Idx == -1) 6254 return nullptr; 6255 6256 return &MI.getOperand(Idx); 6257 } 6258 6259 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 6260 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 6261 return (22ULL << 44) | // IMG_FORMAT_32_FLOAT 6262 (1ULL << 56) | // RESOURCE_LEVEL = 1 6263 (3ULL << 60); // OOB_SELECT = 3 6264 } 6265 6266 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 6267 if (ST.isAmdHsaOS()) { 6268 // Set ATC = 1. GFX9 doesn't have this bit. 6269 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) 6270 RsrcDataFormat |= (1ULL << 56); 6271 6272 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this. 6273 // BTW, it disables TC L2 and therefore decreases performance. 6274 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) 6275 RsrcDataFormat |= (2ULL << 59); 6276 } 6277 6278 return RsrcDataFormat; 6279 } 6280 6281 uint64_t SIInstrInfo::getScratchRsrcWords23() const { 6282 uint64_t Rsrc23 = getDefaultRsrcDataFormat() | 6283 AMDGPU::RSRC_TID_ENABLE | 6284 0xffffffff; // Size; 6285 6286 // GFX9 doesn't have ELEMENT_SIZE. 6287 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 6288 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; 6289 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; 6290 } 6291 6292 // IndexStride = 64 / 32. 6293 uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2; 6294 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; 6295 6296 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. 6297 // Clear them unless we want a huge stride. 6298 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 6299 ST.getGeneration() <= AMDGPUSubtarget::GFX9) 6300 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; 6301 6302 return Rsrc23; 6303 } 6304 6305 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { 6306 unsigned Opc = MI.getOpcode(); 6307 6308 return isSMRD(Opc); 6309 } 6310 6311 bool SIInstrInfo::isHighLatencyDef(int Opc) const { 6312 return get(Opc).mayLoad() && 6313 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc)); 6314 } 6315 6316 unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, 6317 int &FrameIndex) const { 6318 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 6319 if (!Addr || !Addr->isFI()) 6320 return AMDGPU::NoRegister; 6321 6322 assert(!MI.memoperands_empty() && 6323 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS); 6324 6325 FrameIndex = Addr->getIndex(); 6326 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); 6327 } 6328 6329 unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, 6330 int &FrameIndex) const { 6331 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr); 6332 assert(Addr && Addr->isFI()); 6333 FrameIndex = Addr->getIndex(); 6334 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg(); 6335 } 6336 6337 unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 6338 int &FrameIndex) const { 6339 if (!MI.mayLoad()) 6340 return AMDGPU::NoRegister; 6341 6342 if (isMUBUF(MI) || isVGPRSpill(MI)) 6343 return isStackAccess(MI, FrameIndex); 6344 6345 if (isSGPRSpill(MI)) 6346 return isSGPRStackAccess(MI, FrameIndex); 6347 6348 return AMDGPU::NoRegister; 6349 } 6350 6351 unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI, 6352 int &FrameIndex) const { 6353 if (!MI.mayStore()) 6354 return AMDGPU::NoRegister; 6355 6356 if (isMUBUF(MI) || isVGPRSpill(MI)) 6357 return isStackAccess(MI, FrameIndex); 6358 6359 if (isSGPRSpill(MI)) 6360 return isSGPRStackAccess(MI, FrameIndex); 6361 6362 return AMDGPU::NoRegister; 6363 } 6364 6365 unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const { 6366 unsigned Size = 0; 6367 MachineBasicBlock::const_instr_iterator I = MI.getIterator(); 6368 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); 6369 while (++I != E && I->isInsideBundle()) { 6370 assert(!I->isBundle() && "No nested bundle!"); 6371 Size += getInstSizeInBytes(*I); 6372 } 6373 6374 return Size; 6375 } 6376 6377 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 6378 unsigned Opc = MI.getOpcode(); 6379 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); 6380 unsigned DescSize = Desc.getSize(); 6381 6382 // If we have a definitive size, we can use it. Otherwise we need to inspect 6383 // the operands to know the size. 6384 if (isFixedSize(MI)) 6385 return DescSize; 6386 6387 // 4-byte instructions may have a 32-bit literal encoded after them. Check 6388 // operands that coud ever be literals. 6389 if (isVALU(MI) || isSALU(MI)) { 6390 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 6391 if (Src0Idx == -1) 6392 return DescSize; // No operands. 6393 6394 if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx])) 6395 return isVOP3(MI) ? 12 : (DescSize + 4); 6396 6397 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 6398 if (Src1Idx == -1) 6399 return DescSize; 6400 6401 if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx])) 6402 return isVOP3(MI) ? 12 : (DescSize + 4); 6403 6404 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 6405 if (Src2Idx == -1) 6406 return DescSize; 6407 6408 if (isLiteralConstantLike(MI.getOperand(Src2Idx), Desc.OpInfo[Src2Idx])) 6409 return isVOP3(MI) ? 12 : (DescSize + 4); 6410 6411 return DescSize; 6412 } 6413 6414 // Check whether we have extra NSA words. 6415 if (isMIMG(MI)) { 6416 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 6417 if (VAddr0Idx < 0) 6418 return 8; 6419 6420 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 6421 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4); 6422 } 6423 6424 switch (Opc) { 6425 case TargetOpcode::IMPLICIT_DEF: 6426 case TargetOpcode::KILL: 6427 case TargetOpcode::DBG_VALUE: 6428 case TargetOpcode::EH_LABEL: 6429 return 0; 6430 case TargetOpcode::BUNDLE: 6431 return getInstBundleSize(MI); 6432 case TargetOpcode::INLINEASM: 6433 case TargetOpcode::INLINEASM_BR: { 6434 const MachineFunction *MF = MI.getParent()->getParent(); 6435 const char *AsmStr = MI.getOperand(0).getSymbolName(); 6436 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), 6437 &MF->getSubtarget()); 6438 } 6439 default: 6440 return DescSize; 6441 } 6442 } 6443 6444 bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { 6445 if (!isFLAT(MI)) 6446 return false; 6447 6448 if (MI.memoperands_empty()) 6449 return true; 6450 6451 for (const MachineMemOperand *MMO : MI.memoperands()) { 6452 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) 6453 return true; 6454 } 6455 return false; 6456 } 6457 6458 bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const { 6459 return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO; 6460 } 6461 6462 void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, 6463 MachineBasicBlock *IfEnd) const { 6464 MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator(); 6465 assert(TI != IfEntry->end()); 6466 6467 MachineInstr *Branch = &(*TI); 6468 MachineFunction *MF = IfEntry->getParent(); 6469 MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo(); 6470 6471 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 6472 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); 6473 MachineInstr *SIIF = 6474 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg) 6475 .add(Branch->getOperand(0)) 6476 .add(Branch->getOperand(1)); 6477 MachineInstr *SIEND = 6478 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF)) 6479 .addReg(DstReg); 6480 6481 IfEntry->erase(TI); 6482 IfEntry->insert(IfEntry->end(), SIIF); 6483 IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND); 6484 } 6485 } 6486 6487 void SIInstrInfo::convertNonUniformLoopRegion( 6488 MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const { 6489 MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator(); 6490 // We expect 2 terminators, one conditional and one unconditional. 6491 assert(TI != LoopEnd->end()); 6492 6493 MachineInstr *Branch = &(*TI); 6494 MachineFunction *MF = LoopEnd->getParent(); 6495 MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo(); 6496 6497 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 6498 6499 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); 6500 Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC()); 6501 MachineInstrBuilder HeaderPHIBuilder = 6502 BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg); 6503 for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(), 6504 E = LoopEntry->pred_end(); 6505 PI != E; ++PI) { 6506 if (*PI == LoopEnd) { 6507 HeaderPHIBuilder.addReg(BackEdgeReg); 6508 } else { 6509 MachineBasicBlock *PMBB = *PI; 6510 Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC()); 6511 materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(), 6512 ZeroReg, 0); 6513 HeaderPHIBuilder.addReg(ZeroReg); 6514 } 6515 HeaderPHIBuilder.addMBB(*PI); 6516 } 6517 MachineInstr *HeaderPhi = HeaderPHIBuilder; 6518 MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(), 6519 get(AMDGPU::SI_IF_BREAK), BackEdgeReg) 6520 .addReg(DstReg) 6521 .add(Branch->getOperand(0)); 6522 MachineInstr *SILOOP = 6523 BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP)) 6524 .addReg(BackEdgeReg) 6525 .addMBB(LoopEntry); 6526 6527 LoopEntry->insert(LoopEntry->begin(), HeaderPhi); 6528 LoopEnd->erase(TI); 6529 LoopEnd->insert(LoopEnd->end(), SIIFBREAK); 6530 LoopEnd->insert(LoopEnd->end(), SILOOP); 6531 } 6532 } 6533 6534 ArrayRef<std::pair<int, const char *>> 6535 SIInstrInfo::getSerializableTargetIndices() const { 6536 static const std::pair<int, const char *> TargetIndices[] = { 6537 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, 6538 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, 6539 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, 6540 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, 6541 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; 6542 return makeArrayRef(TargetIndices); 6543 } 6544 6545 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The 6546 /// post-RA version of misched uses CreateTargetMIHazardRecognizer. 6547 ScheduleHazardRecognizer * 6548 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, 6549 const ScheduleDAG *DAG) const { 6550 return new GCNHazardRecognizer(DAG->MF); 6551 } 6552 6553 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer 6554 /// pass. 6555 ScheduleHazardRecognizer * 6556 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { 6557 return new GCNHazardRecognizer(MF); 6558 } 6559 6560 std::pair<unsigned, unsigned> 6561 SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { 6562 return std::make_pair(TF & MO_MASK, TF & ~MO_MASK); 6563 } 6564 6565 ArrayRef<std::pair<unsigned, const char *>> 6566 SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { 6567 static const std::pair<unsigned, const char *> TargetFlags[] = { 6568 { MO_GOTPCREL, "amdgpu-gotprel" }, 6569 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" }, 6570 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" }, 6571 { MO_REL32_LO, "amdgpu-rel32-lo" }, 6572 { MO_REL32_HI, "amdgpu-rel32-hi" }, 6573 { MO_ABS32_LO, "amdgpu-abs32-lo" }, 6574 { MO_ABS32_HI, "amdgpu-abs32-hi" }, 6575 }; 6576 6577 return makeArrayRef(TargetFlags); 6578 } 6579 6580 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { 6581 return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && 6582 MI.modifiesRegister(AMDGPU::EXEC, &RI); 6583 } 6584 6585 MachineInstrBuilder 6586 SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, 6587 MachineBasicBlock::iterator I, 6588 const DebugLoc &DL, 6589 Register DestReg) const { 6590 if (ST.hasAddNoCarry()) 6591 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg); 6592 6593 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6594 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC()); 6595 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC()); 6596 6597 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) 6598 .addReg(UnusedCarry, RegState::Define | RegState::Dead); 6599 } 6600 6601 MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, 6602 MachineBasicBlock::iterator I, 6603 const DebugLoc &DL, 6604 Register DestReg, 6605 RegScavenger &RS) const { 6606 if (ST.hasAddNoCarry()) 6607 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg); 6608 6609 // If available, prefer to use vcc. 6610 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC) 6611 ? Register(RI.getVCC()) 6612 : RS.scavengeRegister(RI.getBoolRC(), I, 0, false); 6613 6614 // TODO: Users need to deal with this. 6615 if (!UnusedCarry.isValid()) 6616 return MachineInstrBuilder(); 6617 6618 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) 6619 .addReg(UnusedCarry, RegState::Define | RegState::Dead); 6620 } 6621 6622 bool SIInstrInfo::isKillTerminator(unsigned Opcode) { 6623 switch (Opcode) { 6624 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: 6625 case AMDGPU::SI_KILL_I1_TERMINATOR: 6626 return true; 6627 default: 6628 return false; 6629 } 6630 } 6631 6632 const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const { 6633 switch (Opcode) { 6634 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: 6635 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR); 6636 case AMDGPU::SI_KILL_I1_PSEUDO: 6637 return get(AMDGPU::SI_KILL_I1_TERMINATOR); 6638 default: 6639 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO"); 6640 } 6641 } 6642 6643 void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const { 6644 MachineBasicBlock *MBB = MI.getParent(); 6645 MachineFunction *MF = MBB->getParent(); 6646 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 6647 6648 if (!ST.isWave32()) 6649 return; 6650 6651 for (auto &Op : MI.implicit_operands()) { 6652 if (Op.isReg() && Op.getReg() == AMDGPU::VCC) 6653 Op.setReg(AMDGPU::VCC_LO); 6654 } 6655 } 6656 6657 bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { 6658 if (!isSMRD(MI)) 6659 return false; 6660 6661 // Check that it is using a buffer resource. 6662 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase); 6663 if (Idx == -1) // e.g. s_memtime 6664 return false; 6665 6666 const auto RCID = MI.getDesc().OpInfo[Idx].RegClass; 6667 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass); 6668 } 6669 6670 unsigned SIInstrInfo::getNumFlatOffsetBits(unsigned AddrSpace, 6671 bool Signed) const { 6672 if (!ST.hasFlatInstOffsets()) 6673 return 0; 6674 6675 if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS) 6676 return 0; 6677 6678 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) 6679 return Signed ? 12 : 11; 6680 6681 return Signed ? 13 : 12; 6682 } 6683 6684 bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, 6685 bool Signed) const { 6686 // TODO: Should 0 be special cased? 6687 if (!ST.hasFlatInstOffsets()) 6688 return false; 6689 6690 if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS) 6691 return false; 6692 6693 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 6694 return (Signed && isInt<12>(Offset)) || 6695 (!Signed && isUInt<11>(Offset)); 6696 } 6697 6698 return (Signed && isInt<13>(Offset)) || 6699 (!Signed && isUInt<12>(Offset)); 6700 } 6701 6702 6703 // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td 6704 enum SIEncodingFamily { 6705 SI = 0, 6706 VI = 1, 6707 SDWA = 2, 6708 SDWA9 = 3, 6709 GFX80 = 4, 6710 GFX9 = 5, 6711 GFX10 = 6, 6712 SDWA10 = 7 6713 }; 6714 6715 static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) { 6716 switch (ST.getGeneration()) { 6717 default: 6718 break; 6719 case AMDGPUSubtarget::SOUTHERN_ISLANDS: 6720 case AMDGPUSubtarget::SEA_ISLANDS: 6721 return SIEncodingFamily::SI; 6722 case AMDGPUSubtarget::VOLCANIC_ISLANDS: 6723 case AMDGPUSubtarget::GFX9: 6724 return SIEncodingFamily::VI; 6725 case AMDGPUSubtarget::GFX10: 6726 return SIEncodingFamily::GFX10; 6727 } 6728 llvm_unreachable("Unknown subtarget generation!"); 6729 } 6730 6731 bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const { 6732 switch(MCOp) { 6733 // These opcodes use indirect register addressing so 6734 // they need special handling by codegen (currently missing). 6735 // Therefore it is too risky to allow these opcodes 6736 // to be selected by dpp combiner or sdwa peepholer. 6737 case AMDGPU::V_MOVRELS_B32_dpp_gfx10: 6738 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10: 6739 case AMDGPU::V_MOVRELD_B32_dpp_gfx10: 6740 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10: 6741 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10: 6742 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10: 6743 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10: 6744 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10: 6745 return true; 6746 default: 6747 return false; 6748 } 6749 } 6750 6751 int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { 6752 SIEncodingFamily Gen = subtargetEncodingFamily(ST); 6753 6754 if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 && 6755 ST.getGeneration() == AMDGPUSubtarget::GFX9) 6756 Gen = SIEncodingFamily::GFX9; 6757 6758 // Adjust the encoding family to GFX80 for D16 buffer instructions when the 6759 // subtarget has UnpackedD16VMem feature. 6760 // TODO: remove this when we discard GFX80 encoding. 6761 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf)) 6762 Gen = SIEncodingFamily::GFX80; 6763 6764 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) { 6765 switch (ST.getGeneration()) { 6766 default: 6767 Gen = SIEncodingFamily::SDWA; 6768 break; 6769 case AMDGPUSubtarget::GFX9: 6770 Gen = SIEncodingFamily::SDWA9; 6771 break; 6772 case AMDGPUSubtarget::GFX10: 6773 Gen = SIEncodingFamily::SDWA10; 6774 break; 6775 } 6776 } 6777 6778 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); 6779 6780 // -1 means that Opcode is already a native instruction. 6781 if (MCOp == -1) 6782 return Opcode; 6783 6784 // (uint16_t)-1 means that Opcode is a pseudo instruction that has 6785 // no encoding in the given subtarget generation. 6786 if (MCOp == (uint16_t)-1) 6787 return -1; 6788 6789 if (isAsmOnlyOpcode(MCOp)) 6790 return -1; 6791 6792 return MCOp; 6793 } 6794 6795 static 6796 TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) { 6797 assert(RegOpnd.isReg()); 6798 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() : 6799 getRegSubRegPair(RegOpnd); 6800 } 6801 6802 TargetInstrInfo::RegSubRegPair 6803 llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) { 6804 assert(MI.isRegSequence()); 6805 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I) 6806 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) { 6807 auto &RegOp = MI.getOperand(1 + 2 * I); 6808 return getRegOrUndef(RegOp); 6809 } 6810 return TargetInstrInfo::RegSubRegPair(); 6811 } 6812 6813 // Try to find the definition of reg:subreg in subreg-manipulation pseudos 6814 // Following a subreg of reg:subreg isn't supported 6815 static bool followSubRegDef(MachineInstr &MI, 6816 TargetInstrInfo::RegSubRegPair &RSR) { 6817 if (!RSR.SubReg) 6818 return false; 6819 switch (MI.getOpcode()) { 6820 default: break; 6821 case AMDGPU::REG_SEQUENCE: 6822 RSR = getRegSequenceSubReg(MI, RSR.SubReg); 6823 return true; 6824 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg 6825 case AMDGPU::INSERT_SUBREG: 6826 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm()) 6827 // inserted the subreg we're looking for 6828 RSR = getRegOrUndef(MI.getOperand(2)); 6829 else { // the subreg in the rest of the reg 6830 auto R1 = getRegOrUndef(MI.getOperand(1)); 6831 if (R1.SubReg) // subreg of subreg isn't supported 6832 return false; 6833 RSR.Reg = R1.Reg; 6834 } 6835 return true; 6836 } 6837 return false; 6838 } 6839 6840 MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, 6841 MachineRegisterInfo &MRI) { 6842 assert(MRI.isSSA()); 6843 if (!Register::isVirtualRegister(P.Reg)) 6844 return nullptr; 6845 6846 auto RSR = P; 6847 auto *DefInst = MRI.getVRegDef(RSR.Reg); 6848 while (auto *MI = DefInst) { 6849 DefInst = nullptr; 6850 switch (MI->getOpcode()) { 6851 case AMDGPU::COPY: 6852 case AMDGPU::V_MOV_B32_e32: { 6853 auto &Op1 = MI->getOperand(1); 6854 if (Op1.isReg() && Register::isVirtualRegister(Op1.getReg())) { 6855 if (Op1.isUndef()) 6856 return nullptr; 6857 RSR = getRegSubRegPair(Op1); 6858 DefInst = MRI.getVRegDef(RSR.Reg); 6859 } 6860 break; 6861 } 6862 default: 6863 if (followSubRegDef(*MI, RSR)) { 6864 if (!RSR.Reg) 6865 return nullptr; 6866 DefInst = MRI.getVRegDef(RSR.Reg); 6867 } 6868 } 6869 if (!DefInst) 6870 return MI; 6871 } 6872 return nullptr; 6873 } 6874 6875 bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, 6876 Register VReg, 6877 const MachineInstr &DefMI, 6878 const MachineInstr &UseMI) { 6879 assert(MRI.isSSA() && "Must be run on SSA"); 6880 6881 auto *TRI = MRI.getTargetRegisterInfo(); 6882 auto *DefBB = DefMI.getParent(); 6883 6884 // Don't bother searching between blocks, although it is possible this block 6885 // doesn't modify exec. 6886 if (UseMI.getParent() != DefBB) 6887 return true; 6888 6889 const int MaxInstScan = 20; 6890 int NumInst = 0; 6891 6892 // Stop scan at the use. 6893 auto E = UseMI.getIterator(); 6894 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) { 6895 if (I->isDebugInstr()) 6896 continue; 6897 6898 if (++NumInst > MaxInstScan) 6899 return true; 6900 6901 if (I->modifiesRegister(AMDGPU::EXEC, TRI)) 6902 return true; 6903 } 6904 6905 return false; 6906 } 6907 6908 bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, 6909 Register VReg, 6910 const MachineInstr &DefMI) { 6911 assert(MRI.isSSA() && "Must be run on SSA"); 6912 6913 auto *TRI = MRI.getTargetRegisterInfo(); 6914 auto *DefBB = DefMI.getParent(); 6915 6916 const int MaxUseInstScan = 10; 6917 int NumUseInst = 0; 6918 6919 for (auto &UseInst : MRI.use_nodbg_instructions(VReg)) { 6920 // Don't bother searching between blocks, although it is possible this block 6921 // doesn't modify exec. 6922 if (UseInst.getParent() != DefBB) 6923 return true; 6924 6925 if (++NumUseInst > MaxUseInstScan) 6926 return true; 6927 } 6928 6929 const int MaxInstScan = 20; 6930 int NumInst = 0; 6931 6932 // Stop scan when we have seen all the uses. 6933 for (auto I = std::next(DefMI.getIterator()); ; ++I) { 6934 if (I->isDebugInstr()) 6935 continue; 6936 6937 if (++NumInst > MaxInstScan) 6938 return true; 6939 6940 if (I->readsRegister(VReg)) 6941 if (--NumUseInst == 0) 6942 return false; 6943 6944 if (I->modifiesRegister(AMDGPU::EXEC, TRI)) 6945 return true; 6946 } 6947 } 6948 6949 MachineInstr *SIInstrInfo::createPHIDestinationCopy( 6950 MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt, 6951 const DebugLoc &DL, Register Src, Register Dst) const { 6952 auto Cur = MBB.begin(); 6953 if (Cur != MBB.end()) 6954 do { 6955 if (!Cur->isPHI() && Cur->readsRegister(Dst)) 6956 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src); 6957 ++Cur; 6958 } while (Cur != MBB.end() && Cur != LastPHIIt); 6959 6960 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src, 6961 Dst); 6962 } 6963 6964 MachineInstr *SIInstrInfo::createPHISourceCopy( 6965 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, 6966 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const { 6967 if (InsPt != MBB.end() && 6968 (InsPt->getOpcode() == AMDGPU::SI_IF || 6969 InsPt->getOpcode() == AMDGPU::SI_ELSE || 6970 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) && 6971 InsPt->definesRegister(Src)) { 6972 InsPt++; 6973 return BuildMI(MBB, InsPt, DL, 6974 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term 6975 : AMDGPU::S_MOV_B64_term), 6976 Dst) 6977 .addReg(Src, 0, SrcSubReg) 6978 .addReg(AMDGPU::EXEC, RegState::Implicit); 6979 } 6980 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg, 6981 Dst); 6982 } 6983 6984 bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); } 6985 6986 MachineInstr *SIInstrInfo::foldMemoryOperandImpl( 6987 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, 6988 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS, 6989 VirtRegMap *VRM) const { 6990 // This is a bit of a hack (copied from AArch64). Consider this instruction: 6991 // 6992 // %0:sreg_32 = COPY $m0 6993 // 6994 // We explicitly chose SReg_32 for the virtual register so such a copy might 6995 // be eliminated by RegisterCoalescer. However, that may not be possible, and 6996 // %0 may even spill. We can't spill $m0 normally (it would require copying to 6997 // a numbered SGPR anyway), and since it is in the SReg_32 register class, 6998 // TargetInstrInfo::foldMemoryOperand() is going to try. 6999 // 7000 // To prevent that, constrain the %0 register class here. 7001 if (MI.isFullCopy()) { 7002 Register DstReg = MI.getOperand(0).getReg(); 7003 Register SrcReg = MI.getOperand(1).getReg(); 7004 7005 if (DstReg == AMDGPU::M0 && SrcReg.isVirtual()) { 7006 MF.getRegInfo().constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); 7007 return nullptr; 7008 } 7009 7010 if (SrcReg == AMDGPU::M0 && DstReg.isVirtual()) { 7011 MF.getRegInfo().constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass); 7012 return nullptr; 7013 } 7014 } 7015 7016 return nullptr; 7017 } 7018 7019 unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, 7020 const MachineInstr &MI, 7021 unsigned *PredCost) const { 7022 if (MI.isBundle()) { 7023 MachineBasicBlock::const_instr_iterator I(MI.getIterator()); 7024 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end()); 7025 unsigned Lat = 0, Count = 0; 7026 for (++I; I != E && I->isBundledWithPred(); ++I) { 7027 ++Count; 7028 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I)); 7029 } 7030 return Lat + Count - 1; 7031 } 7032 7033 return SchedModel.computeInstrLatency(&MI); 7034 } 7035