1 //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI Implementation of TargetInstrInfo. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIInstrInfo.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUSubtarget.h" 17 #include "GCNHazardRecognizer.h" 18 #include "SIDefines.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "SIRegisterInfo.h" 21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/APInt.h" 24 #include "llvm/ADT/ArrayRef.h" 25 #include "llvm/ADT/SmallVector.h" 26 #include "llvm/ADT/StringRef.h" 27 #include "llvm/ADT/iterator_range.h" 28 #include "llvm/Analysis/AliasAnalysis.h" 29 #include "llvm/Analysis/MemoryLocation.h" 30 #include "llvm/Analysis/ValueTracking.h" 31 #include "llvm/CodeGen/MachineBasicBlock.h" 32 #include "llvm/CodeGen/MachineDominators.h" 33 #include "llvm/CodeGen/MachineFrameInfo.h" 34 #include "llvm/CodeGen/MachineFunction.h" 35 #include "llvm/CodeGen/MachineInstr.h" 36 #include "llvm/CodeGen/MachineInstrBuilder.h" 37 #include "llvm/CodeGen/MachineInstrBundle.h" 38 #include "llvm/CodeGen/MachineMemOperand.h" 39 #include "llvm/CodeGen/MachineOperand.h" 40 #include "llvm/CodeGen/MachineRegisterInfo.h" 41 #include "llvm/CodeGen/RegisterScavenging.h" 42 #include "llvm/CodeGen/ScheduleDAG.h" 43 #include "llvm/CodeGen/SelectionDAGNodes.h" 44 #include "llvm/CodeGen/TargetOpcodes.h" 45 #include "llvm/CodeGen/TargetRegisterInfo.h" 46 #include "llvm/IR/DebugLoc.h" 47 #include "llvm/IR/DiagnosticInfo.h" 48 #include "llvm/IR/Function.h" 49 #include "llvm/IR/InlineAsm.h" 50 #include "llvm/IR/LLVMContext.h" 51 #include "llvm/MC/MCInstrDesc.h" 52 #include "llvm/Support/Casting.h" 53 #include "llvm/Support/CommandLine.h" 54 #include "llvm/Support/Compiler.h" 55 #include "llvm/Support/ErrorHandling.h" 56 #include "llvm/Support/MachineValueType.h" 57 #include "llvm/Support/MathExtras.h" 58 #include "llvm/Target/TargetMachine.h" 59 #include <cassert> 60 #include <cstdint> 61 #include <iterator> 62 #include <utility> 63 64 using namespace llvm; 65 66 #define GET_INSTRINFO_CTOR_DTOR 67 #include "AMDGPUGenInstrInfo.inc" 68 69 namespace llvm { 70 namespace AMDGPU { 71 #define GET_D16ImageDimIntrinsics_IMPL 72 #define GET_ImageDimIntrinsicTable_IMPL 73 #define GET_RsrcIntrinsics_IMPL 74 #include "AMDGPUGenSearchableTables.inc" 75 } 76 } 77 78 79 // Must be at least 4 to be able to branch over minimum unconditional branch 80 // code. This is only for making it possible to write reasonably small tests for 81 // long branches. 82 static cl::opt<unsigned> 83 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), 84 cl::desc("Restrict range of branch instructions (DEBUG)")); 85 86 SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) 87 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), 88 RI(ST), ST(ST) {} 89 90 //===----------------------------------------------------------------------===// 91 // TargetInstrInfo callbacks 92 //===----------------------------------------------------------------------===// 93 94 static unsigned getNumOperandsNoGlue(SDNode *Node) { 95 unsigned N = Node->getNumOperands(); 96 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 97 --N; 98 return N; 99 } 100 101 /// Returns true if both nodes have the same value for the given 102 /// operand \p Op, or if both nodes do not have this operand. 103 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 104 unsigned Opc0 = N0->getMachineOpcode(); 105 unsigned Opc1 = N1->getMachineOpcode(); 106 107 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 108 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 109 110 if (Op0Idx == -1 && Op1Idx == -1) 111 return true; 112 113 114 if ((Op0Idx == -1 && Op1Idx != -1) || 115 (Op1Idx == -1 && Op0Idx != -1)) 116 return false; 117 118 // getNamedOperandIdx returns the index for the MachineInstr's operands, 119 // which includes the result as the first operand. We are indexing into the 120 // MachineSDNode's operands, so we need to skip the result operand to get 121 // the real index. 122 --Op0Idx; 123 --Op1Idx; 124 125 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 126 } 127 128 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, 129 AliasAnalysis *AA) const { 130 // TODO: The generic check fails for VALU instructions that should be 131 // rematerializable due to implicit reads of exec. We really want all of the 132 // generic logic for this except for this. 133 switch (MI.getOpcode()) { 134 case AMDGPU::V_MOV_B32_e32: 135 case AMDGPU::V_MOV_B32_e64: 136 case AMDGPU::V_MOV_B64_PSEUDO: 137 // No implicit operands. 138 return MI.getNumOperands() == MI.getDesc().getNumOperands(); 139 default: 140 return false; 141 } 142 } 143 144 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 145 int64_t &Offset0, 146 int64_t &Offset1) const { 147 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 148 return false; 149 150 unsigned Opc0 = Load0->getMachineOpcode(); 151 unsigned Opc1 = Load1->getMachineOpcode(); 152 153 // Make sure both are actually loads. 154 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 155 return false; 156 157 if (isDS(Opc0) && isDS(Opc1)) { 158 159 // FIXME: Handle this case: 160 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 161 return false; 162 163 // Check base reg. 164 if (Load0->getOperand(0) != Load1->getOperand(0)) 165 return false; 166 167 // Skip read2 / write2 variants for simplicity. 168 // TODO: We should report true if the used offsets are adjacent (excluded 169 // st64 versions). 170 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || 171 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) 172 return false; 173 174 Offset0 = cast<ConstantSDNode>(Load0->getOperand(1))->getZExtValue(); 175 Offset1 = cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue(); 176 return true; 177 } 178 179 if (isSMRD(Opc0) && isSMRD(Opc1)) { 180 // Skip time and cache invalidation instructions. 181 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 || 182 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1) 183 return false; 184 185 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); 186 187 // Check base reg. 188 if (Load0->getOperand(0) != Load1->getOperand(0)) 189 return false; 190 191 const ConstantSDNode *Load0Offset = 192 dyn_cast<ConstantSDNode>(Load0->getOperand(1)); 193 const ConstantSDNode *Load1Offset = 194 dyn_cast<ConstantSDNode>(Load1->getOperand(1)); 195 196 if (!Load0Offset || !Load1Offset) 197 return false; 198 199 Offset0 = Load0Offset->getZExtValue(); 200 Offset1 = Load1Offset->getZExtValue(); 201 return true; 202 } 203 204 // MUBUF and MTBUF can access the same addresses. 205 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 206 207 // MUBUF and MTBUF have vaddr at different indices. 208 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 209 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 210 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 211 return false; 212 213 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 214 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 215 216 if (OffIdx0 == -1 || OffIdx1 == -1) 217 return false; 218 219 // getNamedOperandIdx returns the index for MachineInstrs. Since they 220 // include the output in the operand list, but SDNodes don't, we need to 221 // subtract the index by one. 222 --OffIdx0; 223 --OffIdx1; 224 225 SDValue Off0 = Load0->getOperand(OffIdx0); 226 SDValue Off1 = Load1->getOperand(OffIdx1); 227 228 // The offset might be a FrameIndexSDNode. 229 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 230 return false; 231 232 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 233 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 234 return true; 235 } 236 237 return false; 238 } 239 240 static bool isStride64(unsigned Opc) { 241 switch (Opc) { 242 case AMDGPU::DS_READ2ST64_B32: 243 case AMDGPU::DS_READ2ST64_B64: 244 case AMDGPU::DS_WRITE2ST64_B32: 245 case AMDGPU::DS_WRITE2ST64_B64: 246 return true; 247 default: 248 return false; 249 } 250 } 251 252 bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt, 253 MachineOperand *&BaseOp, 254 int64_t &Offset, 255 const TargetRegisterInfo *TRI) const { 256 unsigned Opc = LdSt.getOpcode(); 257 258 if (isDS(LdSt)) { 259 const MachineOperand *OffsetImm = 260 getNamedOperand(LdSt, AMDGPU::OpName::offset); 261 if (OffsetImm) { 262 // Normal, single offset LDS instruction. 263 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); 264 // TODO: ds_consume/ds_append use M0 for the base address. Is it safe to 265 // report that here? 266 if (!BaseOp) 267 return false; 268 269 Offset = OffsetImm->getImm(); 270 assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base " 271 "operands of type register."); 272 return true; 273 } 274 275 // The 2 offset instructions use offset0 and offset1 instead. We can treat 276 // these as a load with a single offset if the 2 offsets are consecutive. We 277 // will use this for some partially aligned loads. 278 const MachineOperand *Offset0Imm = 279 getNamedOperand(LdSt, AMDGPU::OpName::offset0); 280 const MachineOperand *Offset1Imm = 281 getNamedOperand(LdSt, AMDGPU::OpName::offset1); 282 283 uint8_t Offset0 = Offset0Imm->getImm(); 284 uint8_t Offset1 = Offset1Imm->getImm(); 285 286 if (Offset1 > Offset0 && Offset1 - Offset0 == 1) { 287 // Each of these offsets is in element sized units, so we need to convert 288 // to bytes of the individual reads. 289 290 unsigned EltSize; 291 if (LdSt.mayLoad()) 292 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16; 293 else { 294 assert(LdSt.mayStore()); 295 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 296 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8; 297 } 298 299 if (isStride64(Opc)) 300 EltSize *= 64; 301 302 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); 303 Offset = EltSize * Offset0; 304 assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base " 305 "operands of type register."); 306 return true; 307 } 308 309 return false; 310 } 311 312 if (isMUBUF(LdSt) || isMTBUF(LdSt)) { 313 const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset); 314 if (SOffset && SOffset->isReg()) 315 return false; 316 317 MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 318 if (!AddrReg) 319 return false; 320 321 const MachineOperand *OffsetImm = 322 getNamedOperand(LdSt, AMDGPU::OpName::offset); 323 BaseOp = AddrReg; 324 Offset = OffsetImm->getImm(); 325 326 if (SOffset) // soffset can be an inline immediate. 327 Offset += SOffset->getImm(); 328 329 assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base " 330 "operands of type register."); 331 return true; 332 } 333 334 if (isSMRD(LdSt)) { 335 const MachineOperand *OffsetImm = 336 getNamedOperand(LdSt, AMDGPU::OpName::offset); 337 if (!OffsetImm) 338 return false; 339 340 MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase); 341 BaseOp = SBaseReg; 342 Offset = OffsetImm->getImm(); 343 assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base " 344 "operands of type register."); 345 return true; 346 } 347 348 if (isFLAT(LdSt)) { 349 MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 350 if (VAddr) { 351 // Can't analyze 2 offsets. 352 if (getNamedOperand(LdSt, AMDGPU::OpName::saddr)) 353 return false; 354 355 BaseOp = VAddr; 356 } else { 357 // scratch instructions have either vaddr or saddr. 358 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr); 359 } 360 361 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm(); 362 assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base " 363 "operands of type register."); 364 return true; 365 } 366 367 return false; 368 } 369 370 static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, 371 const MachineOperand &BaseOp1, 372 const MachineInstr &MI2, 373 const MachineOperand &BaseOp2) { 374 // Support only base operands with base registers. 375 // Note: this could be extended to support FI operands. 376 if (!BaseOp1.isReg() || !BaseOp2.isReg()) 377 return false; 378 379 if (BaseOp1.isIdenticalTo(BaseOp2)) 380 return true; 381 382 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand()) 383 return false; 384 385 auto MO1 = *MI1.memoperands_begin(); 386 auto MO2 = *MI2.memoperands_begin(); 387 if (MO1->getAddrSpace() != MO2->getAddrSpace()) 388 return false; 389 390 auto Base1 = MO1->getValue(); 391 auto Base2 = MO2->getValue(); 392 if (!Base1 || !Base2) 393 return false; 394 const MachineFunction &MF = *MI1.getParent()->getParent(); 395 const DataLayout &DL = MF.getFunction().getParent()->getDataLayout(); 396 Base1 = GetUnderlyingObject(Base1, DL); 397 Base2 = GetUnderlyingObject(Base1, DL); 398 399 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2)) 400 return false; 401 402 return Base1 == Base2; 403 } 404 405 bool SIInstrInfo::shouldClusterMemOps(MachineOperand &BaseOp1, 406 MachineOperand &BaseOp2, 407 unsigned NumLoads) const { 408 MachineInstr &FirstLdSt = *BaseOp1.getParent(); 409 MachineInstr &SecondLdSt = *BaseOp2.getParent(); 410 411 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOp1, SecondLdSt, BaseOp2)) 412 return false; 413 414 const MachineOperand *FirstDst = nullptr; 415 const MachineOperand *SecondDst = nullptr; 416 417 if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || 418 (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) || 419 (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) { 420 const unsigned MaxGlobalLoadCluster = 6; 421 if (NumLoads > MaxGlobalLoadCluster) 422 return false; 423 424 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); 425 if (!FirstDst) 426 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); 427 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); 428 if (!SecondDst) 429 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); 430 } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { 431 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); 432 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); 433 } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) { 434 FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); 435 SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); 436 } 437 438 if (!FirstDst || !SecondDst) 439 return false; 440 441 // Try to limit clustering based on the total number of bytes loaded 442 // rather than the number of instructions. This is done to help reduce 443 // register pressure. The method used is somewhat inexact, though, 444 // because it assumes that all loads in the cluster will load the 445 // same number of bytes as FirstLdSt. 446 447 // The unit of this value is bytes. 448 // FIXME: This needs finer tuning. 449 unsigned LoadClusterThreshold = 16; 450 451 const MachineRegisterInfo &MRI = 452 FirstLdSt.getParent()->getParent()->getRegInfo(); 453 const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg()); 454 455 return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold; 456 } 457 458 // FIXME: This behaves strangely. If, for example, you have 32 load + stores, 459 // the first 16 loads will be interleaved with the stores, and the next 16 will 460 // be clustered as expected. It should really split into 2 16 store batches. 461 // 462 // Loads are clustered until this returns false, rather than trying to schedule 463 // groups of stores. This also means we have to deal with saying different 464 // address space loads should be clustered, and ones which might cause bank 465 // conflicts. 466 // 467 // This might be deprecated so it might not be worth that much effort to fix. 468 bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, 469 int64_t Offset0, int64_t Offset1, 470 unsigned NumLoads) const { 471 assert(Offset1 > Offset0 && 472 "Second offset should be larger than first offset!"); 473 // If we have less than 16 loads in a row, and the offsets are within 64 474 // bytes, then schedule together. 475 476 // A cacheline is 64 bytes (for global memory). 477 return (NumLoads <= 16 && (Offset1 - Offset0) < 64); 478 } 479 480 static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, 481 MachineBasicBlock::iterator MI, 482 const DebugLoc &DL, unsigned DestReg, 483 unsigned SrcReg, bool KillSrc) { 484 MachineFunction *MF = MBB.getParent(); 485 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), 486 "illegal SGPR to VGPR copy", 487 DL, DS_Error); 488 LLVMContext &C = MF->getFunction().getContext(); 489 C.diagnose(IllegalCopy); 490 491 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg) 492 .addReg(SrcReg, getKillRegState(KillSrc)); 493 } 494 495 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 496 MachineBasicBlock::iterator MI, 497 const DebugLoc &DL, unsigned DestReg, 498 unsigned SrcReg, bool KillSrc) const { 499 const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg); 500 501 if (RC == &AMDGPU::VGPR_32RegClass) { 502 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 503 AMDGPU::SReg_32RegClass.contains(SrcReg)); 504 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 505 .addReg(SrcReg, getKillRegState(KillSrc)); 506 return; 507 } 508 509 if (RC == &AMDGPU::SReg_32_XM0RegClass || 510 RC == &AMDGPU::SReg_32RegClass) { 511 if (SrcReg == AMDGPU::SCC) { 512 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) 513 .addImm(-1) 514 .addImm(0); 515 return; 516 } 517 518 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) { 519 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 520 return; 521 } 522 523 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 524 .addReg(SrcReg, getKillRegState(KillSrc)); 525 return; 526 } 527 528 if (RC == &AMDGPU::SReg_64RegClass) { 529 if (DestReg == AMDGPU::VCC) { 530 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 531 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 532 .addReg(SrcReg, getKillRegState(KillSrc)); 533 } else { 534 // FIXME: Hack until VReg_1 removed. 535 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 536 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) 537 .addImm(0) 538 .addReg(SrcReg, getKillRegState(KillSrc)); 539 } 540 541 return; 542 } 543 544 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) { 545 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 546 return; 547 } 548 549 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 550 .addReg(SrcReg, getKillRegState(KillSrc)); 551 return; 552 } 553 554 if (DestReg == AMDGPU::SCC) { 555 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 556 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) 557 .addReg(SrcReg, getKillRegState(KillSrc)) 558 .addImm(0); 559 return; 560 } 561 562 unsigned EltSize = 4; 563 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 564 if (RI.isSGPRClass(RC)) { 565 if (RI.getRegSizeInBits(*RC) > 32) { 566 Opcode = AMDGPU::S_MOV_B64; 567 EltSize = 8; 568 } else { 569 Opcode = AMDGPU::S_MOV_B32; 570 EltSize = 4; 571 } 572 573 if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) { 574 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 575 return; 576 } 577 } 578 579 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); 580 bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); 581 582 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 583 unsigned SubIdx; 584 if (Forward) 585 SubIdx = SubIndices[Idx]; 586 else 587 SubIdx = SubIndices[SubIndices.size() - Idx - 1]; 588 589 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 590 get(Opcode), RI.getSubReg(DestReg, SubIdx)); 591 592 Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); 593 594 if (Idx == 0) 595 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 596 597 bool UseKill = KillSrc && Idx == SubIndices.size() - 1; 598 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); 599 } 600 } 601 602 int SIInstrInfo::commuteOpcode(unsigned Opcode) const { 603 int NewOpc; 604 605 // Try to map original to commuted opcode 606 NewOpc = AMDGPU::getCommuteRev(Opcode); 607 if (NewOpc != -1) 608 // Check if the commuted (REV) opcode exists on the target. 609 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 610 611 // Try to map commuted to original opcode 612 NewOpc = AMDGPU::getCommuteOrig(Opcode); 613 if (NewOpc != -1) 614 // Check if the original (non-REV) opcode exists on the target. 615 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 616 617 return Opcode; 618 } 619 620 void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, 621 MachineBasicBlock::iterator MI, 622 const DebugLoc &DL, unsigned DestReg, 623 int64_t Value) const { 624 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 625 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg); 626 if (RegClass == &AMDGPU::SReg_32RegClass || 627 RegClass == &AMDGPU::SGPR_32RegClass || 628 RegClass == &AMDGPU::SReg_32_XM0RegClass || 629 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) { 630 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 631 .addImm(Value); 632 return; 633 } 634 635 if (RegClass == &AMDGPU::SReg_64RegClass || 636 RegClass == &AMDGPU::SGPR_64RegClass || 637 RegClass == &AMDGPU::SReg_64_XEXECRegClass) { 638 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 639 .addImm(Value); 640 return; 641 } 642 643 if (RegClass == &AMDGPU::VGPR_32RegClass) { 644 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 645 .addImm(Value); 646 return; 647 } 648 if (RegClass == &AMDGPU::VReg_64RegClass) { 649 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg) 650 .addImm(Value); 651 return; 652 } 653 654 unsigned EltSize = 4; 655 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 656 if (RI.isSGPRClass(RegClass)) { 657 if (RI.getRegSizeInBits(*RegClass) > 32) { 658 Opcode = AMDGPU::S_MOV_B64; 659 EltSize = 8; 660 } else { 661 Opcode = AMDGPU::S_MOV_B32; 662 EltSize = 4; 663 } 664 } 665 666 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize); 667 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 668 int64_t IdxValue = Idx == 0 ? Value : 0; 669 670 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 671 get(Opcode), RI.getSubReg(DestReg, Idx)); 672 Builder.addImm(IdxValue); 673 } 674 } 675 676 const TargetRegisterClass * 677 SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const { 678 return &AMDGPU::VGPR_32RegClass; 679 } 680 681 void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, 682 MachineBasicBlock::iterator I, 683 const DebugLoc &DL, unsigned DstReg, 684 ArrayRef<MachineOperand> Cond, 685 unsigned TrueReg, 686 unsigned FalseReg) const { 687 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 688 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && 689 "Not a VGPR32 reg"); 690 691 if (Cond.size() == 1) { 692 unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 693 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 694 .add(Cond[0]); 695 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 696 .addReg(FalseReg) 697 .addReg(TrueReg) 698 .addReg(SReg); 699 } else if (Cond.size() == 2) { 700 assert(Cond[0].isImm() && "Cond[0] is not an immediate"); 701 switch (Cond[0].getImm()) { 702 case SIInstrInfo::SCC_TRUE: { 703 unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 704 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) 705 .addImm(-1) 706 .addImm(0); 707 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 708 .addReg(FalseReg) 709 .addReg(TrueReg) 710 .addReg(SReg); 711 break; 712 } 713 case SIInstrInfo::SCC_FALSE: { 714 unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 715 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) 716 .addImm(0) 717 .addImm(-1); 718 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 719 .addReg(FalseReg) 720 .addReg(TrueReg) 721 .addReg(SReg); 722 break; 723 } 724 case SIInstrInfo::VCCNZ: { 725 MachineOperand RegOp = Cond[1]; 726 RegOp.setImplicit(false); 727 unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 728 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 729 .add(RegOp); 730 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 731 .addReg(FalseReg) 732 .addReg(TrueReg) 733 .addReg(SReg); 734 break; 735 } 736 case SIInstrInfo::VCCZ: { 737 MachineOperand RegOp = Cond[1]; 738 RegOp.setImplicit(false); 739 unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 740 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 741 .add(RegOp); 742 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 743 .addReg(TrueReg) 744 .addReg(FalseReg) 745 .addReg(SReg); 746 break; 747 } 748 case SIInstrInfo::EXECNZ: { 749 unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 750 unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 751 BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 752 .addImm(0); 753 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) 754 .addImm(-1) 755 .addImm(0); 756 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 757 .addReg(FalseReg) 758 .addReg(TrueReg) 759 .addReg(SReg); 760 break; 761 } 762 case SIInstrInfo::EXECZ: { 763 unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 764 unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 765 BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 766 .addImm(0); 767 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) 768 .addImm(0) 769 .addImm(-1); 770 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 771 .addReg(FalseReg) 772 .addReg(TrueReg) 773 .addReg(SReg); 774 llvm_unreachable("Unhandled branch predicate EXECZ"); 775 break; 776 } 777 default: 778 llvm_unreachable("invalid branch predicate"); 779 } 780 } else { 781 llvm_unreachable("Can only handle Cond size 1 or 2"); 782 } 783 } 784 785 unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB, 786 MachineBasicBlock::iterator I, 787 const DebugLoc &DL, 788 unsigned SrcReg, int Value) const { 789 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 790 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 791 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) 792 .addImm(Value) 793 .addReg(SrcReg); 794 795 return Reg; 796 } 797 798 unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB, 799 MachineBasicBlock::iterator I, 800 const DebugLoc &DL, 801 unsigned SrcReg, int Value) const { 802 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 803 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 804 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) 805 .addImm(Value) 806 .addReg(SrcReg); 807 808 return Reg; 809 } 810 811 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 812 813 if (RI.getRegSizeInBits(*DstRC) == 32) { 814 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 815 } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { 816 return AMDGPU::S_MOV_B64; 817 } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) { 818 return AMDGPU::V_MOV_B64_PSEUDO; 819 } 820 return AMDGPU::COPY; 821 } 822 823 static unsigned getSGPRSpillSaveOpcode(unsigned Size) { 824 switch (Size) { 825 case 4: 826 return AMDGPU::SI_SPILL_S32_SAVE; 827 case 8: 828 return AMDGPU::SI_SPILL_S64_SAVE; 829 case 16: 830 return AMDGPU::SI_SPILL_S128_SAVE; 831 case 32: 832 return AMDGPU::SI_SPILL_S256_SAVE; 833 case 64: 834 return AMDGPU::SI_SPILL_S512_SAVE; 835 default: 836 llvm_unreachable("unknown register size"); 837 } 838 } 839 840 static unsigned getVGPRSpillSaveOpcode(unsigned Size) { 841 switch (Size) { 842 case 4: 843 return AMDGPU::SI_SPILL_V32_SAVE; 844 case 8: 845 return AMDGPU::SI_SPILL_V64_SAVE; 846 case 12: 847 return AMDGPU::SI_SPILL_V96_SAVE; 848 case 16: 849 return AMDGPU::SI_SPILL_V128_SAVE; 850 case 32: 851 return AMDGPU::SI_SPILL_V256_SAVE; 852 case 64: 853 return AMDGPU::SI_SPILL_V512_SAVE; 854 default: 855 llvm_unreachable("unknown register size"); 856 } 857 } 858 859 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 860 MachineBasicBlock::iterator MI, 861 unsigned SrcReg, bool isKill, 862 int FrameIndex, 863 const TargetRegisterClass *RC, 864 const TargetRegisterInfo *TRI) const { 865 MachineFunction *MF = MBB.getParent(); 866 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 867 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 868 const DebugLoc &DL = MBB.findDebugLoc(MI); 869 870 unsigned Size = FrameInfo.getObjectSize(FrameIndex); 871 unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); 872 MachinePointerInfo PtrInfo 873 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 874 MachineMemOperand *MMO 875 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 876 Size, Align); 877 unsigned SpillSize = TRI->getSpillSize(*RC); 878 879 if (RI.isSGPRClass(RC)) { 880 MFI->setHasSpilledSGPRs(); 881 882 // We are only allowed to create one new instruction when spilling 883 // registers, so we need to use pseudo instruction for spilling SGPRs. 884 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize)); 885 886 // The SGPR spill/restore instructions only work on number sgprs, so we need 887 // to make sure we are using the correct register class. 888 if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) { 889 MachineRegisterInfo &MRI = MF->getRegInfo(); 890 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); 891 } 892 893 MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc) 894 .addReg(SrcReg, getKillRegState(isKill)) // data 895 .addFrameIndex(FrameIndex) // addr 896 .addMemOperand(MMO) 897 .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) 898 .addReg(MFI->getFrameOffsetReg(), RegState::Implicit); 899 // Add the scratch resource registers as implicit uses because we may end up 900 // needing them, and need to ensure that the reserved registers are 901 // correctly handled. 902 903 FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL); 904 if (ST.hasScalarStores()) { 905 // m0 is used for offset to scalar stores if used to spill. 906 Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead); 907 } 908 909 return; 910 } 911 912 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 913 914 unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize); 915 MFI->setHasSpilledVGPRs(); 916 BuildMI(MBB, MI, DL, get(Opcode)) 917 .addReg(SrcReg, getKillRegState(isKill)) // data 918 .addFrameIndex(FrameIndex) // addr 919 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 920 .addReg(MFI->getFrameOffsetReg()) // scratch_offset 921 .addImm(0) // offset 922 .addMemOperand(MMO); 923 } 924 925 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { 926 switch (Size) { 927 case 4: 928 return AMDGPU::SI_SPILL_S32_RESTORE; 929 case 8: 930 return AMDGPU::SI_SPILL_S64_RESTORE; 931 case 16: 932 return AMDGPU::SI_SPILL_S128_RESTORE; 933 case 32: 934 return AMDGPU::SI_SPILL_S256_RESTORE; 935 case 64: 936 return AMDGPU::SI_SPILL_S512_RESTORE; 937 default: 938 llvm_unreachable("unknown register size"); 939 } 940 } 941 942 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { 943 switch (Size) { 944 case 4: 945 return AMDGPU::SI_SPILL_V32_RESTORE; 946 case 8: 947 return AMDGPU::SI_SPILL_V64_RESTORE; 948 case 12: 949 return AMDGPU::SI_SPILL_V96_RESTORE; 950 case 16: 951 return AMDGPU::SI_SPILL_V128_RESTORE; 952 case 32: 953 return AMDGPU::SI_SPILL_V256_RESTORE; 954 case 64: 955 return AMDGPU::SI_SPILL_V512_RESTORE; 956 default: 957 llvm_unreachable("unknown register size"); 958 } 959 } 960 961 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 962 MachineBasicBlock::iterator MI, 963 unsigned DestReg, int FrameIndex, 964 const TargetRegisterClass *RC, 965 const TargetRegisterInfo *TRI) const { 966 MachineFunction *MF = MBB.getParent(); 967 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 968 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 969 const DebugLoc &DL = MBB.findDebugLoc(MI); 970 unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); 971 unsigned Size = FrameInfo.getObjectSize(FrameIndex); 972 unsigned SpillSize = TRI->getSpillSize(*RC); 973 974 MachinePointerInfo PtrInfo 975 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 976 977 MachineMemOperand *MMO = MF->getMachineMemOperand( 978 PtrInfo, MachineMemOperand::MOLoad, Size, Align); 979 980 if (RI.isSGPRClass(RC)) { 981 MFI->setHasSpilledSGPRs(); 982 983 // FIXME: Maybe this should not include a memoperand because it will be 984 // lowered to non-memory instructions. 985 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); 986 if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) { 987 MachineRegisterInfo &MRI = MF->getRegInfo(); 988 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); 989 } 990 991 FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL); 992 MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg) 993 .addFrameIndex(FrameIndex) // addr 994 .addMemOperand(MMO) 995 .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) 996 .addReg(MFI->getFrameOffsetReg(), RegState::Implicit); 997 998 if (ST.hasScalarStores()) { 999 // m0 is used for offset to scalar stores if used to spill. 1000 Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead); 1001 } 1002 1003 return; 1004 } 1005 1006 assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); 1007 1008 unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize); 1009 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 1010 .addFrameIndex(FrameIndex) // vaddr 1011 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 1012 .addReg(MFI->getFrameOffsetReg()) // scratch_offset 1013 .addImm(0) // offset 1014 .addMemOperand(MMO); 1015 } 1016 1017 /// \param @Offset Offset in bytes of the FrameIndex being spilled 1018 unsigned SIInstrInfo::calculateLDSSpillAddress( 1019 MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg, 1020 unsigned FrameOffset, unsigned Size) const { 1021 MachineFunction *MF = MBB.getParent(); 1022 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1023 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 1024 const DebugLoc &DL = MBB.findDebugLoc(MI); 1025 unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); 1026 unsigned WavefrontSize = ST.getWavefrontSize(); 1027 1028 unsigned TIDReg = MFI->getTIDReg(); 1029 if (!MFI->hasCalculatedTID()) { 1030 MachineBasicBlock &Entry = MBB.getParent()->front(); 1031 MachineBasicBlock::iterator Insert = Entry.front(); 1032 const DebugLoc &DL = Insert->getDebugLoc(); 1033 1034 TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass, 1035 *MF); 1036 if (TIDReg == AMDGPU::NoRegister) 1037 return TIDReg; 1038 1039 if (!AMDGPU::isShader(MF->getFunction().getCallingConv()) && 1040 WorkGroupSize > WavefrontSize) { 1041 unsigned TIDIGXReg 1042 = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 1043 unsigned TIDIGYReg 1044 = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 1045 unsigned TIDIGZReg 1046 = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 1047 unsigned InputPtrReg = 1048 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1049 for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { 1050 if (!Entry.isLiveIn(Reg)) 1051 Entry.addLiveIn(Reg); 1052 } 1053 1054 RS->enterBasicBlock(Entry); 1055 // FIXME: Can we scavenge an SReg_64 and access the subregs? 1056 unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 1057 unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); 1058 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) 1059 .addReg(InputPtrReg) 1060 .addImm(SI::KernelInputOffsets::NGROUPS_Z); 1061 BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) 1062 .addReg(InputPtrReg) 1063 .addImm(SI::KernelInputOffsets::NGROUPS_Y); 1064 1065 // NGROUPS.X * NGROUPS.Y 1066 BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) 1067 .addReg(STmp1) 1068 .addReg(STmp0); 1069 // (NGROUPS.X * NGROUPS.Y) * TIDIG.X 1070 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) 1071 .addReg(STmp1) 1072 .addReg(TIDIGXReg); 1073 // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) 1074 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) 1075 .addReg(STmp0) 1076 .addReg(TIDIGYReg) 1077 .addReg(TIDReg); 1078 // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z 1079 getAddNoCarry(Entry, Insert, DL, TIDReg) 1080 .addReg(TIDReg) 1081 .addReg(TIDIGZReg); 1082 } else { 1083 // Get the wave id 1084 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), 1085 TIDReg) 1086 .addImm(-1) 1087 .addImm(0); 1088 1089 BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), 1090 TIDReg) 1091 .addImm(-1) 1092 .addReg(TIDReg); 1093 } 1094 1095 BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), 1096 TIDReg) 1097 .addImm(2) 1098 .addReg(TIDReg); 1099 MFI->setTIDReg(TIDReg); 1100 } 1101 1102 // Add FrameIndex to LDS offset 1103 unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize); 1104 getAddNoCarry(MBB, MI, DL, TmpReg) 1105 .addImm(LDSOffset) 1106 .addReg(TIDReg); 1107 1108 return TmpReg; 1109 } 1110 1111 void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB, 1112 MachineBasicBlock::iterator MI, 1113 int Count) const { 1114 DebugLoc DL = MBB.findDebugLoc(MI); 1115 while (Count > 0) { 1116 int Arg; 1117 if (Count >= 8) 1118 Arg = 7; 1119 else 1120 Arg = Count - 1; 1121 Count -= 8; 1122 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)) 1123 .addImm(Arg); 1124 } 1125 } 1126 1127 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, 1128 MachineBasicBlock::iterator MI) const { 1129 insertWaitStates(MBB, MI, 1); 1130 } 1131 1132 void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { 1133 auto MF = MBB.getParent(); 1134 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 1135 1136 assert(Info->isEntryFunction()); 1137 1138 if (MBB.succ_empty()) { 1139 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end(); 1140 if (HasNoTerminator) { 1141 if (Info->returnsVoid()) { 1142 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0); 1143 } else { 1144 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG)); 1145 } 1146 } 1147 } 1148 } 1149 1150 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { 1151 switch (MI.getOpcode()) { 1152 default: return 1; // FIXME: Do wait states equal cycles? 1153 1154 case AMDGPU::S_NOP: 1155 return MI.getOperand(0).getImm() + 1; 1156 } 1157 } 1158 1159 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 1160 MachineBasicBlock &MBB = *MI.getParent(); 1161 DebugLoc DL = MBB.findDebugLoc(MI); 1162 switch (MI.getOpcode()) { 1163 default: return TargetInstrInfo::expandPostRAPseudo(MI); 1164 case AMDGPU::S_MOV_B64_term: 1165 // This is only a terminator to get the correct spill code placement during 1166 // register allocation. 1167 MI.setDesc(get(AMDGPU::S_MOV_B64)); 1168 break; 1169 1170 case AMDGPU::S_XOR_B64_term: 1171 // This is only a terminator to get the correct spill code placement during 1172 // register allocation. 1173 MI.setDesc(get(AMDGPU::S_XOR_B64)); 1174 break; 1175 1176 case AMDGPU::S_ANDN2_B64_term: 1177 // This is only a terminator to get the correct spill code placement during 1178 // register allocation. 1179 MI.setDesc(get(AMDGPU::S_ANDN2_B64)); 1180 break; 1181 1182 case AMDGPU::V_MOV_B64_PSEUDO: { 1183 unsigned Dst = MI.getOperand(0).getReg(); 1184 unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 1185 unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 1186 1187 const MachineOperand &SrcOp = MI.getOperand(1); 1188 // FIXME: Will this work for 64-bit floating point immediates? 1189 assert(!SrcOp.isFPImm()); 1190 if (SrcOp.isImm()) { 1191 APInt Imm(64, SrcOp.getImm()); 1192 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 1193 .addImm(Imm.getLoBits(32).getZExtValue()) 1194 .addReg(Dst, RegState::Implicit | RegState::Define); 1195 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 1196 .addImm(Imm.getHiBits(32).getZExtValue()) 1197 .addReg(Dst, RegState::Implicit | RegState::Define); 1198 } else { 1199 assert(SrcOp.isReg()); 1200 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 1201 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 1202 .addReg(Dst, RegState::Implicit | RegState::Define); 1203 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 1204 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 1205 .addReg(Dst, RegState::Implicit | RegState::Define); 1206 } 1207 MI.eraseFromParent(); 1208 break; 1209 } 1210 case AMDGPU::V_SET_INACTIVE_B32: { 1211 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) 1212 .addReg(AMDGPU::EXEC); 1213 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) 1214 .add(MI.getOperand(2)); 1215 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) 1216 .addReg(AMDGPU::EXEC); 1217 MI.eraseFromParent(); 1218 break; 1219 } 1220 case AMDGPU::V_SET_INACTIVE_B64: { 1221 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) 1222 .addReg(AMDGPU::EXEC); 1223 MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), 1224 MI.getOperand(0).getReg()) 1225 .add(MI.getOperand(2)); 1226 expandPostRAPseudo(*Copy); 1227 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) 1228 .addReg(AMDGPU::EXEC); 1229 MI.eraseFromParent(); 1230 break; 1231 } 1232 case AMDGPU::V_MOVRELD_B32_V1: 1233 case AMDGPU::V_MOVRELD_B32_V2: 1234 case AMDGPU::V_MOVRELD_B32_V4: 1235 case AMDGPU::V_MOVRELD_B32_V8: 1236 case AMDGPU::V_MOVRELD_B32_V16: { 1237 const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32); 1238 unsigned VecReg = MI.getOperand(0).getReg(); 1239 bool IsUndef = MI.getOperand(1).isUndef(); 1240 unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm(); 1241 assert(VecReg == MI.getOperand(1).getReg()); 1242 1243 MachineInstr *MovRel = 1244 BuildMI(MBB, MI, DL, MovRelDesc) 1245 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) 1246 .add(MI.getOperand(2)) 1247 .addReg(VecReg, RegState::ImplicitDefine) 1248 .addReg(VecReg, 1249 RegState::Implicit | (IsUndef ? RegState::Undef : 0)); 1250 1251 const int ImpDefIdx = 1252 MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses(); 1253 const int ImpUseIdx = ImpDefIdx + 1; 1254 MovRel->tieOperands(ImpDefIdx, ImpUseIdx); 1255 1256 MI.eraseFromParent(); 1257 break; 1258 } 1259 case AMDGPU::SI_PC_ADD_REL_OFFSET: { 1260 MachineFunction &MF = *MBB.getParent(); 1261 unsigned Reg = MI.getOperand(0).getReg(); 1262 unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); 1263 unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); 1264 1265 // Create a bundle so these instructions won't be re-ordered by the 1266 // post-RA scheduler. 1267 MIBundleBuilder Bundler(MBB, MI); 1268 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); 1269 1270 // Add 32-bit offset from this instruction to the start of the 1271 // constant data. 1272 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) 1273 .addReg(RegLo) 1274 .add(MI.getOperand(1))); 1275 1276 MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) 1277 .addReg(RegHi); 1278 if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE) 1279 MIB.addImm(0); 1280 else 1281 MIB.add(MI.getOperand(2)); 1282 1283 Bundler.append(MIB); 1284 finalizeBundle(MBB, Bundler.begin()); 1285 1286 MI.eraseFromParent(); 1287 break; 1288 } 1289 case AMDGPU::EXIT_WWM: { 1290 // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM 1291 // is exited. 1292 MI.setDesc(get(AMDGPU::S_MOV_B64)); 1293 break; 1294 } 1295 case TargetOpcode::BUNDLE: { 1296 if (!MI.mayLoad()) 1297 return false; 1298 1299 // If it is a load it must be a memory clause 1300 for (MachineBasicBlock::instr_iterator I = MI.getIterator(); 1301 I->isBundledWithSucc(); ++I) { 1302 I->unbundleFromSucc(); 1303 for (MachineOperand &MO : I->operands()) 1304 if (MO.isReg()) 1305 MO.setIsInternalRead(false); 1306 } 1307 1308 MI.eraseFromParent(); 1309 break; 1310 } 1311 } 1312 return true; 1313 } 1314 1315 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, 1316 MachineOperand &Src0, 1317 unsigned Src0OpName, 1318 MachineOperand &Src1, 1319 unsigned Src1OpName) const { 1320 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); 1321 if (!Src0Mods) 1322 return false; 1323 1324 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); 1325 assert(Src1Mods && 1326 "All commutable instructions have both src0 and src1 modifiers"); 1327 1328 int Src0ModsVal = Src0Mods->getImm(); 1329 int Src1ModsVal = Src1Mods->getImm(); 1330 1331 Src1Mods->setImm(Src0ModsVal); 1332 Src0Mods->setImm(Src1ModsVal); 1333 return true; 1334 } 1335 1336 static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, 1337 MachineOperand &RegOp, 1338 MachineOperand &NonRegOp) { 1339 unsigned Reg = RegOp.getReg(); 1340 unsigned SubReg = RegOp.getSubReg(); 1341 bool IsKill = RegOp.isKill(); 1342 bool IsDead = RegOp.isDead(); 1343 bool IsUndef = RegOp.isUndef(); 1344 bool IsDebug = RegOp.isDebug(); 1345 1346 if (NonRegOp.isImm()) 1347 RegOp.ChangeToImmediate(NonRegOp.getImm()); 1348 else if (NonRegOp.isFI()) 1349 RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); 1350 else 1351 return nullptr; 1352 1353 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); 1354 NonRegOp.setSubReg(SubReg); 1355 1356 return &MI; 1357 } 1358 1359 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, 1360 unsigned Src0Idx, 1361 unsigned Src1Idx) const { 1362 assert(!NewMI && "this should never be used"); 1363 1364 unsigned Opc = MI.getOpcode(); 1365 int CommutedOpcode = commuteOpcode(Opc); 1366 if (CommutedOpcode == -1) 1367 return nullptr; 1368 1369 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == 1370 static_cast<int>(Src0Idx) && 1371 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == 1372 static_cast<int>(Src1Idx) && 1373 "inconsistency with findCommutedOpIndices"); 1374 1375 MachineOperand &Src0 = MI.getOperand(Src0Idx); 1376 MachineOperand &Src1 = MI.getOperand(Src1Idx); 1377 1378 MachineInstr *CommutedMI = nullptr; 1379 if (Src0.isReg() && Src1.isReg()) { 1380 if (isOperandLegal(MI, Src1Idx, &Src0)) { 1381 // Be sure to copy the source modifiers to the right place. 1382 CommutedMI 1383 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); 1384 } 1385 1386 } else if (Src0.isReg() && !Src1.isReg()) { 1387 // src0 should always be able to support any operand type, so no need to 1388 // check operand legality. 1389 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); 1390 } else if (!Src0.isReg() && Src1.isReg()) { 1391 if (isOperandLegal(MI, Src1Idx, &Src0)) 1392 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); 1393 } else { 1394 // FIXME: Found two non registers to commute. This does happen. 1395 return nullptr; 1396 } 1397 1398 if (CommutedMI) { 1399 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, 1400 Src1, AMDGPU::OpName::src1_modifiers); 1401 1402 CommutedMI->setDesc(get(CommutedOpcode)); 1403 } 1404 1405 return CommutedMI; 1406 } 1407 1408 // This needs to be implemented because the source modifiers may be inserted 1409 // between the true commutable operands, and the base 1410 // TargetInstrInfo::commuteInstruction uses it. 1411 bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0, 1412 unsigned &SrcOpIdx1) const { 1413 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1); 1414 } 1415 1416 bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0, 1417 unsigned &SrcOpIdx1) const { 1418 if (!Desc.isCommutable()) 1419 return false; 1420 1421 unsigned Opc = Desc.getOpcode(); 1422 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 1423 if (Src0Idx == -1) 1424 return false; 1425 1426 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 1427 if (Src1Idx == -1) 1428 return false; 1429 1430 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); 1431 } 1432 1433 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, 1434 int64_t BrOffset) const { 1435 // BranchRelaxation should never have to check s_setpc_b64 because its dest 1436 // block is unanalyzable. 1437 assert(BranchOp != AMDGPU::S_SETPC_B64); 1438 1439 // Convert to dwords. 1440 BrOffset /= 4; 1441 1442 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is 1443 // from the next instruction. 1444 BrOffset -= 1; 1445 1446 return isIntN(BranchOffsetBits, BrOffset); 1447 } 1448 1449 MachineBasicBlock *SIInstrInfo::getBranchDestBlock( 1450 const MachineInstr &MI) const { 1451 if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { 1452 // This would be a difficult analysis to perform, but can always be legal so 1453 // there's no need to analyze it. 1454 return nullptr; 1455 } 1456 1457 return MI.getOperand(0).getMBB(); 1458 } 1459 1460 unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, 1461 MachineBasicBlock &DestBB, 1462 const DebugLoc &DL, 1463 int64_t BrOffset, 1464 RegScavenger *RS) const { 1465 assert(RS && "RegScavenger required for long branching"); 1466 assert(MBB.empty() && 1467 "new block should be inserted for expanding unconditional branch"); 1468 assert(MBB.pred_size() == 1); 1469 1470 MachineFunction *MF = MBB.getParent(); 1471 MachineRegisterInfo &MRI = MF->getRegInfo(); 1472 1473 // FIXME: Virtual register workaround for RegScavenger not working with empty 1474 // blocks. 1475 unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1476 1477 auto I = MBB.end(); 1478 1479 // We need to compute the offset relative to the instruction immediately after 1480 // s_getpc_b64. Insert pc arithmetic code before last terminator. 1481 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); 1482 1483 // TODO: Handle > 32-bit block address. 1484 if (BrOffset >= 0) { 1485 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) 1486 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 1487 .addReg(PCReg, 0, AMDGPU::sub0) 1488 .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD); 1489 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) 1490 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 1491 .addReg(PCReg, 0, AMDGPU::sub1) 1492 .addImm(0); 1493 } else { 1494 // Backwards branch. 1495 BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32)) 1496 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 1497 .addReg(PCReg, 0, AMDGPU::sub0) 1498 .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD); 1499 BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32)) 1500 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 1501 .addReg(PCReg, 0, AMDGPU::sub1) 1502 .addImm(0); 1503 } 1504 1505 // Insert the indirect branch after the other terminator. 1506 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) 1507 .addReg(PCReg); 1508 1509 // FIXME: If spilling is necessary, this will fail because this scavenger has 1510 // no emergency stack slots. It is non-trivial to spill in this situation, 1511 // because the restore code needs to be specially placed after the 1512 // jump. BranchRelaxation then needs to be made aware of the newly inserted 1513 // block. 1514 // 1515 // If a spill is needed for the pc register pair, we need to insert a spill 1516 // restore block right before the destination block, and insert a short branch 1517 // into the old destination block's fallthrough predecessor. 1518 // e.g.: 1519 // 1520 // s_cbranch_scc0 skip_long_branch: 1521 // 1522 // long_branch_bb: 1523 // spill s[8:9] 1524 // s_getpc_b64 s[8:9] 1525 // s_add_u32 s8, s8, restore_bb 1526 // s_addc_u32 s9, s9, 0 1527 // s_setpc_b64 s[8:9] 1528 // 1529 // skip_long_branch: 1530 // foo; 1531 // 1532 // ..... 1533 // 1534 // dest_bb_fallthrough_predecessor: 1535 // bar; 1536 // s_branch dest_bb 1537 // 1538 // restore_bb: 1539 // restore s[8:9] 1540 // fallthrough dest_bb 1541 /// 1542 // dest_bb: 1543 // buzz; 1544 1545 RS->enterBasicBlockEnd(MBB); 1546 unsigned Scav = RS->scavengeRegisterBackwards( 1547 AMDGPU::SReg_64RegClass, 1548 MachineBasicBlock::iterator(GetPC), false, 0); 1549 MRI.replaceRegWith(PCReg, Scav); 1550 MRI.clearVirtRegs(); 1551 RS->setRegUsed(Scav); 1552 1553 return 4 + 8 + 4 + 4; 1554 } 1555 1556 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { 1557 switch (Cond) { 1558 case SIInstrInfo::SCC_TRUE: 1559 return AMDGPU::S_CBRANCH_SCC1; 1560 case SIInstrInfo::SCC_FALSE: 1561 return AMDGPU::S_CBRANCH_SCC0; 1562 case SIInstrInfo::VCCNZ: 1563 return AMDGPU::S_CBRANCH_VCCNZ; 1564 case SIInstrInfo::VCCZ: 1565 return AMDGPU::S_CBRANCH_VCCZ; 1566 case SIInstrInfo::EXECNZ: 1567 return AMDGPU::S_CBRANCH_EXECNZ; 1568 case SIInstrInfo::EXECZ: 1569 return AMDGPU::S_CBRANCH_EXECZ; 1570 default: 1571 llvm_unreachable("invalid branch predicate"); 1572 } 1573 } 1574 1575 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { 1576 switch (Opcode) { 1577 case AMDGPU::S_CBRANCH_SCC0: 1578 return SCC_FALSE; 1579 case AMDGPU::S_CBRANCH_SCC1: 1580 return SCC_TRUE; 1581 case AMDGPU::S_CBRANCH_VCCNZ: 1582 return VCCNZ; 1583 case AMDGPU::S_CBRANCH_VCCZ: 1584 return VCCZ; 1585 case AMDGPU::S_CBRANCH_EXECNZ: 1586 return EXECNZ; 1587 case AMDGPU::S_CBRANCH_EXECZ: 1588 return EXECZ; 1589 default: 1590 return INVALID_BR; 1591 } 1592 } 1593 1594 bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, 1595 MachineBasicBlock::iterator I, 1596 MachineBasicBlock *&TBB, 1597 MachineBasicBlock *&FBB, 1598 SmallVectorImpl<MachineOperand> &Cond, 1599 bool AllowModify) const { 1600 if (I->getOpcode() == AMDGPU::S_BRANCH) { 1601 // Unconditional Branch 1602 TBB = I->getOperand(0).getMBB(); 1603 return false; 1604 } 1605 1606 MachineBasicBlock *CondBB = nullptr; 1607 1608 if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 1609 CondBB = I->getOperand(1).getMBB(); 1610 Cond.push_back(I->getOperand(0)); 1611 } else { 1612 BranchPredicate Pred = getBranchPredicate(I->getOpcode()); 1613 if (Pred == INVALID_BR) 1614 return true; 1615 1616 CondBB = I->getOperand(0).getMBB(); 1617 Cond.push_back(MachineOperand::CreateImm(Pred)); 1618 Cond.push_back(I->getOperand(1)); // Save the branch register. 1619 } 1620 ++I; 1621 1622 if (I == MBB.end()) { 1623 // Conditional branch followed by fall-through. 1624 TBB = CondBB; 1625 return false; 1626 } 1627 1628 if (I->getOpcode() == AMDGPU::S_BRANCH) { 1629 TBB = CondBB; 1630 FBB = I->getOperand(0).getMBB(); 1631 return false; 1632 } 1633 1634 return true; 1635 } 1636 1637 bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, 1638 MachineBasicBlock *&FBB, 1639 SmallVectorImpl<MachineOperand> &Cond, 1640 bool AllowModify) const { 1641 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 1642 auto E = MBB.end(); 1643 if (I == E) 1644 return false; 1645 1646 // Skip over the instructions that are artificially terminators for special 1647 // exec management. 1648 while (I != E && !I->isBranch() && !I->isReturn() && 1649 I->getOpcode() != AMDGPU::SI_MASK_BRANCH) { 1650 switch (I->getOpcode()) { 1651 case AMDGPU::SI_MASK_BRANCH: 1652 case AMDGPU::S_MOV_B64_term: 1653 case AMDGPU::S_XOR_B64_term: 1654 case AMDGPU::S_ANDN2_B64_term: 1655 break; 1656 case AMDGPU::SI_IF: 1657 case AMDGPU::SI_ELSE: 1658 case AMDGPU::SI_KILL_I1_TERMINATOR: 1659 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: 1660 // FIXME: It's messy that these need to be considered here at all. 1661 return true; 1662 default: 1663 llvm_unreachable("unexpected non-branch terminator inst"); 1664 } 1665 1666 ++I; 1667 } 1668 1669 if (I == E) 1670 return false; 1671 1672 if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH) 1673 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); 1674 1675 ++I; 1676 1677 // TODO: Should be able to treat as fallthrough? 1678 if (I == MBB.end()) 1679 return true; 1680 1681 if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify)) 1682 return true; 1683 1684 MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB(); 1685 1686 // Specifically handle the case where the conditional branch is to the same 1687 // destination as the mask branch. e.g. 1688 // 1689 // si_mask_branch BB8 1690 // s_cbranch_execz BB8 1691 // s_cbranch BB9 1692 // 1693 // This is required to understand divergent loops which may need the branches 1694 // to be relaxed. 1695 if (TBB != MaskBrDest || Cond.empty()) 1696 return true; 1697 1698 auto Pred = Cond[0].getImm(); 1699 return (Pred != EXECZ && Pred != EXECNZ); 1700 } 1701 1702 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, 1703 int *BytesRemoved) const { 1704 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 1705 1706 unsigned Count = 0; 1707 unsigned RemovedSize = 0; 1708 while (I != MBB.end()) { 1709 MachineBasicBlock::iterator Next = std::next(I); 1710 if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) { 1711 I = Next; 1712 continue; 1713 } 1714 1715 RemovedSize += getInstSizeInBytes(*I); 1716 I->eraseFromParent(); 1717 ++Count; 1718 I = Next; 1719 } 1720 1721 if (BytesRemoved) 1722 *BytesRemoved = RemovedSize; 1723 1724 return Count; 1725 } 1726 1727 // Copy the flags onto the implicit condition register operand. 1728 static void preserveCondRegFlags(MachineOperand &CondReg, 1729 const MachineOperand &OrigCond) { 1730 CondReg.setIsUndef(OrigCond.isUndef()); 1731 CondReg.setIsKill(OrigCond.isKill()); 1732 } 1733 1734 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, 1735 MachineBasicBlock *TBB, 1736 MachineBasicBlock *FBB, 1737 ArrayRef<MachineOperand> Cond, 1738 const DebugLoc &DL, 1739 int *BytesAdded) const { 1740 if (!FBB && Cond.empty()) { 1741 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 1742 .addMBB(TBB); 1743 if (BytesAdded) 1744 *BytesAdded = 4; 1745 return 1; 1746 } 1747 1748 if(Cond.size() == 1 && Cond[0].isReg()) { 1749 BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO)) 1750 .add(Cond[0]) 1751 .addMBB(TBB); 1752 return 1; 1753 } 1754 1755 assert(TBB && Cond[0].isImm()); 1756 1757 unsigned Opcode 1758 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); 1759 1760 if (!FBB) { 1761 Cond[1].isUndef(); 1762 MachineInstr *CondBr = 1763 BuildMI(&MBB, DL, get(Opcode)) 1764 .addMBB(TBB); 1765 1766 // Copy the flags onto the implicit condition register operand. 1767 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); 1768 1769 if (BytesAdded) 1770 *BytesAdded = 4; 1771 return 1; 1772 } 1773 1774 assert(TBB && FBB); 1775 1776 MachineInstr *CondBr = 1777 BuildMI(&MBB, DL, get(Opcode)) 1778 .addMBB(TBB); 1779 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 1780 .addMBB(FBB); 1781 1782 MachineOperand &CondReg = CondBr->getOperand(1); 1783 CondReg.setIsUndef(Cond[1].isUndef()); 1784 CondReg.setIsKill(Cond[1].isKill()); 1785 1786 if (BytesAdded) 1787 *BytesAdded = 8; 1788 1789 return 2; 1790 } 1791 1792 bool SIInstrInfo::reverseBranchCondition( 1793 SmallVectorImpl<MachineOperand> &Cond) const { 1794 if (Cond.size() != 2) { 1795 return true; 1796 } 1797 1798 if (Cond[0].isImm()) { 1799 Cond[0].setImm(-Cond[0].getImm()); 1800 return false; 1801 } 1802 1803 return true; 1804 } 1805 1806 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 1807 ArrayRef<MachineOperand> Cond, 1808 unsigned TrueReg, unsigned FalseReg, 1809 int &CondCycles, 1810 int &TrueCycles, int &FalseCycles) const { 1811 switch (Cond[0].getImm()) { 1812 case VCCNZ: 1813 case VCCZ: { 1814 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 1815 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 1816 assert(MRI.getRegClass(FalseReg) == RC); 1817 1818 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 1819 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 1820 1821 // Limit to equal cost for branch vs. N v_cndmask_b32s. 1822 return !RI.isSGPRClass(RC) && NumInsts <= 6; 1823 } 1824 case SCC_TRUE: 1825 case SCC_FALSE: { 1826 // FIXME: We could insert for VGPRs if we could replace the original compare 1827 // with a vector one. 1828 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 1829 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 1830 assert(MRI.getRegClass(FalseReg) == RC); 1831 1832 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 1833 1834 // Multiples of 8 can do s_cselect_b64 1835 if (NumInsts % 2 == 0) 1836 NumInsts /= 2; 1837 1838 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 1839 return RI.isSGPRClass(RC); 1840 } 1841 default: 1842 return false; 1843 } 1844 } 1845 1846 void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, 1847 MachineBasicBlock::iterator I, const DebugLoc &DL, 1848 unsigned DstReg, ArrayRef<MachineOperand> Cond, 1849 unsigned TrueReg, unsigned FalseReg) const { 1850 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); 1851 if (Pred == VCCZ || Pred == SCC_FALSE) { 1852 Pred = static_cast<BranchPredicate>(-Pred); 1853 std::swap(TrueReg, FalseReg); 1854 } 1855 1856 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 1857 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); 1858 unsigned DstSize = RI.getRegSizeInBits(*DstRC); 1859 1860 if (DstSize == 32) { 1861 unsigned SelOp = Pred == SCC_TRUE ? 1862 AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32; 1863 1864 // Instruction's operands are backwards from what is expected. 1865 MachineInstr *Select = 1866 BuildMI(MBB, I, DL, get(SelOp), DstReg) 1867 .addReg(FalseReg) 1868 .addReg(TrueReg); 1869 1870 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 1871 return; 1872 } 1873 1874 if (DstSize == 64 && Pred == SCC_TRUE) { 1875 MachineInstr *Select = 1876 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) 1877 .addReg(FalseReg) 1878 .addReg(TrueReg); 1879 1880 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 1881 return; 1882 } 1883 1884 static const int16_t Sub0_15[] = { 1885 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1886 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 1887 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 1888 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 1889 }; 1890 1891 static const int16_t Sub0_15_64[] = { 1892 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 1893 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 1894 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 1895 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 1896 }; 1897 1898 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; 1899 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; 1900 const int16_t *SubIndices = Sub0_15; 1901 int NElts = DstSize / 32; 1902 1903 // 64-bit select is only avaialble for SALU. 1904 if (Pred == SCC_TRUE) { 1905 SelOp = AMDGPU::S_CSELECT_B64; 1906 EltRC = &AMDGPU::SGPR_64RegClass; 1907 SubIndices = Sub0_15_64; 1908 1909 assert(NElts % 2 == 0); 1910 NElts /= 2; 1911 } 1912 1913 MachineInstrBuilder MIB = BuildMI( 1914 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); 1915 1916 I = MIB->getIterator(); 1917 1918 SmallVector<unsigned, 8> Regs; 1919 for (int Idx = 0; Idx != NElts; ++Idx) { 1920 unsigned DstElt = MRI.createVirtualRegister(EltRC); 1921 Regs.push_back(DstElt); 1922 1923 unsigned SubIdx = SubIndices[Idx]; 1924 1925 MachineInstr *Select = 1926 BuildMI(MBB, I, DL, get(SelOp), DstElt) 1927 .addReg(FalseReg, 0, SubIdx) 1928 .addReg(TrueReg, 0, SubIdx); 1929 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 1930 1931 MIB.addReg(DstElt) 1932 .addImm(SubIdx); 1933 } 1934 } 1935 1936 bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { 1937 switch (MI.getOpcode()) { 1938 case AMDGPU::V_MOV_B32_e32: 1939 case AMDGPU::V_MOV_B32_e64: 1940 case AMDGPU::V_MOV_B64_PSEUDO: { 1941 // If there are additional implicit register operands, this may be used for 1942 // register indexing so the source register operand isn't simply copied. 1943 unsigned NumOps = MI.getDesc().getNumOperands() + 1944 MI.getDesc().getNumImplicitUses(); 1945 1946 return MI.getNumOperands() == NumOps; 1947 } 1948 case AMDGPU::S_MOV_B32: 1949 case AMDGPU::S_MOV_B64: 1950 case AMDGPU::COPY: 1951 return true; 1952 default: 1953 return false; 1954 } 1955 } 1956 1957 unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind( 1958 unsigned Kind) const { 1959 switch(Kind) { 1960 case PseudoSourceValue::Stack: 1961 case PseudoSourceValue::FixedStack: 1962 return AMDGPUAS::PRIVATE_ADDRESS; 1963 case PseudoSourceValue::ConstantPool: 1964 case PseudoSourceValue::GOT: 1965 case PseudoSourceValue::JumpTable: 1966 case PseudoSourceValue::GlobalValueCallEntry: 1967 case PseudoSourceValue::ExternalSymbolCallEntry: 1968 case PseudoSourceValue::TargetCustom: 1969 return AMDGPUAS::CONSTANT_ADDRESS; 1970 } 1971 return AMDGPUAS::FLAT_ADDRESS; 1972 } 1973 1974 static void removeModOperands(MachineInstr &MI) { 1975 unsigned Opc = MI.getOpcode(); 1976 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1977 AMDGPU::OpName::src0_modifiers); 1978 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1979 AMDGPU::OpName::src1_modifiers); 1980 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, 1981 AMDGPU::OpName::src2_modifiers); 1982 1983 MI.RemoveOperand(Src2ModIdx); 1984 MI.RemoveOperand(Src1ModIdx); 1985 MI.RemoveOperand(Src0ModIdx); 1986 } 1987 1988 bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, 1989 unsigned Reg, MachineRegisterInfo *MRI) const { 1990 if (!MRI->hasOneNonDBGUse(Reg)) 1991 return false; 1992 1993 switch (DefMI.getOpcode()) { 1994 default: 1995 return false; 1996 case AMDGPU::S_MOV_B64: 1997 // TODO: We could fold 64-bit immediates, but this get compilicated 1998 // when there are sub-registers. 1999 return false; 2000 2001 case AMDGPU::V_MOV_B32_e32: 2002 case AMDGPU::S_MOV_B32: 2003 break; 2004 } 2005 2006 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); 2007 assert(ImmOp); 2008 // FIXME: We could handle FrameIndex values here. 2009 if (!ImmOp->isImm()) 2010 return false; 2011 2012 unsigned Opc = UseMI.getOpcode(); 2013 if (Opc == AMDGPU::COPY) { 2014 bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg()); 2015 unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 2016 UseMI.setDesc(get(NewOpc)); 2017 UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm()); 2018 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); 2019 return true; 2020 } 2021 2022 if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || 2023 Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) { 2024 // Don't fold if we are using source or output modifiers. The new VOP2 2025 // instructions don't have them. 2026 if (hasAnyModifiersSet(UseMI)) 2027 return false; 2028 2029 // If this is a free constant, there's no reason to do this. 2030 // TODO: We could fold this here instead of letting SIFoldOperands do it 2031 // later. 2032 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); 2033 2034 // Any src operand can be used for the legality check. 2035 if (isInlineConstant(UseMI, *Src0, *ImmOp)) 2036 return false; 2037 2038 bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64; 2039 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); 2040 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); 2041 2042 // Multiplied part is the constant: Use v_madmk_{f16, f32}. 2043 // We should only expect these to be on src0 due to canonicalizations. 2044 if (Src0->isReg() && Src0->getReg() == Reg) { 2045 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 2046 return false; 2047 2048 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) 2049 return false; 2050 2051 // We need to swap operands 0 and 1 since madmk constant is at operand 1. 2052 2053 const int64_t Imm = ImmOp->getImm(); 2054 2055 // FIXME: This would be a lot easier if we could return a new instruction 2056 // instead of having to modify in place. 2057 2058 // Remove these first since they are at the end. 2059 UseMI.RemoveOperand( 2060 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 2061 UseMI.RemoveOperand( 2062 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 2063 2064 unsigned Src1Reg = Src1->getReg(); 2065 unsigned Src1SubReg = Src1->getSubReg(); 2066 Src0->setReg(Src1Reg); 2067 Src0->setSubReg(Src1SubReg); 2068 Src0->setIsKill(Src1->isKill()); 2069 2070 if (Opc == AMDGPU::V_MAC_F32_e64 || 2071 Opc == AMDGPU::V_MAC_F16_e64) 2072 UseMI.untieRegOperand( 2073 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 2074 2075 Src1->ChangeToImmediate(Imm); 2076 2077 removeModOperands(UseMI); 2078 UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16)); 2079 2080 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 2081 if (DeleteDef) 2082 DefMI.eraseFromParent(); 2083 2084 return true; 2085 } 2086 2087 // Added part is the constant: Use v_madak_{f16, f32}. 2088 if (Src2->isReg() && Src2->getReg() == Reg) { 2089 // Not allowed to use constant bus for another operand. 2090 // We can however allow an inline immediate as src0. 2091 bool Src0Inlined = false; 2092 if (Src0->isReg()) { 2093 // Try to inline constant if possible. 2094 // If the Def moves immediate and the use is single 2095 // We are saving VGPR here. 2096 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg()); 2097 if (Def && Def->isMoveImmediate() && 2098 isInlineConstant(Def->getOperand(1)) && 2099 MRI->hasOneUse(Src0->getReg())) { 2100 Src0->ChangeToImmediate(Def->getOperand(1).getImm()); 2101 Src0Inlined = true; 2102 } else if ((RI.isPhysicalRegister(Src0->getReg()) && 2103 RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg()))) || 2104 (RI.isVirtualRegister(Src0->getReg()) && 2105 RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) 2106 return false; 2107 // VGPR is okay as Src0 - fallthrough 2108 } 2109 2110 if (Src1->isReg() && !Src0Inlined ) { 2111 // We have one slot for inlinable constant so far - try to fill it 2112 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg()); 2113 if (Def && Def->isMoveImmediate() && 2114 isInlineConstant(Def->getOperand(1)) && 2115 MRI->hasOneUse(Src1->getReg()) && 2116 commuteInstruction(UseMI)) { 2117 Src0->ChangeToImmediate(Def->getOperand(1).getImm()); 2118 } else if ((RI.isPhysicalRegister(Src1->getReg()) && 2119 RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) || 2120 (RI.isVirtualRegister(Src1->getReg()) && 2121 RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) 2122 return false; 2123 // VGPR is okay as Src1 - fallthrough 2124 } 2125 2126 const int64_t Imm = ImmOp->getImm(); 2127 2128 // FIXME: This would be a lot easier if we could return a new instruction 2129 // instead of having to modify in place. 2130 2131 // Remove these first since they are at the end. 2132 UseMI.RemoveOperand( 2133 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 2134 UseMI.RemoveOperand( 2135 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 2136 2137 if (Opc == AMDGPU::V_MAC_F32_e64 || 2138 Opc == AMDGPU::V_MAC_F16_e64) 2139 UseMI.untieRegOperand( 2140 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 2141 2142 // ChangingToImmediate adds Src2 back to the instruction. 2143 Src2->ChangeToImmediate(Imm); 2144 2145 // These come before src2. 2146 removeModOperands(UseMI); 2147 UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16)); 2148 2149 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 2150 if (DeleteDef) 2151 DefMI.eraseFromParent(); 2152 2153 return true; 2154 } 2155 } 2156 2157 return false; 2158 } 2159 2160 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 2161 int WidthB, int OffsetB) { 2162 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 2163 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 2164 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 2165 return LowOffset + LowWidth <= HighOffset; 2166 } 2167 2168 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa, 2169 MachineInstr &MIb) const { 2170 MachineOperand *BaseOp0, *BaseOp1; 2171 int64_t Offset0, Offset1; 2172 2173 if (getMemOperandWithOffset(MIa, BaseOp0, Offset0, &RI) && 2174 getMemOperandWithOffset(MIb, BaseOp1, Offset1, &RI)) { 2175 if (!BaseOp0->isIdenticalTo(*BaseOp1)) 2176 return false; 2177 2178 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { 2179 // FIXME: Handle ds_read2 / ds_write2. 2180 return false; 2181 } 2182 unsigned Width0 = (*MIa.memoperands_begin())->getSize(); 2183 unsigned Width1 = (*MIb.memoperands_begin())->getSize(); 2184 if (offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { 2185 return true; 2186 } 2187 } 2188 2189 return false; 2190 } 2191 2192 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa, 2193 MachineInstr &MIb, 2194 AliasAnalysis *AA) const { 2195 assert((MIa.mayLoad() || MIa.mayStore()) && 2196 "MIa must load from or modify a memory location"); 2197 assert((MIb.mayLoad() || MIb.mayStore()) && 2198 "MIb must load from or modify a memory location"); 2199 2200 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) 2201 return false; 2202 2203 // XXX - Can we relax this between address spaces? 2204 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 2205 return false; 2206 2207 // TODO: Should we check the address space from the MachineMemOperand? That 2208 // would allow us to distinguish objects we know don't alias based on the 2209 // underlying address space, even if it was lowered to a different one, 2210 // e.g. private accesses lowered to use MUBUF instructions on a scratch 2211 // buffer. 2212 if (isDS(MIa)) { 2213 if (isDS(MIb)) 2214 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2215 2216 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb); 2217 } 2218 2219 if (isMUBUF(MIa) || isMTBUF(MIa)) { 2220 if (isMUBUF(MIb) || isMTBUF(MIb)) 2221 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2222 2223 return !isFLAT(MIb) && !isSMRD(MIb); 2224 } 2225 2226 if (isSMRD(MIa)) { 2227 if (isSMRD(MIb)) 2228 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2229 2230 return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa); 2231 } 2232 2233 if (isFLAT(MIa)) { 2234 if (isFLAT(MIb)) 2235 return checkInstOffsetsDoNotOverlap(MIa, MIb); 2236 2237 return false; 2238 } 2239 2240 return false; 2241 } 2242 2243 static int64_t getFoldableImm(const MachineOperand* MO) { 2244 if (!MO->isReg()) 2245 return false; 2246 const MachineFunction *MF = MO->getParent()->getParent()->getParent(); 2247 const MachineRegisterInfo &MRI = MF->getRegInfo(); 2248 auto Def = MRI.getUniqueVRegDef(MO->getReg()); 2249 if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 && 2250 Def->getOperand(1).isImm()) 2251 return Def->getOperand(1).getImm(); 2252 return AMDGPU::NoRegister; 2253 } 2254 2255 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, 2256 MachineInstr &MI, 2257 LiveVariables *LV) const { 2258 unsigned Opc = MI.getOpcode(); 2259 bool IsF16 = false; 2260 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64; 2261 2262 switch (Opc) { 2263 default: 2264 return nullptr; 2265 case AMDGPU::V_MAC_F16_e64: 2266 IsF16 = true; 2267 LLVM_FALLTHROUGH; 2268 case AMDGPU::V_MAC_F32_e64: 2269 case AMDGPU::V_FMAC_F32_e64: 2270 break; 2271 case AMDGPU::V_MAC_F16_e32: 2272 IsF16 = true; 2273 LLVM_FALLTHROUGH; 2274 case AMDGPU::V_MAC_F32_e32: 2275 case AMDGPU::V_FMAC_F32_e32: { 2276 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 2277 AMDGPU::OpName::src0); 2278 const MachineOperand *Src0 = &MI.getOperand(Src0Idx); 2279 if (!Src0->isReg() && !Src0->isImm()) 2280 return nullptr; 2281 2282 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0)) 2283 return nullptr; 2284 2285 break; 2286 } 2287 } 2288 2289 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 2290 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); 2291 const MachineOperand *Src0Mods = 2292 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 2293 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 2294 const MachineOperand *Src1Mods = 2295 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 2296 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 2297 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); 2298 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); 2299 2300 if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod && 2301 // If we have an SGPR input, we will violate the constant bus restriction. 2302 (!Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) { 2303 if (auto Imm = getFoldableImm(Src2)) { 2304 return BuildMI(*MBB, MI, MI.getDebugLoc(), 2305 get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32)) 2306 .add(*Dst) 2307 .add(*Src0) 2308 .add(*Src1) 2309 .addImm(Imm); 2310 } 2311 if (auto Imm = getFoldableImm(Src1)) { 2312 return BuildMI(*MBB, MI, MI.getDebugLoc(), 2313 get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32)) 2314 .add(*Dst) 2315 .add(*Src0) 2316 .addImm(Imm) 2317 .add(*Src2); 2318 } 2319 if (auto Imm = getFoldableImm(Src0)) { 2320 if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32, 2321 AMDGPU::OpName::src0), Src1)) 2322 return BuildMI(*MBB, MI, MI.getDebugLoc(), 2323 get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32)) 2324 .add(*Dst) 2325 .add(*Src1) 2326 .addImm(Imm) 2327 .add(*Src2); 2328 } 2329 } 2330 2331 assert((!IsFMA || !IsF16) && "fmac only expected with f32"); 2332 unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 : 2333 (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32); 2334 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) 2335 .add(*Dst) 2336 .addImm(Src0Mods ? Src0Mods->getImm() : 0) 2337 .add(*Src0) 2338 .addImm(Src1Mods ? Src1Mods->getImm() : 0) 2339 .add(*Src1) 2340 .addImm(0) // Src mods 2341 .add(*Src2) 2342 .addImm(Clamp ? Clamp->getImm() : 0) 2343 .addImm(Omod ? Omod->getImm() : 0); 2344 } 2345 2346 // It's not generally safe to move VALU instructions across these since it will 2347 // start using the register as a base index rather than directly. 2348 // XXX - Why isn't hasSideEffects sufficient for these? 2349 static bool changesVGPRIndexingMode(const MachineInstr &MI) { 2350 switch (MI.getOpcode()) { 2351 case AMDGPU::S_SET_GPR_IDX_ON: 2352 case AMDGPU::S_SET_GPR_IDX_MODE: 2353 case AMDGPU::S_SET_GPR_IDX_OFF: 2354 return true; 2355 default: 2356 return false; 2357 } 2358 } 2359 2360 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, 2361 const MachineBasicBlock *MBB, 2362 const MachineFunction &MF) const { 2363 // XXX - Do we want the SP check in the base implementation? 2364 2365 // Target-independent instructions do not have an implicit-use of EXEC, even 2366 // when they operate on VGPRs. Treating EXEC modifications as scheduling 2367 // boundaries prevents incorrect movements of such instructions. 2368 return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) || 2369 MI.modifiesRegister(AMDGPU::EXEC, &RI) || 2370 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || 2371 MI.getOpcode() == AMDGPU::S_SETREG_B32 || 2372 changesVGPRIndexingMode(MI); 2373 } 2374 2375 bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { 2376 return Opcode == AMDGPU::DS_ORDERED_COUNT || 2377 Opcode == AMDGPU::DS_GWS_INIT || 2378 Opcode == AMDGPU::DS_GWS_SEMA_V || 2379 Opcode == AMDGPU::DS_GWS_SEMA_BR || 2380 Opcode == AMDGPU::DS_GWS_SEMA_P || 2381 Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL || 2382 Opcode == AMDGPU::DS_GWS_BARRIER; 2383 } 2384 2385 bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const { 2386 unsigned Opcode = MI.getOpcode(); 2387 2388 if (MI.mayStore() && isSMRD(MI)) 2389 return true; // scalar store or atomic 2390 2391 // These instructions cause shader I/O that may cause hardware lockups 2392 // when executed with an empty EXEC mask. 2393 // 2394 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when 2395 // EXEC = 0, but checking for that case here seems not worth it 2396 // given the typical code patterns. 2397 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT || 2398 Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE || 2399 Opcode == AMDGPU::DS_ORDERED_COUNT) 2400 return true; 2401 2402 if (MI.isInlineAsm()) 2403 return true; // conservative assumption 2404 2405 // These are like SALU instructions in terms of effects, so it's questionable 2406 // whether we should return true for those. 2407 // 2408 // However, executing them with EXEC = 0 causes them to operate on undefined 2409 // data, which we avoid by returning true here. 2410 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || Opcode == AMDGPU::V_READLANE_B32) 2411 return true; 2412 2413 return false; 2414 } 2415 2416 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 2417 switch (Imm.getBitWidth()) { 2418 case 32: 2419 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(), 2420 ST.hasInv2PiInlineImm()); 2421 case 64: 2422 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), 2423 ST.hasInv2PiInlineImm()); 2424 case 16: 2425 return ST.has16BitInsts() && 2426 AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), 2427 ST.hasInv2PiInlineImm()); 2428 default: 2429 llvm_unreachable("invalid bitwidth"); 2430 } 2431 } 2432 2433 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 2434 uint8_t OperandType) const { 2435 if (!MO.isImm() || 2436 OperandType < AMDGPU::OPERAND_SRC_FIRST || 2437 OperandType > AMDGPU::OPERAND_SRC_LAST) 2438 return false; 2439 2440 // MachineOperand provides no way to tell the true operand size, since it only 2441 // records a 64-bit value. We need to know the size to determine if a 32-bit 2442 // floating point immediate bit pattern is legal for an integer immediate. It 2443 // would be for any 32-bit integer operand, but would not be for a 64-bit one. 2444 2445 int64_t Imm = MO.getImm(); 2446 switch (OperandType) { 2447 case AMDGPU::OPERAND_REG_IMM_INT32: 2448 case AMDGPU::OPERAND_REG_IMM_FP32: 2449 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 2450 case AMDGPU::OPERAND_REG_INLINE_C_FP32: { 2451 int32_t Trunc = static_cast<int32_t>(Imm); 2452 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); 2453 } 2454 case AMDGPU::OPERAND_REG_IMM_INT64: 2455 case AMDGPU::OPERAND_REG_IMM_FP64: 2456 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 2457 case AMDGPU::OPERAND_REG_INLINE_C_FP64: 2458 return AMDGPU::isInlinableLiteral64(MO.getImm(), 2459 ST.hasInv2PiInlineImm()); 2460 case AMDGPU::OPERAND_REG_IMM_INT16: 2461 case AMDGPU::OPERAND_REG_IMM_FP16: 2462 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 2463 case AMDGPU::OPERAND_REG_INLINE_C_FP16: { 2464 if (isInt<16>(Imm) || isUInt<16>(Imm)) { 2465 // A few special case instructions have 16-bit operands on subtargets 2466 // where 16-bit instructions are not legal. 2467 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle 2468 // constants in these cases 2469 int16_t Trunc = static_cast<int16_t>(Imm); 2470 return ST.has16BitInsts() && 2471 AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); 2472 } 2473 2474 return false; 2475 } 2476 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: 2477 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { 2478 if (isUInt<16>(Imm)) { 2479 int16_t Trunc = static_cast<int16_t>(Imm); 2480 return ST.has16BitInsts() && 2481 AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); 2482 } 2483 if (!(Imm & 0xffff)) { 2484 return ST.has16BitInsts() && 2485 AMDGPU::isInlinableLiteral16(Imm >> 16, ST.hasInv2PiInlineImm()); 2486 } 2487 uint32_t Trunc = static_cast<uint32_t>(Imm); 2488 return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); 2489 } 2490 default: 2491 llvm_unreachable("invalid bitwidth"); 2492 } 2493 } 2494 2495 bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO, 2496 const MCOperandInfo &OpInfo) const { 2497 switch (MO.getType()) { 2498 case MachineOperand::MO_Register: 2499 return false; 2500 case MachineOperand::MO_Immediate: 2501 return !isInlineConstant(MO, OpInfo); 2502 case MachineOperand::MO_FrameIndex: 2503 case MachineOperand::MO_MachineBasicBlock: 2504 case MachineOperand::MO_ExternalSymbol: 2505 case MachineOperand::MO_GlobalAddress: 2506 case MachineOperand::MO_MCSymbol: 2507 return true; 2508 default: 2509 llvm_unreachable("unexpected operand type"); 2510 } 2511 } 2512 2513 static bool compareMachineOp(const MachineOperand &Op0, 2514 const MachineOperand &Op1) { 2515 if (Op0.getType() != Op1.getType()) 2516 return false; 2517 2518 switch (Op0.getType()) { 2519 case MachineOperand::MO_Register: 2520 return Op0.getReg() == Op1.getReg(); 2521 case MachineOperand::MO_Immediate: 2522 return Op0.getImm() == Op1.getImm(); 2523 default: 2524 llvm_unreachable("Didn't expect to be comparing these operand types"); 2525 } 2526 } 2527 2528 bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, 2529 const MachineOperand &MO) const { 2530 const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo]; 2531 2532 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 2533 2534 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 2535 return true; 2536 2537 if (OpInfo.RegClass < 0) 2538 return false; 2539 2540 if (MO.isImm() && isInlineConstant(MO, OpInfo)) 2541 return RI.opCanUseInlineConstant(OpInfo.OperandType); 2542 2543 return RI.opCanUseLiteralConstant(OpInfo.OperandType); 2544 } 2545 2546 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 2547 int Op32 = AMDGPU::getVOPe32(Opcode); 2548 if (Op32 == -1) 2549 return false; 2550 2551 return pseudoToMCOpcode(Op32) != -1; 2552 } 2553 2554 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 2555 // The src0_modifier operand is present on all instructions 2556 // that have modifiers. 2557 2558 return AMDGPU::getNamedOperandIdx(Opcode, 2559 AMDGPU::OpName::src0_modifiers) != -1; 2560 } 2561 2562 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 2563 unsigned OpName) const { 2564 const MachineOperand *Mods = getNamedOperand(MI, OpName); 2565 return Mods && Mods->getImm(); 2566 } 2567 2568 bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { 2569 return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || 2570 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || 2571 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) || 2572 hasModifiersSet(MI, AMDGPU::OpName::clamp) || 2573 hasModifiersSet(MI, AMDGPU::OpName::omod); 2574 } 2575 2576 bool SIInstrInfo::canShrink(const MachineInstr &MI, 2577 const MachineRegisterInfo &MRI) const { 2578 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 2579 // Can't shrink instruction with three operands. 2580 // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add 2581 // a special case for it. It can only be shrunk if the third operand 2582 // is vcc. We should handle this the same way we handle vopc, by addding 2583 // a register allocation hint pre-regalloc and then do the shrinking 2584 // post-regalloc. 2585 if (Src2) { 2586 switch (MI.getOpcode()) { 2587 default: return false; 2588 2589 case AMDGPU::V_ADDC_U32_e64: 2590 case AMDGPU::V_SUBB_U32_e64: 2591 case AMDGPU::V_SUBBREV_U32_e64: { 2592 const MachineOperand *Src1 2593 = getNamedOperand(MI, AMDGPU::OpName::src1); 2594 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg())) 2595 return false; 2596 // Additional verification is needed for sdst/src2. 2597 return true; 2598 } 2599 case AMDGPU::V_MAC_F32_e64: 2600 case AMDGPU::V_MAC_F16_e64: 2601 case AMDGPU::V_FMAC_F32_e64: 2602 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) || 2603 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) 2604 return false; 2605 break; 2606 2607 case AMDGPU::V_CNDMASK_B32_e64: 2608 break; 2609 } 2610 } 2611 2612 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 2613 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) || 2614 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers))) 2615 return false; 2616 2617 // We don't need to check src0, all input types are legal, so just make sure 2618 // src0 isn't using any modifiers. 2619 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) 2620 return false; 2621 2622 // Can it be shrunk to a valid 32 bit opcode? 2623 if (!hasVALU32BitEncoding(MI.getOpcode())) 2624 return false; 2625 2626 // Check output modifiers 2627 return !hasModifiersSet(MI, AMDGPU::OpName::omod) && 2628 !hasModifiersSet(MI, AMDGPU::OpName::clamp); 2629 } 2630 2631 // Set VCC operand with all flags from \p Orig, except for setting it as 2632 // implicit. 2633 static void copyFlagsToImplicitVCC(MachineInstr &MI, 2634 const MachineOperand &Orig) { 2635 2636 for (MachineOperand &Use : MI.implicit_operands()) { 2637 if (Use.isUse() && Use.getReg() == AMDGPU::VCC) { 2638 Use.setIsUndef(Orig.isUndef()); 2639 Use.setIsKill(Orig.isKill()); 2640 return; 2641 } 2642 } 2643 } 2644 2645 MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI, 2646 unsigned Op32) const { 2647 MachineBasicBlock *MBB = MI.getParent();; 2648 MachineInstrBuilder Inst32 = 2649 BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32)); 2650 2651 // Add the dst operand if the 32-bit encoding also has an explicit $vdst. 2652 // For VOPC instructions, this is replaced by an implicit def of vcc. 2653 int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst); 2654 if (Op32DstIdx != -1) { 2655 // dst 2656 Inst32.add(MI.getOperand(0)); 2657 } else { 2658 assert(MI.getOperand(0).getReg() == AMDGPU::VCC && 2659 "Unexpected case"); 2660 } 2661 2662 Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0)); 2663 2664 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 2665 if (Src1) 2666 Inst32.add(*Src1); 2667 2668 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 2669 2670 if (Src2) { 2671 int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); 2672 if (Op32Src2Idx != -1) { 2673 Inst32.add(*Src2); 2674 } else { 2675 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is 2676 // replaced with an implicit read of vcc. This was already added 2677 // during the initial BuildMI, so find it to preserve the flags. 2678 copyFlagsToImplicitVCC(*Inst32, *Src2); 2679 } 2680 } 2681 2682 return Inst32; 2683 } 2684 2685 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 2686 const MachineOperand &MO, 2687 const MCOperandInfo &OpInfo) const { 2688 // Literal constants use the constant bus. 2689 //if (isLiteralConstantLike(MO, OpInfo)) 2690 // return true; 2691 if (MO.isImm()) 2692 return !isInlineConstant(MO, OpInfo); 2693 2694 if (!MO.isReg()) 2695 return true; // Misc other operands like FrameIndex 2696 2697 if (!MO.isUse()) 2698 return false; 2699 2700 if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) 2701 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 2702 2703 // FLAT_SCR is just an SGPR pair. 2704 if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) 2705 return true; 2706 2707 // EXEC register uses the constant bus. 2708 if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) 2709 return true; 2710 2711 // SGPRs use the constant bus 2712 return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 || 2713 (!MO.isImplicit() && 2714 (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || 2715 AMDGPU::SGPR_64RegClass.contains(MO.getReg())))); 2716 } 2717 2718 static unsigned findImplicitSGPRRead(const MachineInstr &MI) { 2719 for (const MachineOperand &MO : MI.implicit_operands()) { 2720 // We only care about reads. 2721 if (MO.isDef()) 2722 continue; 2723 2724 switch (MO.getReg()) { 2725 case AMDGPU::VCC: 2726 case AMDGPU::M0: 2727 case AMDGPU::FLAT_SCR: 2728 return MO.getReg(); 2729 2730 default: 2731 break; 2732 } 2733 } 2734 2735 return AMDGPU::NoRegister; 2736 } 2737 2738 static bool shouldReadExec(const MachineInstr &MI) { 2739 if (SIInstrInfo::isVALU(MI)) { 2740 switch (MI.getOpcode()) { 2741 case AMDGPU::V_READLANE_B32: 2742 case AMDGPU::V_READLANE_B32_si: 2743 case AMDGPU::V_READLANE_B32_vi: 2744 case AMDGPU::V_WRITELANE_B32: 2745 case AMDGPU::V_WRITELANE_B32_si: 2746 case AMDGPU::V_WRITELANE_B32_vi: 2747 return false; 2748 } 2749 2750 return true; 2751 } 2752 2753 if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) || 2754 SIInstrInfo::isSALU(MI) || 2755 SIInstrInfo::isSMRD(MI)) 2756 return false; 2757 2758 return true; 2759 } 2760 2761 static bool isSubRegOf(const SIRegisterInfo &TRI, 2762 const MachineOperand &SuperVec, 2763 const MachineOperand &SubReg) { 2764 if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg())) 2765 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); 2766 2767 return SubReg.getSubReg() != AMDGPU::NoSubRegister && 2768 SubReg.getReg() == SuperVec.getReg(); 2769 } 2770 2771 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, 2772 StringRef &ErrInfo) const { 2773 uint16_t Opcode = MI.getOpcode(); 2774 if (SIInstrInfo::isGenericOpcode(MI.getOpcode())) 2775 return true; 2776 2777 const MachineFunction *MF = MI.getParent()->getParent(); 2778 const MachineRegisterInfo &MRI = MF->getRegInfo(); 2779 2780 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 2781 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 2782 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 2783 2784 // Make sure the number of operands is correct. 2785 const MCInstrDesc &Desc = get(Opcode); 2786 if (!Desc.isVariadic() && 2787 Desc.getNumOperands() != MI.getNumExplicitOperands()) { 2788 ErrInfo = "Instruction has wrong number of operands."; 2789 return false; 2790 } 2791 2792 if (MI.isInlineAsm()) { 2793 // Verify register classes for inlineasm constraints. 2794 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands(); 2795 I != E; ++I) { 2796 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI); 2797 if (!RC) 2798 continue; 2799 2800 const MachineOperand &Op = MI.getOperand(I); 2801 if (!Op.isReg()) 2802 continue; 2803 2804 unsigned Reg = Op.getReg(); 2805 if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) { 2806 ErrInfo = "inlineasm operand has incorrect register class."; 2807 return false; 2808 } 2809 } 2810 2811 return true; 2812 } 2813 2814 // Make sure the register classes are correct. 2815 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 2816 if (MI.getOperand(i).isFPImm()) { 2817 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 2818 "all fp values to integers."; 2819 return false; 2820 } 2821 2822 int RegClass = Desc.OpInfo[i].RegClass; 2823 2824 switch (Desc.OpInfo[i].OperandType) { 2825 case MCOI::OPERAND_REGISTER: 2826 if (MI.getOperand(i).isImm()) { 2827 ErrInfo = "Illegal immediate value for operand."; 2828 return false; 2829 } 2830 break; 2831 case AMDGPU::OPERAND_REG_IMM_INT32: 2832 case AMDGPU::OPERAND_REG_IMM_FP32: 2833 break; 2834 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 2835 case AMDGPU::OPERAND_REG_INLINE_C_FP32: 2836 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 2837 case AMDGPU::OPERAND_REG_INLINE_C_FP64: 2838 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 2839 case AMDGPU::OPERAND_REG_INLINE_C_FP16: { 2840 const MachineOperand &MO = MI.getOperand(i); 2841 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { 2842 ErrInfo = "Illegal immediate value for operand."; 2843 return false; 2844 } 2845 break; 2846 } 2847 case MCOI::OPERAND_IMMEDIATE: 2848 case AMDGPU::OPERAND_KIMM32: 2849 // Check if this operand is an immediate. 2850 // FrameIndex operands will be replaced by immediates, so they are 2851 // allowed. 2852 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { 2853 ErrInfo = "Expected immediate, but got non-immediate"; 2854 return false; 2855 } 2856 LLVM_FALLTHROUGH; 2857 default: 2858 continue; 2859 } 2860 2861 if (!MI.getOperand(i).isReg()) 2862 continue; 2863 2864 if (RegClass != -1) { 2865 unsigned Reg = MI.getOperand(i).getReg(); 2866 if (Reg == AMDGPU::NoRegister || 2867 TargetRegisterInfo::isVirtualRegister(Reg)) 2868 continue; 2869 2870 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 2871 if (!RC->contains(Reg)) { 2872 ErrInfo = "Operand has incorrect register class."; 2873 return false; 2874 } 2875 } 2876 } 2877 2878 // Verify SDWA 2879 if (isSDWA(MI)) { 2880 if (!ST.hasSDWA()) { 2881 ErrInfo = "SDWA is not supported on this target"; 2882 return false; 2883 } 2884 2885 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); 2886 2887 const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx }; 2888 2889 for (int OpIdx: OpIndicies) { 2890 if (OpIdx == -1) 2891 continue; 2892 const MachineOperand &MO = MI.getOperand(OpIdx); 2893 2894 if (!ST.hasSDWAScalar()) { 2895 // Only VGPRS on VI 2896 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) { 2897 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI"; 2898 return false; 2899 } 2900 } else { 2901 // No immediates on GFX9 2902 if (!MO.isReg()) { 2903 ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9"; 2904 return false; 2905 } 2906 } 2907 } 2908 2909 if (!ST.hasSDWAOmod()) { 2910 // No omod allowed on VI 2911 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); 2912 if (OMod != nullptr && 2913 (!OMod->isImm() || OMod->getImm() != 0)) { 2914 ErrInfo = "OMod not allowed in SDWA instructions on VI"; 2915 return false; 2916 } 2917 } 2918 2919 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode); 2920 if (isVOPC(BasicOpcode)) { 2921 if (!ST.hasSDWASdst() && DstIdx != -1) { 2922 // Only vcc allowed as dst on VI for VOPC 2923 const MachineOperand &Dst = MI.getOperand(DstIdx); 2924 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) { 2925 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI"; 2926 return false; 2927 } 2928 } else if (!ST.hasSDWAOutModsVOPC()) { 2929 // No clamp allowed on GFX9 for VOPC 2930 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); 2931 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) { 2932 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI"; 2933 return false; 2934 } 2935 2936 // No omod allowed on GFX9 for VOPC 2937 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); 2938 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) { 2939 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI"; 2940 return false; 2941 } 2942 } 2943 } 2944 2945 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused); 2946 if (DstUnused && DstUnused->isImm() && 2947 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) { 2948 const MachineOperand &Dst = MI.getOperand(DstIdx); 2949 if (!Dst.isReg() || !Dst.isTied()) { 2950 ErrInfo = "Dst register should have tied register"; 2951 return false; 2952 } 2953 2954 const MachineOperand &TiedMO = 2955 MI.getOperand(MI.findTiedOperandIdx(DstIdx)); 2956 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) { 2957 ErrInfo = 2958 "Dst register should be tied to implicit use of preserved register"; 2959 return false; 2960 } else if (TargetRegisterInfo::isPhysicalRegister(TiedMO.getReg()) && 2961 Dst.getReg() != TiedMO.getReg()) { 2962 ErrInfo = "Dst register should use same physical register as preserved"; 2963 return false; 2964 } 2965 } 2966 } 2967 2968 // Verify MIMG 2969 if (isMIMG(MI.getOpcode()) && !MI.mayStore()) { 2970 // Ensure that the return type used is large enough for all the options 2971 // being used TFE/LWE require an extra result register. 2972 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask); 2973 if (DMask) { 2974 uint64_t DMaskImm = DMask->getImm(); 2975 uint32_t RegCount = 2976 isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm); 2977 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe); 2978 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe); 2979 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16); 2980 2981 // Adjust for packed 16 bit values 2982 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem()) 2983 RegCount >>= 1; 2984 2985 // Adjust if using LWE or TFE 2986 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm())) 2987 RegCount += 1; 2988 2989 const uint32_t DstIdx = 2990 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); 2991 const MachineOperand &Dst = MI.getOperand(DstIdx); 2992 if (Dst.isReg()) { 2993 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx); 2994 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32; 2995 if (RegCount > DstSize) { 2996 ErrInfo = "MIMG instruction returns too many registers for dst " 2997 "register class"; 2998 return false; 2999 } 3000 } 3001 } 3002 } 3003 3004 // Verify VOP*. Ignore multiple sgpr operands on writelane. 3005 if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32 3006 && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) { 3007 // Only look at the true operands. Only a real operand can use the constant 3008 // bus, and we don't want to check pseudo-operands like the source modifier 3009 // flags. 3010 const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; 3011 3012 unsigned ConstantBusCount = 0; 3013 unsigned LiteralCount = 0; 3014 3015 if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) 3016 ++ConstantBusCount; 3017 3018 unsigned SGPRUsed = findImplicitSGPRRead(MI); 3019 if (SGPRUsed != AMDGPU::NoRegister) 3020 ++ConstantBusCount; 3021 3022 for (int OpIdx : OpIndices) { 3023 if (OpIdx == -1) 3024 break; 3025 const MachineOperand &MO = MI.getOperand(OpIdx); 3026 if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { 3027 if (MO.isReg()) { 3028 if (MO.getReg() != SGPRUsed) 3029 ++ConstantBusCount; 3030 SGPRUsed = MO.getReg(); 3031 } else { 3032 ++ConstantBusCount; 3033 ++LiteralCount; 3034 } 3035 } 3036 } 3037 if (ConstantBusCount > 1) { 3038 ErrInfo = "VOP* instruction uses the constant bus more than once"; 3039 return false; 3040 } 3041 3042 if (isVOP3(MI) && LiteralCount) { 3043 ErrInfo = "VOP3 instruction uses literal"; 3044 return false; 3045 } 3046 } 3047 3048 // Verify misc. restrictions on specific instructions. 3049 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || 3050 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { 3051 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 3052 const MachineOperand &Src1 = MI.getOperand(Src1Idx); 3053 const MachineOperand &Src2 = MI.getOperand(Src2Idx); 3054 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 3055 if (!compareMachineOp(Src0, Src1) && 3056 !compareMachineOp(Src0, Src2)) { 3057 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 3058 return false; 3059 } 3060 } 3061 } 3062 3063 if (isSOPK(MI)) { 3064 int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm(); 3065 if (sopkIsZext(MI)) { 3066 if (!isUInt<16>(Imm)) { 3067 ErrInfo = "invalid immediate for SOPK instruction"; 3068 return false; 3069 } 3070 } else { 3071 if (!isInt<16>(Imm)) { 3072 ErrInfo = "invalid immediate for SOPK instruction"; 3073 return false; 3074 } 3075 } 3076 } 3077 3078 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || 3079 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || 3080 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 3081 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { 3082 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 3083 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; 3084 3085 const unsigned StaticNumOps = Desc.getNumOperands() + 3086 Desc.getNumImplicitUses(); 3087 const unsigned NumImplicitOps = IsDst ? 2 : 1; 3088 3089 // Allow additional implicit operands. This allows a fixup done by the post 3090 // RA scheduler where the main implicit operand is killed and implicit-defs 3091 // are added for sub-registers that remain live after this instruction. 3092 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { 3093 ErrInfo = "missing implicit register operands"; 3094 return false; 3095 } 3096 3097 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 3098 if (IsDst) { 3099 if (!Dst->isUse()) { 3100 ErrInfo = "v_movreld_b32 vdst should be a use operand"; 3101 return false; 3102 } 3103 3104 unsigned UseOpIdx; 3105 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || 3106 UseOpIdx != StaticNumOps + 1) { 3107 ErrInfo = "movrel implicit operands should be tied"; 3108 return false; 3109 } 3110 } 3111 3112 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 3113 const MachineOperand &ImpUse 3114 = MI.getOperand(StaticNumOps + NumImplicitOps - 1); 3115 if (!ImpUse.isReg() || !ImpUse.isUse() || 3116 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { 3117 ErrInfo = "src0 should be subreg of implicit vector use"; 3118 return false; 3119 } 3120 } 3121 3122 // Make sure we aren't losing exec uses in the td files. This mostly requires 3123 // being careful when using let Uses to try to add other use registers. 3124 if (shouldReadExec(MI)) { 3125 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 3126 ErrInfo = "VALU instruction does not implicitly read exec mask"; 3127 return false; 3128 } 3129 } 3130 3131 if (isSMRD(MI)) { 3132 if (MI.mayStore()) { 3133 // The register offset form of scalar stores may only use m0 as the 3134 // soffset register. 3135 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff); 3136 if (Soff && Soff->getReg() != AMDGPU::M0) { 3137 ErrInfo = "scalar stores must use m0 as offset register"; 3138 return false; 3139 } 3140 } 3141 } 3142 3143 if (isFLAT(MI) && !MF->getSubtarget<GCNSubtarget>().hasFlatInstOffsets()) { 3144 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 3145 if (Offset->getImm() != 0) { 3146 ErrInfo = "subtarget does not support offsets in flat instructions"; 3147 return false; 3148 } 3149 } 3150 3151 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl); 3152 if (DppCt) { 3153 using namespace AMDGPU::DPP; 3154 3155 unsigned DC = DppCt->getImm(); 3156 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 || 3157 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST || 3158 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) || 3159 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) || 3160 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) || 3161 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST)) { 3162 ErrInfo = "Invalid dpp_ctrl value"; 3163 return false; 3164 } 3165 } 3166 3167 return true; 3168 } 3169 3170 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { 3171 switch (MI.getOpcode()) { 3172 default: return AMDGPU::INSTRUCTION_LIST_END; 3173 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 3174 case AMDGPU::COPY: return AMDGPU::COPY; 3175 case AMDGPU::PHI: return AMDGPU::PHI; 3176 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 3177 case AMDGPU::WQM: return AMDGPU::WQM; 3178 case AMDGPU::WWM: return AMDGPU::WWM; 3179 case AMDGPU::S_MOV_B32: 3180 return MI.getOperand(1).isReg() ? 3181 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 3182 case AMDGPU::S_ADD_I32: 3183 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32; 3184 case AMDGPU::S_ADDC_U32: 3185 return AMDGPU::V_ADDC_U32_e32; 3186 case AMDGPU::S_SUB_I32: 3187 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32; 3188 // FIXME: These are not consistently handled, and selected when the carry is 3189 // used. 3190 case AMDGPU::S_ADD_U32: 3191 return AMDGPU::V_ADD_I32_e32; 3192 case AMDGPU::S_SUB_U32: 3193 return AMDGPU::V_SUB_I32_e32; 3194 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 3195 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; 3196 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; 3197 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; 3198 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; 3199 case AMDGPU::S_XNOR_B32: 3200 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; 3201 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; 3202 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; 3203 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; 3204 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; 3205 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 3206 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; 3207 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 3208 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; 3209 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 3210 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; 3211 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; 3212 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; 3213 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; 3214 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; 3215 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 3216 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 3217 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 3218 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 3219 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; 3220 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; 3221 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; 3222 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; 3223 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; 3224 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; 3225 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; 3226 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; 3227 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; 3228 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; 3229 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; 3230 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; 3231 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32; 3232 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32; 3233 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 3234 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 3235 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 3236 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 3237 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; 3238 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; 3239 } 3240 } 3241 3242 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 3243 unsigned OpNo) const { 3244 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 3245 const MCInstrDesc &Desc = get(MI.getOpcode()); 3246 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 3247 Desc.OpInfo[OpNo].RegClass == -1) { 3248 unsigned Reg = MI.getOperand(OpNo).getReg(); 3249 3250 if (TargetRegisterInfo::isVirtualRegister(Reg)) 3251 return MRI.getRegClass(Reg); 3252 return RI.getPhysRegClass(Reg); 3253 } 3254 3255 unsigned RCID = Desc.OpInfo[OpNo].RegClass; 3256 return RI.getRegClass(RCID); 3257 } 3258 3259 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { 3260 MachineBasicBlock::iterator I = MI; 3261 MachineBasicBlock *MBB = MI.getParent(); 3262 MachineOperand &MO = MI.getOperand(OpIdx); 3263 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 3264 unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; 3265 const TargetRegisterClass *RC = RI.getRegClass(RCID); 3266 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 3267 if (MO.isReg()) 3268 Opcode = AMDGPU::COPY; 3269 else if (RI.isSGPRClass(RC)) 3270 Opcode = AMDGPU::S_MOV_B32; 3271 3272 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 3273 if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) 3274 VRC = &AMDGPU::VReg_64RegClass; 3275 else 3276 VRC = &AMDGPU::VGPR_32RegClass; 3277 3278 unsigned Reg = MRI.createVirtualRegister(VRC); 3279 DebugLoc DL = MBB->findDebugLoc(I); 3280 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); 3281 MO.ChangeToRegister(Reg, false); 3282 } 3283 3284 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 3285 MachineRegisterInfo &MRI, 3286 MachineOperand &SuperReg, 3287 const TargetRegisterClass *SuperRC, 3288 unsigned SubIdx, 3289 const TargetRegisterClass *SubRC) 3290 const { 3291 MachineBasicBlock *MBB = MI->getParent(); 3292 DebugLoc DL = MI->getDebugLoc(); 3293 unsigned SubReg = MRI.createVirtualRegister(SubRC); 3294 3295 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { 3296 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 3297 .addReg(SuperReg.getReg(), 0, SubIdx); 3298 return SubReg; 3299 } 3300 3301 // Just in case the super register is itself a sub-register, copy it to a new 3302 // value so we don't need to worry about merging its subreg index with the 3303 // SubIdx passed to this function. The register coalescer should be able to 3304 // eliminate this extra copy. 3305 unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); 3306 3307 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 3308 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 3309 3310 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 3311 .addReg(NewSuperReg, 0, SubIdx); 3312 3313 return SubReg; 3314 } 3315 3316 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 3317 MachineBasicBlock::iterator MII, 3318 MachineRegisterInfo &MRI, 3319 MachineOperand &Op, 3320 const TargetRegisterClass *SuperRC, 3321 unsigned SubIdx, 3322 const TargetRegisterClass *SubRC) const { 3323 if (Op.isImm()) { 3324 if (SubIdx == AMDGPU::sub0) 3325 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); 3326 if (SubIdx == AMDGPU::sub1) 3327 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32)); 3328 3329 llvm_unreachable("Unhandled register index for immediate"); 3330 } 3331 3332 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 3333 SubIdx, SubRC); 3334 return MachineOperand::CreateReg(SubReg, false); 3335 } 3336 3337 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 3338 void SIInstrInfo::swapOperands(MachineInstr &Inst) const { 3339 assert(Inst.getNumExplicitOperands() == 3); 3340 MachineOperand Op1 = Inst.getOperand(1); 3341 Inst.RemoveOperand(1); 3342 Inst.addOperand(Op1); 3343 } 3344 3345 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, 3346 const MCOperandInfo &OpInfo, 3347 const MachineOperand &MO) const { 3348 if (!MO.isReg()) 3349 return false; 3350 3351 unsigned Reg = MO.getReg(); 3352 const TargetRegisterClass *RC = 3353 TargetRegisterInfo::isVirtualRegister(Reg) ? 3354 MRI.getRegClass(Reg) : 3355 RI.getPhysRegClass(Reg); 3356 3357 const SIRegisterInfo *TRI = 3358 static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 3359 RC = TRI->getSubRegClass(RC, MO.getSubReg()); 3360 3361 // In order to be legal, the common sub-class must be equal to the 3362 // class of the current operand. For example: 3363 // 3364 // v_mov_b32 s0 ; Operand defined as vsrc_b32 3365 // ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL 3366 // 3367 // s_sendmsg 0, s0 ; Operand defined as m0reg 3368 // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL 3369 3370 return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; 3371 } 3372 3373 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, 3374 const MCOperandInfo &OpInfo, 3375 const MachineOperand &MO) const { 3376 if (MO.isReg()) 3377 return isLegalRegOperand(MRI, OpInfo, MO); 3378 3379 // Handle non-register types that are treated like immediates. 3380 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); 3381 return true; 3382 } 3383 3384 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, 3385 const MachineOperand *MO) const { 3386 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 3387 const MCInstrDesc &InstDesc = MI.getDesc(); 3388 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; 3389 const TargetRegisterClass *DefinedRC = 3390 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 3391 if (!MO) 3392 MO = &MI.getOperand(OpIdx); 3393 3394 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { 3395 3396 RegSubRegPair SGPRUsed; 3397 if (MO->isReg()) 3398 SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg()); 3399 3400 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3401 if (i == OpIdx) 3402 continue; 3403 const MachineOperand &Op = MI.getOperand(i); 3404 if (Op.isReg()) { 3405 if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) && 3406 usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) { 3407 return false; 3408 } 3409 } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { 3410 return false; 3411 } 3412 } 3413 } 3414 3415 if (MO->isReg()) { 3416 assert(DefinedRC); 3417 return isLegalRegOperand(MRI, OpInfo, *MO); 3418 } 3419 3420 // Handle non-register types that are treated like immediates. 3421 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); 3422 3423 if (!DefinedRC) { 3424 // This operand expects an immediate. 3425 return true; 3426 } 3427 3428 return isImmOperandLegal(MI, OpIdx, *MO); 3429 } 3430 3431 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, 3432 MachineInstr &MI) const { 3433 unsigned Opc = MI.getOpcode(); 3434 const MCInstrDesc &InstrDesc = get(Opc); 3435 3436 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 3437 MachineOperand &Src1 = MI.getOperand(Src1Idx); 3438 3439 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 3440 // we need to only have one constant bus use. 3441 // 3442 // Note we do not need to worry about literal constants here. They are 3443 // disabled for the operand type for instructions because they will always 3444 // violate the one constant bus use rule. 3445 bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; 3446 if (HasImplicitSGPR) { 3447 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 3448 MachineOperand &Src0 = MI.getOperand(Src0Idx); 3449 3450 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) 3451 legalizeOpWithMove(MI, Src0Idx); 3452 } 3453 3454 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for 3455 // both the value to write (src0) and lane select (src1). Fix up non-SGPR 3456 // src0/src1 with V_READFIRSTLANE. 3457 if (Opc == AMDGPU::V_WRITELANE_B32) { 3458 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 3459 MachineOperand &Src0 = MI.getOperand(Src0Idx); 3460 const DebugLoc &DL = MI.getDebugLoc(); 3461 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) { 3462 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 3463 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 3464 .add(Src0); 3465 Src0.ChangeToRegister(Reg, false); 3466 } 3467 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) { 3468 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 3469 const DebugLoc &DL = MI.getDebugLoc(); 3470 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 3471 .add(Src1); 3472 Src1.ChangeToRegister(Reg, false); 3473 } 3474 return; 3475 } 3476 3477 // VOP2 src0 instructions support all operand types, so we don't need to check 3478 // their legality. If src1 is already legal, we don't need to do anything. 3479 if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) 3480 return; 3481 3482 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for 3483 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane 3484 // select is uniform. 3485 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() && 3486 RI.isVGPR(MRI, Src1.getReg())) { 3487 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 3488 const DebugLoc &DL = MI.getDebugLoc(); 3489 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 3490 .add(Src1); 3491 Src1.ChangeToRegister(Reg, false); 3492 return; 3493 } 3494 3495 // We do not use commuteInstruction here because it is too aggressive and will 3496 // commute if it is possible. We only want to commute here if it improves 3497 // legality. This can be called a fairly large number of times so don't waste 3498 // compile time pointlessly swapping and checking legality again. 3499 if (HasImplicitSGPR || !MI.isCommutable()) { 3500 legalizeOpWithMove(MI, Src1Idx); 3501 return; 3502 } 3503 3504 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 3505 MachineOperand &Src0 = MI.getOperand(Src0Idx); 3506 3507 // If src0 can be used as src1, commuting will make the operands legal. 3508 // Otherwise we have to give up and insert a move. 3509 // 3510 // TODO: Other immediate-like operand kinds could be commuted if there was a 3511 // MachineOperand::ChangeTo* for them. 3512 if ((!Src1.isImm() && !Src1.isReg()) || 3513 !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { 3514 legalizeOpWithMove(MI, Src1Idx); 3515 return; 3516 } 3517 3518 int CommutedOpc = commuteOpcode(MI); 3519 if (CommutedOpc == -1) { 3520 legalizeOpWithMove(MI, Src1Idx); 3521 return; 3522 } 3523 3524 MI.setDesc(get(CommutedOpc)); 3525 3526 unsigned Src0Reg = Src0.getReg(); 3527 unsigned Src0SubReg = Src0.getSubReg(); 3528 bool Src0Kill = Src0.isKill(); 3529 3530 if (Src1.isImm()) 3531 Src0.ChangeToImmediate(Src1.getImm()); 3532 else if (Src1.isReg()) { 3533 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); 3534 Src0.setSubReg(Src1.getSubReg()); 3535 } else 3536 llvm_unreachable("Should only have register or immediate operands"); 3537 3538 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); 3539 Src1.setSubReg(Src0SubReg); 3540 } 3541 3542 // Legalize VOP3 operands. Because all operand types are supported for any 3543 // operand, and since literal constants are not allowed and should never be 3544 // seen, we only need to worry about inserting copies if we use multiple SGPR 3545 // operands. 3546 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, 3547 MachineInstr &MI) const { 3548 unsigned Opc = MI.getOpcode(); 3549 3550 int VOP3Idx[3] = { 3551 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 3552 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), 3553 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) 3554 }; 3555 3556 // Find the one SGPR operand we are allowed to use. 3557 unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); 3558 3559 for (unsigned i = 0; i < 3; ++i) { 3560 int Idx = VOP3Idx[i]; 3561 if (Idx == -1) 3562 break; 3563 MachineOperand &MO = MI.getOperand(Idx); 3564 3565 // We should never see a VOP3 instruction with an illegal immediate operand. 3566 if (!MO.isReg()) 3567 continue; 3568 3569 if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 3570 continue; // VGPRs are legal 3571 3572 if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { 3573 SGPRReg = MO.getReg(); 3574 // We can use one SGPR in each VOP3 instruction. 3575 continue; 3576 } 3577 3578 // If we make it this far, then the operand is not legal and we must 3579 // legalize it. 3580 legalizeOpWithMove(MI, Idx); 3581 } 3582 } 3583 3584 unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, 3585 MachineRegisterInfo &MRI) const { 3586 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); 3587 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); 3588 unsigned DstReg = MRI.createVirtualRegister(SRC); 3589 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; 3590 3591 if (SubRegs == 1) { 3592 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 3593 get(AMDGPU::V_READFIRSTLANE_B32), DstReg) 3594 .addReg(SrcReg); 3595 return DstReg; 3596 } 3597 3598 SmallVector<unsigned, 8> SRegs; 3599 for (unsigned i = 0; i < SubRegs; ++i) { 3600 unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 3601 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 3602 get(AMDGPU::V_READFIRSTLANE_B32), SGPR) 3603 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); 3604 SRegs.push_back(SGPR); 3605 } 3606 3607 MachineInstrBuilder MIB = 3608 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 3609 get(AMDGPU::REG_SEQUENCE), DstReg); 3610 for (unsigned i = 0; i < SubRegs; ++i) { 3611 MIB.addReg(SRegs[i]); 3612 MIB.addImm(RI.getSubRegFromChannel(i)); 3613 } 3614 return DstReg; 3615 } 3616 3617 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, 3618 MachineInstr &MI) const { 3619 3620 // If the pointer is store in VGPRs, then we need to move them to 3621 // SGPRs using v_readfirstlane. This is safe because we only select 3622 // loads with uniform pointers to SMRD instruction so we know the 3623 // pointer value is uniform. 3624 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); 3625 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { 3626 unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); 3627 SBase->setReg(SGPR); 3628 } 3629 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff); 3630 if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) { 3631 unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI); 3632 SOff->setReg(SGPR); 3633 } 3634 } 3635 3636 void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, 3637 MachineBasicBlock::iterator I, 3638 const TargetRegisterClass *DstRC, 3639 MachineOperand &Op, 3640 MachineRegisterInfo &MRI, 3641 const DebugLoc &DL) const { 3642 unsigned OpReg = Op.getReg(); 3643 unsigned OpSubReg = Op.getSubReg(); 3644 3645 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( 3646 RI.getRegClassForReg(MRI, OpReg), OpSubReg); 3647 3648 // Check if operand is already the correct register class. 3649 if (DstRC == OpRC) 3650 return; 3651 3652 unsigned DstReg = MRI.createVirtualRegister(DstRC); 3653 MachineInstr *Copy = 3654 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); 3655 3656 Op.setReg(DstReg); 3657 Op.setSubReg(0); 3658 3659 MachineInstr *Def = MRI.getVRegDef(OpReg); 3660 if (!Def) 3661 return; 3662 3663 // Try to eliminate the copy if it is copying an immediate value. 3664 if (Def->isMoveImmediate()) 3665 FoldImmediate(*Copy, *Def, OpReg, &MRI); 3666 } 3667 3668 // Emit the actual waterfall loop, executing the wrapped instruction for each 3669 // unique value of \p Rsrc across all lanes. In the best case we execute 1 3670 // iteration, in the worst case we execute 64 (once per lane). 3671 static void 3672 emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, 3673 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, 3674 const DebugLoc &DL, MachineOperand &Rsrc) { 3675 MachineBasicBlock::iterator I = LoopBB.begin(); 3676 3677 unsigned VRsrc = Rsrc.getReg(); 3678 unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef()); 3679 3680 unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3681 unsigned CondReg0 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3682 unsigned CondReg1 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3683 unsigned AndCond = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3684 unsigned SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 3685 unsigned SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 3686 unsigned SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 3687 unsigned SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 3688 unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); 3689 3690 // Beginning of the loop, read the next Rsrc variant. 3691 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0) 3692 .addReg(VRsrc, VRsrcUndef, AMDGPU::sub0); 3693 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub1) 3694 .addReg(VRsrc, VRsrcUndef, AMDGPU::sub1); 3695 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub2) 3696 .addReg(VRsrc, VRsrcUndef, AMDGPU::sub2); 3697 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub3) 3698 .addReg(VRsrc, VRsrcUndef, AMDGPU::sub3); 3699 3700 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc) 3701 .addReg(SRsrcSub0) 3702 .addImm(AMDGPU::sub0) 3703 .addReg(SRsrcSub1) 3704 .addImm(AMDGPU::sub1) 3705 .addReg(SRsrcSub2) 3706 .addImm(AMDGPU::sub2) 3707 .addReg(SRsrcSub3) 3708 .addImm(AMDGPU::sub3); 3709 3710 // Update Rsrc operand to use the SGPR Rsrc. 3711 Rsrc.setReg(SRsrc); 3712 Rsrc.setIsKill(true); 3713 3714 // Identify all lanes with identical Rsrc operands in their VGPRs. 3715 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg0) 3716 .addReg(SRsrc, 0, AMDGPU::sub0_sub1) 3717 .addReg(VRsrc, 0, AMDGPU::sub0_sub1); 3718 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1) 3719 .addReg(SRsrc, 0, AMDGPU::sub2_sub3) 3720 .addReg(VRsrc, 0, AMDGPU::sub2_sub3); 3721 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_B64), AndCond) 3722 .addReg(CondReg0) 3723 .addReg(CondReg1); 3724 3725 MRI.setSimpleHint(SaveExec, AndCond); 3726 3727 // Update EXEC to matching lanes, saving original to SaveExec. 3728 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_SAVEEXEC_B64), SaveExec) 3729 .addReg(AndCond, RegState::Kill); 3730 3731 // The original instruction is here; we insert the terminators after it. 3732 I = LoopBB.end(); 3733 3734 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 3735 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC) 3736 .addReg(AMDGPU::EXEC) 3737 .addReg(SaveExec); 3738 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB); 3739 } 3740 3741 // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register 3742 // with SGPRs by iterating over all unique values across all lanes. 3743 static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, 3744 MachineOperand &Rsrc, MachineDominatorTree *MDT) { 3745 MachineBasicBlock &MBB = *MI.getParent(); 3746 MachineFunction &MF = *MBB.getParent(); 3747 MachineRegisterInfo &MRI = MF.getRegInfo(); 3748 MachineBasicBlock::iterator I(&MI); 3749 const DebugLoc &DL = MI.getDebugLoc(); 3750 3751 unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 3752 3753 // Save the EXEC mask 3754 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B64), SaveExec) 3755 .addReg(AMDGPU::EXEC); 3756 3757 // Killed uses in the instruction we are waterfalling around will be 3758 // incorrect due to the added control-flow. 3759 for (auto &MO : MI.uses()) { 3760 if (MO.isReg() && MO.isUse()) { 3761 MRI.clearKillFlags(MO.getReg()); 3762 } 3763 } 3764 3765 // To insert the loop we need to split the block. Move everything after this 3766 // point to a new block, and insert a new empty block between the two. 3767 MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock(); 3768 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock(); 3769 MachineFunction::iterator MBBI(MBB); 3770 ++MBBI; 3771 3772 MF.insert(MBBI, LoopBB); 3773 MF.insert(MBBI, RemainderBB); 3774 3775 LoopBB->addSuccessor(LoopBB); 3776 LoopBB->addSuccessor(RemainderBB); 3777 3778 // Move MI to the LoopBB, and the remainder of the block to RemainderBB. 3779 MachineBasicBlock::iterator J = I++; 3780 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 3781 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); 3782 LoopBB->splice(LoopBB->begin(), &MBB, J); 3783 3784 MBB.addSuccessor(LoopBB); 3785 3786 // Update dominators. We know that MBB immediately dominates LoopBB, that 3787 // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately 3788 // dominates all of the successors transferred to it from MBB that MBB used 3789 // to dominate. 3790 if (MDT) { 3791 MDT->addNewBlock(LoopBB, &MBB); 3792 MDT->addNewBlock(RemainderBB, LoopBB); 3793 for (auto &Succ : RemainderBB->successors()) { 3794 if (MDT->dominates(&MBB, Succ)) { 3795 MDT->changeImmediateDominator(Succ, RemainderBB); 3796 } 3797 } 3798 } 3799 3800 emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc); 3801 3802 // Restore the EXEC mask 3803 MachineBasicBlock::iterator First = RemainderBB->begin(); 3804 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) 3805 .addReg(SaveExec); 3806 } 3807 3808 // Extract pointer from Rsrc and return a zero-value Rsrc replacement. 3809 static std::tuple<unsigned, unsigned> 3810 extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) { 3811 MachineBasicBlock &MBB = *MI.getParent(); 3812 MachineFunction &MF = *MBB.getParent(); 3813 MachineRegisterInfo &MRI = MF.getRegInfo(); 3814 3815 // Extract the ptr from the resource descriptor. 3816 unsigned RsrcPtr = 3817 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass, 3818 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); 3819 3820 // Create an empty resource descriptor 3821 unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 3822 unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 3823 unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 3824 unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); 3825 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat(); 3826 3827 // Zero64 = 0 3828 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64) 3829 .addImm(0); 3830 3831 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 3832 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo) 3833 .addImm(RsrcDataFormat & 0xFFFFFFFF); 3834 3835 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 3836 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi) 3837 .addImm(RsrcDataFormat >> 32); 3838 3839 // NewSRsrc = {Zero64, SRsrcFormat} 3840 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc) 3841 .addReg(Zero64) 3842 .addImm(AMDGPU::sub0_sub1) 3843 .addReg(SRsrcFormatLo) 3844 .addImm(AMDGPU::sub2) 3845 .addReg(SRsrcFormatHi) 3846 .addImm(AMDGPU::sub3); 3847 3848 return std::make_tuple(RsrcPtr, NewSRsrc); 3849 } 3850 3851 void SIInstrInfo::legalizeOperands(MachineInstr &MI, 3852 MachineDominatorTree *MDT) const { 3853 MachineFunction &MF = *MI.getParent()->getParent(); 3854 MachineRegisterInfo &MRI = MF.getRegInfo(); 3855 3856 // Legalize VOP2 3857 if (isVOP2(MI) || isVOPC(MI)) { 3858 legalizeOperandsVOP2(MRI, MI); 3859 return; 3860 } 3861 3862 // Legalize VOP3 3863 if (isVOP3(MI)) { 3864 legalizeOperandsVOP3(MRI, MI); 3865 return; 3866 } 3867 3868 // Legalize SMRD 3869 if (isSMRD(MI)) { 3870 legalizeOperandsSMRD(MRI, MI); 3871 return; 3872 } 3873 3874 // Legalize REG_SEQUENCE and PHI 3875 // The register class of the operands much be the same type as the register 3876 // class of the output. 3877 if (MI.getOpcode() == AMDGPU::PHI) { 3878 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 3879 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { 3880 if (!MI.getOperand(i).isReg() || 3881 !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) 3882 continue; 3883 const TargetRegisterClass *OpRC = 3884 MRI.getRegClass(MI.getOperand(i).getReg()); 3885 if (RI.hasVGPRs(OpRC)) { 3886 VRC = OpRC; 3887 } else { 3888 SRC = OpRC; 3889 } 3890 } 3891 3892 // If any of the operands are VGPR registers, then they all most be 3893 // otherwise we will create illegal VGPR->SGPR copies when legalizing 3894 // them. 3895 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { 3896 if (!VRC) { 3897 assert(SRC); 3898 VRC = RI.getEquivalentVGPRClass(SRC); 3899 } 3900 RC = VRC; 3901 } else { 3902 RC = SRC; 3903 } 3904 3905 // Update all the operands so they have the same type. 3906 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3907 MachineOperand &Op = MI.getOperand(I); 3908 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 3909 continue; 3910 3911 // MI is a PHI instruction. 3912 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); 3913 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); 3914 3915 // Avoid creating no-op copies with the same src and dst reg class. These 3916 // confuse some of the machine passes. 3917 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); 3918 } 3919 } 3920 3921 // REG_SEQUENCE doesn't really require operand legalization, but if one has a 3922 // VGPR dest type and SGPR sources, insert copies so all operands are 3923 // VGPRs. This seems to help operand folding / the register coalescer. 3924 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { 3925 MachineBasicBlock *MBB = MI.getParent(); 3926 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); 3927 if (RI.hasVGPRs(DstRC)) { 3928 // Update all the operands so they are VGPR register classes. These may 3929 // not be the same register class because REG_SEQUENCE supports mixing 3930 // subregister index types e.g. sub0_sub1 + sub2 + sub3 3931 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3932 MachineOperand &Op = MI.getOperand(I); 3933 if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) 3934 continue; 3935 3936 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); 3937 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); 3938 if (VRC == OpRC) 3939 continue; 3940 3941 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc()); 3942 Op.setIsKill(); 3943 } 3944 } 3945 3946 return; 3947 } 3948 3949 // Legalize INSERT_SUBREG 3950 // src0 must have the same register class as dst 3951 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { 3952 unsigned Dst = MI.getOperand(0).getReg(); 3953 unsigned Src0 = MI.getOperand(1).getReg(); 3954 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 3955 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 3956 if (DstRC != Src0RC) { 3957 MachineBasicBlock *MBB = MI.getParent(); 3958 MachineOperand &Op = MI.getOperand(1); 3959 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); 3960 } 3961 return; 3962 } 3963 3964 // Legalize SI_INIT_M0 3965 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) { 3966 MachineOperand &Src = MI.getOperand(0); 3967 if (Src.isReg() && RI.hasVGPRs(MRI.getRegClass(Src.getReg()))) 3968 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); 3969 return; 3970 } 3971 3972 // Legalize MIMG and MUBUF/MTBUF for shaders. 3973 // 3974 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via 3975 // scratch memory access. In both cases, the legalization never involves 3976 // conversion to the addr64 form. 3977 if (isMIMG(MI) || 3978 (AMDGPU::isShader(MF.getFunction().getCallingConv()) && 3979 (isMUBUF(MI) || isMTBUF(MI)))) { 3980 MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); 3981 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { 3982 unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); 3983 SRsrc->setReg(SGPR); 3984 } 3985 3986 MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); 3987 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { 3988 unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); 3989 SSamp->setReg(SGPR); 3990 } 3991 return; 3992 } 3993 3994 // Legalize MUBUF* instructions. 3995 int RsrcIdx = 3996 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); 3997 if (RsrcIdx != -1) { 3998 // We have an MUBUF instruction 3999 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx); 4000 unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass; 4001 if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()), 4002 RI.getRegClass(RsrcRC))) { 4003 // The operands are legal. 4004 // FIXME: We may need to legalize operands besided srsrc. 4005 return; 4006 } 4007 4008 // Legalize a VGPR Rsrc. 4009 // 4010 // If the instruction is _ADDR64, we can avoid a waterfall by extracting 4011 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using 4012 // a zero-value SRsrc. 4013 // 4014 // If the instruction is _OFFSET (both idxen and offen disabled), and we 4015 // support ADDR64 instructions, we can convert to ADDR64 and do the same as 4016 // above. 4017 // 4018 // Otherwise we are on non-ADDR64 hardware, and/or we have 4019 // idxen/offen/bothen and we fall back to a waterfall loop. 4020 4021 MachineBasicBlock &MBB = *MI.getParent(); 4022 4023 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 4024 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) { 4025 // This is already an ADDR64 instruction so we need to add the pointer 4026 // extracted from the resource descriptor to the current value of VAddr. 4027 unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4028 unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4029 unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 4030 4031 unsigned RsrcPtr, NewSRsrc; 4032 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); 4033 4034 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0 4035 DebugLoc DL = MI.getDebugLoc(); 4036 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) 4037 .addReg(RsrcPtr, 0, AMDGPU::sub0) 4038 .addReg(VAddr->getReg(), 0, AMDGPU::sub0); 4039 4040 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1 4041 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) 4042 .addReg(RsrcPtr, 0, AMDGPU::sub1) 4043 .addReg(VAddr->getReg(), 0, AMDGPU::sub1); 4044 4045 // NewVaddr = {NewVaddrHi, NewVaddrLo} 4046 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 4047 .addReg(NewVAddrLo) 4048 .addImm(AMDGPU::sub0) 4049 .addReg(NewVAddrHi) 4050 .addImm(AMDGPU::sub1); 4051 4052 VAddr->setReg(NewVAddr); 4053 Rsrc->setReg(NewSRsrc); 4054 } else if (!VAddr && ST.hasAddr64()) { 4055 // This instructions is the _OFFSET variant, so we need to convert it to 4056 // ADDR64. 4057 assert(MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration() 4058 < AMDGPUSubtarget::VOLCANIC_ISLANDS && 4059 "FIXME: Need to emit flat atomics here"); 4060 4061 unsigned RsrcPtr, NewSRsrc; 4062 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); 4063 4064 unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 4065 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); 4066 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 4067 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); 4068 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); 4069 4070 // Atomics rith return have have an additional tied operand and are 4071 // missing some of the special bits. 4072 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); 4073 MachineInstr *Addr64; 4074 4075 if (!VDataIn) { 4076 // Regular buffer load / store. 4077 MachineInstrBuilder MIB = 4078 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 4079 .add(*VData) 4080 .addReg(NewVAddr) 4081 .addReg(NewSRsrc) 4082 .add(*SOffset) 4083 .add(*Offset); 4084 4085 // Atomics do not have this operand. 4086 if (const MachineOperand *GLC = 4087 getNamedOperand(MI, AMDGPU::OpName::glc)) { 4088 MIB.addImm(GLC->getImm()); 4089 } 4090 4091 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)); 4092 4093 if (const MachineOperand *TFE = 4094 getNamedOperand(MI, AMDGPU::OpName::tfe)) { 4095 MIB.addImm(TFE->getImm()); 4096 } 4097 4098 MIB.cloneMemRefs(MI); 4099 Addr64 = MIB; 4100 } else { 4101 // Atomics with return. 4102 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 4103 .add(*VData) 4104 .add(*VDataIn) 4105 .addReg(NewVAddr) 4106 .addReg(NewSRsrc) 4107 .add(*SOffset) 4108 .add(*Offset) 4109 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) 4110 .cloneMemRefs(MI); 4111 } 4112 4113 MI.removeFromParent(); 4114 4115 // NewVaddr = {NewVaddrHi, NewVaddrLo} 4116 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), 4117 NewVAddr) 4118 .addReg(RsrcPtr, 0, AMDGPU::sub0) 4119 .addImm(AMDGPU::sub0) 4120 .addReg(RsrcPtr, 0, AMDGPU::sub1) 4121 .addImm(AMDGPU::sub1); 4122 } else { 4123 // This is another variant; legalize Rsrc with waterfall loop from VGPRs 4124 // to SGPRs. 4125 loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT); 4126 } 4127 } 4128 } 4129 4130 void SIInstrInfo::moveToVALU(MachineInstr &TopInst, 4131 MachineDominatorTree *MDT) const { 4132 SetVectorType Worklist; 4133 Worklist.insert(&TopInst); 4134 4135 while (!Worklist.empty()) { 4136 MachineInstr &Inst = *Worklist.pop_back_val(); 4137 MachineBasicBlock *MBB = Inst.getParent(); 4138 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 4139 4140 unsigned Opcode = Inst.getOpcode(); 4141 unsigned NewOpcode = getVALUOp(Inst); 4142 4143 // Handle some special cases 4144 switch (Opcode) { 4145 default: 4146 break; 4147 case AMDGPU::S_ADD_U64_PSEUDO: 4148 case AMDGPU::S_SUB_U64_PSEUDO: 4149 splitScalar64BitAddSub(Worklist, Inst, MDT); 4150 Inst.eraseFromParent(); 4151 continue; 4152 case AMDGPU::S_ADD_I32: 4153 case AMDGPU::S_SUB_I32: 4154 // FIXME: The u32 versions currently selected use the carry. 4155 if (moveScalarAddSub(Worklist, Inst, MDT)) 4156 continue; 4157 4158 // Default handling 4159 break; 4160 case AMDGPU::S_AND_B64: 4161 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); 4162 Inst.eraseFromParent(); 4163 continue; 4164 4165 case AMDGPU::S_OR_B64: 4166 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT); 4167 Inst.eraseFromParent(); 4168 continue; 4169 4170 case AMDGPU::S_XOR_B64: 4171 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT); 4172 Inst.eraseFromParent(); 4173 continue; 4174 4175 case AMDGPU::S_NAND_B64: 4176 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT); 4177 Inst.eraseFromParent(); 4178 continue; 4179 4180 case AMDGPU::S_NOR_B64: 4181 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT); 4182 Inst.eraseFromParent(); 4183 continue; 4184 4185 case AMDGPU::S_XNOR_B64: 4186 if (ST.hasDLInsts()) 4187 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); 4188 else 4189 splitScalar64BitXnor(Worklist, Inst, MDT); 4190 Inst.eraseFromParent(); 4191 continue; 4192 4193 case AMDGPU::S_ANDN2_B64: 4194 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT); 4195 Inst.eraseFromParent(); 4196 continue; 4197 4198 case AMDGPU::S_ORN2_B64: 4199 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT); 4200 Inst.eraseFromParent(); 4201 continue; 4202 4203 case AMDGPU::S_NOT_B64: 4204 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); 4205 Inst.eraseFromParent(); 4206 continue; 4207 4208 case AMDGPU::S_BCNT1_I32_B64: 4209 splitScalar64BitBCNT(Worklist, Inst); 4210 Inst.eraseFromParent(); 4211 continue; 4212 4213 case AMDGPU::S_BFE_I64: 4214 splitScalar64BitBFE(Worklist, Inst); 4215 Inst.eraseFromParent(); 4216 continue; 4217 4218 case AMDGPU::S_LSHL_B32: 4219 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 4220 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 4221 swapOperands(Inst); 4222 } 4223 break; 4224 case AMDGPU::S_ASHR_I32: 4225 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 4226 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 4227 swapOperands(Inst); 4228 } 4229 break; 4230 case AMDGPU::S_LSHR_B32: 4231 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 4232 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 4233 swapOperands(Inst); 4234 } 4235 break; 4236 case AMDGPU::S_LSHL_B64: 4237 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 4238 NewOpcode = AMDGPU::V_LSHLREV_B64; 4239 swapOperands(Inst); 4240 } 4241 break; 4242 case AMDGPU::S_ASHR_I64: 4243 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 4244 NewOpcode = AMDGPU::V_ASHRREV_I64; 4245 swapOperands(Inst); 4246 } 4247 break; 4248 case AMDGPU::S_LSHR_B64: 4249 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 4250 NewOpcode = AMDGPU::V_LSHRREV_B64; 4251 swapOperands(Inst); 4252 } 4253 break; 4254 4255 case AMDGPU::S_ABS_I32: 4256 lowerScalarAbs(Worklist, Inst); 4257 Inst.eraseFromParent(); 4258 continue; 4259 4260 case AMDGPU::S_CBRANCH_SCC0: 4261 case AMDGPU::S_CBRANCH_SCC1: 4262 // Clear unused bits of vcc 4263 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), 4264 AMDGPU::VCC) 4265 .addReg(AMDGPU::EXEC) 4266 .addReg(AMDGPU::VCC); 4267 break; 4268 4269 case AMDGPU::S_BFE_U64: 4270 case AMDGPU::S_BFM_B64: 4271 llvm_unreachable("Moving this op to VALU not implemented"); 4272 4273 case AMDGPU::S_PACK_LL_B32_B16: 4274 case AMDGPU::S_PACK_LH_B32_B16: 4275 case AMDGPU::S_PACK_HH_B32_B16: 4276 movePackToVALU(Worklist, MRI, Inst); 4277 Inst.eraseFromParent(); 4278 continue; 4279 4280 case AMDGPU::S_XNOR_B32: 4281 lowerScalarXnor(Worklist, Inst); 4282 Inst.eraseFromParent(); 4283 continue; 4284 4285 case AMDGPU::S_NAND_B32: 4286 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32); 4287 Inst.eraseFromParent(); 4288 continue; 4289 4290 case AMDGPU::S_NOR_B32: 4291 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32); 4292 Inst.eraseFromParent(); 4293 continue; 4294 4295 case AMDGPU::S_ANDN2_B32: 4296 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32); 4297 Inst.eraseFromParent(); 4298 continue; 4299 4300 case AMDGPU::S_ORN2_B32: 4301 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); 4302 Inst.eraseFromParent(); 4303 continue; 4304 } 4305 4306 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 4307 // We cannot move this instruction to the VALU, so we should try to 4308 // legalize its operands instead. 4309 legalizeOperands(Inst, MDT); 4310 continue; 4311 } 4312 4313 // Use the new VALU Opcode. 4314 const MCInstrDesc &NewDesc = get(NewOpcode); 4315 Inst.setDesc(NewDesc); 4316 4317 // Remove any references to SCC. Vector instructions can't read from it, and 4318 // We're just about to add the implicit use / defs of VCC, and we don't want 4319 // both. 4320 for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { 4321 MachineOperand &Op = Inst.getOperand(i); 4322 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { 4323 Inst.RemoveOperand(i); 4324 addSCCDefUsersToVALUWorklist(Inst, Worklist); 4325 } 4326 } 4327 4328 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 4329 // We are converting these to a BFE, so we need to add the missing 4330 // operands for the size and offset. 4331 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 4332 Inst.addOperand(MachineOperand::CreateImm(0)); 4333 Inst.addOperand(MachineOperand::CreateImm(Size)); 4334 4335 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 4336 // The VALU version adds the second operand to the result, so insert an 4337 // extra 0 operand. 4338 Inst.addOperand(MachineOperand::CreateImm(0)); 4339 } 4340 4341 Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); 4342 4343 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 4344 const MachineOperand &OffsetWidthOp = Inst.getOperand(2); 4345 // If we need to move this to VGPRs, we need to unpack the second operand 4346 // back into the 2 separate ones for bit offset and width. 4347 assert(OffsetWidthOp.isImm() && 4348 "Scalar BFE is only implemented for constant width and offset"); 4349 uint32_t Imm = OffsetWidthOp.getImm(); 4350 4351 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 4352 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 4353 Inst.RemoveOperand(2); // Remove old immediate. 4354 Inst.addOperand(MachineOperand::CreateImm(Offset)); 4355 Inst.addOperand(MachineOperand::CreateImm(BitWidth)); 4356 } 4357 4358 bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); 4359 unsigned NewDstReg = AMDGPU::NoRegister; 4360 if (HasDst) { 4361 unsigned DstReg = Inst.getOperand(0).getReg(); 4362 if (TargetRegisterInfo::isPhysicalRegister(DstReg)) 4363 continue; 4364 4365 // Update the destination register class. 4366 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); 4367 if (!NewDstRC) 4368 continue; 4369 4370 if (Inst.isCopy() && 4371 TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) && 4372 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { 4373 // Instead of creating a copy where src and dst are the same register 4374 // class, we just replace all uses of dst with src. These kinds of 4375 // copies interfere with the heuristics MachineSink uses to decide 4376 // whether or not to split a critical edge. Since the pass assumes 4377 // that copies will end up as machine instructions and not be 4378 // eliminated. 4379 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); 4380 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); 4381 MRI.clearKillFlags(Inst.getOperand(1).getReg()); 4382 Inst.getOperand(0).setReg(DstReg); 4383 4384 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally 4385 // these are deleted later, but at -O0 it would leave a suspicious 4386 // looking illegal copy of an undef register. 4387 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I) 4388 Inst.RemoveOperand(I); 4389 Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); 4390 continue; 4391 } 4392 4393 NewDstReg = MRI.createVirtualRegister(NewDstRC); 4394 MRI.replaceRegWith(DstReg, NewDstReg); 4395 } 4396 4397 // Legalize the operands 4398 legalizeOperands(Inst, MDT); 4399 4400 if (HasDst) 4401 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 4402 } 4403 } 4404 4405 // Add/sub require special handling to deal with carry outs. 4406 bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, 4407 MachineDominatorTree *MDT) const { 4408 if (ST.hasAddNoCarry()) { 4409 // Assume there is no user of scc since we don't select this in that case. 4410 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant 4411 // is used. 4412 4413 MachineBasicBlock &MBB = *Inst.getParent(); 4414 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4415 4416 unsigned OldDstReg = Inst.getOperand(0).getReg(); 4417 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4418 4419 unsigned Opc = Inst.getOpcode(); 4420 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32); 4421 4422 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ? 4423 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64; 4424 4425 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC); 4426 Inst.RemoveOperand(3); 4427 4428 Inst.setDesc(get(NewOpc)); 4429 Inst.addImplicitDefUseOperands(*MBB.getParent()); 4430 MRI.replaceRegWith(OldDstReg, ResultReg); 4431 legalizeOperands(Inst, MDT); 4432 4433 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 4434 return true; 4435 } 4436 4437 return false; 4438 } 4439 4440 void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, 4441 MachineInstr &Inst) const { 4442 MachineBasicBlock &MBB = *Inst.getParent(); 4443 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4444 MachineBasicBlock::iterator MII = Inst; 4445 DebugLoc DL = Inst.getDebugLoc(); 4446 4447 MachineOperand &Dest = Inst.getOperand(0); 4448 MachineOperand &Src = Inst.getOperand(1); 4449 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4450 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4451 4452 unsigned SubOp = ST.hasAddNoCarry() ? 4453 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32; 4454 4455 BuildMI(MBB, MII, DL, get(SubOp), TmpReg) 4456 .addImm(0) 4457 .addReg(Src.getReg()); 4458 4459 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) 4460 .addReg(Src.getReg()) 4461 .addReg(TmpReg); 4462 4463 MRI.replaceRegWith(Dest.getReg(), ResultReg); 4464 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 4465 } 4466 4467 void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, 4468 MachineInstr &Inst) const { 4469 MachineBasicBlock &MBB = *Inst.getParent(); 4470 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4471 MachineBasicBlock::iterator MII = Inst; 4472 const DebugLoc &DL = Inst.getDebugLoc(); 4473 4474 MachineOperand &Dest = Inst.getOperand(0); 4475 MachineOperand &Src0 = Inst.getOperand(1); 4476 MachineOperand &Src1 = Inst.getOperand(2); 4477 4478 if (ST.hasDLInsts()) { 4479 unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4480 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); 4481 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); 4482 4483 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest) 4484 .add(Src0) 4485 .add(Src1); 4486 4487 MRI.replaceRegWith(Dest.getReg(), NewDest); 4488 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 4489 } else { 4490 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can 4491 // invert either source and then perform the XOR. If either source is a 4492 // scalar register, then we can leave the inversion on the scalar unit to 4493 // acheive a better distrubution of scalar and vector instructions. 4494 bool Src0IsSGPR = Src0.isReg() && 4495 RI.isSGPRClass(MRI.getRegClass(Src0.getReg())); 4496 bool Src1IsSGPR = Src1.isReg() && 4497 RI.isSGPRClass(MRI.getRegClass(Src1.getReg())); 4498 MachineInstr *Not = nullptr; 4499 MachineInstr *Xor = nullptr; 4500 unsigned Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4501 unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4502 4503 // Build a pair of scalar instructions and add them to the work list. 4504 // The next iteration over the work list will lower these to the vector 4505 // unit as necessary. 4506 if (Src0IsSGPR) { 4507 Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp) 4508 .add(Src0); 4509 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) 4510 .addReg(Temp) 4511 .add(Src1); 4512 } else if (Src1IsSGPR) { 4513 Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp) 4514 .add(Src1); 4515 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) 4516 .add(Src0) 4517 .addReg(Temp); 4518 } else { 4519 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp) 4520 .add(Src0) 4521 .add(Src1); 4522 Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest) 4523 .addReg(Temp); 4524 Worklist.insert(Not); 4525 } 4526 4527 MRI.replaceRegWith(Dest.getReg(), NewDest); 4528 4529 Worklist.insert(Xor); 4530 4531 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 4532 } 4533 } 4534 4535 void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist, 4536 MachineInstr &Inst, 4537 unsigned Opcode) const { 4538 MachineBasicBlock &MBB = *Inst.getParent(); 4539 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4540 MachineBasicBlock::iterator MII = Inst; 4541 const DebugLoc &DL = Inst.getDebugLoc(); 4542 4543 MachineOperand &Dest = Inst.getOperand(0); 4544 MachineOperand &Src0 = Inst.getOperand(1); 4545 MachineOperand &Src1 = Inst.getOperand(2); 4546 4547 unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4548 unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4549 4550 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm) 4551 .add(Src0) 4552 .add(Src1); 4553 4554 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest) 4555 .addReg(Interm); 4556 4557 Worklist.insert(&Op); 4558 Worklist.insert(&Not); 4559 4560 MRI.replaceRegWith(Dest.getReg(), NewDest); 4561 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 4562 } 4563 4564 void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist, 4565 MachineInstr &Inst, 4566 unsigned Opcode) const { 4567 MachineBasicBlock &MBB = *Inst.getParent(); 4568 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4569 MachineBasicBlock::iterator MII = Inst; 4570 const DebugLoc &DL = Inst.getDebugLoc(); 4571 4572 MachineOperand &Dest = Inst.getOperand(0); 4573 MachineOperand &Src0 = Inst.getOperand(1); 4574 MachineOperand &Src1 = Inst.getOperand(2); 4575 4576 unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4577 unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4578 4579 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm) 4580 .add(Src1); 4581 4582 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest) 4583 .add(Src0) 4584 .addReg(Interm); 4585 4586 Worklist.insert(&Not); 4587 Worklist.insert(&Op); 4588 4589 MRI.replaceRegWith(Dest.getReg(), NewDest); 4590 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 4591 } 4592 4593 void SIInstrInfo::splitScalar64BitUnaryOp( 4594 SetVectorType &Worklist, MachineInstr &Inst, 4595 unsigned Opcode) const { 4596 MachineBasicBlock &MBB = *Inst.getParent(); 4597 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4598 4599 MachineOperand &Dest = Inst.getOperand(0); 4600 MachineOperand &Src0 = Inst.getOperand(1); 4601 DebugLoc DL = Inst.getDebugLoc(); 4602 4603 MachineBasicBlock::iterator MII = Inst; 4604 4605 const MCInstrDesc &InstDesc = get(Opcode); 4606 const TargetRegisterClass *Src0RC = Src0.isReg() ? 4607 MRI.getRegClass(Src0.getReg()) : 4608 &AMDGPU::SGPR_32RegClass; 4609 4610 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 4611 4612 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 4613 AMDGPU::sub0, Src0SubRC); 4614 4615 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 4616 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 4617 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 4618 4619 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 4620 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); 4621 4622 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 4623 AMDGPU::sub1, Src0SubRC); 4624 4625 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 4626 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); 4627 4628 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 4629 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 4630 .addReg(DestSub0) 4631 .addImm(AMDGPU::sub0) 4632 .addReg(DestSub1) 4633 .addImm(AMDGPU::sub1); 4634 4635 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 4636 4637 Worklist.insert(&LoHalf); 4638 Worklist.insert(&HiHalf); 4639 4640 // We don't need to legalizeOperands here because for a single operand, src0 4641 // will support any kind of input. 4642 4643 // Move all users of this moved value. 4644 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 4645 } 4646 4647 void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, 4648 MachineInstr &Inst, 4649 MachineDominatorTree *MDT) const { 4650 bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); 4651 4652 MachineBasicBlock &MBB = *Inst.getParent(); 4653 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4654 4655 unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 4656 unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4657 unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4658 4659 unsigned CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 4660 unsigned DeadCarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 4661 4662 MachineOperand &Dest = Inst.getOperand(0); 4663 MachineOperand &Src0 = Inst.getOperand(1); 4664 MachineOperand &Src1 = Inst.getOperand(2); 4665 const DebugLoc &DL = Inst.getDebugLoc(); 4666 MachineBasicBlock::iterator MII = Inst; 4667 4668 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); 4669 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); 4670 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 4671 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 4672 4673 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 4674 AMDGPU::sub0, Src0SubRC); 4675 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 4676 AMDGPU::sub0, Src1SubRC); 4677 4678 4679 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 4680 AMDGPU::sub1, Src0SubRC); 4681 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 4682 AMDGPU::sub1, Src1SubRC); 4683 4684 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; 4685 MachineInstr *LoHalf = 4686 BuildMI(MBB, MII, DL, get(LoOpc), DestSub0) 4687 .addReg(CarryReg, RegState::Define) 4688 .add(SrcReg0Sub0) 4689 .add(SrcReg1Sub0); 4690 4691 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 4692 MachineInstr *HiHalf = 4693 BuildMI(MBB, MII, DL, get(HiOpc), DestSub1) 4694 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 4695 .add(SrcReg0Sub1) 4696 .add(SrcReg1Sub1) 4697 .addReg(CarryReg, RegState::Kill); 4698 4699 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 4700 .addReg(DestSub0) 4701 .addImm(AMDGPU::sub0) 4702 .addReg(DestSub1) 4703 .addImm(AMDGPU::sub1); 4704 4705 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 4706 4707 // Try to legalize the operands in case we need to swap the order to keep it 4708 // valid. 4709 legalizeOperands(*LoHalf, MDT); 4710 legalizeOperands(*HiHalf, MDT); 4711 4712 // Move all users of this moved vlaue. 4713 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 4714 } 4715 4716 void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, 4717 MachineInstr &Inst, unsigned Opcode, 4718 MachineDominatorTree *MDT) const { 4719 MachineBasicBlock &MBB = *Inst.getParent(); 4720 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4721 4722 MachineOperand &Dest = Inst.getOperand(0); 4723 MachineOperand &Src0 = Inst.getOperand(1); 4724 MachineOperand &Src1 = Inst.getOperand(2); 4725 DebugLoc DL = Inst.getDebugLoc(); 4726 4727 MachineBasicBlock::iterator MII = Inst; 4728 4729 const MCInstrDesc &InstDesc = get(Opcode); 4730 const TargetRegisterClass *Src0RC = Src0.isReg() ? 4731 MRI.getRegClass(Src0.getReg()) : 4732 &AMDGPU::SGPR_32RegClass; 4733 4734 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 4735 const TargetRegisterClass *Src1RC = Src1.isReg() ? 4736 MRI.getRegClass(Src1.getReg()) : 4737 &AMDGPU::SGPR_32RegClass; 4738 4739 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 4740 4741 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 4742 AMDGPU::sub0, Src0SubRC); 4743 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 4744 AMDGPU::sub0, Src1SubRC); 4745 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 4746 AMDGPU::sub1, Src0SubRC); 4747 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 4748 AMDGPU::sub1, Src1SubRC); 4749 4750 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 4751 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 4752 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 4753 4754 unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 4755 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) 4756 .add(SrcReg0Sub0) 4757 .add(SrcReg1Sub0); 4758 4759 unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 4760 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) 4761 .add(SrcReg0Sub1) 4762 .add(SrcReg1Sub1); 4763 4764 unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); 4765 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 4766 .addReg(DestSub0) 4767 .addImm(AMDGPU::sub0) 4768 .addReg(DestSub1) 4769 .addImm(AMDGPU::sub1); 4770 4771 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 4772 4773 Worklist.insert(&LoHalf); 4774 Worklist.insert(&HiHalf); 4775 4776 // Move all users of this moved vlaue. 4777 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 4778 } 4779 4780 void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist, 4781 MachineInstr &Inst, 4782 MachineDominatorTree *MDT) const { 4783 MachineBasicBlock &MBB = *Inst.getParent(); 4784 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4785 4786 MachineOperand &Dest = Inst.getOperand(0); 4787 MachineOperand &Src0 = Inst.getOperand(1); 4788 MachineOperand &Src1 = Inst.getOperand(2); 4789 const DebugLoc &DL = Inst.getDebugLoc(); 4790 4791 MachineBasicBlock::iterator MII = Inst; 4792 4793 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 4794 4795 unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 4796 4797 MachineOperand* Op0; 4798 MachineOperand* Op1; 4799 4800 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) { 4801 Op0 = &Src0; 4802 Op1 = &Src1; 4803 } else { 4804 Op0 = &Src1; 4805 Op1 = &Src0; 4806 } 4807 4808 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm) 4809 .add(*Op0); 4810 4811 unsigned NewDest = MRI.createVirtualRegister(DestRC); 4812 4813 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest) 4814 .addReg(Interm) 4815 .add(*Op1); 4816 4817 MRI.replaceRegWith(Dest.getReg(), NewDest); 4818 4819 Worklist.insert(&Xor); 4820 } 4821 4822 void SIInstrInfo::splitScalar64BitBCNT( 4823 SetVectorType &Worklist, MachineInstr &Inst) const { 4824 MachineBasicBlock &MBB = *Inst.getParent(); 4825 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4826 4827 MachineBasicBlock::iterator MII = Inst; 4828 const DebugLoc &DL = Inst.getDebugLoc(); 4829 4830 MachineOperand &Dest = Inst.getOperand(0); 4831 MachineOperand &Src = Inst.getOperand(1); 4832 4833 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 4834 const TargetRegisterClass *SrcRC = Src.isReg() ? 4835 MRI.getRegClass(Src.getReg()) : 4836 &AMDGPU::SGPR_32RegClass; 4837 4838 unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4839 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4840 4841 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); 4842 4843 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 4844 AMDGPU::sub0, SrcSubRC); 4845 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 4846 AMDGPU::sub1, SrcSubRC); 4847 4848 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0); 4849 4850 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg); 4851 4852 MRI.replaceRegWith(Dest.getReg(), ResultReg); 4853 4854 // We don't need to legalize operands here. src0 for etiher instruction can be 4855 // an SGPR, and the second input is unused or determined here. 4856 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 4857 } 4858 4859 void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, 4860 MachineInstr &Inst) const { 4861 MachineBasicBlock &MBB = *Inst.getParent(); 4862 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4863 MachineBasicBlock::iterator MII = Inst; 4864 const DebugLoc &DL = Inst.getDebugLoc(); 4865 4866 MachineOperand &Dest = Inst.getOperand(0); 4867 uint32_t Imm = Inst.getOperand(2).getImm(); 4868 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 4869 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 4870 4871 (void) Offset; 4872 4873 // Only sext_inreg cases handled. 4874 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && 4875 Offset == 0 && "Not implemented"); 4876 4877 if (BitWidth < 32) { 4878 unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4879 unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4880 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 4881 4882 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) 4883 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) 4884 .addImm(0) 4885 .addImm(BitWidth); 4886 4887 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 4888 .addImm(31) 4889 .addReg(MidRegLo); 4890 4891 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 4892 .addReg(MidRegLo) 4893 .addImm(AMDGPU::sub0) 4894 .addReg(MidRegHi) 4895 .addImm(AMDGPU::sub1); 4896 4897 MRI.replaceRegWith(Dest.getReg(), ResultReg); 4898 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 4899 return; 4900 } 4901 4902 MachineOperand &Src = Inst.getOperand(1); 4903 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4904 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 4905 4906 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 4907 .addImm(31) 4908 .addReg(Src.getReg(), 0, AMDGPU::sub0); 4909 4910 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 4911 .addReg(Src.getReg(), 0, AMDGPU::sub0) 4912 .addImm(AMDGPU::sub0) 4913 .addReg(TmpReg) 4914 .addImm(AMDGPU::sub1); 4915 4916 MRI.replaceRegWith(Dest.getReg(), ResultReg); 4917 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 4918 } 4919 4920 void SIInstrInfo::addUsersToMoveToVALUWorklist( 4921 unsigned DstReg, 4922 MachineRegisterInfo &MRI, 4923 SetVectorType &Worklist) const { 4924 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), 4925 E = MRI.use_end(); I != E;) { 4926 MachineInstr &UseMI = *I->getParent(); 4927 4928 unsigned OpNo = 0; 4929 4930 switch (UseMI.getOpcode()) { 4931 case AMDGPU::COPY: 4932 case AMDGPU::WQM: 4933 case AMDGPU::WWM: 4934 case AMDGPU::REG_SEQUENCE: 4935 case AMDGPU::PHI: 4936 case AMDGPU::INSERT_SUBREG: 4937 break; 4938 default: 4939 OpNo = I.getOperandNo(); 4940 break; 4941 } 4942 4943 if (!RI.hasVGPRs(getOpRegClass(UseMI, OpNo))) { 4944 Worklist.insert(&UseMI); 4945 4946 do { 4947 ++I; 4948 } while (I != E && I->getParent() == &UseMI); 4949 } else { 4950 ++I; 4951 } 4952 } 4953 } 4954 4955 void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, 4956 MachineRegisterInfo &MRI, 4957 MachineInstr &Inst) const { 4958 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4959 MachineBasicBlock *MBB = Inst.getParent(); 4960 MachineOperand &Src0 = Inst.getOperand(1); 4961 MachineOperand &Src1 = Inst.getOperand(2); 4962 const DebugLoc &DL = Inst.getDebugLoc(); 4963 4964 switch (Inst.getOpcode()) { 4965 case AMDGPU::S_PACK_LL_B32_B16: { 4966 unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4967 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4968 4969 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are 4970 // 0. 4971 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 4972 .addImm(0xffff); 4973 4974 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg) 4975 .addReg(ImmReg, RegState::Kill) 4976 .add(Src0); 4977 4978 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg) 4979 .add(Src1) 4980 .addImm(16) 4981 .addReg(TmpReg, RegState::Kill); 4982 break; 4983 } 4984 case AMDGPU::S_PACK_LH_B32_B16: { 4985 unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4986 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 4987 .addImm(0xffff); 4988 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg) 4989 .addReg(ImmReg, RegState::Kill) 4990 .add(Src0) 4991 .add(Src1); 4992 break; 4993 } 4994 case AMDGPU::S_PACK_HH_B32_B16: { 4995 unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4996 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4997 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) 4998 .addImm(16) 4999 .add(Src0); 5000 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 5001 .addImm(0xffff0000); 5002 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg) 5003 .add(Src1) 5004 .addReg(ImmReg, RegState::Kill) 5005 .addReg(TmpReg, RegState::Kill); 5006 break; 5007 } 5008 default: 5009 llvm_unreachable("unhandled s_pack_* instruction"); 5010 } 5011 5012 MachineOperand &Dest = Inst.getOperand(0); 5013 MRI.replaceRegWith(Dest.getReg(), ResultReg); 5014 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 5015 } 5016 5017 void SIInstrInfo::addSCCDefUsersToVALUWorklist( 5018 MachineInstr &SCCDefInst, SetVectorType &Worklist) const { 5019 // This assumes that all the users of SCC are in the same block 5020 // as the SCC def. 5021 for (MachineInstr &MI : 5022 make_range(MachineBasicBlock::iterator(SCCDefInst), 5023 SCCDefInst.getParent()->end())) { 5024 // Exit if we find another SCC def. 5025 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1) 5026 return; 5027 5028 if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) 5029 Worklist.insert(&MI); 5030 } 5031 } 5032 5033 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( 5034 const MachineInstr &Inst) const { 5035 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); 5036 5037 switch (Inst.getOpcode()) { 5038 // For target instructions, getOpRegClass just returns the virtual register 5039 // class associated with the operand, so we need to find an equivalent VGPR 5040 // register class in order to move the instruction to the VALU. 5041 case AMDGPU::COPY: 5042 case AMDGPU::PHI: 5043 case AMDGPU::REG_SEQUENCE: 5044 case AMDGPU::INSERT_SUBREG: 5045 case AMDGPU::WQM: 5046 case AMDGPU::WWM: 5047 if (RI.hasVGPRs(NewDstRC)) 5048 return nullptr; 5049 5050 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 5051 if (!NewDstRC) 5052 return nullptr; 5053 return NewDstRC; 5054 default: 5055 return NewDstRC; 5056 } 5057 } 5058 5059 // Find the one SGPR operand we are allowed to use. 5060 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI, 5061 int OpIndices[3]) const { 5062 const MCInstrDesc &Desc = MI.getDesc(); 5063 5064 // Find the one SGPR operand we are allowed to use. 5065 // 5066 // First we need to consider the instruction's operand requirements before 5067 // legalizing. Some operands are required to be SGPRs, such as implicit uses 5068 // of VCC, but we are still bound by the constant bus requirement to only use 5069 // one. 5070 // 5071 // If the operand's class is an SGPR, we can never move it. 5072 5073 unsigned SGPRReg = findImplicitSGPRRead(MI); 5074 if (SGPRReg != AMDGPU::NoRegister) 5075 return SGPRReg; 5076 5077 unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; 5078 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 5079 5080 for (unsigned i = 0; i < 3; ++i) { 5081 int Idx = OpIndices[i]; 5082 if (Idx == -1) 5083 break; 5084 5085 const MachineOperand &MO = MI.getOperand(Idx); 5086 if (!MO.isReg()) 5087 continue; 5088 5089 // Is this operand statically required to be an SGPR based on the operand 5090 // constraints? 5091 const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); 5092 bool IsRequiredSGPR = RI.isSGPRClass(OpRC); 5093 if (IsRequiredSGPR) 5094 return MO.getReg(); 5095 5096 // If this could be a VGPR or an SGPR, Check the dynamic register class. 5097 unsigned Reg = MO.getReg(); 5098 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); 5099 if (RI.isSGPRClass(RegRC)) 5100 UsedSGPRs[i] = Reg; 5101 } 5102 5103 // We don't have a required SGPR operand, so we have a bit more freedom in 5104 // selecting operands to move. 5105 5106 // Try to select the most used SGPR. If an SGPR is equal to one of the 5107 // others, we choose that. 5108 // 5109 // e.g. 5110 // V_FMA_F32 v0, s0, s0, s0 -> No moves 5111 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 5112 5113 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should 5114 // prefer those. 5115 5116 if (UsedSGPRs[0] != AMDGPU::NoRegister) { 5117 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 5118 SGPRReg = UsedSGPRs[0]; 5119 } 5120 5121 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { 5122 if (UsedSGPRs[1] == UsedSGPRs[2]) 5123 SGPRReg = UsedSGPRs[1]; 5124 } 5125 5126 return SGPRReg; 5127 } 5128 5129 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 5130 unsigned OperandName) const { 5131 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 5132 if (Idx == -1) 5133 return nullptr; 5134 5135 return &MI.getOperand(Idx); 5136 } 5137 5138 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 5139 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 5140 if (ST.isAmdHsaOS()) { 5141 // Set ATC = 1. GFX9 doesn't have this bit. 5142 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) 5143 RsrcDataFormat |= (1ULL << 56); 5144 5145 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this. 5146 // BTW, it disables TC L2 and therefore decreases performance. 5147 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) 5148 RsrcDataFormat |= (2ULL << 59); 5149 } 5150 5151 return RsrcDataFormat; 5152 } 5153 5154 uint64_t SIInstrInfo::getScratchRsrcWords23() const { 5155 uint64_t Rsrc23 = getDefaultRsrcDataFormat() | 5156 AMDGPU::RSRC_TID_ENABLE | 5157 0xffffffff; // Size; 5158 5159 // GFX9 doesn't have ELEMENT_SIZE. 5160 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 5161 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; 5162 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; 5163 } 5164 5165 // IndexStride = 64. 5166 Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; 5167 5168 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. 5169 // Clear them unless we want a huge stride. 5170 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 5171 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; 5172 5173 return Rsrc23; 5174 } 5175 5176 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { 5177 unsigned Opc = MI.getOpcode(); 5178 5179 return isSMRD(Opc); 5180 } 5181 5182 bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const { 5183 unsigned Opc = MI.getOpcode(); 5184 5185 return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); 5186 } 5187 5188 unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, 5189 int &FrameIndex) const { 5190 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 5191 if (!Addr || !Addr->isFI()) 5192 return AMDGPU::NoRegister; 5193 5194 assert(!MI.memoperands_empty() && 5195 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS); 5196 5197 FrameIndex = Addr->getIndex(); 5198 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); 5199 } 5200 5201 unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, 5202 int &FrameIndex) const { 5203 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr); 5204 assert(Addr && Addr->isFI()); 5205 FrameIndex = Addr->getIndex(); 5206 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg(); 5207 } 5208 5209 unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 5210 int &FrameIndex) const { 5211 if (!MI.mayLoad()) 5212 return AMDGPU::NoRegister; 5213 5214 if (isMUBUF(MI) || isVGPRSpill(MI)) 5215 return isStackAccess(MI, FrameIndex); 5216 5217 if (isSGPRSpill(MI)) 5218 return isSGPRStackAccess(MI, FrameIndex); 5219 5220 return AMDGPU::NoRegister; 5221 } 5222 5223 unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI, 5224 int &FrameIndex) const { 5225 if (!MI.mayStore()) 5226 return AMDGPU::NoRegister; 5227 5228 if (isMUBUF(MI) || isVGPRSpill(MI)) 5229 return isStackAccess(MI, FrameIndex); 5230 5231 if (isSGPRSpill(MI)) 5232 return isSGPRStackAccess(MI, FrameIndex); 5233 5234 return AMDGPU::NoRegister; 5235 } 5236 5237 unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const { 5238 unsigned Size = 0; 5239 MachineBasicBlock::const_instr_iterator I = MI.getIterator(); 5240 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); 5241 while (++I != E && I->isInsideBundle()) { 5242 assert(!I->isBundle() && "No nested bundle!"); 5243 Size += getInstSizeInBytes(*I); 5244 } 5245 5246 return Size; 5247 } 5248 5249 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 5250 unsigned Opc = MI.getOpcode(); 5251 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); 5252 unsigned DescSize = Desc.getSize(); 5253 5254 // If we have a definitive size, we can use it. Otherwise we need to inspect 5255 // the operands to know the size. 5256 if (isFixedSize(MI)) 5257 return DescSize; 5258 5259 // 4-byte instructions may have a 32-bit literal encoded after them. Check 5260 // operands that coud ever be literals. 5261 if (isVALU(MI) || isSALU(MI)) { 5262 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 5263 if (Src0Idx == -1) 5264 return DescSize; // No operands. 5265 5266 if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx])) 5267 return DescSize + 4; 5268 5269 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 5270 if (Src1Idx == -1) 5271 return DescSize; 5272 5273 if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx])) 5274 return DescSize + 4; 5275 5276 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 5277 if (Src2Idx == -1) 5278 return DescSize; 5279 5280 if (isLiteralConstantLike(MI.getOperand(Src2Idx), Desc.OpInfo[Src2Idx])) 5281 return DescSize + 4; 5282 5283 return DescSize; 5284 } 5285 5286 switch (Opc) { 5287 case TargetOpcode::IMPLICIT_DEF: 5288 case TargetOpcode::KILL: 5289 case TargetOpcode::DBG_VALUE: 5290 case TargetOpcode::EH_LABEL: 5291 return 0; 5292 case TargetOpcode::BUNDLE: 5293 return getInstBundleSize(MI); 5294 case TargetOpcode::INLINEASM: 5295 case TargetOpcode::INLINEASM_BR: { 5296 const MachineFunction *MF = MI.getParent()->getParent(); 5297 const char *AsmStr = MI.getOperand(0).getSymbolName(); 5298 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo()); 5299 } 5300 default: 5301 return DescSize; 5302 } 5303 } 5304 5305 bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { 5306 if (!isFLAT(MI)) 5307 return false; 5308 5309 if (MI.memoperands_empty()) 5310 return true; 5311 5312 for (const MachineMemOperand *MMO : MI.memoperands()) { 5313 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) 5314 return true; 5315 } 5316 return false; 5317 } 5318 5319 bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const { 5320 return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO; 5321 } 5322 5323 void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, 5324 MachineBasicBlock *IfEnd) const { 5325 MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator(); 5326 assert(TI != IfEntry->end()); 5327 5328 MachineInstr *Branch = &(*TI); 5329 MachineFunction *MF = IfEntry->getParent(); 5330 MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo(); 5331 5332 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 5333 unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 5334 MachineInstr *SIIF = 5335 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg) 5336 .add(Branch->getOperand(0)) 5337 .add(Branch->getOperand(1)); 5338 MachineInstr *SIEND = 5339 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF)) 5340 .addReg(DstReg); 5341 5342 IfEntry->erase(TI); 5343 IfEntry->insert(IfEntry->end(), SIIF); 5344 IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND); 5345 } 5346 } 5347 5348 void SIInstrInfo::convertNonUniformLoopRegion( 5349 MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const { 5350 MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator(); 5351 // We expect 2 terminators, one conditional and one unconditional. 5352 assert(TI != LoopEnd->end()); 5353 5354 MachineInstr *Branch = &(*TI); 5355 MachineFunction *MF = LoopEnd->getParent(); 5356 MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo(); 5357 5358 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 5359 5360 unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 5361 unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 5362 MachineInstrBuilder HeaderPHIBuilder = 5363 BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg); 5364 for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(), 5365 E = LoopEntry->pred_end(); 5366 PI != E; ++PI) { 5367 if (*PI == LoopEnd) { 5368 HeaderPHIBuilder.addReg(BackEdgeReg); 5369 } else { 5370 MachineBasicBlock *PMBB = *PI; 5371 unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 5372 materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(), 5373 ZeroReg, 0); 5374 HeaderPHIBuilder.addReg(ZeroReg); 5375 } 5376 HeaderPHIBuilder.addMBB(*PI); 5377 } 5378 MachineInstr *HeaderPhi = HeaderPHIBuilder; 5379 MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(), 5380 get(AMDGPU::SI_IF_BREAK), BackEdgeReg) 5381 .addReg(DstReg) 5382 .add(Branch->getOperand(0)); 5383 MachineInstr *SILOOP = 5384 BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP)) 5385 .addReg(BackEdgeReg) 5386 .addMBB(LoopEntry); 5387 5388 LoopEntry->insert(LoopEntry->begin(), HeaderPhi); 5389 LoopEnd->erase(TI); 5390 LoopEnd->insert(LoopEnd->end(), SIIFBREAK); 5391 LoopEnd->insert(LoopEnd->end(), SILOOP); 5392 } 5393 } 5394 5395 ArrayRef<std::pair<int, const char *>> 5396 SIInstrInfo::getSerializableTargetIndices() const { 5397 static const std::pair<int, const char *> TargetIndices[] = { 5398 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, 5399 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, 5400 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, 5401 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, 5402 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; 5403 return makeArrayRef(TargetIndices); 5404 } 5405 5406 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The 5407 /// post-RA version of misched uses CreateTargetMIHazardRecognizer. 5408 ScheduleHazardRecognizer * 5409 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, 5410 const ScheduleDAG *DAG) const { 5411 return new GCNHazardRecognizer(DAG->MF); 5412 } 5413 5414 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer 5415 /// pass. 5416 ScheduleHazardRecognizer * 5417 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { 5418 return new GCNHazardRecognizer(MF); 5419 } 5420 5421 std::pair<unsigned, unsigned> 5422 SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { 5423 return std::make_pair(TF & MO_MASK, TF & ~MO_MASK); 5424 } 5425 5426 ArrayRef<std::pair<unsigned, const char *>> 5427 SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { 5428 static const std::pair<unsigned, const char *> TargetFlags[] = { 5429 { MO_GOTPCREL, "amdgpu-gotprel" }, 5430 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" }, 5431 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" }, 5432 { MO_REL32_LO, "amdgpu-rel32-lo" }, 5433 { MO_REL32_HI, "amdgpu-rel32-hi" } 5434 }; 5435 5436 return makeArrayRef(TargetFlags); 5437 } 5438 5439 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { 5440 return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && 5441 MI.modifiesRegister(AMDGPU::EXEC, &RI); 5442 } 5443 5444 MachineInstrBuilder 5445 SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, 5446 MachineBasicBlock::iterator I, 5447 const DebugLoc &DL, 5448 unsigned DestReg) const { 5449 if (ST.hasAddNoCarry()) 5450 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg); 5451 5452 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5453 unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 5454 MRI.setRegAllocationHint(UnusedCarry, 0, AMDGPU::VCC); 5455 5456 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) 5457 .addReg(UnusedCarry, RegState::Define | RegState::Dead); 5458 } 5459 5460 bool SIInstrInfo::isKillTerminator(unsigned Opcode) { 5461 switch (Opcode) { 5462 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: 5463 case AMDGPU::SI_KILL_I1_TERMINATOR: 5464 return true; 5465 default: 5466 return false; 5467 } 5468 } 5469 5470 const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const { 5471 switch (Opcode) { 5472 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: 5473 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR); 5474 case AMDGPU::SI_KILL_I1_PSEUDO: 5475 return get(AMDGPU::SI_KILL_I1_TERMINATOR); 5476 default: 5477 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO"); 5478 } 5479 } 5480 5481 bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { 5482 if (!isSMRD(MI)) 5483 return false; 5484 5485 // Check that it is using a buffer resource. 5486 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase); 5487 if (Idx == -1) // e.g. s_memtime 5488 return false; 5489 5490 const auto RCID = MI.getDesc().OpInfo[Idx].RegClass; 5491 return RCID == AMDGPU::SReg_128RegClassID; 5492 } 5493 5494 // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td 5495 enum SIEncodingFamily { 5496 SI = 0, 5497 VI = 1, 5498 SDWA = 2, 5499 SDWA9 = 3, 5500 GFX80 = 4, 5501 GFX9 = 5 5502 }; 5503 5504 static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) { 5505 switch (ST.getGeneration()) { 5506 default: 5507 break; 5508 case AMDGPUSubtarget::SOUTHERN_ISLANDS: 5509 case AMDGPUSubtarget::SEA_ISLANDS: 5510 return SIEncodingFamily::SI; 5511 case AMDGPUSubtarget::VOLCANIC_ISLANDS: 5512 case AMDGPUSubtarget::GFX9: 5513 return SIEncodingFamily::VI; 5514 } 5515 llvm_unreachable("Unknown subtarget generation!"); 5516 } 5517 5518 int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { 5519 SIEncodingFamily Gen = subtargetEncodingFamily(ST); 5520 5521 if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 && 5522 ST.getGeneration() >= AMDGPUSubtarget::GFX9) 5523 Gen = SIEncodingFamily::GFX9; 5524 5525 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) 5526 Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9 5527 : SIEncodingFamily::SDWA; 5528 // Adjust the encoding family to GFX80 for D16 buffer instructions when the 5529 // subtarget has UnpackedD16VMem feature. 5530 // TODO: remove this when we discard GFX80 encoding. 5531 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf)) 5532 Gen = SIEncodingFamily::GFX80; 5533 5534 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); 5535 5536 // -1 means that Opcode is already a native instruction. 5537 if (MCOp == -1) 5538 return Opcode; 5539 5540 // (uint16_t)-1 means that Opcode is a pseudo instruction that has 5541 // no encoding in the given subtarget generation. 5542 if (MCOp == (uint16_t)-1) 5543 return -1; 5544 5545 return MCOp; 5546 } 5547 5548 static 5549 TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) { 5550 assert(RegOpnd.isReg()); 5551 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() : 5552 getRegSubRegPair(RegOpnd); 5553 } 5554 5555 TargetInstrInfo::RegSubRegPair 5556 llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) { 5557 assert(MI.isRegSequence()); 5558 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I) 5559 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) { 5560 auto &RegOp = MI.getOperand(1 + 2 * I); 5561 return getRegOrUndef(RegOp); 5562 } 5563 return TargetInstrInfo::RegSubRegPair(); 5564 } 5565 5566 // Try to find the definition of reg:subreg in subreg-manipulation pseudos 5567 // Following a subreg of reg:subreg isn't supported 5568 static bool followSubRegDef(MachineInstr &MI, 5569 TargetInstrInfo::RegSubRegPair &RSR) { 5570 if (!RSR.SubReg) 5571 return false; 5572 switch (MI.getOpcode()) { 5573 default: break; 5574 case AMDGPU::REG_SEQUENCE: 5575 RSR = getRegSequenceSubReg(MI, RSR.SubReg); 5576 return true; 5577 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg 5578 case AMDGPU::INSERT_SUBREG: 5579 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm()) 5580 // inserted the subreg we're looking for 5581 RSR = getRegOrUndef(MI.getOperand(2)); 5582 else { // the subreg in the rest of the reg 5583 auto R1 = getRegOrUndef(MI.getOperand(1)); 5584 if (R1.SubReg) // subreg of subreg isn't supported 5585 return false; 5586 RSR.Reg = R1.Reg; 5587 } 5588 return true; 5589 } 5590 return false; 5591 } 5592 5593 MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, 5594 MachineRegisterInfo &MRI) { 5595 assert(MRI.isSSA()); 5596 if (!TargetRegisterInfo::isVirtualRegister(P.Reg)) 5597 return nullptr; 5598 5599 auto RSR = P; 5600 auto *DefInst = MRI.getVRegDef(RSR.Reg); 5601 while (auto *MI = DefInst) { 5602 DefInst = nullptr; 5603 switch (MI->getOpcode()) { 5604 case AMDGPU::COPY: 5605 case AMDGPU::V_MOV_B32_e32: { 5606 auto &Op1 = MI->getOperand(1); 5607 if (Op1.isReg() && 5608 TargetRegisterInfo::isVirtualRegister(Op1.getReg())) { 5609 if (Op1.isUndef()) 5610 return nullptr; 5611 RSR = getRegSubRegPair(Op1); 5612 DefInst = MRI.getVRegDef(RSR.Reg); 5613 } 5614 break; 5615 } 5616 default: 5617 if (followSubRegDef(*MI, RSR)) { 5618 if (!RSR.Reg) 5619 return nullptr; 5620 DefInst = MRI.getVRegDef(RSR.Reg); 5621 } 5622 } 5623 if (!DefInst) 5624 return MI; 5625 } 5626 return nullptr; 5627 } 5628 5629 bool llvm::isEXECMaskConstantBetweenDefAndUses(unsigned VReg, 5630 MachineRegisterInfo &MRI) { 5631 assert(MRI.isSSA() && "Must be run on SSA"); 5632 auto *TRI = MRI.getTargetRegisterInfo(); 5633 5634 auto *DefI = MRI.getVRegDef(VReg); 5635 auto *BB = DefI->getParent(); 5636 5637 DenseSet<MachineInstr*> Uses; 5638 for (auto &Use : MRI.use_nodbg_operands(VReg)) { 5639 auto *I = Use.getParent(); 5640 if (I->getParent() != BB) 5641 return false; 5642 Uses.insert(I); 5643 } 5644 5645 auto E = BB->end(); 5646 for (auto I = std::next(DefI->getIterator()); I != E; ++I) { 5647 Uses.erase(&*I); 5648 // don't check the last use 5649 if (Uses.empty() || I->modifiesRegister(AMDGPU::EXEC, TRI)) 5650 break; 5651 } 5652 return Uses.empty(); 5653 } 5654