1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements hazard recognizers for scheduling on GCN processors. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "GCNHazardRecognizer.h" 14 #include "AMDGPUSubtarget.h" 15 #include "SIDefines.h" 16 #include "SIInstrInfo.h" 17 #include "SIRegisterInfo.h" 18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 19 #include "Utils/AMDGPUBaseInfo.h" 20 #include "llvm/ADT/iterator_range.h" 21 #include "llvm/CodeGen/MachineFunction.h" 22 #include "llvm/CodeGen/MachineInstr.h" 23 #include "llvm/CodeGen/MachineInstrBuilder.h" 24 #include "llvm/CodeGen/MachineOperand.h" 25 #include "llvm/CodeGen/ScheduleDAG.h" 26 #include "llvm/MC/MCInstrDesc.h" 27 #include "llvm/Support/ErrorHandling.h" 28 #include <algorithm> 29 #include <cassert> 30 #include <limits> 31 #include <set> 32 #include <vector> 33 34 using namespace llvm; 35 36 //===----------------------------------------------------------------------===// 37 // Hazard Recoginizer Implementation 38 //===----------------------------------------------------------------------===// 39 40 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : 41 IsHazardRecognizerMode(false), 42 CurrCycleInstr(nullptr), 43 MF(MF), 44 ST(MF.getSubtarget<GCNSubtarget>()), 45 TII(*ST.getInstrInfo()), 46 TRI(TII.getRegisterInfo()), 47 ClauseUses(TRI.getNumRegUnits()), 48 ClauseDefs(TRI.getNumRegUnits()) { 49 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5; 50 TSchedModel.init(&ST); 51 } 52 53 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) { 54 EmitInstruction(SU->getInstr()); 55 } 56 57 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) { 58 CurrCycleInstr = MI; 59 } 60 61 static bool isDivFMas(unsigned Opcode) { 62 return Opcode == AMDGPU::V_DIV_FMAS_F32 || Opcode == AMDGPU::V_DIV_FMAS_F64; 63 } 64 65 static bool isSGetReg(unsigned Opcode) { 66 return Opcode == AMDGPU::S_GETREG_B32; 67 } 68 69 static bool isSSetReg(unsigned Opcode) { 70 switch (Opcode) { 71 case AMDGPU::S_SETREG_B32: 72 case AMDGPU::S_SETREG_B32_mode: 73 case AMDGPU::S_SETREG_IMM32_B32: 74 case AMDGPU::S_SETREG_IMM32_B32_mode: 75 return true; 76 } 77 return false; 78 } 79 80 static bool isRWLane(unsigned Opcode) { 81 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32; 82 } 83 84 static bool isRFE(unsigned Opcode) { 85 return Opcode == AMDGPU::S_RFE_B64; 86 } 87 88 static bool isSMovRel(unsigned Opcode) { 89 switch (Opcode) { 90 case AMDGPU::S_MOVRELS_B32: 91 case AMDGPU::S_MOVRELS_B64: 92 case AMDGPU::S_MOVRELD_B32: 93 case AMDGPU::S_MOVRELD_B64: 94 return true; 95 default: 96 return false; 97 } 98 } 99 100 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, 101 const MachineInstr &MI) { 102 if (TII.isAlwaysGDS(MI.getOpcode())) 103 return true; 104 105 switch (MI.getOpcode()) { 106 case AMDGPU::S_SENDMSG: 107 case AMDGPU::S_SENDMSGHALT: 108 case AMDGPU::S_TTRACEDATA: 109 return true; 110 // These DS opcodes don't support GDS. 111 case AMDGPU::DS_NOP: 112 case AMDGPU::DS_PERMUTE_B32: 113 case AMDGPU::DS_BPERMUTE_B32: 114 return false; 115 default: 116 if (TII.isDS(MI.getOpcode())) { 117 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 118 AMDGPU::OpName::gds); 119 if (MI.getOperand(GDS).getImm()) 120 return true; 121 } 122 return false; 123 } 124 } 125 126 static bool isPermlane(const MachineInstr &MI) { 127 unsigned Opcode = MI.getOpcode(); 128 return Opcode == AMDGPU::V_PERMLANE16_B32 || 129 Opcode == AMDGPU::V_PERMLANEX16_B32; 130 } 131 132 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { 133 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, 134 AMDGPU::OpName::simm16); 135 return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_; 136 } 137 138 ScheduleHazardRecognizer::HazardType 139 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { 140 MachineInstr *MI = SU->getInstr(); 141 // If we are not in "HazardRecognizerMode" and therefore not being run from 142 // the scheduler, track possible stalls from hazards but don't insert noops. 143 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard; 144 145 if (MI->isBundle()) 146 return NoHazard; 147 148 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) 149 return HazardType; 150 151 // FIXME: Should flat be considered vmem? 152 if ((SIInstrInfo::isVMEM(*MI) || 153 SIInstrInfo::isFLAT(*MI)) 154 && checkVMEMHazards(MI) > 0) 155 return HazardType; 156 157 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0) 158 return HazardType; 159 160 if (checkFPAtomicToDenormModeHazard(MI) > 0) 161 return HazardType; 162 163 if (ST.hasNoDataDepHazard()) 164 return NoHazard; 165 166 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) 167 return HazardType; 168 169 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0) 170 return HazardType; 171 172 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0) 173 return HazardType; 174 175 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0) 176 return HazardType; 177 178 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0) 179 return HazardType; 180 181 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0) 182 return HazardType; 183 184 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0) 185 return HazardType; 186 187 if (ST.hasReadM0MovRelInterpHazard() && 188 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) && 189 checkReadM0Hazards(MI) > 0) 190 return HazardType; 191 192 if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) && 193 checkReadM0Hazards(MI) > 0) 194 return HazardType; 195 196 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0) 197 return HazardType; 198 199 if ((SIInstrInfo::isVMEM(*MI) || 200 SIInstrInfo::isFLAT(*MI) || 201 SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0) 202 return HazardType; 203 204 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0) 205 return HazardType; 206 207 return NoHazard; 208 } 209 210 static void insertNoopInBundle(MachineInstr *MI, const SIInstrInfo &TII) { 211 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP)) 212 .addImm(0); 213 } 214 215 void GCNHazardRecognizer::processBundle() { 216 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator()); 217 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end(); 218 // Check bundled MachineInstr's for hazards. 219 for (; MI != E && MI->isInsideBundle(); ++MI) { 220 CurrCycleInstr = &*MI; 221 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr); 222 223 if (IsHazardRecognizerMode) 224 fixHazards(CurrCycleInstr); 225 226 for (unsigned i = 0; i < WaitStates; ++i) 227 insertNoopInBundle(CurrCycleInstr, TII); 228 229 // It’s unnecessary to track more than MaxLookAhead instructions. Since we 230 // include the bundled MI directly after, only add a maximum of 231 // (MaxLookAhead - 1) noops to EmittedInstrs. 232 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i) 233 EmittedInstrs.push_front(nullptr); 234 235 EmittedInstrs.push_front(CurrCycleInstr); 236 EmittedInstrs.resize(MaxLookAhead); 237 } 238 CurrCycleInstr = nullptr; 239 } 240 241 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { 242 IsHazardRecognizerMode = true; 243 CurrCycleInstr = MI; 244 unsigned W = PreEmitNoopsCommon(MI); 245 fixHazards(MI); 246 CurrCycleInstr = nullptr; 247 return W; 248 } 249 250 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { 251 if (MI->isBundle()) 252 return 0; 253 254 int WaitStates = 0; 255 256 if (SIInstrInfo::isSMRD(*MI)) 257 return std::max(WaitStates, checkSMRDHazards(MI)); 258 259 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI)) 260 WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); 261 262 if (ST.hasNSAtoVMEMBug()) 263 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI)); 264 265 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI)); 266 267 if (ST.hasNoDataDepHazard()) 268 return WaitStates; 269 270 if (SIInstrInfo::isVALU(*MI)) 271 WaitStates = std::max(WaitStates, checkVALUHazards(MI)); 272 273 if (SIInstrInfo::isDPP(*MI)) 274 WaitStates = std::max(WaitStates, checkDPPHazards(MI)); 275 276 if (isDivFMas(MI->getOpcode())) 277 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI)); 278 279 if (isRWLane(MI->getOpcode())) 280 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); 281 282 if (MI->isInlineAsm()) 283 return std::max(WaitStates, checkInlineAsmHazards(MI)); 284 285 if (isSGetReg(MI->getOpcode())) 286 return std::max(WaitStates, checkGetRegHazards(MI)); 287 288 if (isSSetReg(MI->getOpcode())) 289 return std::max(WaitStates, checkSetRegHazards(MI)); 290 291 if (isRFE(MI->getOpcode())) 292 return std::max(WaitStates, checkRFEHazards(MI)); 293 294 if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) || 295 isSMovRel(MI->getOpcode()))) 296 return std::max(WaitStates, checkReadM0Hazards(MI)); 297 298 if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) 299 return std::max(WaitStates, checkReadM0Hazards(MI)); 300 301 if (SIInstrInfo::isMAI(*MI)) 302 return std::max(WaitStates, checkMAIHazards(MI)); 303 304 if (SIInstrInfo::isVMEM(*MI) || 305 SIInstrInfo::isFLAT(*MI) || 306 SIInstrInfo::isDS(*MI)) 307 return std::max(WaitStates, checkMAILdStHazards(MI)); 308 309 return WaitStates; 310 } 311 312 void GCNHazardRecognizer::EmitNoop() { 313 EmittedInstrs.push_front(nullptr); 314 } 315 316 void GCNHazardRecognizer::AdvanceCycle() { 317 // When the scheduler detects a stall, it will call AdvanceCycle() without 318 // emitting any instructions. 319 if (!CurrCycleInstr) { 320 EmittedInstrs.push_front(nullptr); 321 return; 322 } 323 324 // Do not track non-instructions which do not affect the wait states. 325 // If included, these instructions can lead to buffer overflow such that 326 // detectable hazards are missed. 327 if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() || 328 CurrCycleInstr->isKill()) { 329 CurrCycleInstr = nullptr; 330 return; 331 } 332 333 if (CurrCycleInstr->isBundle()) { 334 processBundle(); 335 return; 336 } 337 338 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr); 339 340 // Keep track of emitted instructions 341 EmittedInstrs.push_front(CurrCycleInstr); 342 343 // Add a nullptr for each additional wait state after the first. Make sure 344 // not to add more than getMaxLookAhead() items to the list, since we 345 // truncate the list to that size right after this loop. 346 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead()); 347 i < e; ++i) { 348 EmittedInstrs.push_front(nullptr); 349 } 350 351 // getMaxLookahead() is the largest number of wait states we will ever need 352 // to insert, so there is no point in keeping track of more than that many 353 // wait states. 354 EmittedInstrs.resize(getMaxLookAhead()); 355 356 CurrCycleInstr = nullptr; 357 } 358 359 void GCNHazardRecognizer::RecedeCycle() { 360 llvm_unreachable("hazard recognizer does not support bottom-up scheduling."); 361 } 362 363 //===----------------------------------------------------------------------===// 364 // Helper Functions 365 //===----------------------------------------------------------------------===// 366 367 typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn; 368 369 // Returns a minimum wait states since \p I walking all predecessors. 370 // Only scans until \p IsExpired does not return true. 371 // Can only be run in a hazard recognizer mode. 372 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, 373 MachineBasicBlock *MBB, 374 MachineBasicBlock::reverse_instr_iterator I, 375 int WaitStates, 376 IsExpiredFn IsExpired, 377 DenseSet<const MachineBasicBlock *> &Visited) { 378 for (auto E = MBB->instr_rend(); I != E; ++I) { 379 // Don't add WaitStates for parent BUNDLE instructions. 380 if (I->isBundle()) 381 continue; 382 383 if (IsHazard(&*I)) 384 return WaitStates; 385 386 if (I->isInlineAsm() || I->isMetaInstruction()) 387 continue; 388 389 WaitStates += SIInstrInfo::getNumWaitStates(*I); 390 391 if (IsExpired(&*I, WaitStates)) 392 return std::numeric_limits<int>::max(); 393 } 394 395 int MinWaitStates = WaitStates; 396 bool Found = false; 397 for (MachineBasicBlock *Pred : MBB->predecessors()) { 398 if (!Visited.insert(Pred).second) 399 continue; 400 401 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), 402 WaitStates, IsExpired, Visited); 403 404 if (W == std::numeric_limits<int>::max()) 405 continue; 406 407 MinWaitStates = Found ? std::min(MinWaitStates, W) : W; 408 if (IsExpired(nullptr, MinWaitStates)) 409 return MinWaitStates; 410 411 Found = true; 412 } 413 414 if (Found) 415 return MinWaitStates; 416 417 return std::numeric_limits<int>::max(); 418 } 419 420 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, 421 MachineInstr *MI, 422 IsExpiredFn IsExpired) { 423 DenseSet<const MachineBasicBlock *> Visited; 424 return getWaitStatesSince(IsHazard, MI->getParent(), 425 std::next(MI->getReverseIterator()), 426 0, IsExpired, Visited); 427 } 428 429 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { 430 if (IsHazardRecognizerMode) { 431 auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) { 432 return WaitStates >= Limit; 433 }; 434 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn); 435 } 436 437 int WaitStates = 0; 438 for (MachineInstr *MI : EmittedInstrs) { 439 if (MI) { 440 if (IsHazard(MI)) 441 return WaitStates; 442 443 if (MI->isInlineAsm()) 444 continue; 445 } 446 ++WaitStates; 447 448 if (WaitStates >= Limit) 449 break; 450 } 451 return std::numeric_limits<int>::max(); 452 } 453 454 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, 455 IsHazardFn IsHazardDef, 456 int Limit) { 457 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 458 459 auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) { 460 return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI); 461 }; 462 463 return getWaitStatesSince(IsHazardFn, Limit); 464 } 465 466 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard, 467 int Limit) { 468 auto IsHazardFn = [IsHazard] (MachineInstr *MI) { 469 return isSSetReg(MI->getOpcode()) && IsHazard(MI); 470 }; 471 472 return getWaitStatesSince(IsHazardFn, Limit); 473 } 474 475 //===----------------------------------------------------------------------===// 476 // No-op Hazard Detection 477 //===----------------------------------------------------------------------===// 478 479 static void addRegUnits(const SIRegisterInfo &TRI, 480 BitVector &BV, unsigned Reg) { 481 for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) 482 BV.set(*RUI); 483 } 484 485 static void addRegsToSet(const SIRegisterInfo &TRI, 486 iterator_range<MachineInstr::const_mop_iterator> Ops, 487 BitVector &Set) { 488 for (const MachineOperand &Op : Ops) { 489 if (Op.isReg()) 490 addRegUnits(TRI, Set, Op.getReg()); 491 } 492 } 493 494 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) { 495 // XXX: Do we need to worry about implicit operands 496 addRegsToSet(TRI, MI.defs(), ClauseDefs); 497 addRegsToSet(TRI, MI.uses(), ClauseUses); 498 } 499 500 static bool breaksSMEMSoftClause(MachineInstr *MI) { 501 return !SIInstrInfo::isSMRD(*MI); 502 } 503 504 static bool breaksVMEMSoftClause(MachineInstr *MI) { 505 return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI); 506 } 507 508 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { 509 // SMEM soft clause are only present on VI+, and only matter if xnack is 510 // enabled. 511 if (!ST.isXNACKEnabled()) 512 return 0; 513 514 bool IsSMRD = TII.isSMRD(*MEM); 515 516 resetClause(); 517 518 // A soft-clause is any group of consecutive SMEM instructions. The 519 // instructions in this group may return out of order and/or may be 520 // replayed (i.e. the same instruction issued more than once). 521 // 522 // In order to handle these situations correctly we need to make sure that 523 // when a clause has more than one instruction, no instruction in the clause 524 // writes to a register that is read by another instruction in the clause 525 // (including itself). If we encounter this situaion, we need to break the 526 // clause by inserting a non SMEM instruction. 527 528 for (MachineInstr *MI : EmittedInstrs) { 529 // When we hit a non-SMEM instruction then we have passed the start of the 530 // clause and we can stop. 531 if (!MI) 532 break; 533 534 if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI)) 535 break; 536 537 addClauseInst(*MI); 538 } 539 540 if (ClauseDefs.none()) 541 return 0; 542 543 // We need to make sure not to put loads and stores in the same clause if they 544 // use the same address. For now, just start a new clause whenever we see a 545 // store. 546 if (MEM->mayStore()) 547 return 1; 548 549 addClauseInst(*MEM); 550 551 // If the set of defs and uses intersect then we cannot add this instruction 552 // to the clause, so we have a hazard. 553 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0; 554 } 555 556 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { 557 int WaitStatesNeeded = 0; 558 559 WaitStatesNeeded = checkSoftClauseHazards(SMRD); 560 561 // This SMRD hazard only affects SI. 562 if (!ST.hasSMRDReadVALUDefHazard()) 563 return WaitStatesNeeded; 564 565 // A read of an SGPR by SMRD instruction requires 4 wait states when the 566 // SGPR was written by a VALU instruction. 567 int SmrdSgprWaitStates = 4; 568 auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; 569 auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); }; 570 571 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD); 572 573 for (const MachineOperand &Use : SMRD->uses()) { 574 if (!Use.isReg()) 575 continue; 576 int WaitStatesNeededForUse = 577 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 578 SmrdSgprWaitStates); 579 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 580 581 // This fixes what appears to be undocumented hardware behavior in SI where 582 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor 583 // needs some number of nops in between. We don't know how many we need, but 584 // let's use 4. This wasn't discovered before probably because the only 585 // case when this happens is when we expand a 64-bit pointer into a full 586 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was 587 // probably never encountered in the closed-source land. 588 if (IsBufferSMRD) { 589 int WaitStatesNeededForUse = 590 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), 591 IsBufferHazardDefFn, 592 SmrdSgprWaitStates); 593 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 594 } 595 } 596 597 return WaitStatesNeeded; 598 } 599 600 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { 601 if (!ST.hasVMEMReadSGPRVALUDefHazard()) 602 return 0; 603 604 int WaitStatesNeeded = checkSoftClauseHazards(VMEM); 605 606 // A read of an SGPR by a VMEM instruction requires 5 wait states when the 607 // SGPR was written by a VALU Instruction. 608 const int VmemSgprWaitStates = 5; 609 auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; 610 for (const MachineOperand &Use : VMEM->uses()) { 611 if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 612 continue; 613 614 int WaitStatesNeededForUse = 615 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 616 VmemSgprWaitStates); 617 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 618 } 619 return WaitStatesNeeded; 620 } 621 622 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { 623 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 624 const SIInstrInfo *TII = ST.getInstrInfo(); 625 626 // Check for DPP VGPR read after VALU VGPR write and EXEC write. 627 int DppVgprWaitStates = 2; 628 int DppExecWaitStates = 5; 629 int WaitStatesNeeded = 0; 630 auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; 631 632 for (const MachineOperand &Use : DPP->uses()) { 633 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg())) 634 continue; 635 int WaitStatesNeededForUse = 636 DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(), 637 [](MachineInstr *) { return true; }, 638 DppVgprWaitStates); 639 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 640 } 641 642 WaitStatesNeeded = std::max( 643 WaitStatesNeeded, 644 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn, 645 DppExecWaitStates)); 646 647 return WaitStatesNeeded; 648 } 649 650 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) { 651 const SIInstrInfo *TII = ST.getInstrInfo(); 652 653 // v_div_fmas requires 4 wait states after a write to vcc from a VALU 654 // instruction. 655 const int DivFMasWaitStates = 4; 656 auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; 657 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn, 658 DivFMasWaitStates); 659 660 return DivFMasWaitStates - WaitStatesNeeded; 661 } 662 663 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) { 664 const SIInstrInfo *TII = ST.getInstrInfo(); 665 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr); 666 667 const int GetRegWaitStates = 2; 668 auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) { 669 return GetRegHWReg == getHWReg(TII, *MI); 670 }; 671 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates); 672 673 return GetRegWaitStates - WaitStatesNeeded; 674 } 675 676 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) { 677 const SIInstrInfo *TII = ST.getInstrInfo(); 678 unsigned HWReg = getHWReg(TII, *SetRegInstr); 679 680 const int SetRegWaitStates = ST.getSetRegWaitStates(); 681 auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) { 682 return HWReg == getHWReg(TII, *MI); 683 }; 684 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates); 685 return SetRegWaitStates - WaitStatesNeeded; 686 } 687 688 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { 689 if (!MI.mayStore()) 690 return -1; 691 692 const SIInstrInfo *TII = ST.getInstrInfo(); 693 unsigned Opcode = MI.getOpcode(); 694 const MCInstrDesc &Desc = MI.getDesc(); 695 696 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 697 int VDataRCID = -1; 698 if (VDataIdx != -1) 699 VDataRCID = Desc.OpInfo[VDataIdx].RegClass; 700 701 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) { 702 // There is no hazard if the instruction does not use vector regs 703 // (like wbinvl1) 704 if (VDataIdx == -1) 705 return -1; 706 // For MUBUF/MTBUF instructions this hazard only exists if the 707 // instruction is not using a register in the soffset field. 708 const MachineOperand *SOffset = 709 TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 710 // If we have no soffset operand, then assume this field has been 711 // hardcoded to zero. 712 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 && 713 (!SOffset || !SOffset->isReg())) 714 return VDataIdx; 715 } 716 717 // MIMG instructions create a hazard if they don't use a 256-bit T# and 718 // the store size is greater than 8 bytes and they have more than two bits 719 // of their dmask set. 720 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them. 721 if (TII->isMIMG(MI)) { 722 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); 723 assert(SRsrcIdx != -1 && 724 AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256); 725 (void)SRsrcIdx; 726 } 727 728 if (TII->isFLAT(MI)) { 729 int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 730 if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64) 731 return DataIdx; 732 } 733 734 return -1; 735 } 736 737 int 738 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, 739 const MachineRegisterInfo &MRI) { 740 // Helper to check for the hazard where VMEM instructions that store more than 741 // 8 bytes can have there store data over written by the next instruction. 742 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 743 744 const int VALUWaitStates = 1; 745 int WaitStatesNeeded = 0; 746 747 if (!TRI->isVGPR(MRI, Def.getReg())) 748 return WaitStatesNeeded; 749 Register Reg = Def.getReg(); 750 auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) { 751 int DataIdx = createsVALUHazard(*MI); 752 return DataIdx >= 0 && 753 TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg); 754 }; 755 int WaitStatesNeededForDef = 756 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates); 757 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 758 759 return WaitStatesNeeded; 760 } 761 762 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { 763 // This checks for the hazard where VMEM instructions that store more than 764 // 8 bytes can have there store data over written by the next instruction. 765 if (!ST.has12DWordStoreHazard()) 766 return 0; 767 768 const MachineRegisterInfo &MRI = MF.getRegInfo(); 769 int WaitStatesNeeded = 0; 770 771 for (const MachineOperand &Def : VALU->defs()) { 772 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI)); 773 } 774 775 return WaitStatesNeeded; 776 } 777 778 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { 779 // This checks for hazards associated with inline asm statements. 780 // Since inline asms can contain just about anything, we use this 781 // to call/leverage other check*Hazard routines. Note that 782 // this function doesn't attempt to address all possible inline asm 783 // hazards (good luck), but is a collection of what has been 784 // problematic thus far. 785 786 // see checkVALUHazards() 787 if (!ST.has12DWordStoreHazard()) 788 return 0; 789 790 const MachineRegisterInfo &MRI = MF.getRegInfo(); 791 int WaitStatesNeeded = 0; 792 793 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands(); 794 I != E; ++I) { 795 const MachineOperand &Op = IA->getOperand(I); 796 if (Op.isReg() && Op.isDef()) { 797 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI)); 798 } 799 } 800 801 return WaitStatesNeeded; 802 } 803 804 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { 805 const SIInstrInfo *TII = ST.getInstrInfo(); 806 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 807 const MachineRegisterInfo &MRI = MF.getRegInfo(); 808 809 const MachineOperand *LaneSelectOp = 810 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1); 811 812 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg())) 813 return 0; 814 815 Register LaneSelectReg = LaneSelectOp->getReg(); 816 auto IsHazardFn = [TII] (MachineInstr *MI) { 817 return TII->isVALU(*MI); 818 }; 819 820 const int RWLaneWaitStates = 4; 821 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn, 822 RWLaneWaitStates); 823 return RWLaneWaitStates - WaitStatesSince; 824 } 825 826 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { 827 if (!ST.hasRFEHazards()) 828 return 0; 829 830 const SIInstrInfo *TII = ST.getInstrInfo(); 831 832 const int RFEWaitStates = 1; 833 834 auto IsHazardFn = [TII] (MachineInstr *MI) { 835 return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS; 836 }; 837 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates); 838 return RFEWaitStates - WaitStatesNeeded; 839 } 840 841 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { 842 const SIInstrInfo *TII = ST.getInstrInfo(); 843 const int SMovRelWaitStates = 1; 844 auto IsHazardFn = [TII] (MachineInstr *MI) { 845 return TII->isSALU(*MI); 846 }; 847 return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, 848 SMovRelWaitStates); 849 } 850 851 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { 852 fixVMEMtoScalarWriteHazards(MI); 853 fixVcmpxPermlaneHazards(MI); 854 fixSMEMtoVectorWriteHazards(MI); 855 fixVcmpxExecWARHazard(MI); 856 fixLdsBranchVmemWARHazard(MI); 857 } 858 859 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { 860 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI)) 861 return false; 862 863 const SIInstrInfo *TII = ST.getInstrInfo(); 864 auto IsHazardFn = [TII] (MachineInstr *MI) { 865 return TII->isVOPC(*MI); 866 }; 867 868 auto IsExpiredFn = [] (MachineInstr *MI, int) { 869 if (!MI) 870 return false; 871 unsigned Opc = MI->getOpcode(); 872 return SIInstrInfo::isVALU(*MI) && 873 Opc != AMDGPU::V_NOP_e32 && 874 Opc != AMDGPU::V_NOP_e64 && 875 Opc != AMDGPU::V_NOP_sdwa; 876 }; 877 878 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 879 std::numeric_limits<int>::max()) 880 return false; 881 882 // V_NOP will be discarded by SQ. 883 // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* 884 // which is always a VGPR and available. 885 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); 886 Register Reg = Src0->getReg(); 887 bool IsUndef = Src0->isUndef(); 888 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 889 TII->get(AMDGPU::V_MOV_B32_e32)) 890 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0)) 891 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill); 892 893 return true; 894 } 895 896 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { 897 if (!ST.hasVMEMtoScalarWriteHazard()) 898 return false; 899 900 if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI)) 901 return false; 902 903 if (MI->getNumDefs() == 0) 904 return false; 905 906 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 907 908 auto IsHazardFn = [TRI, MI] (MachineInstr *I) { 909 if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) && 910 !SIInstrInfo::isFLAT(*I)) 911 return false; 912 913 for (const MachineOperand &Def : MI->defs()) { 914 MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI); 915 if (!Op) 916 continue; 917 return true; 918 } 919 return false; 920 }; 921 922 auto IsExpiredFn = [](MachineInstr *MI, int) { 923 return MI && (SIInstrInfo::isVALU(*MI) || 924 (MI->getOpcode() == AMDGPU::S_WAITCNT && 925 !MI->getOperand(0).getImm()) || 926 (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 927 MI->getOperand(0).getImm() == 0xffe3)); 928 }; 929 930 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 931 std::numeric_limits<int>::max()) 932 return false; 933 934 const SIInstrInfo *TII = ST.getInstrInfo(); 935 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 936 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 937 .addImm(0xffe3); 938 return true; 939 } 940 941 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { 942 if (!ST.hasSMEMtoVectorWriteHazard()) 943 return false; 944 945 if (!SIInstrInfo::isVALU(*MI)) 946 return false; 947 948 unsigned SDSTName; 949 switch (MI->getOpcode()) { 950 case AMDGPU::V_READLANE_B32: 951 case AMDGPU::V_READLANE_B32_gfx10: 952 case AMDGPU::V_READFIRSTLANE_B32: 953 SDSTName = AMDGPU::OpName::vdst; 954 break; 955 default: 956 SDSTName = AMDGPU::OpName::sdst; 957 break; 958 } 959 960 const SIInstrInfo *TII = ST.getInstrInfo(); 961 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 962 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU()); 963 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName); 964 if (!SDST) { 965 for (const auto &MO : MI->implicit_operands()) { 966 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) { 967 SDST = &MO; 968 break; 969 } 970 } 971 } 972 973 if (!SDST) 974 return false; 975 976 const Register SDSTReg = SDST->getReg(); 977 auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) { 978 return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI); 979 }; 980 981 auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) { 982 if (MI) { 983 if (TII->isSALU(*MI)) { 984 switch (MI->getOpcode()) { 985 case AMDGPU::S_SETVSKIP: 986 case AMDGPU::S_VERSION: 987 case AMDGPU::S_WAITCNT_VSCNT: 988 case AMDGPU::S_WAITCNT_VMCNT: 989 case AMDGPU::S_WAITCNT_EXPCNT: 990 // These instructions cannot not mitigate the hazard. 991 return false; 992 case AMDGPU::S_WAITCNT_LGKMCNT: 993 // Reducing lgkmcnt count to 0 always mitigates the hazard. 994 return (MI->getOperand(1).getImm() == 0) && 995 (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL); 996 case AMDGPU::S_WAITCNT: { 997 const int64_t Imm = MI->getOperand(0).getImm(); 998 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm); 999 return (Decoded.LgkmCnt == 0); 1000 } 1001 default: 1002 // SOPP instructions cannot mitigate the hazard. 1003 if (TII->isSOPP(*MI)) 1004 return false; 1005 // At this point the SALU can be assumed to mitigate the hazard 1006 // because either: 1007 // (a) it is independent of the at risk SMEM (breaking chain), 1008 // or 1009 // (b) it is dependent on the SMEM, in which case an appropriate 1010 // s_waitcnt lgkmcnt _must_ exist between it and the at risk 1011 // SMEM instruction. 1012 return true; 1013 } 1014 } 1015 } 1016 return false; 1017 }; 1018 1019 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1020 std::numeric_limits<int>::max()) 1021 return false; 1022 1023 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1024 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL) 1025 .addImm(0); 1026 return true; 1027 } 1028 1029 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { 1030 if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI)) 1031 return false; 1032 1033 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1034 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI)) 1035 return false; 1036 1037 auto IsHazardFn = [TRI] (MachineInstr *I) { 1038 if (SIInstrInfo::isVALU(*I)) 1039 return false; 1040 return I->readsRegister(AMDGPU::EXEC, TRI); 1041 }; 1042 1043 const SIInstrInfo *TII = ST.getInstrInfo(); 1044 auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) { 1045 if (!MI) 1046 return false; 1047 if (SIInstrInfo::isVALU(*MI)) { 1048 if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst)) 1049 return true; 1050 for (auto MO : MI->implicit_operands()) 1051 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) 1052 return true; 1053 } 1054 if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1055 (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe) 1056 return true; 1057 return false; 1058 }; 1059 1060 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1061 std::numeric_limits<int>::max()) 1062 return false; 1063 1064 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1065 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 1066 .addImm(0xfffe); 1067 return true; 1068 } 1069 1070 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { 1071 if (!ST.hasLdsBranchVmemWARHazard()) 1072 return false; 1073 1074 auto IsHazardInst = [] (const MachineInstr *MI) { 1075 if (SIInstrInfo::isDS(*MI)) 1076 return 1; 1077 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI)) 1078 return 2; 1079 return 0; 1080 }; 1081 1082 auto InstType = IsHazardInst(MI); 1083 if (!InstType) 1084 return false; 1085 1086 auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) { 1087 return I && (IsHazardInst(I) || 1088 (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT && 1089 I->getOperand(0).getReg() == AMDGPU::SGPR_NULL && 1090 !I->getOperand(1).getImm())); 1091 }; 1092 1093 auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) { 1094 if (!I->isBranch()) 1095 return false; 1096 1097 auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) { 1098 auto InstType2 = IsHazardInst(I); 1099 return InstType2 && InstType != InstType2; 1100 }; 1101 1102 auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) { 1103 if (!I) 1104 return false; 1105 1106 auto InstType2 = IsHazardInst(I); 1107 if (InstType == InstType2) 1108 return true; 1109 1110 return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT && 1111 I->getOperand(0).getReg() == AMDGPU::SGPR_NULL && 1112 !I->getOperand(1).getImm(); 1113 }; 1114 1115 return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) != 1116 std::numeric_limits<int>::max(); 1117 }; 1118 1119 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1120 std::numeric_limits<int>::max()) 1121 return false; 1122 1123 const SIInstrInfo *TII = ST.getInstrInfo(); 1124 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1125 TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1126 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1127 .addImm(0); 1128 1129 return true; 1130 } 1131 1132 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { 1133 int NSAtoVMEMWaitStates = 1; 1134 1135 if (!ST.hasNSAtoVMEMBug()) 1136 return 0; 1137 1138 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI)) 1139 return 0; 1140 1141 const SIInstrInfo *TII = ST.getInstrInfo(); 1142 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 1143 if (!Offset || (Offset->getImm() & 6) == 0) 1144 return 0; 1145 1146 auto IsHazardFn = [TII] (MachineInstr *I) { 1147 if (!SIInstrInfo::isMIMG(*I)) 1148 return false; 1149 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode()); 1150 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA && 1151 TII->getInstSizeInBytes(*I) >= 16; 1152 }; 1153 1154 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1); 1155 } 1156 1157 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { 1158 int FPAtomicToDenormModeWaitStates = 3; 1159 1160 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE) 1161 return 0; 1162 1163 auto IsHazardFn = [] (MachineInstr *I) { 1164 if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I)) 1165 return false; 1166 return SIInstrInfo::isFPAtomic(*I); 1167 }; 1168 1169 auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) { 1170 if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI)) 1171 return true; 1172 1173 switch (MI->getOpcode()) { 1174 case AMDGPU::S_WAITCNT: 1175 case AMDGPU::S_WAITCNT_VSCNT: 1176 case AMDGPU::S_WAITCNT_VMCNT: 1177 case AMDGPU::S_WAITCNT_EXPCNT: 1178 case AMDGPU::S_WAITCNT_LGKMCNT: 1179 case AMDGPU::S_WAITCNT_IDLE: 1180 return true; 1181 default: 1182 break; 1183 } 1184 1185 return false; 1186 }; 1187 1188 1189 return FPAtomicToDenormModeWaitStates - 1190 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn); 1191 } 1192 1193 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { 1194 assert(SIInstrInfo::isMAI(*MI)); 1195 1196 int WaitStatesNeeded = 0; 1197 unsigned Opc = MI->getOpcode(); 1198 1199 auto IsVALUFn = [] (MachineInstr *MI) { 1200 return SIInstrInfo::isVALU(*MI); 1201 }; 1202 1203 if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write 1204 const int LegacyVALUWritesVGPRWaitStates = 2; 1205 const int VALUWritesExecWaitStates = 4; 1206 const int MaxWaitStates = 4; 1207 1208 int WaitStatesNeededForUse = VALUWritesExecWaitStates - 1209 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates); 1210 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1211 1212 if (WaitStatesNeeded < MaxWaitStates) { 1213 for (const MachineOperand &Use : MI->explicit_uses()) { 1214 const int MaxWaitStates = 2; 1215 1216 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 1217 continue; 1218 1219 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates - 1220 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates); 1221 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1222 1223 if (WaitStatesNeeded == MaxWaitStates) 1224 break; 1225 } 1226 } 1227 } 1228 1229 auto IsMFMAFn = [] (MachineInstr *MI) { 1230 return SIInstrInfo::isMAI(*MI) && 1231 MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 && 1232 MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32; 1233 }; 1234 1235 for (const MachineOperand &Op : MI->explicit_operands()) { 1236 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg())) 1237 continue; 1238 1239 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32) 1240 continue; 1241 1242 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4; 1243 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2; 1244 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4; 1245 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10; 1246 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18; 1247 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1; 1248 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7; 1249 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15; 1250 const int MaxWaitStates = 18; 1251 Register Reg = Op.getReg(); 1252 unsigned HazardDefLatency = 0; 1253 1254 auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this] 1255 (MachineInstr *MI) { 1256 if (!IsMFMAFn(MI)) 1257 return false; 1258 Register DstReg = MI->getOperand(0).getReg(); 1259 if (DstReg == Reg) 1260 return false; 1261 HazardDefLatency = std::max(HazardDefLatency, 1262 TSchedModel.computeInstrLatency(MI)); 1263 return TRI.regsOverlap(DstReg, Reg); 1264 }; 1265 1266 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, 1267 MaxWaitStates); 1268 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates; 1269 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 1270 int OpNo = MI->getOperandNo(&Op); 1271 if (OpNo == SrcCIdx) { 1272 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates; 1273 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) { 1274 switch (HazardDefLatency) { 1275 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates; 1276 break; 1277 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates; 1278 break; 1279 case 16: LLVM_FALLTHROUGH; 1280 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates; 1281 break; 1282 } 1283 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) { 1284 switch (HazardDefLatency) { 1285 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates; 1286 break; 1287 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates; 1288 break; 1289 case 16: LLVM_FALLTHROUGH; 1290 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates; 1291 break; 1292 } 1293 } 1294 1295 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 1296 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1297 1298 if (WaitStatesNeeded == MaxWaitStates) 1299 return WaitStatesNeeded; // Early exit. 1300 1301 auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) { 1302 if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32) 1303 return false; 1304 Register DstReg = MI->getOperand(0).getReg(); 1305 return TRI.regsOverlap(Reg, DstReg); 1306 }; 1307 1308 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1; 1309 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3; 1310 const int AccVGPRWriteAccVgprReadWaitStates = 3; 1311 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates; 1312 if (OpNo == SrcCIdx) 1313 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates; 1314 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) 1315 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates; 1316 1317 WaitStatesNeededForUse = NeedWaitStates - 1318 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates); 1319 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1320 1321 if (WaitStatesNeeded == MaxWaitStates) 1322 return WaitStatesNeeded; // Early exit. 1323 } 1324 1325 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) { 1326 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0; 1327 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5; 1328 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13; 1329 const int MaxWaitStates = 13; 1330 Register DstReg = MI->getOperand(0).getReg(); 1331 unsigned HazardDefLatency = 0; 1332 1333 auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this] 1334 (MachineInstr *MI) { 1335 if (!IsMFMAFn(MI)) 1336 return false; 1337 Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); 1338 HazardDefLatency = std::max(HazardDefLatency, 1339 TSchedModel.computeInstrLatency(MI)); 1340 return TRI.regsOverlap(Reg, DstReg); 1341 }; 1342 1343 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates); 1344 int NeedWaitStates; 1345 switch (HazardDefLatency) { 1346 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates; 1347 break; 1348 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates; 1349 break; 1350 case 16: LLVM_FALLTHROUGH; 1351 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates; 1352 break; 1353 } 1354 1355 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince; 1356 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1357 } 1358 1359 return WaitStatesNeeded; 1360 } 1361 1362 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { 1363 if (!ST.hasMAIInsts()) 1364 return 0; 1365 1366 int WaitStatesNeeded = 0; 1367 1368 auto IsAccVgprReadFn = [] (MachineInstr *MI) { 1369 return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32; 1370 }; 1371 1372 for (const MachineOperand &Op : MI->explicit_uses()) { 1373 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg())) 1374 continue; 1375 1376 Register Reg = Op.getReg(); 1377 1378 const int AccVgprReadLdStWaitStates = 2; 1379 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1; 1380 const int MaxWaitStates = 2; 1381 1382 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates - 1383 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates); 1384 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1385 1386 if (WaitStatesNeeded == MaxWaitStates) 1387 return WaitStatesNeeded; // Early exit. 1388 1389 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](MachineInstr *MI) { 1390 if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32 && 1391 MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32) 1392 return false; 1393 auto IsVALUFn = [] (MachineInstr *MI) { 1394 return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI); 1395 }; 1396 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) < 1397 std::numeric_limits<int>::max(); 1398 }; 1399 1400 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates - 1401 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates); 1402 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1403 } 1404 1405 return WaitStatesNeeded; 1406 } 1407 1408 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) { 1409 if (!SU->isInstr()) 1410 return false; 1411 1412 MachineInstr *MAI = nullptr; 1413 auto IsMFMAFn = [&MAI] (MachineInstr *MI) { 1414 MAI = nullptr; 1415 if (SIInstrInfo::isMAI(*MI) && 1416 MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 && 1417 MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32) 1418 MAI = MI; 1419 return MAI != nullptr; 1420 }; 1421 1422 MachineInstr *MI = SU->getInstr(); 1423 if (IsMFMAFn(MI)) { 1424 int W = getWaitStatesSince(IsMFMAFn, 16); 1425 if (MAI) 1426 return W < (int)TSchedModel.computeInstrLatency(MAI); 1427 } 1428 1429 return false; 1430 } 1431