1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Insert wait instructions for memory reads and writes. 11 /// 12 /// Memory reads and writes are issued asynchronously, so we need to insert 13 /// S_WAITCNT instructions when we want to access any of their results or 14 /// overwrite any register that's used asynchronously. 15 /// 16 /// TODO: This pass currently keeps one timeline per hardware counter. A more 17 /// finely-grained approach that keeps one timeline per event type could 18 /// sometimes get away with generating weaker s_waitcnt instructions. For 19 /// example, when both SMEM and LDS are in flight and we need to wait for 20 /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient, 21 /// but the pass will currently generate a conservative lgkmcnt(0) because 22 /// multiple event types are in flight. 23 // 24 //===----------------------------------------------------------------------===// 25 26 #include "AMDGPU.h" 27 #include "AMDGPUSubtarget.h" 28 #include "SIMachineFunctionInfo.h" 29 #include "llvm/ADT/MapVector.h" 30 #include "llvm/ADT/PostOrderIterator.h" 31 #include "llvm/CodeGen/MachinePostDominators.h" 32 #include "llvm/InitializePasses.h" 33 #include "llvm/Support/DebugCounter.h" 34 #include "llvm/Support/TargetParser.h" 35 using namespace llvm; 36 37 #define DEBUG_TYPE "si-insert-waitcnts" 38 39 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp", 40 "Force emit s_waitcnt expcnt(0) instrs"); 41 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm", 42 "Force emit s_waitcnt lgkmcnt(0) instrs"); 43 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm", 44 "Force emit s_waitcnt vmcnt(0) instrs"); 45 46 static cl::opt<bool> ForceEmitZeroFlag( 47 "amdgpu-waitcnt-forcezero", 48 cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), 49 cl::init(false), cl::Hidden); 50 51 namespace { 52 53 template <typename EnumT> 54 class enum_iterator 55 : public iterator_facade_base<enum_iterator<EnumT>, 56 std::forward_iterator_tag, const EnumT> { 57 EnumT Value; 58 public: 59 enum_iterator() = default; 60 enum_iterator(EnumT Value) : Value(Value) {} 61 62 enum_iterator &operator++() { 63 Value = static_cast<EnumT>(Value + 1); 64 return *this; 65 } 66 67 bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; } 68 69 EnumT operator*() const { return Value; } 70 }; 71 72 // Class of object that encapsulates latest instruction counter score 73 // associated with the operand. Used for determining whether 74 // s_waitcnt instruction needs to be emited. 75 76 #define CNT_MASK(t) (1u << (t)) 77 78 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS }; 79 80 iterator_range<enum_iterator<InstCounterType>> inst_counter_types() { 81 return make_range(enum_iterator<InstCounterType>(VM_CNT), 82 enum_iterator<InstCounterType>(NUM_INST_CNTS)); 83 } 84 85 using RegInterval = std::pair<int, int>; 86 87 struct { 88 unsigned VmcntMax; 89 unsigned ExpcntMax; 90 unsigned LgkmcntMax; 91 unsigned VscntMax; 92 } HardwareLimits; 93 94 struct { 95 unsigned VGPR0; 96 unsigned VGPRL; 97 unsigned SGPR0; 98 unsigned SGPRL; 99 } RegisterEncoding; 100 101 enum WaitEventType { 102 VMEM_ACCESS, // vector-memory read & write 103 VMEM_READ_ACCESS, // vector-memory read 104 VMEM_WRITE_ACCESS,// vector-memory write 105 LDS_ACCESS, // lds read & write 106 GDS_ACCESS, // gds read & write 107 SQ_MESSAGE, // send message 108 SMEM_ACCESS, // scalar-memory read & write 109 EXP_GPR_LOCK, // export holding on its data src 110 GDS_GPR_LOCK, // GDS holding on its data and addr src 111 EXP_POS_ACCESS, // write to export position 112 EXP_PARAM_ACCESS, // write to export parameter 113 VMW_GPR_LOCK, // vector-memory write holding on its data src 114 NUM_WAIT_EVENTS, 115 }; 116 117 static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = { 118 (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS), 119 (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) | 120 (1 << SQ_MESSAGE), 121 (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) | 122 (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS), 123 (1 << VMEM_WRITE_ACCESS) 124 }; 125 126 // The mapping is: 127 // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs 128 // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots 129 // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs 130 // We reserve a fixed number of VGPR slots in the scoring tables for 131 // special tokens like SCMEM_LDS (needed for buffer load to LDS). 132 enum RegisterMapping { 133 SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets. 134 SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets. 135 NUM_EXTRA_VGPRS = 1, // A reserved slot for DS. 136 EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses. 137 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts. 138 }; 139 140 // Enumerate different types of result-returning VMEM operations. Although 141 // s_waitcnt orders them all with a single vmcnt counter, in the absence of 142 // s_waitcnt only instructions of the same VmemType are guaranteed to write 143 // their results in order -- so there is no need to insert an s_waitcnt between 144 // two instructions of the same type that write the same vgpr. 145 enum VmemType { 146 // BUF instructions and MIMG instructions without a sampler. 147 VMEM_NOSAMPLER, 148 // MIMG instructions with a sampler. 149 VMEM_SAMPLER, 150 }; 151 152 VmemType getVmemType(const MachineInstr &Inst) { 153 assert(SIInstrInfo::isVMEM(Inst)); 154 if (!SIInstrInfo::isMIMG(Inst)) 155 return VMEM_NOSAMPLER; 156 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode()); 157 return AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler 158 ? VMEM_SAMPLER 159 : VMEM_NOSAMPLER; 160 } 161 162 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { 163 switch (T) { 164 case VM_CNT: 165 Wait.VmCnt = std::min(Wait.VmCnt, Count); 166 break; 167 case EXP_CNT: 168 Wait.ExpCnt = std::min(Wait.ExpCnt, Count); 169 break; 170 case LGKM_CNT: 171 Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count); 172 break; 173 case VS_CNT: 174 Wait.VsCnt = std::min(Wait.VsCnt, Count); 175 break; 176 default: 177 llvm_unreachable("bad InstCounterType"); 178 } 179 } 180 181 // This objects maintains the current score brackets of each wait counter, and 182 // a per-register scoreboard for each wait counter. 183 // 184 // We also maintain the latest score for every event type that can change the 185 // waitcnt in order to know if there are multiple types of events within 186 // the brackets. When multiple types of event happen in the bracket, 187 // wait count may get decreased out of order, therefore we need to put in 188 // "s_waitcnt 0" before use. 189 class WaitcntBrackets { 190 public: 191 WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {} 192 193 static unsigned getWaitCountMax(InstCounterType T) { 194 switch (T) { 195 case VM_CNT: 196 return HardwareLimits.VmcntMax; 197 case LGKM_CNT: 198 return HardwareLimits.LgkmcntMax; 199 case EXP_CNT: 200 return HardwareLimits.ExpcntMax; 201 case VS_CNT: 202 return HardwareLimits.VscntMax; 203 default: 204 break; 205 } 206 return 0; 207 } 208 209 unsigned getScoreLB(InstCounterType T) const { 210 assert(T < NUM_INST_CNTS); 211 return ScoreLBs[T]; 212 } 213 214 unsigned getScoreUB(InstCounterType T) const { 215 assert(T < NUM_INST_CNTS); 216 return ScoreUBs[T]; 217 } 218 219 // Mapping from event to counter. 220 InstCounterType eventCounter(WaitEventType E) { 221 if (WaitEventMaskForInst[VM_CNT] & (1 << E)) 222 return VM_CNT; 223 if (WaitEventMaskForInst[LGKM_CNT] & (1 << E)) 224 return LGKM_CNT; 225 if (WaitEventMaskForInst[VS_CNT] & (1 << E)) 226 return VS_CNT; 227 assert(WaitEventMaskForInst[EXP_CNT] & (1 << E)); 228 return EXP_CNT; 229 } 230 231 unsigned getRegScore(int GprNo, InstCounterType T) { 232 if (GprNo < NUM_ALL_VGPRS) { 233 return VgprScores[T][GprNo]; 234 } 235 assert(T == LGKM_CNT); 236 return SgprScores[GprNo - NUM_ALL_VGPRS]; 237 } 238 239 bool merge(const WaitcntBrackets &Other); 240 241 RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII, 242 const MachineRegisterInfo *MRI, 243 const SIRegisterInfo *TRI, unsigned OpNo) const; 244 245 bool counterOutOfOrder(InstCounterType T) const; 246 bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; 247 bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const; 248 void determineWait(InstCounterType T, unsigned ScoreToWait, 249 AMDGPU::Waitcnt &Wait) const; 250 void applyWaitcnt(const AMDGPU::Waitcnt &Wait); 251 void applyWaitcnt(InstCounterType T, unsigned Count); 252 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, 253 const MachineRegisterInfo *MRI, WaitEventType E, 254 MachineInstr &MI); 255 256 bool hasPending() const { return PendingEvents != 0; } 257 bool hasPendingEvent(WaitEventType E) const { 258 return PendingEvents & (1 << E); 259 } 260 261 bool hasMixedPendingEvents(InstCounterType T) const { 262 unsigned Events = PendingEvents & WaitEventMaskForInst[T]; 263 // Return true if more than one bit is set in Events. 264 return Events & (Events - 1); 265 } 266 267 bool hasPendingFlat() const { 268 return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] && 269 LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) || 270 (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] && 271 LastFlat[VM_CNT] <= ScoreUBs[VM_CNT])); 272 } 273 274 void setPendingFlat() { 275 LastFlat[VM_CNT] = ScoreUBs[VM_CNT]; 276 LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT]; 277 } 278 279 // Return true if there might be pending writes to the specified vgpr by VMEM 280 // instructions with types different from V. 281 bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const { 282 assert(GprNo < NUM_ALL_VGPRS); 283 return VgprVmemTypes[GprNo] & ~(1 << V); 284 } 285 286 void clearVgprVmemTypes(int GprNo) { 287 assert(GprNo < NUM_ALL_VGPRS); 288 VgprVmemTypes[GprNo] = 0; 289 } 290 291 void print(raw_ostream &); 292 void dump() { print(dbgs()); } 293 294 private: 295 struct MergeInfo { 296 unsigned OldLB; 297 unsigned OtherLB; 298 unsigned MyShift; 299 unsigned OtherShift; 300 }; 301 static bool mergeScore(const MergeInfo &M, unsigned &Score, 302 unsigned OtherScore); 303 304 void setScoreLB(InstCounterType T, unsigned Val) { 305 assert(T < NUM_INST_CNTS); 306 ScoreLBs[T] = Val; 307 } 308 309 void setScoreUB(InstCounterType T, unsigned Val) { 310 assert(T < NUM_INST_CNTS); 311 ScoreUBs[T] = Val; 312 if (T == EXP_CNT) { 313 unsigned UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT); 314 if (ScoreLBs[T] < UB && UB < ScoreUBs[T]) 315 ScoreLBs[T] = UB; 316 } 317 } 318 319 void setRegScore(int GprNo, InstCounterType T, unsigned Val) { 320 if (GprNo < NUM_ALL_VGPRS) { 321 VgprUB = std::max(VgprUB, GprNo); 322 VgprScores[T][GprNo] = Val; 323 } else { 324 assert(T == LGKM_CNT); 325 SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS); 326 SgprScores[GprNo - NUM_ALL_VGPRS] = Val; 327 } 328 } 329 330 void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII, 331 const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, 332 unsigned OpNo, unsigned Val); 333 334 const GCNSubtarget *ST = nullptr; 335 unsigned ScoreLBs[NUM_INST_CNTS] = {0}; 336 unsigned ScoreUBs[NUM_INST_CNTS] = {0}; 337 unsigned PendingEvents = 0; 338 // Remember the last flat memory operation. 339 unsigned LastFlat[NUM_INST_CNTS] = {0}; 340 // wait_cnt scores for every vgpr. 341 // Keep track of the VgprUB and SgprUB to make merge at join efficient. 342 int VgprUB = -1; 343 int SgprUB = -1; 344 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; 345 // Wait cnt scores for every sgpr, only lgkmcnt is relevant. 346 unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0}; 347 // Bitmask of the VmemTypes of VMEM instructions that might have a pending 348 // write to each vgpr. 349 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; 350 }; 351 352 class SIInsertWaitcnts : public MachineFunctionPass { 353 private: 354 const GCNSubtarget *ST = nullptr; 355 const SIInstrInfo *TII = nullptr; 356 const SIRegisterInfo *TRI = nullptr; 357 const MachineRegisterInfo *MRI = nullptr; 358 AMDGPU::IsaVersion IV; 359 360 DenseSet<MachineInstr *> TrackedWaitcntSet; 361 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses; 362 MachinePostDominatorTree *PDT; 363 364 struct BlockInfo { 365 MachineBasicBlock *MBB; 366 std::unique_ptr<WaitcntBrackets> Incoming; 367 bool Dirty = true; 368 369 explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {} 370 }; 371 372 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos; 373 374 // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0 375 // because of amdgpu-waitcnt-forcezero flag 376 bool ForceEmitZeroWaitcnts; 377 bool ForceEmitWaitcnt[NUM_INST_CNTS]; 378 379 public: 380 static char ID; 381 382 SIInsertWaitcnts() : MachineFunctionPass(ID) { 383 (void)ForceExpCounter; 384 (void)ForceLgkmCounter; 385 (void)ForceVMCounter; 386 } 387 388 bool runOnMachineFunction(MachineFunction &MF) override; 389 390 StringRef getPassName() const override { 391 return "SI insert wait instructions"; 392 } 393 394 void getAnalysisUsage(AnalysisUsage &AU) const override { 395 AU.setPreservesCFG(); 396 AU.addRequired<MachinePostDominatorTree>(); 397 MachineFunctionPass::getAnalysisUsage(AU); 398 } 399 400 bool isForceEmitWaitcnt() const { 401 for (auto T : inst_counter_types()) 402 if (ForceEmitWaitcnt[T]) 403 return true; 404 return false; 405 } 406 407 void setForceEmitWaitcnt() { 408 // For non-debug builds, ForceEmitWaitcnt has been initialized to false; 409 // For debug builds, get the debug counter info and adjust if need be 410 #ifndef NDEBUG 411 if (DebugCounter::isCounterSet(ForceExpCounter) && 412 DebugCounter::shouldExecute(ForceExpCounter)) { 413 ForceEmitWaitcnt[EXP_CNT] = true; 414 } else { 415 ForceEmitWaitcnt[EXP_CNT] = false; 416 } 417 418 if (DebugCounter::isCounterSet(ForceLgkmCounter) && 419 DebugCounter::shouldExecute(ForceLgkmCounter)) { 420 ForceEmitWaitcnt[LGKM_CNT] = true; 421 } else { 422 ForceEmitWaitcnt[LGKM_CNT] = false; 423 } 424 425 if (DebugCounter::isCounterSet(ForceVMCounter) && 426 DebugCounter::shouldExecute(ForceVMCounter)) { 427 ForceEmitWaitcnt[VM_CNT] = true; 428 } else { 429 ForceEmitWaitcnt[VM_CNT] = false; 430 } 431 #endif // NDEBUG 432 } 433 434 bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; 435 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; 436 bool generateWaitcntInstBefore(MachineInstr &MI, 437 WaitcntBrackets &ScoreBrackets, 438 MachineInstr *OldWaitcntInstr); 439 void updateEventWaitcntAfter(MachineInstr &Inst, 440 WaitcntBrackets *ScoreBrackets); 441 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, 442 WaitcntBrackets &ScoreBrackets); 443 }; 444 445 } // end anonymous namespace 446 447 RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, 448 const SIInstrInfo *TII, 449 const MachineRegisterInfo *MRI, 450 const SIRegisterInfo *TRI, 451 unsigned OpNo) const { 452 const MachineOperand &Op = MI->getOperand(OpNo); 453 assert(Op.isReg()); 454 if (!TRI->isInAllocatableClass(Op.getReg()) || TRI->isAGPR(*MRI, Op.getReg())) 455 return {-1, -1}; 456 457 // A use via a PW operand does not need a waitcnt. 458 // A partial write is not a WAW. 459 assert(!Op.getSubReg() || !Op.isUndef()); 460 461 RegInterval Result; 462 463 unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)); 464 465 if (TRI->isVGPR(*MRI, Op.getReg())) { 466 assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL); 467 Result.first = Reg - RegisterEncoding.VGPR0; 468 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS); 469 } else if (TRI->isSGPRReg(*MRI, Op.getReg())) { 470 assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS); 471 Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS; 472 assert(Result.first >= NUM_ALL_VGPRS && 473 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS); 474 } 475 // TODO: Handle TTMP 476 // else if (TRI->isTTMP(*MRI, Reg.getReg())) ... 477 else 478 return {-1, -1}; 479 480 const TargetRegisterClass *RC = TII->getOpRegClass(*MI, OpNo); 481 unsigned Size = TRI->getRegSizeInBits(*RC); 482 Result.second = Result.first + ((Size + 16) / 32); 483 484 return Result; 485 } 486 487 void WaitcntBrackets::setExpScore(const MachineInstr *MI, 488 const SIInstrInfo *TII, 489 const SIRegisterInfo *TRI, 490 const MachineRegisterInfo *MRI, unsigned OpNo, 491 unsigned Val) { 492 RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo); 493 assert(TRI->isVGPR(*MRI, MI->getOperand(OpNo).getReg())); 494 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 495 setRegScore(RegNo, EXP_CNT, Val); 496 } 497 } 498 499 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, 500 const SIRegisterInfo *TRI, 501 const MachineRegisterInfo *MRI, 502 WaitEventType E, MachineInstr &Inst) { 503 InstCounterType T = eventCounter(E); 504 unsigned CurrScore = getScoreUB(T) + 1; 505 if (CurrScore == 0) 506 report_fatal_error("InsertWaitcnt score wraparound"); 507 // PendingEvents and ScoreUB need to be update regardless if this event 508 // changes the score of a register or not. 509 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message. 510 PendingEvents |= 1 << E; 511 setScoreUB(T, CurrScore); 512 513 if (T == EXP_CNT) { 514 // Put score on the source vgprs. If this is a store, just use those 515 // specific register(s). 516 if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) { 517 int AddrOpIdx = 518 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr); 519 // All GDS operations must protect their address register (same as 520 // export.) 521 if (AddrOpIdx != -1) { 522 setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore); 523 } 524 525 if (Inst.mayStore()) { 526 if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), 527 AMDGPU::OpName::data0) != -1) { 528 setExpScore( 529 &Inst, TII, TRI, MRI, 530 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0), 531 CurrScore); 532 } 533 if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), 534 AMDGPU::OpName::data1) != -1) { 535 setExpScore(&Inst, TII, TRI, MRI, 536 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), 537 AMDGPU::OpName::data1), 538 CurrScore); 539 } 540 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 && 541 Inst.getOpcode() != AMDGPU::DS_GWS_INIT && 542 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V && 543 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR && 544 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P && 545 Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER && 546 Inst.getOpcode() != AMDGPU::DS_APPEND && 547 Inst.getOpcode() != AMDGPU::DS_CONSUME && 548 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) { 549 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 550 const MachineOperand &Op = Inst.getOperand(I); 551 if (Op.isReg() && !Op.isDef() && TRI->isVGPR(*MRI, Op.getReg())) { 552 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore); 553 } 554 } 555 } 556 } else if (TII->isFLAT(Inst)) { 557 if (Inst.mayStore()) { 558 setExpScore( 559 &Inst, TII, TRI, MRI, 560 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 561 CurrScore); 562 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) { 563 setExpScore( 564 &Inst, TII, TRI, MRI, 565 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 566 CurrScore); 567 } 568 } else if (TII->isMIMG(Inst)) { 569 if (Inst.mayStore()) { 570 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); 571 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) { 572 setExpScore( 573 &Inst, TII, TRI, MRI, 574 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 575 CurrScore); 576 } 577 } else if (TII->isMTBUF(Inst)) { 578 if (Inst.mayStore()) { 579 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); 580 } 581 } else if (TII->isMUBUF(Inst)) { 582 if (Inst.mayStore()) { 583 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); 584 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) { 585 setExpScore( 586 &Inst, TII, TRI, MRI, 587 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 588 CurrScore); 589 } 590 } else { 591 if (TII->isEXP(Inst)) { 592 // For export the destination registers are really temps that 593 // can be used as the actual source after export patching, so 594 // we need to treat them like sources and set the EXP_CNT 595 // score. 596 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 597 MachineOperand &DefMO = Inst.getOperand(I); 598 if (DefMO.isReg() && DefMO.isDef() && 599 TRI->isVGPR(*MRI, DefMO.getReg())) { 600 setRegScore( 601 TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)), 602 EXP_CNT, CurrScore); 603 } 604 } 605 } 606 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 607 MachineOperand &MO = Inst.getOperand(I); 608 if (MO.isReg() && !MO.isDef() && TRI->isVGPR(*MRI, MO.getReg())) { 609 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore); 610 } 611 } 612 } 613 #if 0 // TODO: check if this is handled by MUBUF code above. 614 } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD || 615 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 || 616 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) { 617 MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data); 618 unsigned OpNo;//TODO: find the OpNo for this operand; 619 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo); 620 for (int RegNo = Interval.first; RegNo < Interval.second; 621 ++RegNo) { 622 setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore); 623 } 624 #endif 625 } else { 626 // Match the score to the destination registers. 627 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 628 auto &Op = Inst.getOperand(I); 629 if (!Op.isReg() || !Op.isDef()) 630 continue; 631 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I); 632 if (T == VM_CNT) { 633 if (Interval.first >= NUM_ALL_VGPRS) 634 continue; 635 if (SIInstrInfo::isVMEM(Inst)) { 636 VmemType V = getVmemType(Inst); 637 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) 638 VgprVmemTypes[RegNo] |= 1 << V; 639 } 640 } 641 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 642 setRegScore(RegNo, T, CurrScore); 643 } 644 } 645 if (TII->isDS(Inst) && Inst.mayStore()) { 646 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore); 647 } 648 } 649 } 650 651 void WaitcntBrackets::print(raw_ostream &OS) { 652 OS << '\n'; 653 for (auto T : inst_counter_types()) { 654 unsigned LB = getScoreLB(T); 655 unsigned UB = getScoreUB(T); 656 657 switch (T) { 658 case VM_CNT: 659 OS << " VM_CNT(" << UB - LB << "): "; 660 break; 661 case LGKM_CNT: 662 OS << " LGKM_CNT(" << UB - LB << "): "; 663 break; 664 case EXP_CNT: 665 OS << " EXP_CNT(" << UB - LB << "): "; 666 break; 667 case VS_CNT: 668 OS << " VS_CNT(" << UB - LB << "): "; 669 break; 670 default: 671 OS << " UNKNOWN(" << UB - LB << "): "; 672 break; 673 } 674 675 if (LB < UB) { 676 // Print vgpr scores. 677 for (int J = 0; J <= VgprUB; J++) { 678 unsigned RegScore = getRegScore(J, T); 679 if (RegScore <= LB) 680 continue; 681 unsigned RelScore = RegScore - LB - 1; 682 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) { 683 OS << RelScore << ":v" << J << " "; 684 } else { 685 OS << RelScore << ":ds "; 686 } 687 } 688 // Also need to print sgpr scores for lgkm_cnt. 689 if (T == LGKM_CNT) { 690 for (int J = 0; J <= SgprUB; J++) { 691 unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); 692 if (RegScore <= LB) 693 continue; 694 unsigned RelScore = RegScore - LB - 1; 695 OS << RelScore << ":s" << J << " "; 696 } 697 } 698 } 699 OS << '\n'; 700 } 701 OS << '\n'; 702 } 703 704 /// Simplify the waitcnt, in the sense of removing redundant counts, and return 705 /// whether a waitcnt instruction is needed at all. 706 bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { 707 return simplifyWaitcnt(VM_CNT, Wait.VmCnt) | 708 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) | 709 simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt) | 710 simplifyWaitcnt(VS_CNT, Wait.VsCnt); 711 } 712 713 bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T, 714 unsigned &Count) const { 715 const unsigned LB = getScoreLB(T); 716 const unsigned UB = getScoreUB(T); 717 if (Count < UB && UB - Count > LB) 718 return true; 719 720 Count = ~0u; 721 return false; 722 } 723 724 void WaitcntBrackets::determineWait(InstCounterType T, unsigned ScoreToWait, 725 AMDGPU::Waitcnt &Wait) const { 726 // If the score of src_operand falls within the bracket, we need an 727 // s_waitcnt instruction. 728 const unsigned LB = getScoreLB(T); 729 const unsigned UB = getScoreUB(T); 730 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { 731 if ((T == VM_CNT || T == LGKM_CNT) && 732 hasPendingFlat() && 733 !ST->hasFlatLgkmVMemCountInOrder()) { 734 // If there is a pending FLAT operation, and this is a VMem or LGKM 735 // waitcnt and the target can report early completion, then we need 736 // to force a waitcnt 0. 737 addWait(Wait, T, 0); 738 } else if (counterOutOfOrder(T)) { 739 // Counter can get decremented out-of-order when there 740 // are multiple types event in the bracket. Also emit an s_wait counter 741 // with a conservative value of 0 for the counter. 742 addWait(Wait, T, 0); 743 } else { 744 // If a counter has been maxed out avoid overflow by waiting for 745 // MAX(CounterType) - 1 instead. 746 unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1); 747 addWait(Wait, T, NeededWait); 748 } 749 } 750 } 751 752 void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) { 753 applyWaitcnt(VM_CNT, Wait.VmCnt); 754 applyWaitcnt(EXP_CNT, Wait.ExpCnt); 755 applyWaitcnt(LGKM_CNT, Wait.LgkmCnt); 756 applyWaitcnt(VS_CNT, Wait.VsCnt); 757 } 758 759 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { 760 const unsigned UB = getScoreUB(T); 761 if (Count >= UB) 762 return; 763 if (Count != 0) { 764 if (counterOutOfOrder(T)) 765 return; 766 setScoreLB(T, std::max(getScoreLB(T), UB - Count)); 767 } else { 768 setScoreLB(T, UB); 769 PendingEvents &= ~WaitEventMaskForInst[T]; 770 } 771 } 772 773 // Where there are multiple types of event in the bracket of a counter, 774 // the decrement may go out of order. 775 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { 776 // Scalar memory read always can go out of order. 777 if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS)) 778 return true; 779 return hasMixedPendingEvents(T); 780 } 781 782 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, 783 false) 784 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) 785 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, 786 false) 787 788 char SIInsertWaitcnts::ID = 0; 789 790 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID; 791 792 FunctionPass *llvm::createSIInsertWaitcntsPass() { 793 return new SIInsertWaitcnts(); 794 } 795 796 static bool readsVCCZ(const MachineInstr &MI) { 797 unsigned Opc = MI.getOpcode(); 798 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && 799 !MI.getOperand(1).isUndef(); 800 } 801 802 /// \returns true if the callee inserts an s_waitcnt 0 on function entry. 803 static bool callWaitsOnFunctionEntry(const MachineInstr &MI) { 804 // Currently all conventions wait, but this may not always be the case. 805 // 806 // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make 807 // senses to omit the wait and do it in the caller. 808 return true; 809 } 810 811 /// \returns true if the callee is expected to wait for any outstanding waits 812 /// before returning. 813 static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { 814 return true; 815 } 816 817 /// Generate s_waitcnt instruction to be placed before cur_Inst. 818 /// Instructions of a given type are returned in order, 819 /// but instructions of different types can complete out of order. 820 /// We rely on this in-order completion 821 /// and simply assign a score to the memory access instructions. 822 /// We keep track of the active "score bracket" to determine 823 /// if an access of a memory read requires an s_waitcnt 824 /// and if so what the value of each counter is. 825 /// The "score bracket" is bound by the lower bound and upper bound 826 /// scores (*_score_LB and *_score_ub respectively). 827 bool SIInsertWaitcnts::generateWaitcntInstBefore( 828 MachineInstr &MI, WaitcntBrackets &ScoreBrackets, 829 MachineInstr *OldWaitcntInstr) { 830 setForceEmitWaitcnt(); 831 bool IsForceEmitWaitcnt = isForceEmitWaitcnt(); 832 833 if (MI.isMetaInstruction()) 834 return false; 835 836 AMDGPU::Waitcnt Wait; 837 838 // See if this instruction has a forced S_WAITCNT VM. 839 // TODO: Handle other cases of NeedsWaitcntVmBefore() 840 if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 || 841 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC || 842 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL || 843 MI.getOpcode() == AMDGPU::BUFFER_GL0_INV || 844 MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) { 845 Wait.VmCnt = 0; 846 } 847 848 // All waits must be resolved at call return. 849 // NOTE: this could be improved with knowledge of all call sites or 850 // with knowledge of the called routines. 851 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || 852 MI.getOpcode() == AMDGPU::S_SETPC_B64_return || 853 (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { 854 Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt())); 855 } 856 // Resolve vm waits before gs-done. 857 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG || 858 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) && 859 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) == 860 AMDGPU::SendMsg::ID_GS_DONE)) { 861 Wait.VmCnt = 0; 862 } 863 #if 0 // TODO: the following blocks of logic when we have fence. 864 else if (MI.getOpcode() == SC_FENCE) { 865 const unsigned int group_size = 866 context->shader_info->GetMaxThreadGroupSize(); 867 // group_size == 0 means thread group size is unknown at compile time 868 const bool group_is_multi_wave = 869 (group_size == 0 || group_size > target_info->GetWaveFrontSize()); 870 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence(); 871 872 for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) { 873 SCRegType src_type = Inst->GetSrcType(i); 874 switch (src_type) { 875 case SCMEM_LDS: 876 if (group_is_multi_wave || 877 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) { 878 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, 879 ScoreBrackets->getScoreUB(LGKM_CNT)); 880 // LDS may have to wait for VM_CNT after buffer load to LDS 881 if (target_info->HasBufferLoadToLDS()) { 882 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT, 883 ScoreBrackets->getScoreUB(VM_CNT)); 884 } 885 } 886 break; 887 888 case SCMEM_GDS: 889 if (group_is_multi_wave || fence_is_global) { 890 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, 891 ScoreBrackets->getScoreUB(EXP_CNT)); 892 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, 893 ScoreBrackets->getScoreUB(LGKM_CNT)); 894 } 895 break; 896 897 case SCMEM_UAV: 898 case SCMEM_TFBUF: 899 case SCMEM_RING: 900 case SCMEM_SCATTER: 901 if (group_is_multi_wave || fence_is_global) { 902 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, 903 ScoreBrackets->getScoreUB(EXP_CNT)); 904 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT, 905 ScoreBrackets->getScoreUB(VM_CNT)); 906 } 907 break; 908 909 case SCMEM_SCRATCH: 910 default: 911 break; 912 } 913 } 914 } 915 #endif 916 917 // Export & GDS instructions do not read the EXEC mask until after the export 918 // is granted (which can occur well after the instruction is issued). 919 // The shader program must flush all EXP operations on the export-count 920 // before overwriting the EXEC mask. 921 else { 922 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) { 923 // Export and GDS are tracked individually, either may trigger a waitcnt 924 // for EXEC. 925 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) || 926 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) || 927 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) || 928 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) { 929 Wait.ExpCnt = 0; 930 } 931 } 932 933 if (MI.isCall() && callWaitsOnFunctionEntry(MI)) { 934 // The function is going to insert a wait on everything in its prolog. 935 // This still needs to be careful if the call target is a load (e.g. a GOT 936 // load). We also need to check WAW depenancy with saved PC. 937 Wait = AMDGPU::Waitcnt(); 938 939 int CallAddrOpIdx = 940 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 941 942 if (MI.getOperand(CallAddrOpIdx).isReg()) { 943 RegInterval CallAddrOpInterval = 944 ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, CallAddrOpIdx); 945 946 for (int RegNo = CallAddrOpInterval.first; 947 RegNo < CallAddrOpInterval.second; ++RegNo) 948 ScoreBrackets.determineWait( 949 LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); 950 951 int RtnAddrOpIdx = 952 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); 953 if (RtnAddrOpIdx != -1) { 954 RegInterval RtnAddrOpInterval = 955 ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, RtnAddrOpIdx); 956 957 for (int RegNo = RtnAddrOpInterval.first; 958 RegNo < RtnAddrOpInterval.second; ++RegNo) 959 ScoreBrackets.determineWait( 960 LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); 961 } 962 } 963 } else { 964 // FIXME: Should not be relying on memoperands. 965 // Look at the source operands of every instruction to see if 966 // any of them results from a previous memory operation that affects 967 // its current usage. If so, an s_waitcnt instruction needs to be 968 // emitted. 969 // If the source operand was defined by a load, add the s_waitcnt 970 // instruction. 971 // 972 // Two cases are handled for destination operands: 973 // 1) If the destination operand was defined by a load, add the s_waitcnt 974 // instruction to guarantee the right WAW order. 975 // 2) If a destination operand that was used by a recent export/store ins, 976 // add s_waitcnt on exp_cnt to guarantee the WAR order. 977 for (const MachineMemOperand *Memop : MI.memoperands()) { 978 const Value *Ptr = Memop->getValue(); 979 if (Memop->isStore() && SLoadAddresses.count(Ptr)) { 980 addWait(Wait, LGKM_CNT, 0); 981 if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second)) 982 SLoadAddresses.erase(Ptr); 983 } 984 unsigned AS = Memop->getAddrSpace(); 985 if (AS != AMDGPUAS::LOCAL_ADDRESS) 986 continue; 987 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; 988 // VM_CNT is only relevant to vgpr or LDS. 989 ScoreBrackets.determineWait( 990 VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); 991 if (Memop->isStore()) { 992 ScoreBrackets.determineWait( 993 EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); 994 } 995 } 996 997 // Loop over use and def operands. 998 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 999 MachineOperand &Op = MI.getOperand(I); 1000 if (!Op.isReg()) 1001 continue; 1002 RegInterval Interval = 1003 ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I); 1004 1005 const bool IsVGPR = TRI->isVGPR(*MRI, Op.getReg()); 1006 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 1007 if (IsVGPR) { 1008 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the 1009 // previous write and this write are the same type of VMEM 1010 // instruction, in which case they're guaranteed to write their 1011 // results in order anyway. 1012 if (Op.isUse() || !SIInstrInfo::isVMEM(MI) || 1013 ScoreBrackets.hasOtherPendingVmemTypes(RegNo, 1014 getVmemType(MI))) { 1015 ScoreBrackets.determineWait( 1016 VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); 1017 ScoreBrackets.clearVgprVmemTypes(RegNo); 1018 } 1019 if (Op.isDef()) { 1020 ScoreBrackets.determineWait( 1021 EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); 1022 } 1023 } 1024 ScoreBrackets.determineWait( 1025 LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); 1026 } 1027 } 1028 } 1029 } 1030 1031 // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0 1032 // occurs before the instruction. Doing it here prevents any additional 1033 // S_WAITCNTs from being emitted if the instruction was marked as 1034 // requiring a WAITCNT beforehand. 1035 if (MI.getOpcode() == AMDGPU::S_BARRIER && 1036 !ST->hasAutoWaitcntBeforeBarrier()) { 1037 Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt())); 1038 } 1039 1040 // TODO: Remove this work-around, enable the assert for Bug 457939 1041 // after fixing the scheduler. Also, the Shader Compiler code is 1042 // independent of target. 1043 if (readsVCCZ(MI) && ST->hasReadVCCZBug()) { 1044 if (ScoreBrackets.getScoreLB(LGKM_CNT) < 1045 ScoreBrackets.getScoreUB(LGKM_CNT) && 1046 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { 1047 Wait.LgkmCnt = 0; 1048 } 1049 } 1050 1051 // Early-out if no wait is indicated. 1052 if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) { 1053 bool Modified = false; 1054 if (OldWaitcntInstr) { 1055 for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II); 1056 &*II != &MI; II = NextI, ++NextI) { 1057 if (II->isDebugInstr()) 1058 continue; 1059 1060 if (TrackedWaitcntSet.count(&*II)) { 1061 TrackedWaitcntSet.erase(&*II); 1062 II->eraseFromParent(); 1063 Modified = true; 1064 } else if (II->getOpcode() == AMDGPU::S_WAITCNT) { 1065 int64_t Imm = II->getOperand(0).getImm(); 1066 ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm)); 1067 } else { 1068 assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT); 1069 assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL); 1070 auto W = TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm(); 1071 ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt(~0u, ~0u, ~0u, W)); 1072 } 1073 } 1074 } 1075 return Modified; 1076 } 1077 1078 if (ForceEmitZeroWaitcnts) 1079 Wait = AMDGPU::Waitcnt::allZero(ST->hasVscnt()); 1080 1081 if (ForceEmitWaitcnt[VM_CNT]) 1082 Wait.VmCnt = 0; 1083 if (ForceEmitWaitcnt[EXP_CNT]) 1084 Wait.ExpCnt = 0; 1085 if (ForceEmitWaitcnt[LGKM_CNT]) 1086 Wait.LgkmCnt = 0; 1087 if (ForceEmitWaitcnt[VS_CNT]) 1088 Wait.VsCnt = 0; 1089 1090 ScoreBrackets.applyWaitcnt(Wait); 1091 1092 AMDGPU::Waitcnt OldWait; 1093 bool Modified = false; 1094 1095 if (OldWaitcntInstr) { 1096 for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II); 1097 &*II != &MI; II = NextI, NextI++) { 1098 if (II->isDebugInstr()) 1099 continue; 1100 1101 if (II->getOpcode() == AMDGPU::S_WAITCNT) { 1102 unsigned IEnc = II->getOperand(0).getImm(); 1103 AMDGPU::Waitcnt IWait = AMDGPU::decodeWaitcnt(IV, IEnc); 1104 OldWait = OldWait.combined(IWait); 1105 if (!TrackedWaitcntSet.count(&*II)) 1106 Wait = Wait.combined(IWait); 1107 unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait); 1108 if (IEnc != NewEnc) { 1109 II->getOperand(0).setImm(NewEnc); 1110 Modified = true; 1111 } 1112 Wait.VmCnt = ~0u; 1113 Wait.LgkmCnt = ~0u; 1114 Wait.ExpCnt = ~0u; 1115 } else { 1116 assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT); 1117 assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL); 1118 1119 unsigned ICnt = TII->getNamedOperand(*II, AMDGPU::OpName::simm16) 1120 ->getImm(); 1121 OldWait.VsCnt = std::min(OldWait.VsCnt, ICnt); 1122 if (!TrackedWaitcntSet.count(&*II)) 1123 Wait.VsCnt = std::min(Wait.VsCnt, ICnt); 1124 if (Wait.VsCnt != ICnt) { 1125 TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->setImm(Wait.VsCnt); 1126 Modified = true; 1127 } 1128 Wait.VsCnt = ~0u; 1129 } 1130 1131 LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" 1132 << "Old Instr: " << MI 1133 << "New Instr: " << *II << '\n'); 1134 1135 if (!Wait.hasWait()) 1136 return Modified; 1137 } 1138 } 1139 1140 if (Wait.VmCnt != ~0u || Wait.LgkmCnt != ~0u || Wait.ExpCnt != ~0u) { 1141 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); 1142 auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(), 1143 MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) 1144 .addImm(Enc); 1145 TrackedWaitcntSet.insert(SWaitInst); 1146 Modified = true; 1147 1148 LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" 1149 << "Old Instr: " << MI 1150 << "New Instr: " << *SWaitInst << '\n'); 1151 } 1152 1153 if (Wait.VsCnt != ~0u) { 1154 assert(ST->hasVscnt()); 1155 1156 auto SWaitInst = 1157 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1158 TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1159 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1160 .addImm(Wait.VsCnt); 1161 TrackedWaitcntSet.insert(SWaitInst); 1162 Modified = true; 1163 1164 LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" 1165 << "Old Instr: " << MI 1166 << "New Instr: " << *SWaitInst << '\n'); 1167 } 1168 1169 return Modified; 1170 } 1171 1172 // This is a flat memory operation. Check to see if it has memory tokens other 1173 // than LDS. Other address spaces supported by flat memory operations involve 1174 // global memory. 1175 bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const { 1176 assert(TII->isFLAT(MI)); 1177 1178 // All flat instructions use the VMEM counter. 1179 assert(TII->usesVM_CNT(MI)); 1180 1181 // If there are no memory operands then conservatively assume the flat 1182 // operation may access VMEM. 1183 if (MI.memoperands_empty()) 1184 return true; 1185 1186 // See if any memory operand specifies an address space that involves VMEM. 1187 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces 1188 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION 1189 // (GDS) address space is not supported by flat operations. Therefore, simply 1190 // return true unless only the LDS address space is found. 1191 for (const MachineMemOperand *Memop : MI.memoperands()) { 1192 unsigned AS = Memop->getAddrSpace(); 1193 assert(AS != AMDGPUAS::REGION_ADDRESS); 1194 if (AS != AMDGPUAS::LOCAL_ADDRESS) 1195 return true; 1196 } 1197 1198 return false; 1199 } 1200 1201 // This is a flat memory operation. Check to see if it has memory tokens for 1202 // either LDS or FLAT. 1203 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { 1204 assert(TII->isFLAT(MI)); 1205 1206 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter. 1207 if (!TII->usesLGKM_CNT(MI)) 1208 return false; 1209 1210 // If there are no memory operands then conservatively assume the flat 1211 // operation may access LDS. 1212 if (MI.memoperands_empty()) 1213 return true; 1214 1215 // See if any memory operand specifies an address space that involves LDS. 1216 for (const MachineMemOperand *Memop : MI.memoperands()) { 1217 unsigned AS = Memop->getAddrSpace(); 1218 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) 1219 return true; 1220 } 1221 1222 return false; 1223 } 1224 1225 void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, 1226 WaitcntBrackets *ScoreBrackets) { 1227 // Now look at the instruction opcode. If it is a memory access 1228 // instruction, update the upper-bound of the appropriate counter's 1229 // bracket and the destination operand scores. 1230 // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere. 1231 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) { 1232 if (TII->isAlwaysGDS(Inst.getOpcode()) || 1233 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) { 1234 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst); 1235 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst); 1236 } else { 1237 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); 1238 } 1239 } else if (TII->isFLAT(Inst)) { 1240 assert(Inst.mayLoadOrStore()); 1241 1242 int FlatASCount = 0; 1243 1244 if (mayAccessVMEMThroughFlat(Inst)) { 1245 ++FlatASCount; 1246 if (!ST->hasVscnt()) 1247 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); 1248 else if (Inst.mayLoad() && 1249 AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1) 1250 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst); 1251 else 1252 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst); 1253 } 1254 1255 if (mayAccessLDSThroughFlat(Inst)) { 1256 ++FlatASCount; 1257 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); 1258 } 1259 1260 // A Flat memory operation must access at least one address space. 1261 assert(FlatASCount); 1262 1263 // This is a flat memory operation that access both VMEM and LDS, so note it 1264 // - it will require that both the VM and LGKM be flushed to zero if it is 1265 // pending when a VM or LGKM dependency occurs. 1266 if (FlatASCount > 1) 1267 ScoreBrackets->setPendingFlat(); 1268 } else if (SIInstrInfo::isVMEM(Inst) && 1269 // TODO: get a better carve out. 1270 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 && 1271 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC && 1272 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL && 1273 Inst.getOpcode() != AMDGPU::BUFFER_GL0_INV && 1274 Inst.getOpcode() != AMDGPU::BUFFER_GL1_INV) { 1275 if (!ST->hasVscnt()) 1276 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); 1277 else if ((Inst.mayLoad() && 1278 AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1) || 1279 /* IMAGE_GET_RESINFO / IMAGE_GET_LOD */ 1280 (TII->isMIMG(Inst) && !Inst.mayLoad() && !Inst.mayStore())) 1281 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst); 1282 else if (Inst.mayStore()) 1283 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst); 1284 1285 if (ST->vmemWriteNeedsExpWaitcnt() && 1286 (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) { 1287 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst); 1288 } 1289 } else if (TII->isSMRD(Inst)) { 1290 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); 1291 } else if (Inst.isCall()) { 1292 if (callWaitsOnFunctionReturn(Inst)) { 1293 // Act as a wait on everything 1294 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(ST->hasVscnt())); 1295 } else { 1296 // May need to way wait for anything. 1297 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt()); 1298 } 1299 } else if (SIInstrInfo::isEXP(Inst)) { 1300 int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm(); 1301 if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31) 1302 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst); 1303 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST) 1304 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst); 1305 else 1306 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst); 1307 } else { 1308 switch (Inst.getOpcode()) { 1309 case AMDGPU::S_SENDMSG: 1310 case AMDGPU::S_SENDMSGHALT: 1311 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst); 1312 break; 1313 case AMDGPU::S_MEMTIME: 1314 case AMDGPU::S_MEMREALTIME: 1315 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); 1316 break; 1317 } 1318 } 1319 } 1320 1321 bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score, 1322 unsigned OtherScore) { 1323 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift; 1324 unsigned OtherShifted = 1325 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift; 1326 Score = std::max(MyShifted, OtherShifted); 1327 return OtherShifted > MyShifted; 1328 } 1329 1330 /// Merge the pending events and associater score brackets of \p Other into 1331 /// this brackets status. 1332 /// 1333 /// Returns whether the merge resulted in a change that requires tighter waits 1334 /// (i.e. the merged brackets strictly dominate the original brackets). 1335 bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { 1336 bool StrictDom = false; 1337 1338 VgprUB = std::max(VgprUB, Other.VgprUB); 1339 SgprUB = std::max(SgprUB, Other.SgprUB); 1340 1341 for (auto T : inst_counter_types()) { 1342 // Merge event flags for this counter 1343 const bool OldOutOfOrder = counterOutOfOrder(T); 1344 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T]; 1345 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T]; 1346 if (OtherEvents & ~OldEvents) 1347 StrictDom = true; 1348 PendingEvents |= OtherEvents; 1349 1350 // Merge scores for this counter 1351 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T]; 1352 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T]; 1353 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending); 1354 if (NewUB < ScoreLBs[T]) 1355 report_fatal_error("waitcnt score overflow"); 1356 1357 MergeInfo M; 1358 M.OldLB = ScoreLBs[T]; 1359 M.OtherLB = Other.ScoreLBs[T]; 1360 M.MyShift = NewUB - ScoreUBs[T]; 1361 M.OtherShift = NewUB - Other.ScoreUBs[T]; 1362 1363 ScoreUBs[T] = NewUB; 1364 1365 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]); 1366 1367 bool RegStrictDom = false; 1368 for (int J = 0; J <= VgprUB; J++) { 1369 RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]); 1370 } 1371 1372 if (T == VM_CNT) { 1373 for (int J = 0; J <= VgprUB; J++) { 1374 unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J]; 1375 RegStrictDom |= NewVmemTypes != VgprVmemTypes[J]; 1376 VgprVmemTypes[J] = NewVmemTypes; 1377 } 1378 } 1379 1380 if (T == LGKM_CNT) { 1381 for (int J = 0; J <= SgprUB; J++) { 1382 RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]); 1383 } 1384 } 1385 1386 if (RegStrictDom && !OldOutOfOrder) 1387 StrictDom = true; 1388 } 1389 1390 return StrictDom; 1391 } 1392 1393 // Generate s_waitcnt instructions where needed. 1394 bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, 1395 MachineBasicBlock &Block, 1396 WaitcntBrackets &ScoreBrackets) { 1397 bool Modified = false; 1398 1399 LLVM_DEBUG({ 1400 dbgs() << "*** Block" << Block.getNumber() << " ***"; 1401 ScoreBrackets.dump(); 1402 }); 1403 1404 // Track the correctness of vccz through this basic block. There are two 1405 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and 1406 // ST->partialVCCWritesUpdateVCCZ(). 1407 bool VCCZCorrect = true; 1408 if (ST->hasReadVCCZBug()) { 1409 // vccz could be incorrect at a basic block boundary if a predecessor wrote 1410 // to vcc and then issued an smem load. 1411 VCCZCorrect = false; 1412 } else if (!ST->partialVCCWritesUpdateVCCZ()) { 1413 // vccz could be incorrect at a basic block boundary if a predecessor wrote 1414 // to vcc_lo or vcc_hi. 1415 VCCZCorrect = false; 1416 } 1417 1418 // Walk over the instructions. 1419 MachineInstr *OldWaitcntInstr = nullptr; 1420 1421 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(), 1422 E = Block.instr_end(); 1423 Iter != E;) { 1424 MachineInstr &Inst = *Iter; 1425 1426 // Track pre-existing waitcnts from earlier iterations. 1427 if (Inst.getOpcode() == AMDGPU::S_WAITCNT || 1428 (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT && 1429 Inst.getOperand(0).isReg() && 1430 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL)) { 1431 if (!OldWaitcntInstr) 1432 OldWaitcntInstr = &Inst; 1433 ++Iter; 1434 continue; 1435 } 1436 1437 // Generate an s_waitcnt instruction to be placed before Inst, if needed. 1438 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr); 1439 OldWaitcntInstr = nullptr; 1440 1441 // Restore vccz if it's not known to be correct already. 1442 bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst); 1443 1444 // Don't examine operands unless we need to track vccz correctness. 1445 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) { 1446 if (Inst.definesRegister(AMDGPU::VCC_LO) || 1447 Inst.definesRegister(AMDGPU::VCC_HI)) { 1448 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz. 1449 if (!ST->partialVCCWritesUpdateVCCZ()) 1450 VCCZCorrect = false; 1451 } else if (Inst.definesRegister(AMDGPU::VCC)) { 1452 // There is a hardware bug on CI/SI where SMRD instruction may corrupt 1453 // vccz bit, so when we detect that an instruction may read from a 1454 // corrupt vccz bit, we need to: 1455 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD 1456 // operations to complete. 1457 // 2. Restore the correct value of vccz by writing the current value 1458 // of vcc back to vcc. 1459 if (ST->hasReadVCCZBug() && 1460 ScoreBrackets.getScoreLB(LGKM_CNT) < 1461 ScoreBrackets.getScoreUB(LGKM_CNT) && 1462 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { 1463 // Writes to vcc while there's an outstanding smem read may get 1464 // clobbered as soon as any read completes. 1465 VCCZCorrect = false; 1466 } else { 1467 // Writes to vcc will fix any incorrect value in vccz. 1468 VCCZCorrect = true; 1469 } 1470 } 1471 } 1472 1473 if (TII->isSMRD(Inst)) { 1474 for (const MachineMemOperand *Memop : Inst.memoperands()) { 1475 const Value *Ptr = Memop->getValue(); 1476 SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent())); 1477 } 1478 if (ST->hasReadVCCZBug()) { 1479 // This smem read could complete and clobber vccz at any time. 1480 VCCZCorrect = false; 1481 } 1482 } 1483 1484 updateEventWaitcntAfter(Inst, &ScoreBrackets); 1485 1486 #if 0 // TODO: implement resource type check controlled by options with ub = LB. 1487 // If this instruction generates a S_SETVSKIP because it is an 1488 // indexed resource, and we are on Tahiti, then it will also force 1489 // an S_WAITCNT vmcnt(0) 1490 if (RequireCheckResourceType(Inst, context)) { 1491 // Force the score to as if an S_WAITCNT vmcnt(0) is emitted. 1492 ScoreBrackets->setScoreLB(VM_CNT, 1493 ScoreBrackets->getScoreUB(VM_CNT)); 1494 } 1495 #endif 1496 1497 LLVM_DEBUG({ 1498 Inst.print(dbgs()); 1499 ScoreBrackets.dump(); 1500 }); 1501 1502 // TODO: Remove this work-around after fixing the scheduler and enable the 1503 // assert above. 1504 if (RestoreVCCZ) { 1505 // Restore the vccz bit. Any time a value is written to vcc, the vcc 1506 // bit is updated, so we can restore the bit by reading the value of 1507 // vcc and then writing it back to the register. 1508 BuildMI(Block, Inst, Inst.getDebugLoc(), 1509 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), 1510 TRI->getVCC()) 1511 .addReg(TRI->getVCC()); 1512 VCCZCorrect = true; 1513 Modified = true; 1514 } 1515 1516 ++Iter; 1517 } 1518 1519 return Modified; 1520 } 1521 1522 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { 1523 ST = &MF.getSubtarget<GCNSubtarget>(); 1524 TII = ST->getInstrInfo(); 1525 TRI = &TII->getRegisterInfo(); 1526 MRI = &MF.getRegInfo(); 1527 IV = AMDGPU::getIsaVersion(ST->getCPU()); 1528 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1529 PDT = &getAnalysis<MachinePostDominatorTree>(); 1530 1531 ForceEmitZeroWaitcnts = ForceEmitZeroFlag; 1532 for (auto T : inst_counter_types()) 1533 ForceEmitWaitcnt[T] = false; 1534 1535 HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV); 1536 HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV); 1537 HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV); 1538 HardwareLimits.VscntMax = ST->hasVscnt() ? 63 : 0; 1539 1540 unsigned NumVGPRsMax = ST->getAddressableNumVGPRs(); 1541 unsigned NumSGPRsMax = ST->getAddressableNumSGPRs(); 1542 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS); 1543 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS); 1544 1545 RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0); 1546 RegisterEncoding.VGPRL = RegisterEncoding.VGPR0 + NumVGPRsMax - 1; 1547 RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0); 1548 RegisterEncoding.SGPRL = RegisterEncoding.SGPR0 + NumSGPRsMax - 1; 1549 1550 TrackedWaitcntSet.clear(); 1551 BlockInfos.clear(); 1552 1553 // Keep iterating over the blocks in reverse post order, inserting and 1554 // updating s_waitcnt where needed, until a fix point is reached. 1555 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF)) 1556 BlockInfos.insert({MBB, BlockInfo(MBB)}); 1557 1558 std::unique_ptr<WaitcntBrackets> Brackets; 1559 bool Modified = false; 1560 bool Repeat; 1561 do { 1562 Repeat = false; 1563 1564 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE; 1565 ++BII) { 1566 BlockInfo &BI = BII->second; 1567 if (!BI.Dirty) 1568 continue; 1569 1570 if (BI.Incoming) { 1571 if (!Brackets) 1572 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming); 1573 else 1574 *Brackets = *BI.Incoming; 1575 } else { 1576 if (!Brackets) 1577 Brackets = std::make_unique<WaitcntBrackets>(ST); 1578 else 1579 *Brackets = WaitcntBrackets(ST); 1580 } 1581 1582 Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets); 1583 BI.Dirty = false; 1584 1585 if (Brackets->hasPending()) { 1586 BlockInfo *MoveBracketsToSucc = nullptr; 1587 for (MachineBasicBlock *Succ : BI.MBB->successors()) { 1588 auto SuccBII = BlockInfos.find(Succ); 1589 BlockInfo &SuccBI = SuccBII->second; 1590 if (!SuccBI.Incoming) { 1591 SuccBI.Dirty = true; 1592 if (SuccBII <= BII) 1593 Repeat = true; 1594 if (!MoveBracketsToSucc) { 1595 MoveBracketsToSucc = &SuccBI; 1596 } else { 1597 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets); 1598 } 1599 } else if (SuccBI.Incoming->merge(*Brackets)) { 1600 SuccBI.Dirty = true; 1601 if (SuccBII <= BII) 1602 Repeat = true; 1603 } 1604 } 1605 if (MoveBracketsToSucc) 1606 MoveBracketsToSucc->Incoming = std::move(Brackets); 1607 } 1608 } 1609 } while (Repeat); 1610 1611 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks; 1612 1613 bool HaveScalarStores = false; 1614 1615 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; 1616 ++BI) { 1617 MachineBasicBlock &MBB = *BI; 1618 1619 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 1620 ++I) { 1621 if (!HaveScalarStores && TII->isScalarStore(*I)) 1622 HaveScalarStores = true; 1623 1624 if (I->getOpcode() == AMDGPU::S_ENDPGM || 1625 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) 1626 EndPgmBlocks.push_back(&MBB); 1627 } 1628 } 1629 1630 if (HaveScalarStores) { 1631 // If scalar writes are used, the cache must be flushed or else the next 1632 // wave to reuse the same scratch memory can be clobbered. 1633 // 1634 // Insert s_dcache_wb at wave termination points if there were any scalar 1635 // stores, and only if the cache hasn't already been flushed. This could be 1636 // improved by looking across blocks for flushes in postdominating blocks 1637 // from the stores but an explicitly requested flush is probably very rare. 1638 for (MachineBasicBlock *MBB : EndPgmBlocks) { 1639 bool SeenDCacheWB = false; 1640 1641 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; 1642 ++I) { 1643 if (I->getOpcode() == AMDGPU::S_DCACHE_WB) 1644 SeenDCacheWB = true; 1645 else if (TII->isScalarStore(*I)) 1646 SeenDCacheWB = false; 1647 1648 // FIXME: It would be better to insert this before a waitcnt if any. 1649 if ((I->getOpcode() == AMDGPU::S_ENDPGM || 1650 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && 1651 !SeenDCacheWB) { 1652 Modified = true; 1653 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB)); 1654 } 1655 } 1656 } 1657 } 1658 1659 if (!MFI->isEntryFunction()) { 1660 // Wait for any outstanding memory operations that the input registers may 1661 // depend on. We can't track them and it's better to the wait after the 1662 // costly call sequence. 1663 1664 // TODO: Could insert earlier and schedule more liberally with operations 1665 // that only use caller preserved registers. 1666 MachineBasicBlock &EntryBB = MF.front(); 1667 MachineBasicBlock::iterator I = EntryBB.begin(); 1668 for (MachineBasicBlock::iterator E = EntryBB.end(); 1669 I != E && (I->isPHI() || I->isMetaInstruction()); ++I) 1670 ; 1671 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); 1672 if (ST->hasVscnt()) 1673 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1674 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1675 .addImm(0); 1676 1677 Modified = true; 1678 } 1679 1680 return Modified; 1681 } 1682