1 //===-- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===/ 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Insert wait instructions for memory reads and writes. 12 /// 13 /// Memory reads and writes are issued asynchronously, so we need to insert 14 /// S_WAITCNT instructions when we want to access any of their results or 15 /// overwrite any register that's used asynchronously. 16 // 17 //===----------------------------------------------------------------------===// 18 19 #include "AMDGPU.h" 20 #include "AMDGPUSubtarget.h" 21 #include "SIDefines.h" 22 #include "SIInstrInfo.h" 23 #include "SIMachineFunctionInfo.h" 24 #include "Utils/AMDGPUBaseInfo.h" 25 #include "llvm/ADT/PostOrderIterator.h" 26 #include "llvm/CodeGen/MachineFunction.h" 27 #include "llvm/CodeGen/MachineFunctionPass.h" 28 #include "llvm/CodeGen/MachineInstrBuilder.h" 29 #include "llvm/CodeGen/MachineRegisterInfo.h" 30 31 #define DEBUG_TYPE "si-insert-waitcnts" 32 33 using namespace llvm; 34 35 namespace { 36 37 // Class of object that encapsulates latest instruction counter score 38 // associated with the operand. Used for determining whether 39 // s_waitcnt instruction needs to be emited. 40 41 #define CNT_MASK(t) (1u << (t)) 42 43 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS }; 44 45 typedef std::pair<signed, signed> RegInterval; 46 47 struct { 48 int32_t VmcntMax; 49 int32_t ExpcntMax; 50 int32_t LgkmcntMax; 51 int32_t NumVGPRsMax; 52 int32_t NumSGPRsMax; 53 } HardwareLimits; 54 55 struct { 56 unsigned VGPR0; 57 unsigned VGPRL; 58 unsigned SGPR0; 59 unsigned SGPRL; 60 } RegisterEncoding; 61 62 enum WaitEventType { 63 VMEM_ACCESS, // vector-memory read & write 64 LDS_ACCESS, // lds read & write 65 GDS_ACCESS, // gds read & write 66 SQ_MESSAGE, // send message 67 SMEM_ACCESS, // scalar-memory read & write 68 EXP_GPR_LOCK, // export holding on its data src 69 GDS_GPR_LOCK, // GDS holding on its data and addr src 70 EXP_POS_ACCESS, // write to export position 71 EXP_PARAM_ACCESS, // write to export parameter 72 VMW_GPR_LOCK, // vector-memory write holding on its data src 73 NUM_WAIT_EVENTS, 74 }; 75 76 // The mapping is: 77 // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs 78 // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots 79 // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs 80 // We reserve a fixed number of VGPR slots in the scoring tables for 81 // special tokens like SCMEM_LDS (needed for buffer load to LDS). 82 enum RegisterMapping { 83 SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets. 84 SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets. 85 NUM_EXTRA_VGPRS = 1, // A reserved slot for DS. 86 EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses. 87 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts. 88 }; 89 90 #define ForAllWaitEventType(w) \ 91 for (enum WaitEventType w = (enum WaitEventType)0; \ 92 (w) < (enum WaitEventType)NUM_WAIT_EVENTS; \ 93 (w) = (enum WaitEventType)((w) + 1)) 94 95 // This is a per-basic-block object that maintains current score brackets 96 // of each wait-counter, and a per-register scoreboard for each wait-couner. 97 // We also maintain the latest score for every event type that can change the 98 // waitcnt in order to know if there are multiple types of events within 99 // the brackets. When multiple types of event happen in the bracket, 100 // wait-count may get decreased out of order, therefore we need to put in 101 // "s_waitcnt 0" before use. 102 class BlockWaitcntBrackets { 103 public: 104 static int32_t getWaitCountMax(InstCounterType T) { 105 switch (T) { 106 case VM_CNT: 107 return HardwareLimits.VmcntMax; 108 case LGKM_CNT: 109 return HardwareLimits.LgkmcntMax; 110 case EXP_CNT: 111 return HardwareLimits.ExpcntMax; 112 default: 113 break; 114 } 115 return 0; 116 }; 117 118 void setScoreLB(InstCounterType T, int32_t Val) { 119 assert(T < NUM_INST_CNTS); 120 if (T >= NUM_INST_CNTS) 121 return; 122 ScoreLBs[T] = Val; 123 }; 124 125 void setScoreUB(InstCounterType T, int32_t Val) { 126 assert(T < NUM_INST_CNTS); 127 if (T >= NUM_INST_CNTS) 128 return; 129 ScoreUBs[T] = Val; 130 if (T == EXP_CNT) { 131 int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT)); 132 if (ScoreLBs[T] < UB) 133 ScoreLBs[T] = UB; 134 } 135 }; 136 137 int32_t getScoreLB(InstCounterType T) { 138 assert(T < NUM_INST_CNTS); 139 if (T >= NUM_INST_CNTS) 140 return 0; 141 return ScoreLBs[T]; 142 }; 143 144 int32_t getScoreUB(InstCounterType T) { 145 assert(T < NUM_INST_CNTS); 146 if (T >= NUM_INST_CNTS) 147 return 0; 148 return ScoreUBs[T]; 149 }; 150 151 // Mapping from event to counter. 152 InstCounterType eventCounter(WaitEventType E) { 153 switch (E) { 154 case VMEM_ACCESS: 155 return VM_CNT; 156 case LDS_ACCESS: 157 case GDS_ACCESS: 158 case SQ_MESSAGE: 159 case SMEM_ACCESS: 160 return LGKM_CNT; 161 case EXP_GPR_LOCK: 162 case GDS_GPR_LOCK: 163 case VMW_GPR_LOCK: 164 case EXP_POS_ACCESS: 165 case EXP_PARAM_ACCESS: 166 return EXP_CNT; 167 default: 168 llvm_unreachable("unhandled event type"); 169 } 170 return NUM_INST_CNTS; 171 } 172 173 void setRegScore(int GprNo, InstCounterType T, int32_t Val) { 174 if (GprNo < NUM_ALL_VGPRS) { 175 if (GprNo > VgprUB) { 176 VgprUB = GprNo; 177 } 178 VgprScores[T][GprNo] = Val; 179 } else { 180 assert(T == LGKM_CNT); 181 if (GprNo - NUM_ALL_VGPRS > SgprUB) { 182 SgprUB = GprNo - NUM_ALL_VGPRS; 183 } 184 SgprScores[GprNo - NUM_ALL_VGPRS] = Val; 185 } 186 } 187 188 int32_t getRegScore(int GprNo, InstCounterType T) { 189 if (GprNo < NUM_ALL_VGPRS) { 190 return VgprScores[T][GprNo]; 191 } 192 return SgprScores[GprNo - NUM_ALL_VGPRS]; 193 } 194 195 void clear() { 196 memset(ScoreLBs, 0, sizeof(ScoreLBs)); 197 memset(ScoreUBs, 0, sizeof(ScoreUBs)); 198 memset(EventUBs, 0, sizeof(EventUBs)); 199 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; 200 T = (enum InstCounterType)(T + 1)) { 201 memset(VgprScores[T], 0, sizeof(VgprScores[T])); 202 } 203 memset(SgprScores, 0, sizeof(SgprScores)); 204 } 205 206 RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII, 207 const MachineRegisterInfo *MRI, 208 const SIRegisterInfo *TRI, unsigned OpNo, 209 bool Def) const; 210 211 void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII, 212 const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, 213 unsigned OpNo, int32_t Val); 214 215 void setWaitAtBeginning() { WaitAtBeginning = true; } 216 void clearWaitAtBeginning() { WaitAtBeginning = false; } 217 bool getWaitAtBeginning() const { return WaitAtBeginning; } 218 void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; } 219 int32_t getMaxVGPR() const { return VgprUB; } 220 int32_t getMaxSGPR() const { return SgprUB; } 221 int32_t getEventUB(enum WaitEventType W) const { 222 assert(W < NUM_WAIT_EVENTS); 223 return EventUBs[W]; 224 } 225 bool counterOutOfOrder(InstCounterType T); 226 unsigned int updateByWait(InstCounterType T, int ScoreToWait); 227 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, 228 const MachineRegisterInfo *MRI, WaitEventType E, 229 MachineInstr &MI); 230 231 BlockWaitcntBrackets() 232 : WaitAtBeginning(false), RevisitLoop(false), ValidLoop(false), MixedExpTypes(false), 233 LoopRegion(NULL), PostOrder(0), Waitcnt(NULL), VgprUB(0), SgprUB(0) { 234 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; 235 T = (enum InstCounterType)(T + 1)) { 236 memset(VgprScores[T], 0, sizeof(VgprScores[T])); 237 } 238 } 239 ~BlockWaitcntBrackets(){}; 240 241 bool hasPendingSMEM() const { 242 return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] && 243 EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]); 244 } 245 246 bool hasPendingFlat() const { 247 return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] && 248 LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) || 249 (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] && 250 LastFlat[VM_CNT] <= ScoreUBs[VM_CNT])); 251 } 252 253 void setPendingFlat() { 254 LastFlat[VM_CNT] = ScoreUBs[VM_CNT]; 255 LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT]; 256 } 257 258 int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; } 259 260 void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; } 261 262 bool getRevisitLoop() const { return RevisitLoop; } 263 void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; } 264 265 void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; } 266 int32_t getPostOrder() const { return PostOrder; } 267 268 void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; } 269 void clearWaitcnt() { Waitcnt = NULL; } 270 MachineInstr *getWaitcnt() const { return Waitcnt; } 271 272 bool mixedExpTypes() const { return MixedExpTypes; } 273 void setMixedExpTypes(bool MixedExpTypesIn) { 274 MixedExpTypes = MixedExpTypesIn; 275 } 276 277 void print(raw_ostream &); 278 void dump() { print(dbgs()); } 279 280 private: 281 bool WaitAtBeginning; 282 bool RevisitLoop; 283 bool ValidLoop; 284 bool MixedExpTypes; 285 MachineLoop *LoopRegion; 286 int32_t PostOrder; 287 MachineInstr *Waitcnt; 288 int32_t ScoreLBs[NUM_INST_CNTS] = {0}; 289 int32_t ScoreUBs[NUM_INST_CNTS] = {0}; 290 int32_t EventUBs[NUM_WAIT_EVENTS] = {0}; 291 // Remember the last flat memory operation. 292 int32_t LastFlat[NUM_INST_CNTS] = {0}; 293 // wait_cnt scores for every vgpr. 294 // Keep track of the VgprUB and SgprUB to make merge at join efficient. 295 int32_t VgprUB; 296 int32_t SgprUB; 297 int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS]; 298 // Wait cnt scores for every sgpr, only lgkmcnt is relevant. 299 int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0}; 300 }; 301 302 // This is a per-loop-region object that records waitcnt status at the end of 303 // loop footer from the previous iteration. We also maintain an iteration 304 // count to track the number of times the loop has been visited. When it 305 // doesn't converge naturally, we force convergence by inserting s_waitcnt 0 306 // at the end of the loop footer. 307 class LoopWaitcntData { 308 public: 309 void incIterCnt() { IterCnt++; } 310 void resetIterCnt() { IterCnt = 0; } 311 int32_t getIterCnt() { return IterCnt; } 312 313 LoopWaitcntData() : LfWaitcnt(NULL), IterCnt(0) {} 314 ~LoopWaitcntData(){}; 315 316 void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; } 317 MachineInstr *getWaitcnt() const { return LfWaitcnt; } 318 319 void print() { 320 DEBUG(dbgs() << " iteration " << IterCnt << '\n';); 321 return; 322 } 323 324 private: 325 // s_waitcnt added at the end of loop footer to stablize wait scores 326 // at the end of the loop footer. 327 MachineInstr *LfWaitcnt; 328 // Number of iterations the loop has been visited, not including the initial 329 // walk over. 330 int32_t IterCnt; 331 }; 332 333 class SIInsertWaitcnts : public MachineFunctionPass { 334 335 private: 336 const SISubtarget *ST; 337 const SIInstrInfo *TII; 338 const SIRegisterInfo *TRI; 339 const MachineRegisterInfo *MRI; 340 const MachineLoopInfo *MLI; 341 AMDGPU::IsaInfo::IsaVersion IV; 342 AMDGPUAS AMDGPUASI; 343 344 DenseSet<MachineBasicBlock *> BlockVisitedSet; 345 DenseSet<MachineInstr *> CompilerGeneratedWaitcntSet; 346 DenseSet<MachineInstr *> VCCZBugHandledSet; 347 348 DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>> 349 BlockWaitcntBracketsMap; 350 351 DenseSet<MachineBasicBlock *> BlockWaitcntProcessedSet; 352 353 DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap; 354 355 std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets; 356 357 public: 358 static char ID; 359 360 SIInsertWaitcnts() 361 : MachineFunctionPass(ID), ST(nullptr), TII(nullptr), TRI(nullptr), 362 MRI(nullptr), MLI(nullptr) {} 363 364 bool runOnMachineFunction(MachineFunction &MF) override; 365 366 StringRef getPassName() const override { 367 return "SI insert wait instructions"; 368 } 369 370 void getAnalysisUsage(AnalysisUsage &AU) const override { 371 AU.setPreservesCFG(); 372 AU.addRequired<MachineLoopInfo>(); 373 MachineFunctionPass::getAnalysisUsage(AU); 374 } 375 376 void addKillWaitBracket(BlockWaitcntBrackets *Bracket) { 377 // The waitcnt information is copied because it changes as the block is 378 // traversed. 379 KillWaitBrackets.push_back(make_unique<BlockWaitcntBrackets>(*Bracket)); 380 } 381 382 MachineInstr *generateSWaitCntInstBefore(MachineInstr &MI, 383 BlockWaitcntBrackets *ScoreBrackets); 384 void updateEventWaitCntAfter(MachineInstr &Inst, 385 BlockWaitcntBrackets *ScoreBrackets); 386 void mergeInputScoreBrackets(MachineBasicBlock &Block); 387 MachineBasicBlock *loopBottom(const MachineLoop *Loop); 388 void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block); 389 void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst); 390 }; 391 392 } // End anonymous namespace. 393 394 RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI, 395 const SIInstrInfo *TII, 396 const MachineRegisterInfo *MRI, 397 const SIRegisterInfo *TRI, 398 unsigned OpNo, 399 bool Def) const { 400 const MachineOperand &Op = MI->getOperand(OpNo); 401 if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) || 402 (Def && !Op.isDef())) 403 return {-1, -1}; 404 405 // A use via a PW operand does not need a waitcnt. 406 // A partial write is not a WAW. 407 assert(!Op.getSubReg() || !Op.isUndef()); 408 409 RegInterval Result; 410 const MachineRegisterInfo &MRIA = *MRI; 411 412 unsigned Reg = TRI->getEncodingValue(Op.getReg()); 413 414 if (TRI->isVGPR(MRIA, Op.getReg())) { 415 assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL); 416 Result.first = Reg - RegisterEncoding.VGPR0; 417 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS); 418 } else if (TRI->isSGPRReg(MRIA, Op.getReg())) { 419 assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS); 420 Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS; 421 assert(Result.first >= NUM_ALL_VGPRS && 422 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS); 423 } 424 // TODO: Handle TTMP 425 // else if (TRI->isTTMP(MRIA, Reg.getReg())) ... 426 else 427 return {-1, -1}; 428 429 const MachineInstr &MIA = *MI; 430 const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo); 431 unsigned Size = TRI->getRegSizeInBits(*RC); 432 Result.second = Result.first + (Size / 32); 433 434 return Result; 435 } 436 437 void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI, 438 const SIInstrInfo *TII, 439 const SIRegisterInfo *TRI, 440 const MachineRegisterInfo *MRI, 441 unsigned OpNo, int32_t Val) { 442 RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false); 443 DEBUG({ 444 const MachineOperand &Opnd = MI->getOperand(OpNo); 445 assert(TRI->isVGPR(*MRI, Opnd.getReg())); 446 }); 447 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 448 setRegScore(RegNo, EXP_CNT, Val); 449 } 450 } 451 452 void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII, 453 const SIRegisterInfo *TRI, 454 const MachineRegisterInfo *MRI, 455 WaitEventType E, MachineInstr &Inst) { 456 const MachineRegisterInfo &MRIA = *MRI; 457 InstCounterType T = eventCounter(E); 458 int32_t CurrScore = getScoreUB(T) + 1; 459 // EventUB and ScoreUB need to be update regardless if this event changes 460 // the score of a register or not. 461 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message. 462 EventUBs[E] = CurrScore; 463 setScoreUB(T, CurrScore); 464 465 if (T == EXP_CNT) { 466 // Check for mixed export types. If they are mixed, then a waitcnt exp(0) 467 // is required. 468 if (!MixedExpTypes) { 469 MixedExpTypes = counterOutOfOrder(EXP_CNT); 470 } 471 472 // Put score on the source vgprs. If this is a store, just use those 473 // specific register(s). 474 if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) { 475 // All GDS operations must protect their address register (same as 476 // export.) 477 if (Inst.getOpcode() != AMDGPU::DS_APPEND && 478 Inst.getOpcode() != AMDGPU::DS_CONSUME) { 479 setExpScore( 480 &Inst, TII, TRI, MRI, 481 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr), 482 CurrScore); 483 } 484 if (Inst.mayStore()) { 485 setExpScore( 486 &Inst, TII, TRI, MRI, 487 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0), 488 CurrScore); 489 if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), 490 AMDGPU::OpName::data1) != -1) { 491 setExpScore(&Inst, TII, TRI, MRI, 492 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), 493 AMDGPU::OpName::data1), 494 CurrScore); 495 } 496 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 && 497 Inst.getOpcode() != AMDGPU::DS_GWS_INIT && 498 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V && 499 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR && 500 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P && 501 Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER && 502 Inst.getOpcode() != AMDGPU::DS_APPEND && 503 Inst.getOpcode() != AMDGPU::DS_CONSUME && 504 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) { 505 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 506 const MachineOperand &Op = Inst.getOperand(I); 507 if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) { 508 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore); 509 } 510 } 511 } 512 } else if (TII->isFLAT(Inst)) { 513 if (Inst.mayStore()) { 514 setExpScore( 515 &Inst, TII, TRI, MRI, 516 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 517 CurrScore); 518 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) { 519 setExpScore( 520 &Inst, TII, TRI, MRI, 521 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 522 CurrScore); 523 } 524 } else if (TII->isMIMG(Inst)) { 525 if (Inst.mayStore()) { 526 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); 527 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) { 528 setExpScore( 529 &Inst, TII, TRI, MRI, 530 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 531 CurrScore); 532 } 533 } else if (TII->isMTBUF(Inst)) { 534 if (Inst.mayStore()) { 535 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); 536 } 537 } else if (TII->isMUBUF(Inst)) { 538 if (Inst.mayStore()) { 539 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); 540 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) { 541 setExpScore( 542 &Inst, TII, TRI, MRI, 543 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 544 CurrScore); 545 } 546 } else { 547 if (TII->isEXP(Inst)) { 548 // For export the destination registers are really temps that 549 // can be used as the actual source after export patching, so 550 // we need to treat them like sources and set the EXP_CNT 551 // score. 552 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 553 MachineOperand &DefMO = Inst.getOperand(I); 554 if (DefMO.isReg() && DefMO.isDef() && 555 TRI->isVGPR(MRIA, DefMO.getReg())) { 556 setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT, 557 CurrScore); 558 } 559 } 560 } 561 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 562 MachineOperand &MO = Inst.getOperand(I); 563 if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) { 564 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore); 565 } 566 } 567 } 568 #if 0 // TODO: check if this is handled by MUBUF code above. 569 } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD || 570 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 || 571 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) { 572 MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data); 573 unsigned OpNo;//TODO: find the OpNo for this operand; 574 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false); 575 for (signed RegNo = Interval.first; RegNo < Interval.second; 576 ++RegNo) { 577 setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore); 578 } 579 #endif 580 } else { 581 // Match the score to the destination registers. 582 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 583 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true); 584 if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS) 585 continue; 586 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 587 setRegScore(RegNo, T, CurrScore); 588 } 589 } 590 if (TII->isDS(Inst) && Inst.mayStore()) { 591 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore); 592 } 593 } 594 } 595 596 void BlockWaitcntBrackets::print(raw_ostream &OS) { 597 OS << '\n'; 598 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; 599 T = (enum InstCounterType)(T + 1)) { 600 int LB = getScoreLB(T); 601 int UB = getScoreUB(T); 602 603 switch (T) { 604 case VM_CNT: 605 OS << " VM_CNT(" << UB - LB << "): "; 606 break; 607 case LGKM_CNT: 608 OS << " LGKM_CNT(" << UB - LB << "): "; 609 break; 610 case EXP_CNT: 611 OS << " EXP_CNT(" << UB - LB << "): "; 612 break; 613 default: 614 OS << " UNKNOWN(" << UB - LB << "): "; 615 break; 616 } 617 618 if (LB < UB) { 619 // Print vgpr scores. 620 for (int J = 0; J <= getMaxVGPR(); J++) { 621 int RegScore = getRegScore(J, T); 622 if (RegScore <= LB) 623 continue; 624 int RelScore = RegScore - LB - 1; 625 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) { 626 OS << RelScore << ":v" << J << " "; 627 } else { 628 OS << RelScore << ":ds "; 629 } 630 } 631 // Also need to print sgpr scores for lgkm_cnt. 632 if (T == LGKM_CNT) { 633 for (int J = 0; J <= getMaxSGPR(); J++) { 634 int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); 635 if (RegScore <= LB) 636 continue; 637 int RelScore = RegScore - LB - 1; 638 OS << RelScore << ":s" << J << " "; 639 } 640 } 641 } 642 OS << '\n'; 643 } 644 OS << '\n'; 645 return; 646 } 647 648 unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T, 649 int ScoreToWait) { 650 unsigned int NeedWait = 0; 651 if (ScoreToWait == -1) { 652 // The score to wait is unknown. This implies that it was not encountered 653 // during the path of the CFG walk done during the current traversal but 654 // may be seen on a different path. Emit an s_wait counter with a 655 // conservative value of 0 for the counter. 656 NeedWait = CNT_MASK(T); 657 setScoreLB(T, getScoreUB(T)); 658 return NeedWait; 659 } 660 661 // If the score of src_operand falls within the bracket, we need an 662 // s_waitcnt instruction. 663 const int32_t LB = getScoreLB(T); 664 const int32_t UB = getScoreUB(T); 665 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { 666 if (T == VM_CNT && hasPendingFlat()) { 667 // If there is a pending FLAT operation, and this is a VM waitcnt, 668 // then we need to force a waitcnt 0 for VM. 669 NeedWait = CNT_MASK(T); 670 setScoreLB(T, getScoreUB(T)); 671 } else if (counterOutOfOrder(T)) { 672 // Counter can get decremented out-of-order when there 673 // are multiple types event in the brack. Also emit an s_wait counter 674 // with a conservative value of 0 for the counter. 675 NeedWait = CNT_MASK(T); 676 setScoreLB(T, getScoreUB(T)); 677 } else { 678 NeedWait = CNT_MASK(T); 679 setScoreLB(T, ScoreToWait); 680 } 681 } 682 683 return NeedWait; 684 } 685 686 // Where there are multiple types of event in the bracket of a counter, 687 // the decrement may go out of order. 688 bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) { 689 switch (T) { 690 case VM_CNT: 691 return false; 692 case LGKM_CNT: { 693 if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] && 694 EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) { 695 // Scalar memory read always can go out of order. 696 return true; 697 } 698 int NumEventTypes = 0; 699 if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] && 700 EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) { 701 NumEventTypes++; 702 } 703 if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] && 704 EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) { 705 NumEventTypes++; 706 } 707 if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] && 708 EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) { 709 NumEventTypes++; 710 } 711 if (NumEventTypes <= 1) { 712 return false; 713 } 714 break; 715 } 716 case EXP_CNT: { 717 // If there has been a mixture of export types, then a waitcnt exp(0) is 718 // required. 719 if (MixedExpTypes) 720 return true; 721 int NumEventTypes = 0; 722 if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] && 723 EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) { 724 NumEventTypes++; 725 } 726 if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] && 727 EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) { 728 NumEventTypes++; 729 } 730 if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] && 731 EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) { 732 NumEventTypes++; 733 } 734 if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] && 735 EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) { 736 NumEventTypes++; 737 } 738 739 if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] && 740 EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) { 741 NumEventTypes++; 742 } 743 744 if (NumEventTypes <= 1) { 745 return false; 746 } 747 break; 748 } 749 default: 750 break; 751 } 752 return true; 753 } 754 755 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, 756 false) 757 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, 758 false) 759 760 char SIInsertWaitcnts::ID = 0; 761 762 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID; 763 764 FunctionPass *llvm::createSIInsertWaitcntsPass() { 765 return new SIInsertWaitcnts(); 766 } 767 768 static bool readsVCCZ(const MachineInstr &MI) { 769 unsigned Opc = MI.getOpcode(); 770 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && 771 !MI.getOperand(1).isUndef(); 772 } 773 774 /// \brief Generate s_waitcnt instruction to be placed before cur_Inst. 775 /// Instructions of a given type are returned in order, 776 /// but instructions of different types can complete out of order. 777 /// We rely on this in-order completion 778 /// and simply assign a score to the memory access instructions. 779 /// We keep track of the active "score bracket" to determine 780 /// if an access of a memory read requires an s_waitcnt 781 /// and if so what the value of each counter is. 782 /// The "score bracket" is bound by the lower bound and upper bound 783 /// scores (*_score_LB and *_score_ub respectively). 784 MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( 785 MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) { 786 // To emit, or not to emit - that's the question! 787 // Start with an assumption that there is no need to emit. 788 unsigned int EmitSwaitcnt = 0; 789 // s_waitcnt instruction to return; default is NULL. 790 MachineInstr *SWaitInst = nullptr; 791 // No need to wait before phi. If a phi-move exists, then the wait should 792 // has been inserted before the move. If a phi-move does not exist, then 793 // wait should be inserted before the real use. The same is true for 794 // sc-merge. It is not a coincident that all these cases correspond to the 795 // instructions that are skipped in the assembling loop. 796 bool NeedLineMapping = false; // TODO: Check on this. 797 if (MI.isDebugValue() && 798 // TODO: any other opcode? 799 !NeedLineMapping) { 800 return SWaitInst; 801 } 802 803 // See if an s_waitcnt is forced at block entry, or is needed at 804 // program end. 805 if (ScoreBrackets->getWaitAtBeginning()) { 806 // Note that we have already cleared the state, so we don't need to update 807 // it. 808 ScoreBrackets->clearWaitAtBeginning(); 809 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; 810 T = (enum InstCounterType)(T + 1)) { 811 EmitSwaitcnt |= CNT_MASK(T); 812 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); 813 } 814 } 815 816 // See if this instruction has a forced S_WAITCNT VM. 817 // TODO: Handle other cases of NeedsWaitcntVmBefore() 818 else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 || 819 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC || 820 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) { 821 EmitSwaitcnt |= 822 ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); 823 } 824 825 // All waits must be resolved at call return. 826 // NOTE: this could be improved with knowledge of all call sites or 827 // with knowledge of the called routines. 828 if (MI.getOpcode() == AMDGPU::RETURN || 829 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || 830 MI.getOpcode() == AMDGPU::S_SETPC_B64_return) { 831 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; 832 T = (enum InstCounterType)(T + 1)) { 833 if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) { 834 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); 835 EmitSwaitcnt |= CNT_MASK(T); 836 } 837 } 838 } 839 // Resolve vm waits before gs-done. 840 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG || 841 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) && 842 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) == 843 AMDGPU::SendMsg::ID_GS_DONE)) { 844 if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) { 845 ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); 846 EmitSwaitcnt |= CNT_MASK(VM_CNT); 847 } 848 } 849 #if 0 // TODO: the following blocks of logic when we have fence. 850 else if (MI.getOpcode() == SC_FENCE) { 851 const unsigned int group_size = 852 context->shader_info->GetMaxThreadGroupSize(); 853 // group_size == 0 means thread group size is unknown at compile time 854 const bool group_is_multi_wave = 855 (group_size == 0 || group_size > target_info->GetWaveFrontSize()); 856 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence(); 857 858 for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) { 859 SCRegType src_type = Inst->GetSrcType(i); 860 switch (src_type) { 861 case SCMEM_LDS: 862 if (group_is_multi_wave || 863 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) { 864 EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, 865 ScoreBrackets->getScoreUB(LGKM_CNT)); 866 // LDS may have to wait for VM_CNT after buffer load to LDS 867 if (target_info->HasBufferLoadToLDS()) { 868 EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT, 869 ScoreBrackets->getScoreUB(VM_CNT)); 870 } 871 } 872 break; 873 874 case SCMEM_GDS: 875 if (group_is_multi_wave || fence_is_global) { 876 EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, 877 ScoreBrackets->getScoreUB(EXP_CNT)); 878 EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, 879 ScoreBrackets->getScoreUB(LGKM_CNT)); 880 } 881 break; 882 883 case SCMEM_UAV: 884 case SCMEM_TFBUF: 885 case SCMEM_RING: 886 case SCMEM_SCATTER: 887 if (group_is_multi_wave || fence_is_global) { 888 EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, 889 ScoreBrackets->getScoreUB(EXP_CNT)); 890 EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT, 891 ScoreBrackets->getScoreUB(VM_CNT)); 892 } 893 break; 894 895 case SCMEM_SCRATCH: 896 default: 897 break; 898 } 899 } 900 } 901 #endif 902 903 // Export & GDS instructions do not read the EXEC mask until after the export 904 // is granted (which can occur well after the instruction is issued). 905 // The shader program must flush all EXP operations on the export-count 906 // before overwriting the EXEC mask. 907 else { 908 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) { 909 // Export and GDS are tracked individually, either may trigger a waitcnt 910 // for EXEC. 911 EmitSwaitcnt |= ScoreBrackets->updateByWait( 912 EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK)); 913 EmitSwaitcnt |= ScoreBrackets->updateByWait( 914 EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS)); 915 EmitSwaitcnt |= ScoreBrackets->updateByWait( 916 EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS)); 917 EmitSwaitcnt |= ScoreBrackets->updateByWait( 918 EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK)); 919 } 920 921 #if 0 // TODO: the following code to handle CALL. 922 // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT. 923 // However, there is a problem with EXP_CNT, because the call cannot 924 // easily tell if a register is used in the function, and if it did, then 925 // the referring instruction would have to have an S_WAITCNT, which is 926 // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs 927 // before the call. 928 if (MI.getOpcode() == SC_CALL) { 929 if (ScoreBrackets->getScoreUB(EXP_CNT) > 930 ScoreBrackets->getScoreLB(EXP_CNT)) { 931 ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); 932 EmitSwaitcnt |= CNT_MASK(EXP_CNT); 933 } 934 } 935 #endif 936 937 // Look at the source operands of every instruction to see if 938 // any of them results from a previous memory operation that affects 939 // its current usage. If so, an s_waitcnt instruction needs to be 940 // emitted. 941 // If the source operand was defined by a load, add the s_waitcnt 942 // instruction. 943 for (const MachineMemOperand *Memop : MI.memoperands()) { 944 unsigned AS = Memop->getAddrSpace(); 945 if (AS != AMDGPUASI.LOCAL_ADDRESS) 946 continue; 947 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; 948 // VM_CNT is only relevant to vgpr or LDS. 949 EmitSwaitcnt |= ScoreBrackets->updateByWait( 950 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); 951 } 952 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 953 const MachineOperand &Op = MI.getOperand(I); 954 const MachineRegisterInfo &MRIA = *MRI; 955 RegInterval Interval = 956 ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false); 957 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 958 if (TRI->isVGPR(MRIA, Op.getReg())) { 959 // VM_CNT is only relevant to vgpr or LDS. 960 EmitSwaitcnt |= ScoreBrackets->updateByWait( 961 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); 962 } 963 EmitSwaitcnt |= ScoreBrackets->updateByWait( 964 LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT)); 965 } 966 } 967 // End of for loop that looks at all source operands to decide vm_wait_cnt 968 // and lgk_wait_cnt. 969 970 // Two cases are handled for destination operands: 971 // 1) If the destination operand was defined by a load, add the s_waitcnt 972 // instruction to guarantee the right WAW order. 973 // 2) If a destination operand that was used by a recent export/store ins, 974 // add s_waitcnt on exp_cnt to guarantee the WAR order. 975 if (MI.mayStore()) { 976 for (const MachineMemOperand *Memop : MI.memoperands()) { 977 unsigned AS = Memop->getAddrSpace(); 978 if (AS != AMDGPUASI.LOCAL_ADDRESS) 979 continue; 980 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; 981 EmitSwaitcnt |= ScoreBrackets->updateByWait( 982 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); 983 EmitSwaitcnt |= ScoreBrackets->updateByWait( 984 EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT)); 985 } 986 } 987 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 988 MachineOperand &Def = MI.getOperand(I); 989 const MachineRegisterInfo &MRIA = *MRI; 990 RegInterval Interval = 991 ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true); 992 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 993 if (TRI->isVGPR(MRIA, Def.getReg())) { 994 EmitSwaitcnt |= ScoreBrackets->updateByWait( 995 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); 996 EmitSwaitcnt |= ScoreBrackets->updateByWait( 997 EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT)); 998 } 999 EmitSwaitcnt |= ScoreBrackets->updateByWait( 1000 LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT)); 1001 } 1002 } // End of for loop that looks at all dest operands. 1003 } 1004 1005 // TODO: Tie force zero to a compiler triage option. 1006 bool ForceZero = false; 1007 1008 // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0 1009 // occurs before the instruction. Doing it here prevents any additional 1010 // S_WAITCNTs from being emitted if the instruction was marked as 1011 // requiring a WAITCNT beforehand. 1012 if (MI.getOpcode() == AMDGPU::S_BARRIER && 1013 !ST->hasAutoWaitcntBeforeBarrier()) { 1014 EmitSwaitcnt |= 1015 ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); 1016 EmitSwaitcnt |= ScoreBrackets->updateByWait( 1017 EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); 1018 EmitSwaitcnt |= ScoreBrackets->updateByWait( 1019 LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT)); 1020 } 1021 1022 // TODO: Remove this work-around, enable the assert for Bug 457939 1023 // after fixing the scheduler. Also, the Shader Compiler code is 1024 // independent of target. 1025 if (readsVCCZ(MI) && ST->getGeneration() <= SISubtarget::SEA_ISLANDS) { 1026 if (ScoreBrackets->getScoreLB(LGKM_CNT) < 1027 ScoreBrackets->getScoreUB(LGKM_CNT) && 1028 ScoreBrackets->hasPendingSMEM()) { 1029 // Wait on everything, not just LGKM. vccz reads usually come from 1030 // terminators, and we always wait on everything at the end of the 1031 // block, so if we only wait on LGKM here, we might end up with 1032 // another s_waitcnt inserted right after this if there are non-LGKM 1033 // instructions still outstanding. 1034 ForceZero = true; 1035 EmitSwaitcnt = true; 1036 } 1037 } 1038 1039 // Does this operand processing indicate s_wait counter update? 1040 if (EmitSwaitcnt) { 1041 int CntVal[NUM_INST_CNTS]; 1042 1043 bool UseDefaultWaitcntStrategy = true; 1044 if (ForceZero) { 1045 // Force all waitcnts to 0. 1046 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; 1047 T = (enum InstCounterType)(T + 1)) { 1048 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); 1049 } 1050 CntVal[VM_CNT] = 0; 1051 CntVal[EXP_CNT] = 0; 1052 CntVal[LGKM_CNT] = 0; 1053 UseDefaultWaitcntStrategy = false; 1054 } 1055 1056 if (UseDefaultWaitcntStrategy) { 1057 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; 1058 T = (enum InstCounterType)(T + 1)) { 1059 if (EmitSwaitcnt & CNT_MASK(T)) { 1060 int Delta = 1061 ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T); 1062 int MaxDelta = ScoreBrackets->getWaitCountMax(T); 1063 if (Delta >= MaxDelta) { 1064 Delta = -1; 1065 if (T != EXP_CNT) { 1066 ScoreBrackets->setScoreLB( 1067 T, ScoreBrackets->getScoreUB(T) - MaxDelta); 1068 } 1069 EmitSwaitcnt &= ~CNT_MASK(T); 1070 } 1071 CntVal[T] = Delta; 1072 } else { 1073 // If we are not waiting for a particular counter then encode 1074 // it as -1 which means "don't care." 1075 CntVal[T] = -1; 1076 } 1077 } 1078 } 1079 1080 // If we are not waiting on any counter we can skip the wait altogether. 1081 if (EmitSwaitcnt != 0) { 1082 MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt(); 1083 int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm(); 1084 if (!OldWaitcnt || (AMDGPU::decodeVmcnt(IV, Imm) != 1085 (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) || 1086 (AMDGPU::decodeExpcnt(IV, Imm) != 1087 (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) || 1088 (AMDGPU::decodeLgkmcnt(IV, Imm) != 1089 (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) { 1090 MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent()); 1091 if (ContainingLoop) { 1092 MachineBasicBlock *TBB = ContainingLoop->getHeader(); 1093 BlockWaitcntBrackets *ScoreBracket = 1094 BlockWaitcntBracketsMap[TBB].get(); 1095 if (!ScoreBracket) { 1096 assert(BlockVisitedSet.find(TBB) == BlockVisitedSet.end()); 1097 BlockWaitcntBracketsMap[TBB] = make_unique<BlockWaitcntBrackets>(); 1098 ScoreBracket = BlockWaitcntBracketsMap[TBB].get(); 1099 } 1100 ScoreBracket->setRevisitLoop(true); 1101 DEBUG(dbgs() << "set-revisit: block" 1102 << ContainingLoop->getHeader()->getNumber() << '\n';); 1103 } 1104 } 1105 1106 // Update an existing waitcount, or make a new one. 1107 MachineFunction &MF = *MI.getParent()->getParent(); 1108 if (OldWaitcnt && OldWaitcnt->getOpcode() != AMDGPU::S_WAITCNT) { 1109 SWaitInst = OldWaitcnt; 1110 } else { 1111 SWaitInst = MF.CreateMachineInstr(TII->get(AMDGPU::S_WAITCNT), 1112 MI.getDebugLoc()); 1113 CompilerGeneratedWaitcntSet.insert(SWaitInst); 1114 } 1115 1116 const MachineOperand &Op = 1117 MachineOperand::CreateImm(AMDGPU::encodeWaitcnt( 1118 IV, CntVal[VM_CNT], CntVal[EXP_CNT], CntVal[LGKM_CNT])); 1119 SWaitInst->addOperand(MF, Op); 1120 1121 if (CntVal[EXP_CNT] == 0) { 1122 ScoreBrackets->setMixedExpTypes(false); 1123 } 1124 } 1125 } 1126 1127 return SWaitInst; 1128 } 1129 1130 void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB, 1131 MachineInstr *Waitcnt) { 1132 if (MBB.empty()) { 1133 MBB.push_back(Waitcnt); 1134 return; 1135 } 1136 1137 MachineBasicBlock::iterator It = MBB.end(); 1138 MachineInstr *MI = &*(--It); 1139 if (MI->isBranch()) { 1140 MBB.insert(It, Waitcnt); 1141 } else { 1142 MBB.push_back(Waitcnt); 1143 } 1144 1145 return; 1146 } 1147 1148 void SIInsertWaitcnts::updateEventWaitCntAfter( 1149 MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) { 1150 // Now look at the instruction opcode. If it is a memory access 1151 // instruction, update the upper-bound of the appropriate counter's 1152 // bracket and the destination operand scores. 1153 // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere. 1154 uint64_t TSFlags = Inst.getDesc().TSFlags; 1155 if (TII->isDS(Inst) && (TSFlags & SIInstrFlags::LGKM_CNT)) { 1156 if (TII->getNamedOperand(Inst, AMDGPU::OpName::gds) && 1157 TII->getNamedOperand(Inst, AMDGPU::OpName::gds)->getImm() != 0) { 1158 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst); 1159 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst); 1160 } else { 1161 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); 1162 } 1163 } else if (TII->isFLAT(Inst)) { 1164 assert(Inst.mayLoad() || Inst.mayStore()); 1165 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); 1166 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); 1167 1168 // This is a flat memory operation. Check to see if it has memory 1169 // tokens for both LDS and Memory, and if so mark it as a flat. 1170 bool FoundLDSMem = false; 1171 for (const MachineMemOperand *Memop : Inst.memoperands()) { 1172 unsigned AS = Memop->getAddrSpace(); 1173 if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) 1174 FoundLDSMem = true; 1175 } 1176 1177 // This is a flat memory operation, so note it - it will require 1178 // that both the VM and LGKM be flushed to zero if it is pending when 1179 // a VM or LGKM dependency occurs. 1180 if (FoundLDSMem) { 1181 ScoreBrackets->setPendingFlat(); 1182 } 1183 } else if (SIInstrInfo::isVMEM(Inst) && 1184 // TODO: get a better carve out. 1185 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 && 1186 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC && 1187 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) { 1188 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); 1189 if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() && 1190 (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) { 1191 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst); 1192 } 1193 } else if (TII->isSMRD(Inst)) { 1194 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); 1195 } else { 1196 switch (Inst.getOpcode()) { 1197 case AMDGPU::S_SENDMSG: 1198 case AMDGPU::S_SENDMSGHALT: 1199 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst); 1200 break; 1201 case AMDGPU::EXP: 1202 case AMDGPU::EXP_DONE: { 1203 int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm(); 1204 if (Imm >= 32 && Imm <= 63) 1205 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst); 1206 else if (Imm >= 12 && Imm <= 15) 1207 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst); 1208 else 1209 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst); 1210 break; 1211 } 1212 case AMDGPU::S_MEMTIME: 1213 case AMDGPU::S_MEMREALTIME: 1214 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); 1215 break; 1216 default: 1217 break; 1218 } 1219 } 1220 } 1221 1222 void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) { 1223 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get(); 1224 int32_t MaxPending[NUM_INST_CNTS] = {0}; 1225 int32_t MaxFlat[NUM_INST_CNTS] = {0}; 1226 bool MixedExpTypes = false; 1227 1228 // Clear the score bracket state. 1229 ScoreBrackets->clear(); 1230 1231 // Compute the number of pending elements on block entry. 1232 1233 // IMPORTANT NOTE: If iterative handling of loops is added, the code will 1234 // need to handle single BBs with backedges to themselves. This means that 1235 // they will need to retain and not clear their initial state. 1236 1237 // See if there are any uninitialized predecessors. If so, emit an 1238 // s_waitcnt 0 at the beginning of the block. 1239 for (MachineBasicBlock *pred : Block.predecessors()) { 1240 BlockWaitcntBrackets *PredScoreBrackets = 1241 BlockWaitcntBracketsMap[pred].get(); 1242 bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end(); 1243 if (!Visited || PredScoreBrackets->getWaitAtBeginning()) { 1244 break; 1245 } 1246 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; 1247 T = (enum InstCounterType)(T + 1)) { 1248 int span = 1249 PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T); 1250 MaxPending[T] = std::max(MaxPending[T], span); 1251 span = 1252 PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T); 1253 MaxFlat[T] = std::max(MaxFlat[T], span); 1254 } 1255 1256 MixedExpTypes |= PredScoreBrackets->mixedExpTypes(); 1257 } 1258 1259 // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()? 1260 // Also handle kills for exit block. 1261 if (Block.succ_empty() && !KillWaitBrackets.empty()) { 1262 for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) { 1263 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; 1264 T = (enum InstCounterType)(T + 1)) { 1265 int Span = KillWaitBrackets[I]->getScoreUB(T) - 1266 KillWaitBrackets[I]->getScoreLB(T); 1267 MaxPending[T] = std::max(MaxPending[T], Span); 1268 Span = KillWaitBrackets[I]->pendingFlat(T) - 1269 KillWaitBrackets[I]->getScoreLB(T); 1270 MaxFlat[T] = std::max(MaxFlat[T], Span); 1271 } 1272 1273 MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes(); 1274 } 1275 } 1276 1277 // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK. 1278 for (MachineBasicBlock *Pred : Block.predecessors()) { 1279 BlockWaitcntBrackets *PredScoreBrackets = 1280 BlockWaitcntBracketsMap[Pred].get(); 1281 bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end(); 1282 if (!Visited || PredScoreBrackets->getWaitAtBeginning()) { 1283 break; 1284 } 1285 1286 int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) - 1287 PredScoreBrackets->getScoreLB(EXP_CNT); 1288 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan); 1289 int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) - 1290 PredScoreBrackets->getScoreLB(EXP_CNT); 1291 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan); 1292 } 1293 1294 // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()? 1295 if (Block.succ_empty() && !KillWaitBrackets.empty()) { 1296 for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) { 1297 int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) - 1298 KillWaitBrackets[I]->getScoreLB(EXP_CNT); 1299 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan); 1300 int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) - 1301 KillWaitBrackets[I]->getScoreLB(EXP_CNT); 1302 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan); 1303 } 1304 } 1305 1306 #if 0 1307 // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker. 1308 // TODO: how does LC distinguish between function entry and main entry? 1309 // If this is the entry to a function, force a wait. 1310 MachineBasicBlock &Entry = Block.getParent()->front(); 1311 if (Entry.getNumber() == Block.getNumber()) { 1312 ScoreBrackets->setWaitAtBeginning(); 1313 return; 1314 } 1315 #endif 1316 1317 // Now set the current Block's brackets to the largest ending bracket. 1318 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; 1319 T = (enum InstCounterType)(T + 1)) { 1320 ScoreBrackets->setScoreUB(T, MaxPending[T]); 1321 ScoreBrackets->setScoreLB(T, 0); 1322 ScoreBrackets->setLastFlat(T, MaxFlat[T]); 1323 } 1324 1325 ScoreBrackets->setMixedExpTypes(MixedExpTypes); 1326 1327 // Set the register scoreboard. 1328 for (MachineBasicBlock *Pred : Block.predecessors()) { 1329 if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) { 1330 break; 1331 } 1332 1333 BlockWaitcntBrackets *PredScoreBrackets = 1334 BlockWaitcntBracketsMap[Pred].get(); 1335 1336 // Now merge the gpr_reg_score information 1337 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; 1338 T = (enum InstCounterType)(T + 1)) { 1339 int PredLB = PredScoreBrackets->getScoreLB(T); 1340 int PredUB = PredScoreBrackets->getScoreUB(T); 1341 if (PredLB < PredUB) { 1342 int PredScale = MaxPending[T] - PredUB; 1343 // Merge vgpr scores. 1344 for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) { 1345 int PredRegScore = PredScoreBrackets->getRegScore(J, T); 1346 if (PredRegScore <= PredLB) 1347 continue; 1348 int NewRegScore = PredScale + PredRegScore; 1349 ScoreBrackets->setRegScore( 1350 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore)); 1351 } 1352 // Also need to merge sgpr scores for lgkm_cnt. 1353 if (T == LGKM_CNT) { 1354 for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) { 1355 int PredRegScore = 1356 PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); 1357 if (PredRegScore <= PredLB) 1358 continue; 1359 int NewRegScore = PredScale + PredRegScore; 1360 ScoreBrackets->setRegScore( 1361 J + NUM_ALL_VGPRS, LGKM_CNT, 1362 std::max( 1363 ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT), 1364 NewRegScore)); 1365 } 1366 } 1367 } 1368 } 1369 1370 // Also merge the WaitEvent information. 1371 ForAllWaitEventType(W) { 1372 enum InstCounterType T = PredScoreBrackets->eventCounter(W); 1373 int PredEventUB = PredScoreBrackets->getEventUB(W); 1374 if (PredEventUB > PredScoreBrackets->getScoreLB(T)) { 1375 int NewEventUB = 1376 MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T); 1377 if (NewEventUB > 0) { 1378 ScoreBrackets->setEventUB( 1379 W, std::max(ScoreBrackets->getEventUB(W), NewEventUB)); 1380 } 1381 } 1382 } 1383 } 1384 1385 // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()? 1386 // Set the register scoreboard. 1387 if (Block.succ_empty() && !KillWaitBrackets.empty()) { 1388 for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) { 1389 // Now merge the gpr_reg_score information. 1390 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; 1391 T = (enum InstCounterType)(T + 1)) { 1392 int PredLB = KillWaitBrackets[I]->getScoreLB(T); 1393 int PredUB = KillWaitBrackets[I]->getScoreUB(T); 1394 if (PredLB < PredUB) { 1395 int PredScale = MaxPending[T] - PredUB; 1396 // Merge vgpr scores. 1397 for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) { 1398 int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T); 1399 if (PredRegScore <= PredLB) 1400 continue; 1401 int NewRegScore = PredScale + PredRegScore; 1402 ScoreBrackets->setRegScore( 1403 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore)); 1404 } 1405 // Also need to merge sgpr scores for lgkm_cnt. 1406 if (T == LGKM_CNT) { 1407 for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) { 1408 int PredRegScore = 1409 KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); 1410 if (PredRegScore <= PredLB) 1411 continue; 1412 int NewRegScore = PredScale + PredRegScore; 1413 ScoreBrackets->setRegScore( 1414 J + NUM_ALL_VGPRS, LGKM_CNT, 1415 std::max( 1416 ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT), 1417 NewRegScore)); 1418 } 1419 } 1420 } 1421 } 1422 1423 // Also merge the WaitEvent information. 1424 ForAllWaitEventType(W) { 1425 enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W); 1426 int PredEventUB = KillWaitBrackets[I]->getEventUB(W); 1427 if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) { 1428 int NewEventUB = 1429 MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T); 1430 if (NewEventUB > 0) { 1431 ScoreBrackets->setEventUB( 1432 W, std::max(ScoreBrackets->getEventUB(W), NewEventUB)); 1433 } 1434 } 1435 } 1436 } 1437 } 1438 1439 // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the 1440 // sequencing predecessors, because changes to EXEC require waitcnts due to 1441 // the delayed nature of these operations. 1442 for (MachineBasicBlock *Pred : Block.predecessors()) { 1443 if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) { 1444 break; 1445 } 1446 1447 BlockWaitcntBrackets *PredScoreBrackets = 1448 BlockWaitcntBracketsMap[Pred].get(); 1449 1450 int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK); 1451 if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) { 1452 int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub - 1453 PredScoreBrackets->getScoreUB(EXP_CNT); 1454 if (new_gds_ub > 0) { 1455 ScoreBrackets->setEventUB( 1456 GDS_GPR_LOCK, 1457 std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub)); 1458 } 1459 } 1460 int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK); 1461 if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) { 1462 int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub - 1463 PredScoreBrackets->getScoreUB(EXP_CNT); 1464 if (new_exp_ub > 0) { 1465 ScoreBrackets->setEventUB( 1466 EXP_GPR_LOCK, 1467 std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub)); 1468 } 1469 } 1470 } 1471 } 1472 1473 /// Return the "bottom" block of a loop. This differs from 1474 /// MachineLoop::getBottomBlock in that it works even if the loop is 1475 /// discontiguous. 1476 MachineBasicBlock *SIInsertWaitcnts::loopBottom(const MachineLoop *Loop) { 1477 MachineBasicBlock *Bottom = Loop->getHeader(); 1478 for (MachineBasicBlock *MBB : Loop->blocks()) 1479 if (MBB->getNumber() > Bottom->getNumber()) 1480 Bottom = MBB; 1481 return Bottom; 1482 } 1483 1484 // Generate s_waitcnt instructions where needed. 1485 void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, 1486 MachineBasicBlock &Block) { 1487 // Initialize the state information. 1488 mergeInputScoreBrackets(Block); 1489 1490 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get(); 1491 1492 DEBUG({ 1493 dbgs() << "Block" << Block.getNumber(); 1494 ScoreBrackets->dump(); 1495 }); 1496 1497 bool InsertNOP = false; 1498 1499 // Walk over the instructions. 1500 for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end(); 1501 Iter != E;) { 1502 MachineInstr &Inst = *Iter; 1503 // Remove any previously existing waitcnts. 1504 if (Inst.getOpcode() == AMDGPU::S_WAITCNT) { 1505 // TODO: Register the old waitcnt and optimize the following waitcnts. 1506 // Leaving the previously existing waitcnts is conservatively correct. 1507 if (CompilerGeneratedWaitcntSet.find(&Inst) == 1508 CompilerGeneratedWaitcntSet.end()) 1509 ++Iter; 1510 else { 1511 ScoreBrackets->setWaitcnt(&Inst); 1512 ++Iter; 1513 Inst.removeFromParent(); 1514 } 1515 continue; 1516 } 1517 1518 // Kill instructions generate a conditional branch to the endmain block. 1519 // Merge the current waitcnt state into the endmain block information. 1520 // TODO: Are there other flavors of KILL instruction? 1521 if (Inst.getOpcode() == AMDGPU::KILL) { 1522 addKillWaitBracket(ScoreBrackets); 1523 } 1524 1525 bool VCCZBugWorkAround = false; 1526 if (readsVCCZ(Inst) && 1527 (VCCZBugHandledSet.find(&Inst) == VCCZBugHandledSet.end())) { 1528 if (ScoreBrackets->getScoreLB(LGKM_CNT) < 1529 ScoreBrackets->getScoreUB(LGKM_CNT) && 1530 ScoreBrackets->hasPendingSMEM()) { 1531 if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) 1532 VCCZBugWorkAround = true; 1533 } 1534 } 1535 1536 // Generate an s_waitcnt instruction to be placed before 1537 // cur_Inst, if needed. 1538 MachineInstr *SWaitInst = generateSWaitCntInstBefore(Inst, ScoreBrackets); 1539 1540 if (SWaitInst) { 1541 Block.insert(Inst, SWaitInst); 1542 if (ScoreBrackets->getWaitcnt() != SWaitInst) { 1543 DEBUG(dbgs() << "insertWaitcntInBlock\n" 1544 << "Old Instr: " << Inst << '\n' 1545 << "New Instr: " << *SWaitInst << '\n';); 1546 } 1547 } 1548 1549 updateEventWaitCntAfter(Inst, ScoreBrackets); 1550 1551 #if 0 // TODO: implement resource type check controlled by options with ub = LB. 1552 // If this instruction generates a S_SETVSKIP because it is an 1553 // indexed resource, and we are on Tahiti, then it will also force 1554 // an S_WAITCNT vmcnt(0) 1555 if (RequireCheckResourceType(Inst, context)) { 1556 // Force the score to as if an S_WAITCNT vmcnt(0) is emitted. 1557 ScoreBrackets->setScoreLB(VM_CNT, 1558 ScoreBrackets->getScoreUB(VM_CNT)); 1559 } 1560 #endif 1561 1562 ScoreBrackets->clearWaitcnt(); 1563 1564 if (SWaitInst) { 1565 DEBUG({ SWaitInst->print(dbgs() << '\n'); }); 1566 } 1567 DEBUG({ 1568 Inst.print(dbgs()); 1569 ScoreBrackets->dump(); 1570 }); 1571 1572 // Check to see if this is a GWS instruction. If so, and if this is CI or 1573 // VI, then the generated code sequence will include an S_WAITCNT 0. 1574 // TODO: Are these the only GWS instructions? 1575 if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT || 1576 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V || 1577 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR || 1578 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P || 1579 Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) { 1580 // TODO: && context->target_info->GwsRequiresMemViolTest() ) { 1581 ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); 1582 ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); 1583 ScoreBrackets->updateByWait(LGKM_CNT, 1584 ScoreBrackets->getScoreUB(LGKM_CNT)); 1585 } 1586 1587 // TODO: Remove this work-around after fixing the scheduler and enable the 1588 // assert above. 1589 if (VCCZBugWorkAround) { 1590 // Restore the vccz bit. Any time a value is written to vcc, the vcc 1591 // bit is updated, so we can restore the bit by reading the value of 1592 // vcc and then writing it back to the register. 1593 BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), 1594 AMDGPU::VCC) 1595 .addReg(AMDGPU::VCC); 1596 VCCZBugHandledSet.insert(&Inst); 1597 } 1598 1599 if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 1600 1601 // This avoids a s_nop after a waitcnt has just been inserted. 1602 if (!SWaitInst && InsertNOP) { 1603 BuildMI(Block, Inst, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0); 1604 } 1605 InsertNOP = false; 1606 1607 // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM 1608 // or SMEM clause, respectively. 1609 // 1610 // The temporary workaround is to break the clauses with S_NOP. 1611 // 1612 // The proper solution would be to allocate registers such that all source 1613 // and destination registers don't overlap, e.g. this is illegal: 1614 // r0 = load r2 1615 // r2 = load r0 1616 bool IsSMEM = false; 1617 bool IsVMEM = false; 1618 if (TII->isSMRD(Inst)) 1619 IsSMEM = true; 1620 else if (TII->usesVM_CNT(Inst)) 1621 IsVMEM = true; 1622 1623 ++Iter; 1624 if (Iter == E) 1625 break; 1626 1627 MachineInstr &Next = *Iter; 1628 1629 // TODO: How about consecutive SMEM instructions? 1630 // The comments above says break the clause but the code does not. 1631 // if ((TII->isSMRD(next) && isSMEM) || 1632 if (!IsSMEM && TII->usesVM_CNT(Next) && IsVMEM && 1633 // TODO: Enable this check when hasSoftClause is upstreamed. 1634 // ST->hasSoftClauses() && 1635 ST->isXNACKEnabled()) { 1636 // Insert a NOP to break the clause. 1637 InsertNOP = true; 1638 continue; 1639 } 1640 1641 // There must be "S_NOP 0" between an instruction writing M0 and 1642 // S_SENDMSG. 1643 if ((Next.getOpcode() == AMDGPU::S_SENDMSG || 1644 Next.getOpcode() == AMDGPU::S_SENDMSGHALT) && 1645 Inst.definesRegister(AMDGPU::M0)) 1646 InsertNOP = true; 1647 1648 continue; 1649 } 1650 1651 ++Iter; 1652 } 1653 1654 // Check if we need to force convergence at loop footer. 1655 MachineLoop *ContainingLoop = MLI->getLoopFor(&Block); 1656 if (ContainingLoop && loopBottom(ContainingLoop) == &Block) { 1657 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get(); 1658 WaitcntData->print(); 1659 DEBUG(dbgs() << '\n';); 1660 1661 // The iterative waitcnt insertion algorithm aims for optimal waitcnt 1662 // placement and doesn't always guarantee convergence for a loop. Each 1663 // loop should take at most 2 iterations for it to converge naturally. 1664 // When this max is reached and result doesn't converge, we force 1665 // convergence by inserting a s_waitcnt at the end of loop footer. 1666 if (WaitcntData->getIterCnt() > 2) { 1667 // To ensure convergence, need to make wait events at loop footer be no 1668 // more than those from the previous iteration. 1669 // As a simplification, Instead of tracking individual scores and 1670 // generate the precise wait count, just wait on 0. 1671 bool HasPending = false; 1672 MachineInstr *SWaitInst = WaitcntData->getWaitcnt(); 1673 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; 1674 T = (enum InstCounterType)(T + 1)) { 1675 if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) { 1676 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); 1677 HasPending = true; 1678 } 1679 } 1680 1681 if (HasPending) { 1682 if (!SWaitInst) { 1683 SWaitInst = Block.getParent()->CreateMachineInstr( 1684 TII->get(AMDGPU::S_WAITCNT), DebugLoc()); 1685 CompilerGeneratedWaitcntSet.insert(SWaitInst); 1686 const MachineOperand &Op = MachineOperand::CreateImm(0); 1687 SWaitInst->addOperand(MF, Op); 1688 #if 0 // TODO: Format the debug output 1689 OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context); 1690 OutputTransformAdd(SWaitInst, context); 1691 #endif 1692 } 1693 #if 0 // TODO: ?? 1694 _DEV( REPORTED_STATS->force_waitcnt_converge = 1; ) 1695 #endif 1696 } 1697 1698 if (SWaitInst) { 1699 DEBUG({ 1700 SWaitInst->print(dbgs()); 1701 dbgs() << "\nAdjusted score board:"; 1702 ScoreBrackets->dump(); 1703 }); 1704 1705 // Add this waitcnt to the block. It is either newly created or 1706 // created in previous iterations and added back since block traversal 1707 // always remove waitcnt. 1708 insertWaitcntBeforeCF(Block, SWaitInst); 1709 WaitcntData->setWaitcnt(SWaitInst); 1710 } 1711 } 1712 } 1713 } 1714 1715 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { 1716 ST = &MF.getSubtarget<SISubtarget>(); 1717 TII = ST->getInstrInfo(); 1718 TRI = &TII->getRegisterInfo(); 1719 MRI = &MF.getRegInfo(); 1720 MLI = &getAnalysis<MachineLoopInfo>(); 1721 IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits()); 1722 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1723 AMDGPUASI = ST->getAMDGPUAS(); 1724 1725 HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV); 1726 HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV); 1727 HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV); 1728 1729 HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs(); 1730 HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs(); 1731 assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS); 1732 assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS); 1733 1734 RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0); 1735 RegisterEncoding.VGPRL = 1736 RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1; 1737 RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0); 1738 RegisterEncoding.SGPRL = 1739 RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1; 1740 1741 // Walk over the blocks in reverse post-dominator order, inserting 1742 // s_waitcnt where needed. 1743 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF); 1744 bool Modified = false; 1745 for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator 1746 I = RPOT.begin(), 1747 E = RPOT.end(), J = RPOT.begin(); 1748 I != E;) { 1749 MachineBasicBlock &MBB = **I; 1750 1751 BlockVisitedSet.insert(&MBB); 1752 1753 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get(); 1754 if (!ScoreBrackets) { 1755 BlockWaitcntBracketsMap[&MBB] = make_unique<BlockWaitcntBrackets>(); 1756 ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get(); 1757 } 1758 ScoreBrackets->setPostOrder(MBB.getNumber()); 1759 MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB); 1760 if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr) 1761 LoopWaitcntDataMap[ContainingLoop] = make_unique<LoopWaitcntData>(); 1762 1763 // If we are walking into the block from before the loop, then guarantee 1764 // at least 1 re-walk over the loop to propagate the information, even if 1765 // no S_WAITCNT instructions were generated. 1766 if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I && 1767 (BlockWaitcntProcessedSet.find(&MBB) == 1768 BlockWaitcntProcessedSet.end())) { 1769 BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true); 1770 DEBUG(dbgs() << "set-revisit: block" 1771 << ContainingLoop->getHeader()->getNumber() << '\n';); 1772 } 1773 1774 // Walk over the instructions. 1775 insertWaitcntInBlock(MF, MBB); 1776 1777 // Flag that waitcnts have been processed at least once. 1778 BlockWaitcntProcessedSet.insert(&MBB); 1779 1780 // See if we want to revisit the loop. 1781 if (ContainingLoop && loopBottom(ContainingLoop) == &MBB) { 1782 MachineBasicBlock *EntryBB = ContainingLoop->getHeader(); 1783 BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get(); 1784 if (EntrySB && EntrySB->getRevisitLoop()) { 1785 EntrySB->setRevisitLoop(false); 1786 J = I; 1787 int32_t PostOrder = EntrySB->getPostOrder(); 1788 // TODO: Avoid this loop. Find another way to set I. 1789 for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator 1790 X = RPOT.begin(), 1791 Y = RPOT.end(); 1792 X != Y; ++X) { 1793 MachineBasicBlock &MBBX = **X; 1794 if (MBBX.getNumber() == PostOrder) { 1795 I = X; 1796 break; 1797 } 1798 } 1799 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get(); 1800 WaitcntData->incIterCnt(); 1801 DEBUG(dbgs() << "revisit: block" << EntryBB->getNumber() << '\n';); 1802 continue; 1803 } else { 1804 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get(); 1805 // Loop converged, reset iteration count. If this loop gets revisited, 1806 // it must be from an outer loop, the counter will restart, this will 1807 // ensure we don't force convergence on such revisits. 1808 WaitcntData->resetIterCnt(); 1809 } 1810 } 1811 1812 J = I; 1813 ++I; 1814 } 1815 1816 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks; 1817 1818 bool HaveScalarStores = false; 1819 1820 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; 1821 ++BI) { 1822 1823 MachineBasicBlock &MBB = *BI; 1824 1825 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 1826 ++I) { 1827 1828 if (!HaveScalarStores && TII->isScalarStore(*I)) 1829 HaveScalarStores = true; 1830 1831 if (I->getOpcode() == AMDGPU::S_ENDPGM || 1832 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) 1833 EndPgmBlocks.push_back(&MBB); 1834 } 1835 } 1836 1837 if (HaveScalarStores) { 1838 // If scalar writes are used, the cache must be flushed or else the next 1839 // wave to reuse the same scratch memory can be clobbered. 1840 // 1841 // Insert s_dcache_wb at wave termination points if there were any scalar 1842 // stores, and only if the cache hasn't already been flushed. This could be 1843 // improved by looking across blocks for flushes in postdominating blocks 1844 // from the stores but an explicitly requested flush is probably very rare. 1845 for (MachineBasicBlock *MBB : EndPgmBlocks) { 1846 bool SeenDCacheWB = false; 1847 1848 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; 1849 ++I) { 1850 1851 if (I->getOpcode() == AMDGPU::S_DCACHE_WB) 1852 SeenDCacheWB = true; 1853 else if (TII->isScalarStore(*I)) 1854 SeenDCacheWB = false; 1855 1856 // FIXME: It would be better to insert this before a waitcnt if any. 1857 if ((I->getOpcode() == AMDGPU::S_ENDPGM || 1858 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && 1859 !SeenDCacheWB) { 1860 Modified = true; 1861 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB)); 1862 } 1863 } 1864 } 1865 } 1866 1867 if (!MFI->isEntryFunction()) { 1868 // Wait for any outstanding memory operations that the input registers may 1869 // depend on. We can't track them and it's better to to the wait after the 1870 // costly call sequence. 1871 1872 // TODO: Could insert earlier and schedule more liberally with operations 1873 // that only use caller preserved registers. 1874 MachineBasicBlock &EntryBB = MF.front(); 1875 BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) 1876 .addImm(0); 1877 1878 Modified = true; 1879 } 1880 1881 return Modified; 1882 } 1883