1 //===-- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===/ 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Insert wait instructions for memory reads and writes. 12 /// 13 /// Memory reads and writes are issued asynchronously, so we need to insert 14 /// S_WAITCNT instructions when we want to access any of their results or 15 /// overwrite any register that's used asynchronously. 16 // 17 //===----------------------------------------------------------------------===// 18 19 #include "AMDGPU.h" 20 #include "AMDGPUSubtarget.h" 21 #include "SIDefines.h" 22 #include "SIInstrInfo.h" 23 #include "SIMachineFunctionInfo.h" 24 #include "Utils/AMDGPUBaseInfo.h" 25 #include "llvm/ADT/PostOrderIterator.h" 26 #include "llvm/CodeGen/MachineFunction.h" 27 #include "llvm/CodeGen/MachineFunctionPass.h" 28 #include "llvm/CodeGen/MachineInstrBuilder.h" 29 #include "llvm/CodeGen/MachineRegisterInfo.h" 30 31 #define DEBUG_TYPE "si-insert-waitcnts" 32 33 using namespace llvm; 34 35 namespace { 36 37 // Class of object that encapsulates latest instruction counter score 38 // associated with the operand. Used for determining whether 39 // s_waitcnt instruction needs to be emited. 40 41 #define CNT_MASK(t) (1u << (t)) 42 43 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS }; 44 45 typedef std::pair<signed, signed> RegInterval; 46 47 struct { 48 int32_t VmcntMax; 49 int32_t ExpcntMax; 50 int32_t LgkmcntMax; 51 int32_t NumVGPRsMax; 52 int32_t NumSGPRsMax; 53 } HardwareLimits; 54 55 struct { 56 unsigned VGPR0; 57 unsigned VGPRL; 58 unsigned SGPR0; 59 unsigned SGPRL; 60 } RegisterEncoding; 61 62 enum WaitEventType { 63 VMEM_ACCESS, // vector-memory read & write 64 LDS_ACCESS, // lds read & write 65 GDS_ACCESS, // gds read & write 66 SQ_MESSAGE, // send message 67 SMEM_ACCESS, // scalar-memory read & write 68 EXP_GPR_LOCK, // export holding on its data src 69 GDS_GPR_LOCK, // GDS holding on its data and addr src 70 EXP_POS_ACCESS, // write to export position 71 EXP_PARAM_ACCESS, // write to export parameter 72 VMW_GPR_LOCK, // vector-memory write holding on its data src 73 NUM_WAIT_EVENTS, 74 }; 75 76 // The mapping is: 77 // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs 78 // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots 79 // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs 80 // We reserve a fixed number of VGPR slots in the scoring tables for 81 // special tokens like SCMEM_LDS (needed for buffer load to LDS). 82 enum RegisterMapping { 83 SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets. 84 SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets. 85 NUM_EXTRA_VGPRS = 1, // A reserved slot for DS. 86 EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses. 87 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts. 88 }; 89 90 #define ForAllWaitEventType(w) \ 91 for (enum WaitEventType w = (enum WaitEventType)0; \ 92 (w) < (enum WaitEventType)NUM_WAIT_EVENTS; \ 93 (w) = (enum WaitEventType)((w) + 1)) 94 95 // This is a per-basic-block object that maintains current score brackets 96 // of each wait-counter, and a per-register scoreboard for each wait-couner. 97 // We also maintain the latest score for every event type that can change the 98 // waitcnt in order to know if there are multiple types of events within 99 // the brackets. When multiple types of event happen in the bracket, 100 // wait-count may get decreased out of order, therefore we need to put in 101 // "s_waitcnt 0" before use. 102 class BlockWaitcntBrackets { 103 public: 104 static int32_t getWaitCountMax(InstCounterType T) { 105 switch (T) { 106 case VM_CNT: 107 return HardwareLimits.VmcntMax; 108 case LGKM_CNT: 109 return HardwareLimits.LgkmcntMax; 110 case EXP_CNT: 111 return HardwareLimits.ExpcntMax; 112 default: 113 break; 114 } 115 return 0; 116 }; 117 118 void setScoreLB(InstCounterType T, int32_t Val) { 119 assert(T < NUM_INST_CNTS); 120 if (T >= NUM_INST_CNTS) 121 return; 122 ScoreLBs[T] = Val; 123 }; 124 125 void setScoreUB(InstCounterType T, int32_t Val) { 126 assert(T < NUM_INST_CNTS); 127 if (T >= NUM_INST_CNTS) 128 return; 129 ScoreUBs[T] = Val; 130 if (T == EXP_CNT) { 131 int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT)); 132 if (ScoreLBs[T] < UB) 133 ScoreLBs[T] = UB; 134 } 135 }; 136 137 int32_t getScoreLB(InstCounterType T) { 138 assert(T < NUM_INST_CNTS); 139 if (T >= NUM_INST_CNTS) 140 return 0; 141 return ScoreLBs[T]; 142 }; 143 144 int32_t getScoreUB(InstCounterType T) { 145 assert(T < NUM_INST_CNTS); 146 if (T >= NUM_INST_CNTS) 147 return 0; 148 return ScoreUBs[T]; 149 }; 150 151 // Mapping from event to counter. 152 InstCounterType eventCounter(WaitEventType E) { 153 switch (E) { 154 case VMEM_ACCESS: 155 return VM_CNT; 156 case LDS_ACCESS: 157 case GDS_ACCESS: 158 case SQ_MESSAGE: 159 case SMEM_ACCESS: 160 return LGKM_CNT; 161 case EXP_GPR_LOCK: 162 case GDS_GPR_LOCK: 163 case VMW_GPR_LOCK: 164 case EXP_POS_ACCESS: 165 case EXP_PARAM_ACCESS: 166 return EXP_CNT; 167 default: 168 llvm_unreachable("unhandled event type"); 169 } 170 return NUM_INST_CNTS; 171 } 172 173 void setRegScore(int GprNo, InstCounterType T, int32_t Val) { 174 if (GprNo < NUM_ALL_VGPRS) { 175 if (GprNo > VgprUB) { 176 VgprUB = GprNo; 177 } 178 VgprScores[T][GprNo] = Val; 179 } else { 180 assert(T == LGKM_CNT); 181 if (GprNo - NUM_ALL_VGPRS > SgprUB) { 182 SgprUB = GprNo - NUM_ALL_VGPRS; 183 } 184 SgprScores[GprNo - NUM_ALL_VGPRS] = Val; 185 } 186 } 187 188 int32_t getRegScore(int GprNo, InstCounterType T) { 189 if (GprNo < NUM_ALL_VGPRS) { 190 return VgprScores[T][GprNo]; 191 } 192 return SgprScores[GprNo - NUM_ALL_VGPRS]; 193 } 194 195 void clear() { 196 memset(ScoreLBs, 0, sizeof(ScoreLBs)); 197 memset(ScoreUBs, 0, sizeof(ScoreUBs)); 198 memset(EventUBs, 0, sizeof(EventUBs)); 199 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; 200 T = (enum InstCounterType)(T + 1)) { 201 memset(VgprScores[T], 0, sizeof(VgprScores[T])); 202 } 203 memset(SgprScores, 0, sizeof(SgprScores)); 204 } 205 206 RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII, 207 const MachineRegisterInfo *MRI, 208 const SIRegisterInfo *TRI, unsigned OpNo, 209 bool Def) const; 210 211 void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII, 212 const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, 213 unsigned OpNo, int32_t Val); 214 215 void setWaitAtBeginning() { WaitAtBeginning = true; } 216 void clearWaitAtBeginning() { WaitAtBeginning = false; } 217 bool getWaitAtBeginning() const { return WaitAtBeginning; } 218 void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; } 219 int32_t getMaxVGPR() const { return VgprUB; } 220 int32_t getMaxSGPR() const { return SgprUB; } 221 int32_t getEventUB(enum WaitEventType W) const { 222 assert(W < NUM_WAIT_EVENTS); 223 return EventUBs[W]; 224 } 225 bool counterOutOfOrder(InstCounterType T); 226 unsigned int updateByWait(InstCounterType T, int ScoreToWait); 227 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, 228 const MachineRegisterInfo *MRI, WaitEventType E, 229 MachineInstr &MI); 230 231 BlockWaitcntBrackets() 232 : WaitAtBeginning(false), RevisitLoop(false), ValidLoop(false), MixedExpTypes(false), 233 LoopRegion(NULL), PostOrder(0), Waitcnt(NULL), VgprUB(0), SgprUB(0) { 234 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; 235 T = (enum InstCounterType)(T + 1)) { 236 memset(VgprScores[T], 0, sizeof(VgprScores[T])); 237 } 238 } 239 ~BlockWaitcntBrackets(){}; 240 241 bool hasPendingSMEM() const { 242 return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] && 243 EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]); 244 } 245 246 bool hasPendingFlat() const { 247 return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] && 248 LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) || 249 (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] && 250 LastFlat[VM_CNT] <= ScoreUBs[VM_CNT])); 251 } 252 253 void setPendingFlat() { 254 LastFlat[VM_CNT] = ScoreUBs[VM_CNT]; 255 LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT]; 256 } 257 258 int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; } 259 260 void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; } 261 262 bool getRevisitLoop() const { return RevisitLoop; } 263 void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; } 264 265 void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; } 266 int32_t getPostOrder() const { return PostOrder; } 267 268 void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; } 269 void clearWaitcnt() { Waitcnt = NULL; } 270 MachineInstr *getWaitcnt() const { return Waitcnt; } 271 272 bool mixedExpTypes() const { return MixedExpTypes; } 273 void setMixedExpTypes(bool MixedExpTypesIn) { 274 MixedExpTypes = MixedExpTypesIn; 275 } 276 277 void print(raw_ostream &); 278 void dump() { print(dbgs()); } 279 280 private: 281 bool WaitAtBeginning; 282 bool RevisitLoop; 283 bool ValidLoop; 284 bool MixedExpTypes; 285 MachineLoop *LoopRegion; 286 int32_t PostOrder; 287 MachineInstr *Waitcnt; 288 int32_t ScoreLBs[NUM_INST_CNTS] = {0}; 289 int32_t ScoreUBs[NUM_INST_CNTS] = {0}; 290 int32_t EventUBs[NUM_WAIT_EVENTS] = {0}; 291 // Remember the last flat memory operation. 292 int32_t LastFlat[NUM_INST_CNTS] = {0}; 293 // wait_cnt scores for every vgpr. 294 // Keep track of the VgprUB and SgprUB to make merge at join efficient. 295 int32_t VgprUB; 296 int32_t SgprUB; 297 int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS]; 298 // Wait cnt scores for every sgpr, only lgkmcnt is relevant. 299 int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0}; 300 }; 301 302 // This is a per-loop-region object that records waitcnt status at the end of 303 // loop footer from the previous iteration. We also maintain an iteration 304 // count to track the number of times the loop has been visited. When it 305 // doesn't converge naturally, we force convergence by inserting s_waitcnt 0 306 // at the end of the loop footer. 307 class LoopWaitcntData { 308 public: 309 void incIterCnt() { IterCnt++; } 310 void resetIterCnt() { IterCnt = 0; } 311 int32_t getIterCnt() { return IterCnt; } 312 313 LoopWaitcntData() : LfWaitcnt(NULL), IterCnt(0) {} 314 ~LoopWaitcntData(){}; 315 316 void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; } 317 MachineInstr *getWaitcnt() const { return LfWaitcnt; } 318 319 void print() { 320 DEBUG(dbgs() << " iteration " << IterCnt << '\n';); 321 return; 322 } 323 324 private: 325 // s_waitcnt added at the end of loop footer to stablize wait scores 326 // at the end of the loop footer. 327 MachineInstr *LfWaitcnt; 328 // Number of iterations the loop has been visited, not including the initial 329 // walk over. 330 int32_t IterCnt; 331 }; 332 333 class SIInsertWaitcnts : public MachineFunctionPass { 334 335 private: 336 const SISubtarget *ST; 337 const SIInstrInfo *TII; 338 const SIRegisterInfo *TRI; 339 const MachineRegisterInfo *MRI; 340 const MachineLoopInfo *MLI; 341 AMDGPU::IsaInfo::IsaVersion IV; 342 AMDGPUAS AMDGPUASI; 343 344 DenseSet<MachineBasicBlock *> BlockVisitedSet; 345 DenseSet<MachineInstr *> CompilerGeneratedWaitcntSet; 346 DenseSet<MachineInstr *> VCCZBugHandledSet; 347 348 DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>> 349 BlockWaitcntBracketsMap; 350 351 DenseSet<MachineBasicBlock *> BlockWaitcntProcessedSet; 352 353 DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap; 354 355 std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets; 356 357 public: 358 static char ID; 359 360 SIInsertWaitcnts() 361 : MachineFunctionPass(ID), ST(nullptr), TII(nullptr), TRI(nullptr), 362 MRI(nullptr), MLI(nullptr) {} 363 364 bool runOnMachineFunction(MachineFunction &MF) override; 365 366 StringRef getPassName() const override { 367 return "SI insert wait instructions"; 368 } 369 370 void getAnalysisUsage(AnalysisUsage &AU) const override { 371 AU.setPreservesCFG(); 372 AU.addRequired<MachineLoopInfo>(); 373 MachineFunctionPass::getAnalysisUsage(AU); 374 } 375 376 void addKillWaitBracket(BlockWaitcntBrackets *Bracket) { 377 // The waitcnt information is copied because it changes as the block is 378 // traversed. 379 KillWaitBrackets.push_back(make_unique<BlockWaitcntBrackets>(*Bracket)); 380 } 381 382 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; 383 MachineInstr *generateSWaitCntInstBefore(MachineInstr &MI, 384 BlockWaitcntBrackets *ScoreBrackets); 385 void updateEventWaitCntAfter(MachineInstr &Inst, 386 BlockWaitcntBrackets *ScoreBrackets); 387 void mergeInputScoreBrackets(MachineBasicBlock &Block); 388 MachineBasicBlock *loopBottom(const MachineLoop *Loop); 389 void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block); 390 void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst); 391 }; 392 393 } // End anonymous namespace. 394 395 RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI, 396 const SIInstrInfo *TII, 397 const MachineRegisterInfo *MRI, 398 const SIRegisterInfo *TRI, 399 unsigned OpNo, 400 bool Def) const { 401 const MachineOperand &Op = MI->getOperand(OpNo); 402 if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) || 403 (Def && !Op.isDef())) 404 return {-1, -1}; 405 406 // A use via a PW operand does not need a waitcnt. 407 // A partial write is not a WAW. 408 assert(!Op.getSubReg() || !Op.isUndef()); 409 410 RegInterval Result; 411 const MachineRegisterInfo &MRIA = *MRI; 412 413 unsigned Reg = TRI->getEncodingValue(Op.getReg()); 414 415 if (TRI->isVGPR(MRIA, Op.getReg())) { 416 assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL); 417 Result.first = Reg - RegisterEncoding.VGPR0; 418 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS); 419 } else if (TRI->isSGPRReg(MRIA, Op.getReg())) { 420 assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS); 421 Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS; 422 assert(Result.first >= NUM_ALL_VGPRS && 423 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS); 424 } 425 // TODO: Handle TTMP 426 // else if (TRI->isTTMP(MRIA, Reg.getReg())) ... 427 else 428 return {-1, -1}; 429 430 const MachineInstr &MIA = *MI; 431 const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo); 432 unsigned Size = TRI->getRegSizeInBits(*RC); 433 Result.second = Result.first + (Size / 32); 434 435 return Result; 436 } 437 438 void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI, 439 const SIInstrInfo *TII, 440 const SIRegisterInfo *TRI, 441 const MachineRegisterInfo *MRI, 442 unsigned OpNo, int32_t Val) { 443 RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false); 444 DEBUG({ 445 const MachineOperand &Opnd = MI->getOperand(OpNo); 446 assert(TRI->isVGPR(*MRI, Opnd.getReg())); 447 }); 448 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 449 setRegScore(RegNo, EXP_CNT, Val); 450 } 451 } 452 453 void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII, 454 const SIRegisterInfo *TRI, 455 const MachineRegisterInfo *MRI, 456 WaitEventType E, MachineInstr &Inst) { 457 const MachineRegisterInfo &MRIA = *MRI; 458 InstCounterType T = eventCounter(E); 459 int32_t CurrScore = getScoreUB(T) + 1; 460 // EventUB and ScoreUB need to be update regardless if this event changes 461 // the score of a register or not. 462 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message. 463 EventUBs[E] = CurrScore; 464 setScoreUB(T, CurrScore); 465 466 if (T == EXP_CNT) { 467 // Check for mixed export types. If they are mixed, then a waitcnt exp(0) 468 // is required. 469 if (!MixedExpTypes) { 470 MixedExpTypes = counterOutOfOrder(EXP_CNT); 471 } 472 473 // Put score on the source vgprs. If this is a store, just use those 474 // specific register(s). 475 if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) { 476 // All GDS operations must protect their address register (same as 477 // export.) 478 if (Inst.getOpcode() != AMDGPU::DS_APPEND && 479 Inst.getOpcode() != AMDGPU::DS_CONSUME) { 480 setExpScore( 481 &Inst, TII, TRI, MRI, 482 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr), 483 CurrScore); 484 } 485 if (Inst.mayStore()) { 486 setExpScore( 487 &Inst, TII, TRI, MRI, 488 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0), 489 CurrScore); 490 if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), 491 AMDGPU::OpName::data1) != -1) { 492 setExpScore(&Inst, TII, TRI, MRI, 493 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), 494 AMDGPU::OpName::data1), 495 CurrScore); 496 } 497 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 && 498 Inst.getOpcode() != AMDGPU::DS_GWS_INIT && 499 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V && 500 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR && 501 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P && 502 Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER && 503 Inst.getOpcode() != AMDGPU::DS_APPEND && 504 Inst.getOpcode() != AMDGPU::DS_CONSUME && 505 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) { 506 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 507 const MachineOperand &Op = Inst.getOperand(I); 508 if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) { 509 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore); 510 } 511 } 512 } 513 } else if (TII->isFLAT(Inst)) { 514 if (Inst.mayStore()) { 515 setExpScore( 516 &Inst, TII, TRI, MRI, 517 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 518 CurrScore); 519 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) { 520 setExpScore( 521 &Inst, TII, TRI, MRI, 522 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 523 CurrScore); 524 } 525 } else if (TII->isMIMG(Inst)) { 526 if (Inst.mayStore()) { 527 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); 528 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) { 529 setExpScore( 530 &Inst, TII, TRI, MRI, 531 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 532 CurrScore); 533 } 534 } else if (TII->isMTBUF(Inst)) { 535 if (Inst.mayStore()) { 536 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); 537 } 538 } else if (TII->isMUBUF(Inst)) { 539 if (Inst.mayStore()) { 540 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); 541 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) { 542 setExpScore( 543 &Inst, TII, TRI, MRI, 544 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 545 CurrScore); 546 } 547 } else { 548 if (TII->isEXP(Inst)) { 549 // For export the destination registers are really temps that 550 // can be used as the actual source after export patching, so 551 // we need to treat them like sources and set the EXP_CNT 552 // score. 553 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 554 MachineOperand &DefMO = Inst.getOperand(I); 555 if (DefMO.isReg() && DefMO.isDef() && 556 TRI->isVGPR(MRIA, DefMO.getReg())) { 557 setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT, 558 CurrScore); 559 } 560 } 561 } 562 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 563 MachineOperand &MO = Inst.getOperand(I); 564 if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) { 565 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore); 566 } 567 } 568 } 569 #if 0 // TODO: check if this is handled by MUBUF code above. 570 } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD || 571 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 || 572 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) { 573 MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data); 574 unsigned OpNo;//TODO: find the OpNo for this operand; 575 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false); 576 for (signed RegNo = Interval.first; RegNo < Interval.second; 577 ++RegNo) { 578 setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore); 579 } 580 #endif 581 } else { 582 // Match the score to the destination registers. 583 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 584 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true); 585 if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS) 586 continue; 587 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 588 setRegScore(RegNo, T, CurrScore); 589 } 590 } 591 if (TII->isDS(Inst) && Inst.mayStore()) { 592 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore); 593 } 594 } 595 } 596 597 void BlockWaitcntBrackets::print(raw_ostream &OS) { 598 OS << '\n'; 599 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; 600 T = (enum InstCounterType)(T + 1)) { 601 int LB = getScoreLB(T); 602 int UB = getScoreUB(T); 603 604 switch (T) { 605 case VM_CNT: 606 OS << " VM_CNT(" << UB - LB << "): "; 607 break; 608 case LGKM_CNT: 609 OS << " LGKM_CNT(" << UB - LB << "): "; 610 break; 611 case EXP_CNT: 612 OS << " EXP_CNT(" << UB - LB << "): "; 613 break; 614 default: 615 OS << " UNKNOWN(" << UB - LB << "): "; 616 break; 617 } 618 619 if (LB < UB) { 620 // Print vgpr scores. 621 for (int J = 0; J <= getMaxVGPR(); J++) { 622 int RegScore = getRegScore(J, T); 623 if (RegScore <= LB) 624 continue; 625 int RelScore = RegScore - LB - 1; 626 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) { 627 OS << RelScore << ":v" << J << " "; 628 } else { 629 OS << RelScore << ":ds "; 630 } 631 } 632 // Also need to print sgpr scores for lgkm_cnt. 633 if (T == LGKM_CNT) { 634 for (int J = 0; J <= getMaxSGPR(); J++) { 635 int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); 636 if (RegScore <= LB) 637 continue; 638 int RelScore = RegScore - LB - 1; 639 OS << RelScore << ":s" << J << " "; 640 } 641 } 642 } 643 OS << '\n'; 644 } 645 OS << '\n'; 646 return; 647 } 648 649 unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T, 650 int ScoreToWait) { 651 unsigned int NeedWait = 0; 652 if (ScoreToWait == -1) { 653 // The score to wait is unknown. This implies that it was not encountered 654 // during the path of the CFG walk done during the current traversal but 655 // may be seen on a different path. Emit an s_wait counter with a 656 // conservative value of 0 for the counter. 657 NeedWait = CNT_MASK(T); 658 setScoreLB(T, getScoreUB(T)); 659 return NeedWait; 660 } 661 662 // If the score of src_operand falls within the bracket, we need an 663 // s_waitcnt instruction. 664 const int32_t LB = getScoreLB(T); 665 const int32_t UB = getScoreUB(T); 666 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { 667 if (T == VM_CNT && hasPendingFlat()) { 668 // If there is a pending FLAT operation, and this is a VM waitcnt, 669 // then we need to force a waitcnt 0 for VM. 670 NeedWait = CNT_MASK(T); 671 setScoreLB(T, getScoreUB(T)); 672 } else if (counterOutOfOrder(T)) { 673 // Counter can get decremented out-of-order when there 674 // are multiple types event in the brack. Also emit an s_wait counter 675 // with a conservative value of 0 for the counter. 676 NeedWait = CNT_MASK(T); 677 setScoreLB(T, getScoreUB(T)); 678 } else { 679 NeedWait = CNT_MASK(T); 680 setScoreLB(T, ScoreToWait); 681 } 682 } 683 684 return NeedWait; 685 } 686 687 // Where there are multiple types of event in the bracket of a counter, 688 // the decrement may go out of order. 689 bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) { 690 switch (T) { 691 case VM_CNT: 692 return false; 693 case LGKM_CNT: { 694 if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] && 695 EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) { 696 // Scalar memory read always can go out of order. 697 return true; 698 } 699 int NumEventTypes = 0; 700 if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] && 701 EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) { 702 NumEventTypes++; 703 } 704 if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] && 705 EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) { 706 NumEventTypes++; 707 } 708 if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] && 709 EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) { 710 NumEventTypes++; 711 } 712 if (NumEventTypes <= 1) { 713 return false; 714 } 715 break; 716 } 717 case EXP_CNT: { 718 // If there has been a mixture of export types, then a waitcnt exp(0) is 719 // required. 720 if (MixedExpTypes) 721 return true; 722 int NumEventTypes = 0; 723 if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] && 724 EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) { 725 NumEventTypes++; 726 } 727 if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] && 728 EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) { 729 NumEventTypes++; 730 } 731 if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] && 732 EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) { 733 NumEventTypes++; 734 } 735 if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] && 736 EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) { 737 NumEventTypes++; 738 } 739 740 if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] && 741 EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) { 742 NumEventTypes++; 743 } 744 745 if (NumEventTypes <= 1) { 746 return false; 747 } 748 break; 749 } 750 default: 751 break; 752 } 753 return true; 754 } 755 756 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, 757 false) 758 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, 759 false) 760 761 char SIInsertWaitcnts::ID = 0; 762 763 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID; 764 765 FunctionPass *llvm::createSIInsertWaitcntsPass() { 766 return new SIInsertWaitcnts(); 767 } 768 769 static bool readsVCCZ(const MachineInstr &MI) { 770 unsigned Opc = MI.getOpcode(); 771 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && 772 !MI.getOperand(1).isUndef(); 773 } 774 775 /// \brief Generate s_waitcnt instruction to be placed before cur_Inst. 776 /// Instructions of a given type are returned in order, 777 /// but instructions of different types can complete out of order. 778 /// We rely on this in-order completion 779 /// and simply assign a score to the memory access instructions. 780 /// We keep track of the active "score bracket" to determine 781 /// if an access of a memory read requires an s_waitcnt 782 /// and if so what the value of each counter is. 783 /// The "score bracket" is bound by the lower bound and upper bound 784 /// scores (*_score_LB and *_score_ub respectively). 785 MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( 786 MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) { 787 // To emit, or not to emit - that's the question! 788 // Start with an assumption that there is no need to emit. 789 unsigned int EmitSwaitcnt = 0; 790 // s_waitcnt instruction to return; default is NULL. 791 MachineInstr *SWaitInst = nullptr; 792 // No need to wait before phi. If a phi-move exists, then the wait should 793 // has been inserted before the move. If a phi-move does not exist, then 794 // wait should be inserted before the real use. The same is true for 795 // sc-merge. It is not a coincident that all these cases correspond to the 796 // instructions that are skipped in the assembling loop. 797 bool NeedLineMapping = false; // TODO: Check on this. 798 if (MI.isDebugValue() && 799 // TODO: any other opcode? 800 !NeedLineMapping) { 801 return SWaitInst; 802 } 803 804 // See if an s_waitcnt is forced at block entry, or is needed at 805 // program end. 806 if (ScoreBrackets->getWaitAtBeginning()) { 807 // Note that we have already cleared the state, so we don't need to update 808 // it. 809 ScoreBrackets->clearWaitAtBeginning(); 810 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; 811 T = (enum InstCounterType)(T + 1)) { 812 EmitSwaitcnt |= CNT_MASK(T); 813 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); 814 } 815 } 816 817 // See if this instruction has a forced S_WAITCNT VM. 818 // TODO: Handle other cases of NeedsWaitcntVmBefore() 819 else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 || 820 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC || 821 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) { 822 EmitSwaitcnt |= 823 ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); 824 } 825 826 // All waits must be resolved at call return. 827 // NOTE: this could be improved with knowledge of all call sites or 828 // with knowledge of the called routines. 829 if (MI.getOpcode() == AMDGPU::RETURN || 830 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || 831 MI.getOpcode() == AMDGPU::S_SETPC_B64_return) { 832 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; 833 T = (enum InstCounterType)(T + 1)) { 834 if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) { 835 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); 836 EmitSwaitcnt |= CNT_MASK(T); 837 } 838 } 839 } 840 // Resolve vm waits before gs-done. 841 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG || 842 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) && 843 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) == 844 AMDGPU::SendMsg::ID_GS_DONE)) { 845 if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) { 846 ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); 847 EmitSwaitcnt |= CNT_MASK(VM_CNT); 848 } 849 } 850 #if 0 // TODO: the following blocks of logic when we have fence. 851 else if (MI.getOpcode() == SC_FENCE) { 852 const unsigned int group_size = 853 context->shader_info->GetMaxThreadGroupSize(); 854 // group_size == 0 means thread group size is unknown at compile time 855 const bool group_is_multi_wave = 856 (group_size == 0 || group_size > target_info->GetWaveFrontSize()); 857 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence(); 858 859 for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) { 860 SCRegType src_type = Inst->GetSrcType(i); 861 switch (src_type) { 862 case SCMEM_LDS: 863 if (group_is_multi_wave || 864 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) { 865 EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, 866 ScoreBrackets->getScoreUB(LGKM_CNT)); 867 // LDS may have to wait for VM_CNT after buffer load to LDS 868 if (target_info->HasBufferLoadToLDS()) { 869 EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT, 870 ScoreBrackets->getScoreUB(VM_CNT)); 871 } 872 } 873 break; 874 875 case SCMEM_GDS: 876 if (group_is_multi_wave || fence_is_global) { 877 EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, 878 ScoreBrackets->getScoreUB(EXP_CNT)); 879 EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, 880 ScoreBrackets->getScoreUB(LGKM_CNT)); 881 } 882 break; 883 884 case SCMEM_UAV: 885 case SCMEM_TFBUF: 886 case SCMEM_RING: 887 case SCMEM_SCATTER: 888 if (group_is_multi_wave || fence_is_global) { 889 EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, 890 ScoreBrackets->getScoreUB(EXP_CNT)); 891 EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT, 892 ScoreBrackets->getScoreUB(VM_CNT)); 893 } 894 break; 895 896 case SCMEM_SCRATCH: 897 default: 898 break; 899 } 900 } 901 } 902 #endif 903 904 // Export & GDS instructions do not read the EXEC mask until after the export 905 // is granted (which can occur well after the instruction is issued). 906 // The shader program must flush all EXP operations on the export-count 907 // before overwriting the EXEC mask. 908 else { 909 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) { 910 // Export and GDS are tracked individually, either may trigger a waitcnt 911 // for EXEC. 912 EmitSwaitcnt |= ScoreBrackets->updateByWait( 913 EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK)); 914 EmitSwaitcnt |= ScoreBrackets->updateByWait( 915 EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS)); 916 EmitSwaitcnt |= ScoreBrackets->updateByWait( 917 EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS)); 918 EmitSwaitcnt |= ScoreBrackets->updateByWait( 919 EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK)); 920 } 921 922 #if 0 // TODO: the following code to handle CALL. 923 // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT. 924 // However, there is a problem with EXP_CNT, because the call cannot 925 // easily tell if a register is used in the function, and if it did, then 926 // the referring instruction would have to have an S_WAITCNT, which is 927 // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs 928 // before the call. 929 if (MI.getOpcode() == SC_CALL) { 930 if (ScoreBrackets->getScoreUB(EXP_CNT) > 931 ScoreBrackets->getScoreLB(EXP_CNT)) { 932 ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); 933 EmitSwaitcnt |= CNT_MASK(EXP_CNT); 934 } 935 } 936 #endif 937 938 // FIXME: Should not be relying on memoperands. 939 // Look at the source operands of every instruction to see if 940 // any of them results from a previous memory operation that affects 941 // its current usage. If so, an s_waitcnt instruction needs to be 942 // emitted. 943 // If the source operand was defined by a load, add the s_waitcnt 944 // instruction. 945 for (const MachineMemOperand *Memop : MI.memoperands()) { 946 unsigned AS = Memop->getAddrSpace(); 947 if (AS != AMDGPUASI.LOCAL_ADDRESS) 948 continue; 949 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; 950 // VM_CNT is only relevant to vgpr or LDS. 951 EmitSwaitcnt |= ScoreBrackets->updateByWait( 952 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); 953 } 954 955 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 956 const MachineOperand &Op = MI.getOperand(I); 957 const MachineRegisterInfo &MRIA = *MRI; 958 RegInterval Interval = 959 ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false); 960 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 961 if (TRI->isVGPR(MRIA, Op.getReg())) { 962 // VM_CNT is only relevant to vgpr or LDS. 963 EmitSwaitcnt |= ScoreBrackets->updateByWait( 964 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); 965 } 966 EmitSwaitcnt |= ScoreBrackets->updateByWait( 967 LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT)); 968 } 969 } 970 // End of for loop that looks at all source operands to decide vm_wait_cnt 971 // and lgk_wait_cnt. 972 973 // Two cases are handled for destination operands: 974 // 1) If the destination operand was defined by a load, add the s_waitcnt 975 // instruction to guarantee the right WAW order. 976 // 2) If a destination operand that was used by a recent export/store ins, 977 // add s_waitcnt on exp_cnt to guarantee the WAR order. 978 if (MI.mayStore()) { 979 // FIXME: Should not be relying on memoperands. 980 for (const MachineMemOperand *Memop : MI.memoperands()) { 981 unsigned AS = Memop->getAddrSpace(); 982 if (AS != AMDGPUASI.LOCAL_ADDRESS) 983 continue; 984 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; 985 EmitSwaitcnt |= ScoreBrackets->updateByWait( 986 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); 987 EmitSwaitcnt |= ScoreBrackets->updateByWait( 988 EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT)); 989 } 990 } 991 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 992 MachineOperand &Def = MI.getOperand(I); 993 const MachineRegisterInfo &MRIA = *MRI; 994 RegInterval Interval = 995 ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true); 996 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 997 if (TRI->isVGPR(MRIA, Def.getReg())) { 998 EmitSwaitcnt |= ScoreBrackets->updateByWait( 999 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); 1000 EmitSwaitcnt |= ScoreBrackets->updateByWait( 1001 EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT)); 1002 } 1003 EmitSwaitcnt |= ScoreBrackets->updateByWait( 1004 LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT)); 1005 } 1006 } // End of for loop that looks at all dest operands. 1007 } 1008 1009 // TODO: Tie force zero to a compiler triage option. 1010 bool ForceZero = false; 1011 1012 // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0 1013 // occurs before the instruction. Doing it here prevents any additional 1014 // S_WAITCNTs from being emitted if the instruction was marked as 1015 // requiring a WAITCNT beforehand. 1016 if (MI.getOpcode() == AMDGPU::S_BARRIER && 1017 !ST->hasAutoWaitcntBeforeBarrier()) { 1018 EmitSwaitcnt |= 1019 ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); 1020 EmitSwaitcnt |= ScoreBrackets->updateByWait( 1021 EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); 1022 EmitSwaitcnt |= ScoreBrackets->updateByWait( 1023 LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT)); 1024 } 1025 1026 // TODO: Remove this work-around, enable the assert for Bug 457939 1027 // after fixing the scheduler. Also, the Shader Compiler code is 1028 // independent of target. 1029 if (readsVCCZ(MI) && ST->getGeneration() <= SISubtarget::SEA_ISLANDS) { 1030 if (ScoreBrackets->getScoreLB(LGKM_CNT) < 1031 ScoreBrackets->getScoreUB(LGKM_CNT) && 1032 ScoreBrackets->hasPendingSMEM()) { 1033 // Wait on everything, not just LGKM. vccz reads usually come from 1034 // terminators, and we always wait on everything at the end of the 1035 // block, so if we only wait on LGKM here, we might end up with 1036 // another s_waitcnt inserted right after this if there are non-LGKM 1037 // instructions still outstanding. 1038 ForceZero = true; 1039 EmitSwaitcnt = true; 1040 } 1041 } 1042 1043 // Does this operand processing indicate s_wait counter update? 1044 if (EmitSwaitcnt) { 1045 int CntVal[NUM_INST_CNTS]; 1046 1047 bool UseDefaultWaitcntStrategy = true; 1048 if (ForceZero) { 1049 // Force all waitcnts to 0. 1050 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; 1051 T = (enum InstCounterType)(T + 1)) { 1052 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); 1053 } 1054 CntVal[VM_CNT] = 0; 1055 CntVal[EXP_CNT] = 0; 1056 CntVal[LGKM_CNT] = 0; 1057 UseDefaultWaitcntStrategy = false; 1058 } 1059 1060 if (UseDefaultWaitcntStrategy) { 1061 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; 1062 T = (enum InstCounterType)(T + 1)) { 1063 if (EmitSwaitcnt & CNT_MASK(T)) { 1064 int Delta = 1065 ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T); 1066 int MaxDelta = ScoreBrackets->getWaitCountMax(T); 1067 if (Delta >= MaxDelta) { 1068 Delta = -1; 1069 if (T != EXP_CNT) { 1070 ScoreBrackets->setScoreLB( 1071 T, ScoreBrackets->getScoreUB(T) - MaxDelta); 1072 } 1073 EmitSwaitcnt &= ~CNT_MASK(T); 1074 } 1075 CntVal[T] = Delta; 1076 } else { 1077 // If we are not waiting for a particular counter then encode 1078 // it as -1 which means "don't care." 1079 CntVal[T] = -1; 1080 } 1081 } 1082 } 1083 1084 // If we are not waiting on any counter we can skip the wait altogether. 1085 if (EmitSwaitcnt != 0) { 1086 MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt(); 1087 int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm(); 1088 if (!OldWaitcnt || (AMDGPU::decodeVmcnt(IV, Imm) != 1089 (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) || 1090 (AMDGPU::decodeExpcnt(IV, Imm) != 1091 (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) || 1092 (AMDGPU::decodeLgkmcnt(IV, Imm) != 1093 (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) { 1094 MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent()); 1095 if (ContainingLoop) { 1096 MachineBasicBlock *TBB = ContainingLoop->getHeader(); 1097 BlockWaitcntBrackets *ScoreBracket = 1098 BlockWaitcntBracketsMap[TBB].get(); 1099 if (!ScoreBracket) { 1100 assert(BlockVisitedSet.find(TBB) == BlockVisitedSet.end()); 1101 BlockWaitcntBracketsMap[TBB] = make_unique<BlockWaitcntBrackets>(); 1102 ScoreBracket = BlockWaitcntBracketsMap[TBB].get(); 1103 } 1104 ScoreBracket->setRevisitLoop(true); 1105 DEBUG(dbgs() << "set-revisit: block" 1106 << ContainingLoop->getHeader()->getNumber() << '\n';); 1107 } 1108 } 1109 1110 // Update an existing waitcount, or make a new one. 1111 MachineFunction &MF = *MI.getParent()->getParent(); 1112 if (OldWaitcnt && OldWaitcnt->getOpcode() != AMDGPU::S_WAITCNT) { 1113 SWaitInst = OldWaitcnt; 1114 } else { 1115 SWaitInst = MF.CreateMachineInstr(TII->get(AMDGPU::S_WAITCNT), 1116 MI.getDebugLoc()); 1117 CompilerGeneratedWaitcntSet.insert(SWaitInst); 1118 } 1119 1120 const MachineOperand &Op = 1121 MachineOperand::CreateImm(AMDGPU::encodeWaitcnt( 1122 IV, CntVal[VM_CNT], CntVal[EXP_CNT], CntVal[LGKM_CNT])); 1123 SWaitInst->addOperand(MF, Op); 1124 1125 if (CntVal[EXP_CNT] == 0) { 1126 ScoreBrackets->setMixedExpTypes(false); 1127 } 1128 } 1129 } 1130 1131 return SWaitInst; 1132 } 1133 1134 void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB, 1135 MachineInstr *Waitcnt) { 1136 if (MBB.empty()) { 1137 MBB.push_back(Waitcnt); 1138 return; 1139 } 1140 1141 MachineBasicBlock::iterator It = MBB.end(); 1142 MachineInstr *MI = &*(--It); 1143 if (MI->isBranch()) { 1144 MBB.insert(It, Waitcnt); 1145 } else { 1146 MBB.push_back(Waitcnt); 1147 } 1148 1149 return; 1150 } 1151 1152 // This is a flat memory operation. Check to see if it has memory 1153 // tokens for both LDS and Memory, and if so mark it as a flat. 1154 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { 1155 if (MI.memoperands_empty()) 1156 return true; 1157 1158 for (const MachineMemOperand *Memop : MI.memoperands()) { 1159 unsigned AS = Memop->getAddrSpace(); 1160 if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) 1161 return true; 1162 } 1163 1164 return false; 1165 } 1166 1167 void SIInsertWaitcnts::updateEventWaitCntAfter( 1168 MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) { 1169 // Now look at the instruction opcode. If it is a memory access 1170 // instruction, update the upper-bound of the appropriate counter's 1171 // bracket and the destination operand scores. 1172 // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere. 1173 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) { 1174 if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) { 1175 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst); 1176 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst); 1177 } else { 1178 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); 1179 } 1180 } else if (TII->isFLAT(Inst)) { 1181 assert(Inst.mayLoad() || Inst.mayStore()); 1182 1183 if (TII->usesVM_CNT(Inst)) 1184 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); 1185 1186 if (TII->usesLGKM_CNT(Inst)) { 1187 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); 1188 1189 // This is a flat memory operation, so note it - it will require 1190 // that both the VM and LGKM be flushed to zero if it is pending when 1191 // a VM or LGKM dependency occurs. 1192 if (mayAccessLDSThroughFlat(Inst)) 1193 ScoreBrackets->setPendingFlat(); 1194 } 1195 } else if (SIInstrInfo::isVMEM(Inst) && 1196 // TODO: get a better carve out. 1197 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 && 1198 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC && 1199 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) { 1200 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); 1201 if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() && 1202 (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) { 1203 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst); 1204 } 1205 } else if (TII->isSMRD(Inst)) { 1206 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); 1207 } else { 1208 switch (Inst.getOpcode()) { 1209 case AMDGPU::S_SENDMSG: 1210 case AMDGPU::S_SENDMSGHALT: 1211 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst); 1212 break; 1213 case AMDGPU::EXP: 1214 case AMDGPU::EXP_DONE: { 1215 int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm(); 1216 if (Imm >= 32 && Imm <= 63) 1217 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst); 1218 else if (Imm >= 12 && Imm <= 15) 1219 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst); 1220 else 1221 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst); 1222 break; 1223 } 1224 case AMDGPU::S_MEMTIME: 1225 case AMDGPU::S_MEMREALTIME: 1226 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); 1227 break; 1228 default: 1229 break; 1230 } 1231 } 1232 } 1233 1234 void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) { 1235 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get(); 1236 int32_t MaxPending[NUM_INST_CNTS] = {0}; 1237 int32_t MaxFlat[NUM_INST_CNTS] = {0}; 1238 bool MixedExpTypes = false; 1239 1240 // Clear the score bracket state. 1241 ScoreBrackets->clear(); 1242 1243 // Compute the number of pending elements on block entry. 1244 1245 // IMPORTANT NOTE: If iterative handling of loops is added, the code will 1246 // need to handle single BBs with backedges to themselves. This means that 1247 // they will need to retain and not clear their initial state. 1248 1249 // See if there are any uninitialized predecessors. If so, emit an 1250 // s_waitcnt 0 at the beginning of the block. 1251 for (MachineBasicBlock *pred : Block.predecessors()) { 1252 BlockWaitcntBrackets *PredScoreBrackets = 1253 BlockWaitcntBracketsMap[pred].get(); 1254 bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end(); 1255 if (!Visited || PredScoreBrackets->getWaitAtBeginning()) { 1256 break; 1257 } 1258 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; 1259 T = (enum InstCounterType)(T + 1)) { 1260 int span = 1261 PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T); 1262 MaxPending[T] = std::max(MaxPending[T], span); 1263 span = 1264 PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T); 1265 MaxFlat[T] = std::max(MaxFlat[T], span); 1266 } 1267 1268 MixedExpTypes |= PredScoreBrackets->mixedExpTypes(); 1269 } 1270 1271 // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()? 1272 // Also handle kills for exit block. 1273 if (Block.succ_empty() && !KillWaitBrackets.empty()) { 1274 for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) { 1275 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; 1276 T = (enum InstCounterType)(T + 1)) { 1277 int Span = KillWaitBrackets[I]->getScoreUB(T) - 1278 KillWaitBrackets[I]->getScoreLB(T); 1279 MaxPending[T] = std::max(MaxPending[T], Span); 1280 Span = KillWaitBrackets[I]->pendingFlat(T) - 1281 KillWaitBrackets[I]->getScoreLB(T); 1282 MaxFlat[T] = std::max(MaxFlat[T], Span); 1283 } 1284 1285 MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes(); 1286 } 1287 } 1288 1289 // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK. 1290 for (MachineBasicBlock *Pred : Block.predecessors()) { 1291 BlockWaitcntBrackets *PredScoreBrackets = 1292 BlockWaitcntBracketsMap[Pred].get(); 1293 bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end(); 1294 if (!Visited || PredScoreBrackets->getWaitAtBeginning()) { 1295 break; 1296 } 1297 1298 int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) - 1299 PredScoreBrackets->getScoreLB(EXP_CNT); 1300 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan); 1301 int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) - 1302 PredScoreBrackets->getScoreLB(EXP_CNT); 1303 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan); 1304 } 1305 1306 // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()? 1307 if (Block.succ_empty() && !KillWaitBrackets.empty()) { 1308 for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) { 1309 int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) - 1310 KillWaitBrackets[I]->getScoreLB(EXP_CNT); 1311 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan); 1312 int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) - 1313 KillWaitBrackets[I]->getScoreLB(EXP_CNT); 1314 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan); 1315 } 1316 } 1317 1318 #if 0 1319 // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker. 1320 // TODO: how does LC distinguish between function entry and main entry? 1321 // If this is the entry to a function, force a wait. 1322 MachineBasicBlock &Entry = Block.getParent()->front(); 1323 if (Entry.getNumber() == Block.getNumber()) { 1324 ScoreBrackets->setWaitAtBeginning(); 1325 return; 1326 } 1327 #endif 1328 1329 // Now set the current Block's brackets to the largest ending bracket. 1330 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; 1331 T = (enum InstCounterType)(T + 1)) { 1332 ScoreBrackets->setScoreUB(T, MaxPending[T]); 1333 ScoreBrackets->setScoreLB(T, 0); 1334 ScoreBrackets->setLastFlat(T, MaxFlat[T]); 1335 } 1336 1337 ScoreBrackets->setMixedExpTypes(MixedExpTypes); 1338 1339 // Set the register scoreboard. 1340 for (MachineBasicBlock *Pred : Block.predecessors()) { 1341 if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) { 1342 break; 1343 } 1344 1345 BlockWaitcntBrackets *PredScoreBrackets = 1346 BlockWaitcntBracketsMap[Pred].get(); 1347 1348 // Now merge the gpr_reg_score information 1349 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; 1350 T = (enum InstCounterType)(T + 1)) { 1351 int PredLB = PredScoreBrackets->getScoreLB(T); 1352 int PredUB = PredScoreBrackets->getScoreUB(T); 1353 if (PredLB < PredUB) { 1354 int PredScale = MaxPending[T] - PredUB; 1355 // Merge vgpr scores. 1356 for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) { 1357 int PredRegScore = PredScoreBrackets->getRegScore(J, T); 1358 if (PredRegScore <= PredLB) 1359 continue; 1360 int NewRegScore = PredScale + PredRegScore; 1361 ScoreBrackets->setRegScore( 1362 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore)); 1363 } 1364 // Also need to merge sgpr scores for lgkm_cnt. 1365 if (T == LGKM_CNT) { 1366 for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) { 1367 int PredRegScore = 1368 PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); 1369 if (PredRegScore <= PredLB) 1370 continue; 1371 int NewRegScore = PredScale + PredRegScore; 1372 ScoreBrackets->setRegScore( 1373 J + NUM_ALL_VGPRS, LGKM_CNT, 1374 std::max( 1375 ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT), 1376 NewRegScore)); 1377 } 1378 } 1379 } 1380 } 1381 1382 // Also merge the WaitEvent information. 1383 ForAllWaitEventType(W) { 1384 enum InstCounterType T = PredScoreBrackets->eventCounter(W); 1385 int PredEventUB = PredScoreBrackets->getEventUB(W); 1386 if (PredEventUB > PredScoreBrackets->getScoreLB(T)) { 1387 int NewEventUB = 1388 MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T); 1389 if (NewEventUB > 0) { 1390 ScoreBrackets->setEventUB( 1391 W, std::max(ScoreBrackets->getEventUB(W), NewEventUB)); 1392 } 1393 } 1394 } 1395 } 1396 1397 // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()? 1398 // Set the register scoreboard. 1399 if (Block.succ_empty() && !KillWaitBrackets.empty()) { 1400 for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) { 1401 // Now merge the gpr_reg_score information. 1402 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; 1403 T = (enum InstCounterType)(T + 1)) { 1404 int PredLB = KillWaitBrackets[I]->getScoreLB(T); 1405 int PredUB = KillWaitBrackets[I]->getScoreUB(T); 1406 if (PredLB < PredUB) { 1407 int PredScale = MaxPending[T] - PredUB; 1408 // Merge vgpr scores. 1409 for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) { 1410 int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T); 1411 if (PredRegScore <= PredLB) 1412 continue; 1413 int NewRegScore = PredScale + PredRegScore; 1414 ScoreBrackets->setRegScore( 1415 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore)); 1416 } 1417 // Also need to merge sgpr scores for lgkm_cnt. 1418 if (T == LGKM_CNT) { 1419 for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) { 1420 int PredRegScore = 1421 KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); 1422 if (PredRegScore <= PredLB) 1423 continue; 1424 int NewRegScore = PredScale + PredRegScore; 1425 ScoreBrackets->setRegScore( 1426 J + NUM_ALL_VGPRS, LGKM_CNT, 1427 std::max( 1428 ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT), 1429 NewRegScore)); 1430 } 1431 } 1432 } 1433 } 1434 1435 // Also merge the WaitEvent information. 1436 ForAllWaitEventType(W) { 1437 enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W); 1438 int PredEventUB = KillWaitBrackets[I]->getEventUB(W); 1439 if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) { 1440 int NewEventUB = 1441 MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T); 1442 if (NewEventUB > 0) { 1443 ScoreBrackets->setEventUB( 1444 W, std::max(ScoreBrackets->getEventUB(W), NewEventUB)); 1445 } 1446 } 1447 } 1448 } 1449 } 1450 1451 // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the 1452 // sequencing predecessors, because changes to EXEC require waitcnts due to 1453 // the delayed nature of these operations. 1454 for (MachineBasicBlock *Pred : Block.predecessors()) { 1455 if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) { 1456 break; 1457 } 1458 1459 BlockWaitcntBrackets *PredScoreBrackets = 1460 BlockWaitcntBracketsMap[Pred].get(); 1461 1462 int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK); 1463 if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) { 1464 int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub - 1465 PredScoreBrackets->getScoreUB(EXP_CNT); 1466 if (new_gds_ub > 0) { 1467 ScoreBrackets->setEventUB( 1468 GDS_GPR_LOCK, 1469 std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub)); 1470 } 1471 } 1472 int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK); 1473 if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) { 1474 int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub - 1475 PredScoreBrackets->getScoreUB(EXP_CNT); 1476 if (new_exp_ub > 0) { 1477 ScoreBrackets->setEventUB( 1478 EXP_GPR_LOCK, 1479 std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub)); 1480 } 1481 } 1482 } 1483 } 1484 1485 /// Return the "bottom" block of a loop. This differs from 1486 /// MachineLoop::getBottomBlock in that it works even if the loop is 1487 /// discontiguous. 1488 MachineBasicBlock *SIInsertWaitcnts::loopBottom(const MachineLoop *Loop) { 1489 MachineBasicBlock *Bottom = Loop->getHeader(); 1490 for (MachineBasicBlock *MBB : Loop->blocks()) 1491 if (MBB->getNumber() > Bottom->getNumber()) 1492 Bottom = MBB; 1493 return Bottom; 1494 } 1495 1496 // Generate s_waitcnt instructions where needed. 1497 void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, 1498 MachineBasicBlock &Block) { 1499 // Initialize the state information. 1500 mergeInputScoreBrackets(Block); 1501 1502 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get(); 1503 1504 DEBUG({ 1505 dbgs() << "Block" << Block.getNumber(); 1506 ScoreBrackets->dump(); 1507 }); 1508 1509 bool InsertNOP = false; 1510 1511 // Walk over the instructions. 1512 for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end(); 1513 Iter != E;) { 1514 MachineInstr &Inst = *Iter; 1515 // Remove any previously existing waitcnts. 1516 if (Inst.getOpcode() == AMDGPU::S_WAITCNT) { 1517 // TODO: Register the old waitcnt and optimize the following waitcnts. 1518 // Leaving the previously existing waitcnts is conservatively correct. 1519 if (CompilerGeneratedWaitcntSet.find(&Inst) == 1520 CompilerGeneratedWaitcntSet.end()) 1521 ++Iter; 1522 else { 1523 ScoreBrackets->setWaitcnt(&Inst); 1524 ++Iter; 1525 Inst.removeFromParent(); 1526 } 1527 continue; 1528 } 1529 1530 // Kill instructions generate a conditional branch to the endmain block. 1531 // Merge the current waitcnt state into the endmain block information. 1532 // TODO: Are there other flavors of KILL instruction? 1533 if (Inst.getOpcode() == AMDGPU::KILL) { 1534 addKillWaitBracket(ScoreBrackets); 1535 } 1536 1537 bool VCCZBugWorkAround = false; 1538 if (readsVCCZ(Inst) && 1539 (VCCZBugHandledSet.find(&Inst) == VCCZBugHandledSet.end())) { 1540 if (ScoreBrackets->getScoreLB(LGKM_CNT) < 1541 ScoreBrackets->getScoreUB(LGKM_CNT) && 1542 ScoreBrackets->hasPendingSMEM()) { 1543 if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) 1544 VCCZBugWorkAround = true; 1545 } 1546 } 1547 1548 // Generate an s_waitcnt instruction to be placed before 1549 // cur_Inst, if needed. 1550 MachineInstr *SWaitInst = generateSWaitCntInstBefore(Inst, ScoreBrackets); 1551 1552 if (SWaitInst) { 1553 Block.insert(Inst, SWaitInst); 1554 if (ScoreBrackets->getWaitcnt() != SWaitInst) { 1555 DEBUG(dbgs() << "insertWaitcntInBlock\n" 1556 << "Old Instr: " << Inst << '\n' 1557 << "New Instr: " << *SWaitInst << '\n';); 1558 } 1559 } 1560 1561 updateEventWaitCntAfter(Inst, ScoreBrackets); 1562 1563 #if 0 // TODO: implement resource type check controlled by options with ub = LB. 1564 // If this instruction generates a S_SETVSKIP because it is an 1565 // indexed resource, and we are on Tahiti, then it will also force 1566 // an S_WAITCNT vmcnt(0) 1567 if (RequireCheckResourceType(Inst, context)) { 1568 // Force the score to as if an S_WAITCNT vmcnt(0) is emitted. 1569 ScoreBrackets->setScoreLB(VM_CNT, 1570 ScoreBrackets->getScoreUB(VM_CNT)); 1571 } 1572 #endif 1573 1574 ScoreBrackets->clearWaitcnt(); 1575 1576 if (SWaitInst) { 1577 DEBUG({ SWaitInst->print(dbgs() << '\n'); }); 1578 } 1579 DEBUG({ 1580 Inst.print(dbgs()); 1581 ScoreBrackets->dump(); 1582 }); 1583 1584 // Check to see if this is a GWS instruction. If so, and if this is CI or 1585 // VI, then the generated code sequence will include an S_WAITCNT 0. 1586 // TODO: Are these the only GWS instructions? 1587 if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT || 1588 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V || 1589 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR || 1590 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P || 1591 Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) { 1592 // TODO: && context->target_info->GwsRequiresMemViolTest() ) { 1593 ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); 1594 ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); 1595 ScoreBrackets->updateByWait(LGKM_CNT, 1596 ScoreBrackets->getScoreUB(LGKM_CNT)); 1597 } 1598 1599 // TODO: Remove this work-around after fixing the scheduler and enable the 1600 // assert above. 1601 if (VCCZBugWorkAround) { 1602 // Restore the vccz bit. Any time a value is written to vcc, the vcc 1603 // bit is updated, so we can restore the bit by reading the value of 1604 // vcc and then writing it back to the register. 1605 BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), 1606 AMDGPU::VCC) 1607 .addReg(AMDGPU::VCC); 1608 VCCZBugHandledSet.insert(&Inst); 1609 } 1610 1611 if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 1612 1613 // This avoids a s_nop after a waitcnt has just been inserted. 1614 if (!SWaitInst && InsertNOP) { 1615 BuildMI(Block, Inst, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0); 1616 } 1617 InsertNOP = false; 1618 1619 // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM 1620 // or SMEM clause, respectively. 1621 // 1622 // The temporary workaround is to break the clauses with S_NOP. 1623 // 1624 // The proper solution would be to allocate registers such that all source 1625 // and destination registers don't overlap, e.g. this is illegal: 1626 // r0 = load r2 1627 // r2 = load r0 1628 bool IsSMEM = false; 1629 bool IsVMEM = false; 1630 if (TII->isSMRD(Inst)) 1631 IsSMEM = true; 1632 else if (TII->usesVM_CNT(Inst)) 1633 IsVMEM = true; 1634 1635 ++Iter; 1636 if (Iter == E) 1637 break; 1638 1639 MachineInstr &Next = *Iter; 1640 1641 // TODO: How about consecutive SMEM instructions? 1642 // The comments above says break the clause but the code does not. 1643 // if ((TII->isSMRD(next) && isSMEM) || 1644 if (!IsSMEM && TII->usesVM_CNT(Next) && IsVMEM && 1645 // TODO: Enable this check when hasSoftClause is upstreamed. 1646 // ST->hasSoftClauses() && 1647 ST->isXNACKEnabled()) { 1648 // Insert a NOP to break the clause. 1649 InsertNOP = true; 1650 continue; 1651 } 1652 1653 // There must be "S_NOP 0" between an instruction writing M0 and 1654 // S_SENDMSG. 1655 if ((Next.getOpcode() == AMDGPU::S_SENDMSG || 1656 Next.getOpcode() == AMDGPU::S_SENDMSGHALT) && 1657 Inst.definesRegister(AMDGPU::M0)) 1658 InsertNOP = true; 1659 1660 continue; 1661 } 1662 1663 ++Iter; 1664 } 1665 1666 // Check if we need to force convergence at loop footer. 1667 MachineLoop *ContainingLoop = MLI->getLoopFor(&Block); 1668 if (ContainingLoop && loopBottom(ContainingLoop) == &Block) { 1669 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get(); 1670 WaitcntData->print(); 1671 DEBUG(dbgs() << '\n';); 1672 1673 // The iterative waitcnt insertion algorithm aims for optimal waitcnt 1674 // placement and doesn't always guarantee convergence for a loop. Each 1675 // loop should take at most 2 iterations for it to converge naturally. 1676 // When this max is reached and result doesn't converge, we force 1677 // convergence by inserting a s_waitcnt at the end of loop footer. 1678 if (WaitcntData->getIterCnt() > 2) { 1679 // To ensure convergence, need to make wait events at loop footer be no 1680 // more than those from the previous iteration. 1681 // As a simplification, Instead of tracking individual scores and 1682 // generate the precise wait count, just wait on 0. 1683 bool HasPending = false; 1684 MachineInstr *SWaitInst = WaitcntData->getWaitcnt(); 1685 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; 1686 T = (enum InstCounterType)(T + 1)) { 1687 if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) { 1688 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); 1689 HasPending = true; 1690 } 1691 } 1692 1693 if (HasPending) { 1694 if (!SWaitInst) { 1695 SWaitInst = Block.getParent()->CreateMachineInstr( 1696 TII->get(AMDGPU::S_WAITCNT), DebugLoc()); 1697 CompilerGeneratedWaitcntSet.insert(SWaitInst); 1698 const MachineOperand &Op = MachineOperand::CreateImm(0); 1699 SWaitInst->addOperand(MF, Op); 1700 #if 0 // TODO: Format the debug output 1701 OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context); 1702 OutputTransformAdd(SWaitInst, context); 1703 #endif 1704 } 1705 #if 0 // TODO: ?? 1706 _DEV( REPORTED_STATS->force_waitcnt_converge = 1; ) 1707 #endif 1708 } 1709 1710 if (SWaitInst) { 1711 DEBUG({ 1712 SWaitInst->print(dbgs()); 1713 dbgs() << "\nAdjusted score board:"; 1714 ScoreBrackets->dump(); 1715 }); 1716 1717 // Add this waitcnt to the block. It is either newly created or 1718 // created in previous iterations and added back since block traversal 1719 // always remove waitcnt. 1720 insertWaitcntBeforeCF(Block, SWaitInst); 1721 WaitcntData->setWaitcnt(SWaitInst); 1722 } 1723 } 1724 } 1725 } 1726 1727 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { 1728 ST = &MF.getSubtarget<SISubtarget>(); 1729 TII = ST->getInstrInfo(); 1730 TRI = &TII->getRegisterInfo(); 1731 MRI = &MF.getRegInfo(); 1732 MLI = &getAnalysis<MachineLoopInfo>(); 1733 IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits()); 1734 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1735 AMDGPUASI = ST->getAMDGPUAS(); 1736 1737 HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV); 1738 HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV); 1739 HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV); 1740 1741 HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs(); 1742 HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs(); 1743 assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS); 1744 assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS); 1745 1746 RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0); 1747 RegisterEncoding.VGPRL = 1748 RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1; 1749 RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0); 1750 RegisterEncoding.SGPRL = 1751 RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1; 1752 1753 // Walk over the blocks in reverse post-dominator order, inserting 1754 // s_waitcnt where needed. 1755 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF); 1756 bool Modified = false; 1757 for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator 1758 I = RPOT.begin(), 1759 E = RPOT.end(), J = RPOT.begin(); 1760 I != E;) { 1761 MachineBasicBlock &MBB = **I; 1762 1763 BlockVisitedSet.insert(&MBB); 1764 1765 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get(); 1766 if (!ScoreBrackets) { 1767 BlockWaitcntBracketsMap[&MBB] = make_unique<BlockWaitcntBrackets>(); 1768 ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get(); 1769 } 1770 ScoreBrackets->setPostOrder(MBB.getNumber()); 1771 MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB); 1772 if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr) 1773 LoopWaitcntDataMap[ContainingLoop] = make_unique<LoopWaitcntData>(); 1774 1775 // If we are walking into the block from before the loop, then guarantee 1776 // at least 1 re-walk over the loop to propagate the information, even if 1777 // no S_WAITCNT instructions were generated. 1778 if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I && 1779 (BlockWaitcntProcessedSet.find(&MBB) == 1780 BlockWaitcntProcessedSet.end())) { 1781 BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true); 1782 DEBUG(dbgs() << "set-revisit: block" 1783 << ContainingLoop->getHeader()->getNumber() << '\n';); 1784 } 1785 1786 // Walk over the instructions. 1787 insertWaitcntInBlock(MF, MBB); 1788 1789 // Flag that waitcnts have been processed at least once. 1790 BlockWaitcntProcessedSet.insert(&MBB); 1791 1792 // See if we want to revisit the loop. 1793 if (ContainingLoop && loopBottom(ContainingLoop) == &MBB) { 1794 MachineBasicBlock *EntryBB = ContainingLoop->getHeader(); 1795 BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get(); 1796 if (EntrySB && EntrySB->getRevisitLoop()) { 1797 EntrySB->setRevisitLoop(false); 1798 J = I; 1799 int32_t PostOrder = EntrySB->getPostOrder(); 1800 // TODO: Avoid this loop. Find another way to set I. 1801 for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator 1802 X = RPOT.begin(), 1803 Y = RPOT.end(); 1804 X != Y; ++X) { 1805 MachineBasicBlock &MBBX = **X; 1806 if (MBBX.getNumber() == PostOrder) { 1807 I = X; 1808 break; 1809 } 1810 } 1811 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get(); 1812 WaitcntData->incIterCnt(); 1813 DEBUG(dbgs() << "revisit: block" << EntryBB->getNumber() << '\n';); 1814 continue; 1815 } else { 1816 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get(); 1817 // Loop converged, reset iteration count. If this loop gets revisited, 1818 // it must be from an outer loop, the counter will restart, this will 1819 // ensure we don't force convergence on such revisits. 1820 WaitcntData->resetIterCnt(); 1821 } 1822 } 1823 1824 J = I; 1825 ++I; 1826 } 1827 1828 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks; 1829 1830 bool HaveScalarStores = false; 1831 1832 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; 1833 ++BI) { 1834 1835 MachineBasicBlock &MBB = *BI; 1836 1837 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 1838 ++I) { 1839 1840 if (!HaveScalarStores && TII->isScalarStore(*I)) 1841 HaveScalarStores = true; 1842 1843 if (I->getOpcode() == AMDGPU::S_ENDPGM || 1844 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) 1845 EndPgmBlocks.push_back(&MBB); 1846 } 1847 } 1848 1849 if (HaveScalarStores) { 1850 // If scalar writes are used, the cache must be flushed or else the next 1851 // wave to reuse the same scratch memory can be clobbered. 1852 // 1853 // Insert s_dcache_wb at wave termination points if there were any scalar 1854 // stores, and only if the cache hasn't already been flushed. This could be 1855 // improved by looking across blocks for flushes in postdominating blocks 1856 // from the stores but an explicitly requested flush is probably very rare. 1857 for (MachineBasicBlock *MBB : EndPgmBlocks) { 1858 bool SeenDCacheWB = false; 1859 1860 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; 1861 ++I) { 1862 1863 if (I->getOpcode() == AMDGPU::S_DCACHE_WB) 1864 SeenDCacheWB = true; 1865 else if (TII->isScalarStore(*I)) 1866 SeenDCacheWB = false; 1867 1868 // FIXME: It would be better to insert this before a waitcnt if any. 1869 if ((I->getOpcode() == AMDGPU::S_ENDPGM || 1870 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && 1871 !SeenDCacheWB) { 1872 Modified = true; 1873 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB)); 1874 } 1875 } 1876 } 1877 } 1878 1879 if (!MFI->isEntryFunction()) { 1880 // Wait for any outstanding memory operations that the input registers may 1881 // depend on. We can't track them and it's better to to the wait after the 1882 // costly call sequence. 1883 1884 // TODO: Could insert earlier and schedule more liberally with operations 1885 // that only use caller preserved registers. 1886 MachineBasicBlock &EntryBB = MF.front(); 1887 BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) 1888 .addImm(0); 1889 1890 Modified = true; 1891 } 1892 1893 return Modified; 1894 } 1895