1 //===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief SI Machine Scheduler interface 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "SIMachineScheduler.h" 16 #include "AMDGPU.h" 17 #include "SIInstrInfo.h" 18 #include "SIRegisterInfo.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/CodeGen/LiveInterval.h" 22 #include "llvm/CodeGen/LiveIntervals.h" 23 #include "llvm/CodeGen/MachineInstr.h" 24 #include "llvm/CodeGen/MachineRegisterInfo.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/RegisterPressure.h" 27 #include "llvm/CodeGen/SlotIndexes.h" 28 #include "llvm/CodeGen/TargetRegisterInfo.h" 29 #include "llvm/Support/Debug.h" 30 #include "llvm/Support/ErrorHandling.h" 31 #include "llvm/Support/raw_ostream.h" 32 #include <algorithm> 33 #include <cassert> 34 #include <map> 35 #include <set> 36 #include <utility> 37 #include <vector> 38 39 using namespace llvm; 40 41 #define DEBUG_TYPE "machine-scheduler" 42 43 // This scheduler implements a different scheduling algorithm than 44 // GenericScheduler. 45 // 46 // There are several specific architecture behaviours that can't be modelled 47 // for GenericScheduler: 48 // . When accessing the result of an SGPR load instruction, you have to wait 49 // for all the SGPR load instructions before your current instruction to 50 // have finished. 51 // . When accessing the result of an VGPR load instruction, you have to wait 52 // for all the VGPR load instructions previous to the VGPR load instruction 53 // you are interested in to finish. 54 // . The less the register pressure, the best load latencies are hidden 55 // 56 // Moreover some specifities (like the fact a lot of instructions in the shader 57 // have few dependencies) makes the generic scheduler have some unpredictable 58 // behaviours. For example when register pressure becomes high, it can either 59 // manage to prevent register pressure from going too high, or it can 60 // increase register pressure even more than if it hadn't taken register 61 // pressure into account. 62 // 63 // Also some other bad behaviours are generated, like loading at the beginning 64 // of the shader a constant in VGPR you won't need until the end of the shader. 65 // 66 // The scheduling problem for SI can distinguish three main parts: 67 // . Hiding high latencies (texture sampling, etc) 68 // . Hiding low latencies (SGPR constant loading, etc) 69 // . Keeping register usage low for better latency hiding and general 70 // performance 71 // 72 // Some other things can also affect performance, but are hard to predict 73 // (cache usage, the fact the HW can issue several instructions from different 74 // wavefronts if different types, etc) 75 // 76 // This scheduler tries to solve the scheduling problem by dividing it into 77 // simpler sub-problems. It divides the instructions into blocks, schedules 78 // locally inside the blocks where it takes care of low latencies, and then 79 // chooses the order of the blocks by taking care of high latencies. 80 // Dividing the instructions into blocks helps control keeping register 81 // usage low. 82 // 83 // First the instructions are put into blocks. 84 // We want the blocks help control register usage and hide high latencies 85 // later. To help control register usage, we typically want all local 86 // computations, when for example you create a result that can be comsummed 87 // right away, to be contained in a block. Block inputs and outputs would 88 // typically be important results that are needed in several locations of 89 // the shader. Since we do want blocks to help hide high latencies, we want 90 // the instructions inside the block to have a minimal set of dependencies 91 // on high latencies. It will make it easy to pick blocks to hide specific 92 // high latencies. 93 // The block creation algorithm is divided into several steps, and several 94 // variants can be tried during the scheduling process. 95 // 96 // Second the order of the instructions inside the blocks is chosen. 97 // At that step we do take into account only register usage and hiding 98 // low latency instructions 99 // 100 // Third the block order is chosen, there we try to hide high latencies 101 // and keep register usage low. 102 // 103 // After the third step, a pass is done to improve the hiding of low 104 // latencies. 105 // 106 // Actually when talking about 'low latency' or 'high latency' it includes 107 // both the latency to get the cache (or global mem) data go to the register, 108 // and the bandwidth limitations. 109 // Increasing the number of active wavefronts helps hide the former, but it 110 // doesn't solve the latter, thus why even if wavefront count is high, we have 111 // to try have as many instructions hiding high latencies as possible. 112 // The OpenCL doc says for example latency of 400 cycles for a global mem access, 113 // which is hidden by 10 instructions if the wavefront count is 10. 114 115 // Some figures taken from AMD docs: 116 // Both texture and constant L1 caches are 4-way associative with 64 bytes 117 // lines. 118 // Constant cache is shared with 4 CUs. 119 // For texture sampling, the address generation unit receives 4 texture 120 // addresses per cycle, thus we could expect texture sampling latency to be 121 // equivalent to 4 instructions in the very best case (a VGPR is 64 work items, 122 // instructions in a wavefront group are executed every 4 cycles), 123 // or 16 instructions if the other wavefronts associated to the 3 other VALUs 124 // of the CU do texture sampling too. (Don't take these figures too seriously, 125 // as I'm not 100% sure of the computation) 126 // Data exports should get similar latency. 127 // For constant loading, the cache is shader with 4 CUs. 128 // The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit" 129 // I guess if the other CU don't read the cache, it can go up to 64B/cycle. 130 // It means a simple s_buffer_load should take one instruction to hide, as 131 // well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same 132 // cache line. 133 // 134 // As of today the driver doesn't preload the constants in cache, thus the 135 // first loads get extra latency. The doc says global memory access can be 136 // 300-600 cycles. We do not specially take that into account when scheduling 137 // As we expect the driver to be able to preload the constants soon. 138 139 // common code // 140 141 #ifndef NDEBUG 142 143 static const char *getReasonStr(SIScheduleCandReason Reason) { 144 switch (Reason) { 145 case NoCand: return "NOCAND"; 146 case RegUsage: return "REGUSAGE"; 147 case Latency: return "LATENCY"; 148 case Successor: return "SUCCESSOR"; 149 case Depth: return "DEPTH"; 150 case NodeOrder: return "ORDER"; 151 } 152 llvm_unreachable("Unknown reason!"); 153 } 154 155 #endif 156 157 static bool tryLess(int TryVal, int CandVal, 158 SISchedulerCandidate &TryCand, 159 SISchedulerCandidate &Cand, 160 SIScheduleCandReason Reason) { 161 if (TryVal < CandVal) { 162 TryCand.Reason = Reason; 163 return true; 164 } 165 if (TryVal > CandVal) { 166 if (Cand.Reason > Reason) 167 Cand.Reason = Reason; 168 return true; 169 } 170 Cand.setRepeat(Reason); 171 return false; 172 } 173 174 static bool tryGreater(int TryVal, int CandVal, 175 SISchedulerCandidate &TryCand, 176 SISchedulerCandidate &Cand, 177 SIScheduleCandReason Reason) { 178 if (TryVal > CandVal) { 179 TryCand.Reason = Reason; 180 return true; 181 } 182 if (TryVal < CandVal) { 183 if (Cand.Reason > Reason) 184 Cand.Reason = Reason; 185 return true; 186 } 187 Cand.setRepeat(Reason); 188 return false; 189 } 190 191 // SIScheduleBlock // 192 193 void SIScheduleBlock::addUnit(SUnit *SU) { 194 NodeNum2Index[SU->NodeNum] = SUnits.size(); 195 SUnits.push_back(SU); 196 } 197 198 #ifndef NDEBUG 199 void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) { 200 201 dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason); 202 dbgs() << '\n'; 203 } 204 #endif 205 206 void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand, 207 SISchedCandidate &TryCand) { 208 // Initialize the candidate if needed. 209 if (!Cand.isValid()) { 210 TryCand.Reason = NodeOrder; 211 return; 212 } 213 214 if (Cand.SGPRUsage > 60 && 215 tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage)) 216 return; 217 218 // Schedule low latency instructions as top as possible. 219 // Order of priority is: 220 // . Low latency instructions which do not depend on other low latency 221 // instructions we haven't waited for 222 // . Other instructions which do not depend on low latency instructions 223 // we haven't waited for 224 // . Low latencies 225 // . All other instructions 226 // Goal is to get: low latency instructions - independent instructions 227 // - (eventually some more low latency instructions) 228 // - instructions that depend on the first low latency instructions. 229 // If in the block there is a lot of constant loads, the SGPR usage 230 // could go quite high, thus above the arbitrary limit of 60 will encourage 231 // use the already loaded constants (in order to release some SGPRs) before 232 // loading more. 233 if (tryLess(TryCand.HasLowLatencyNonWaitedParent, 234 Cand.HasLowLatencyNonWaitedParent, 235 TryCand, Cand, SIScheduleCandReason::Depth)) 236 return; 237 238 if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency, 239 TryCand, Cand, SIScheduleCandReason::Depth)) 240 return; 241 242 if (TryCand.IsLowLatency && 243 tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset, 244 TryCand, Cand, SIScheduleCandReason::Depth)) 245 return; 246 247 if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage)) 248 return; 249 250 // Fall through to original instruction order. 251 if (TryCand.SU->NodeNum < Cand.SU->NodeNum) { 252 TryCand.Reason = NodeOrder; 253 } 254 } 255 256 SUnit* SIScheduleBlock::pickNode() { 257 SISchedCandidate TopCand; 258 259 for (SUnit* SU : TopReadySUs) { 260 SISchedCandidate TryCand; 261 std::vector<unsigned> pressure; 262 std::vector<unsigned> MaxPressure; 263 // Predict register usage after this instruction. 264 TryCand.SU = SU; 265 TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure); 266 TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()]; 267 TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()]; 268 TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum]; 269 TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum]; 270 TryCand.HasLowLatencyNonWaitedParent = 271 HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]; 272 tryCandidateTopDown(TopCand, TryCand); 273 if (TryCand.Reason != NoCand) 274 TopCand.setBest(TryCand); 275 } 276 277 return TopCand.SU; 278 } 279 280 281 // Schedule something valid. 282 void SIScheduleBlock::fastSchedule() { 283 TopReadySUs.clear(); 284 if (Scheduled) 285 undoSchedule(); 286 287 for (SUnit* SU : SUnits) { 288 if (!SU->NumPredsLeft) 289 TopReadySUs.push_back(SU); 290 } 291 292 while (!TopReadySUs.empty()) { 293 SUnit *SU = TopReadySUs[0]; 294 ScheduledSUnits.push_back(SU); 295 nodeScheduled(SU); 296 } 297 298 Scheduled = true; 299 } 300 301 // Returns if the register was set between first and last. 302 static bool isDefBetween(unsigned Reg, 303 SlotIndex First, SlotIndex Last, 304 const MachineRegisterInfo *MRI, 305 const LiveIntervals *LIS) { 306 for (MachineRegisterInfo::def_instr_iterator 307 UI = MRI->def_instr_begin(Reg), 308 UE = MRI->def_instr_end(); UI != UE; ++UI) { 309 const MachineInstr* MI = &*UI; 310 if (MI->isDebugValue()) 311 continue; 312 SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot(); 313 if (InstSlot >= First && InstSlot <= Last) 314 return true; 315 } 316 return false; 317 } 318 319 void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, 320 MachineBasicBlock::iterator EndBlock) { 321 IntervalPressure Pressure, BotPressure; 322 RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure); 323 LiveIntervals *LIS = DAG->getLIS(); 324 MachineRegisterInfo *MRI = DAG->getMRI(); 325 DAG->initRPTracker(TopRPTracker); 326 DAG->initRPTracker(BotRPTracker); 327 DAG->initRPTracker(RPTracker); 328 329 // Goes though all SU. RPTracker captures what had to be alive for the SUs 330 // to execute, and what is still alive at the end. 331 for (SUnit* SU : ScheduledSUnits) { 332 RPTracker.setPos(SU->getInstr()); 333 RPTracker.advance(); 334 } 335 336 // Close the RPTracker to finalize live ins/outs. 337 RPTracker.closeRegion(); 338 339 // Initialize the live ins and live outs. 340 TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs); 341 BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs); 342 343 // Do not Track Physical Registers, because it messes up. 344 for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) { 345 if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit)) 346 LiveInRegs.insert(RegMaskPair.RegUnit); 347 } 348 LiveOutRegs.clear(); 349 // There is several possibilities to distinguish: 350 // 1) Reg is not input to any instruction in the block, but is output of one 351 // 2) 1) + read in the block and not needed after it 352 // 3) 1) + read in the block but needed in another block 353 // 4) Reg is input of an instruction but another block will read it too 354 // 5) Reg is input of an instruction and then rewritten in the block. 355 // result is not read in the block (implies used in another block) 356 // 6) Reg is input of an instruction and then rewritten in the block. 357 // result is read in the block and not needed in another block 358 // 7) Reg is input of an instruction and then rewritten in the block. 359 // result is read in the block but also needed in another block 360 // LiveInRegs will contains all the regs in situation 4, 5, 6, 7 361 // We want LiveOutRegs to contain only Regs whose content will be read after 362 // in another block, and whose content was written in the current block, 363 // that is we want it to get 1, 3, 5, 7 364 // Since we made the MIs of a block to be packed all together before 365 // scheduling, then the LiveIntervals were correct, and the RPTracker was 366 // able to correctly handle 5 vs 6, 2 vs 3. 367 // (Note: This is not sufficient for RPTracker to not do mistakes for case 4) 368 // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7 369 // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7 370 // The use of findDefBetween removes the case 4. 371 for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) { 372 unsigned Reg = RegMaskPair.RegUnit; 373 if (TargetRegisterInfo::isVirtualRegister(Reg) && 374 isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(), 375 LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI, 376 LIS)) { 377 LiveOutRegs.insert(Reg); 378 } 379 } 380 381 // Pressure = sum_alive_registers register size 382 // Internally llvm will represent some registers as big 128 bits registers 383 // for example, but they actually correspond to 4 actual 32 bits registers. 384 // Thus Pressure is not equal to num_alive_registers * constant. 385 LiveInPressure = TopPressure.MaxSetPressure; 386 LiveOutPressure = BotPressure.MaxSetPressure; 387 388 // Prepares TopRPTracker for top down scheduling. 389 TopRPTracker.closeTop(); 390 } 391 392 void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock, 393 MachineBasicBlock::iterator EndBlock) { 394 if (!Scheduled) 395 fastSchedule(); 396 397 // PreScheduling phase to set LiveIn and LiveOut. 398 initRegPressure(BeginBlock, EndBlock); 399 undoSchedule(); 400 401 // Schedule for real now. 402 403 TopReadySUs.clear(); 404 405 for (SUnit* SU : SUnits) { 406 if (!SU->NumPredsLeft) 407 TopReadySUs.push_back(SU); 408 } 409 410 while (!TopReadySUs.empty()) { 411 SUnit *SU = pickNode(); 412 ScheduledSUnits.push_back(SU); 413 TopRPTracker.setPos(SU->getInstr()); 414 TopRPTracker.advance(); 415 nodeScheduled(SU); 416 } 417 418 // TODO: compute InternalAdditionnalPressure. 419 InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size()); 420 421 // Check everything is right. 422 #ifndef NDEBUG 423 assert(SUnits.size() == ScheduledSUnits.size() && 424 TopReadySUs.empty()); 425 for (SUnit* SU : SUnits) { 426 assert(SU->isScheduled && 427 SU->NumPredsLeft == 0); 428 } 429 #endif 430 431 Scheduled = true; 432 } 433 434 void SIScheduleBlock::undoSchedule() { 435 for (SUnit* SU : SUnits) { 436 SU->isScheduled = false; 437 for (SDep& Succ : SU->Succs) { 438 if (BC->isSUInBlock(Succ.getSUnit(), ID)) 439 undoReleaseSucc(SU, &Succ); 440 } 441 } 442 HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); 443 ScheduledSUnits.clear(); 444 Scheduled = false; 445 } 446 447 void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) { 448 SUnit *SuccSU = SuccEdge->getSUnit(); 449 450 if (SuccEdge->isWeak()) { 451 ++SuccSU->WeakPredsLeft; 452 return; 453 } 454 ++SuccSU->NumPredsLeft; 455 } 456 457 void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) { 458 SUnit *SuccSU = SuccEdge->getSUnit(); 459 460 if (SuccEdge->isWeak()) { 461 --SuccSU->WeakPredsLeft; 462 return; 463 } 464 #ifndef NDEBUG 465 if (SuccSU->NumPredsLeft == 0) { 466 dbgs() << "*** Scheduling failed! ***\n"; 467 SuccSU->dump(DAG); 468 dbgs() << " has been released too many times!\n"; 469 llvm_unreachable(nullptr); 470 } 471 #endif 472 473 --SuccSU->NumPredsLeft; 474 } 475 476 /// Release Successors of the SU that are in the block or not. 477 void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) { 478 for (SDep& Succ : SU->Succs) { 479 SUnit *SuccSU = Succ.getSUnit(); 480 481 if (SuccSU->NodeNum >= DAG->SUnits.size()) 482 continue; 483 484 if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock) 485 continue; 486 487 releaseSucc(SU, &Succ); 488 if (SuccSU->NumPredsLeft == 0 && InOrOutBlock) 489 TopReadySUs.push_back(SuccSU); 490 } 491 } 492 493 void SIScheduleBlock::nodeScheduled(SUnit *SU) { 494 // Is in TopReadySUs 495 assert (!SU->NumPredsLeft); 496 std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU); 497 if (I == TopReadySUs.end()) { 498 dbgs() << "Data Structure Bug in SI Scheduler\n"; 499 llvm_unreachable(nullptr); 500 } 501 TopReadySUs.erase(I); 502 503 releaseSuccessors(SU, true); 504 // Scheduling this node will trigger a wait, 505 // thus propagate to other instructions that they do not need to wait either. 506 if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]) 507 HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); 508 509 if (DAG->IsLowLatencySU[SU->NodeNum]) { 510 for (SDep& Succ : SU->Succs) { 511 std::map<unsigned, unsigned>::iterator I = 512 NodeNum2Index.find(Succ.getSUnit()->NodeNum); 513 if (I != NodeNum2Index.end()) 514 HasLowLatencyNonWaitedParent[I->second] = 1; 515 } 516 } 517 SU->isScheduled = true; 518 } 519 520 void SIScheduleBlock::finalizeUnits() { 521 // We remove links from outside blocks to enable scheduling inside the block. 522 for (SUnit* SU : SUnits) { 523 releaseSuccessors(SU, false); 524 if (DAG->IsHighLatencySU[SU->NodeNum]) 525 HighLatencyBlock = true; 526 } 527 HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0); 528 } 529 530 // we maintain ascending order of IDs 531 void SIScheduleBlock::addPred(SIScheduleBlock *Pred) { 532 unsigned PredID = Pred->getID(); 533 534 // Check if not already predecessor. 535 for (SIScheduleBlock* P : Preds) { 536 if (PredID == P->getID()) 537 return; 538 } 539 Preds.push_back(Pred); 540 541 assert(none_of(Succs, 542 [=](std::pair<SIScheduleBlock*, 543 SIScheduleBlockLinkKind> S) { 544 return PredID == S.first->getID(); 545 }) && 546 "Loop in the Block Graph!"); 547 } 548 549 void SIScheduleBlock::addSucc(SIScheduleBlock *Succ, 550 SIScheduleBlockLinkKind Kind) { 551 unsigned SuccID = Succ->getID(); 552 553 // Check if not already predecessor. 554 for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) { 555 if (SuccID == S.first->getID()) { 556 if (S.second == SIScheduleBlockLinkKind::NoData && 557 Kind == SIScheduleBlockLinkKind::Data) 558 S.second = Kind; 559 return; 560 } 561 } 562 if (Succ->isHighLatencyBlock()) 563 ++NumHighLatencySuccessors; 564 Succs.push_back(std::make_pair(Succ, Kind)); 565 566 assert(none_of(Preds, 567 [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && 568 "Loop in the Block Graph!"); 569 } 570 571 #ifndef NDEBUG 572 void SIScheduleBlock::printDebug(bool full) { 573 dbgs() << "Block (" << ID << ")\n"; 574 if (!full) 575 return; 576 577 dbgs() << "\nContains High Latency Instruction: " 578 << HighLatencyBlock << '\n'; 579 dbgs() << "\nDepends On:\n"; 580 for (SIScheduleBlock* P : Preds) { 581 P->printDebug(false); 582 } 583 584 dbgs() << "\nSuccessors:\n"; 585 for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) { 586 if (S.second == SIScheduleBlockLinkKind::Data) 587 dbgs() << "(Data Dep) "; 588 S.first->printDebug(false); 589 } 590 591 if (Scheduled) { 592 dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' ' 593 << LiveInPressure[DAG->getVGPRSetID()] << '\n'; 594 dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' ' 595 << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n"; 596 dbgs() << "LiveIns:\n"; 597 for (unsigned Reg : LiveInRegs) 598 dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; 599 600 dbgs() << "\nLiveOuts:\n"; 601 for (unsigned Reg : LiveOutRegs) 602 dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; 603 } 604 605 dbgs() << "\nInstructions:\n"; 606 if (!Scheduled) { 607 for (SUnit* SU : SUnits) { 608 SU->dump(DAG); 609 } 610 } else { 611 for (SUnit* SU : SUnits) { 612 SU->dump(DAG); 613 } 614 } 615 616 dbgs() << "///////////////////////\n"; 617 } 618 #endif 619 620 // SIScheduleBlockCreator // 621 622 SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) : 623 DAG(DAG) { 624 } 625 626 SIScheduleBlockCreator::~SIScheduleBlockCreator() = default; 627 628 SIScheduleBlocks 629 SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) { 630 std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B = 631 Blocks.find(BlockVariant); 632 if (B == Blocks.end()) { 633 SIScheduleBlocks Res; 634 createBlocksForVariant(BlockVariant); 635 topologicalSort(); 636 scheduleInsideBlocks(); 637 fillStats(); 638 Res.Blocks = CurrentBlocks; 639 Res.TopDownIndex2Block = TopDownIndex2Block; 640 Res.TopDownBlock2Index = TopDownBlock2Index; 641 Blocks[BlockVariant] = Res; 642 return Res; 643 } else { 644 return B->second; 645 } 646 } 647 648 bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) { 649 if (SU->NodeNum >= DAG->SUnits.size()) 650 return false; 651 return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID; 652 } 653 654 void SIScheduleBlockCreator::colorHighLatenciesAlone() { 655 unsigned DAGSize = DAG->SUnits.size(); 656 657 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 658 SUnit *SU = &DAG->SUnits[i]; 659 if (DAG->IsHighLatencySU[SU->NodeNum]) { 660 CurrentColoring[SU->NodeNum] = NextReservedID++; 661 } 662 } 663 } 664 665 static bool 666 hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) { 667 for (const auto &PredDep : SU.Preds) { 668 if (PredDep.getSUnit() == &FromSU && 669 PredDep.getKind() == llvm::SDep::Data) 670 return true; 671 } 672 return false; 673 } 674 675 void SIScheduleBlockCreator::colorHighLatenciesGroups() { 676 unsigned DAGSize = DAG->SUnits.size(); 677 unsigned NumHighLatencies = 0; 678 unsigned GroupSize; 679 int Color = NextReservedID; 680 unsigned Count = 0; 681 std::set<unsigned> FormingGroup; 682 683 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 684 SUnit *SU = &DAG->SUnits[i]; 685 if (DAG->IsHighLatencySU[SU->NodeNum]) 686 ++NumHighLatencies; 687 } 688 689 if (NumHighLatencies == 0) 690 return; 691 692 if (NumHighLatencies <= 6) 693 GroupSize = 2; 694 else if (NumHighLatencies <= 12) 695 GroupSize = 3; 696 else 697 GroupSize = 4; 698 699 for (unsigned SUNum : DAG->TopDownIndex2SU) { 700 const SUnit &SU = DAG->SUnits[SUNum]; 701 if (DAG->IsHighLatencySU[SU.NodeNum]) { 702 unsigned CompatibleGroup = true; 703 int ProposedColor = Color; 704 std::vector<int> AdditionalElements; 705 706 // We don't want to put in the same block 707 // two high latency instructions that depend 708 // on each other. 709 // One way would be to check canAddEdge 710 // in both directions, but that currently is not 711 // enough because there the high latency order is 712 // enforced (via links). 713 // Instead, look at the dependencies between the 714 // high latency instructions and deduce if it is 715 // a data dependency or not. 716 for (unsigned j : FormingGroup) { 717 bool HasSubGraph; 718 std::vector<int> SubGraph; 719 // By construction (topological order), if SU and 720 // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary 721 // in the parent graph of SU. 722 #ifndef NDEBUG 723 SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j], 724 HasSubGraph); 725 assert(!HasSubGraph); 726 #endif 727 SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU, 728 HasSubGraph); 729 if (!HasSubGraph) 730 continue; // No dependencies between each other 731 else if (SubGraph.size() > 5) { 732 // Too many elements would be required to be added to the block. 733 CompatibleGroup = false; 734 break; 735 } 736 else { 737 // Check the type of dependency 738 for (unsigned k : SubGraph) { 739 // If in the path to join the two instructions, 740 // there is another high latency instruction, 741 // or instructions colored for another block 742 // abort the merge. 743 if (DAG->IsHighLatencySU[k] || 744 (CurrentColoring[k] != ProposedColor && 745 CurrentColoring[k] != 0)) { 746 CompatibleGroup = false; 747 break; 748 } 749 // If one of the SU in the subgraph depends on the result of SU j, 750 // there'll be a data dependency. 751 if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) { 752 CompatibleGroup = false; 753 break; 754 } 755 } 756 if (!CompatibleGroup) 757 break; 758 // Same check for the SU 759 if (hasDataDependencyPred(SU, DAG->SUnits[j])) { 760 CompatibleGroup = false; 761 break; 762 } 763 // Add all the required instructions to the block 764 // These cannot live in another block (because they 765 // depend (order dependency) on one of the 766 // instruction in the block, and are required for the 767 // high latency instruction we add. 768 AdditionalElements.insert(AdditionalElements.end(), 769 SubGraph.begin(), SubGraph.end()); 770 } 771 } 772 if (CompatibleGroup) { 773 FormingGroup.insert(SU.NodeNum); 774 for (unsigned j : AdditionalElements) 775 CurrentColoring[j] = ProposedColor; 776 CurrentColoring[SU.NodeNum] = ProposedColor; 777 ++Count; 778 } 779 // Found one incompatible instruction, 780 // or has filled a big enough group. 781 // -> start a new one. 782 if (!CompatibleGroup) { 783 FormingGroup.clear(); 784 Color = ++NextReservedID; 785 ProposedColor = Color; 786 FormingGroup.insert(SU.NodeNum); 787 CurrentColoring[SU.NodeNum] = ProposedColor; 788 Count = 0; 789 } else if (Count == GroupSize) { 790 FormingGroup.clear(); 791 Color = ++NextReservedID; 792 ProposedColor = Color; 793 Count = 0; 794 } 795 } 796 } 797 } 798 799 void SIScheduleBlockCreator::colorComputeReservedDependencies() { 800 unsigned DAGSize = DAG->SUnits.size(); 801 std::map<std::set<unsigned>, unsigned> ColorCombinations; 802 803 CurrentTopDownReservedDependencyColoring.clear(); 804 CurrentBottomUpReservedDependencyColoring.clear(); 805 806 CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0); 807 CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0); 808 809 // Traverse TopDown, and give different colors to SUs depending 810 // on which combination of High Latencies they depend on. 811 812 for (unsigned SUNum : DAG->TopDownIndex2SU) { 813 SUnit *SU = &DAG->SUnits[SUNum]; 814 std::set<unsigned> SUColors; 815 816 // Already given. 817 if (CurrentColoring[SU->NodeNum]) { 818 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = 819 CurrentColoring[SU->NodeNum]; 820 continue; 821 } 822 823 for (SDep& PredDep : SU->Preds) { 824 SUnit *Pred = PredDep.getSUnit(); 825 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) 826 continue; 827 if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0) 828 SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]); 829 } 830 // Color 0 by default. 831 if (SUColors.empty()) 832 continue; 833 // Same color than parents. 834 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) 835 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = 836 *SUColors.begin(); 837 else { 838 std::map<std::set<unsigned>, unsigned>::iterator Pos = 839 ColorCombinations.find(SUColors); 840 if (Pos != ColorCombinations.end()) { 841 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second; 842 } else { 843 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = 844 NextNonReservedID; 845 ColorCombinations[SUColors] = NextNonReservedID++; 846 } 847 } 848 } 849 850 ColorCombinations.clear(); 851 852 // Same as before, but BottomUp. 853 854 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 855 SUnit *SU = &DAG->SUnits[SUNum]; 856 std::set<unsigned> SUColors; 857 858 // Already given. 859 if (CurrentColoring[SU->NodeNum]) { 860 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = 861 CurrentColoring[SU->NodeNum]; 862 continue; 863 } 864 865 for (SDep& SuccDep : SU->Succs) { 866 SUnit *Succ = SuccDep.getSUnit(); 867 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 868 continue; 869 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0) 870 SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]); 871 } 872 // Keep color 0. 873 if (SUColors.empty()) 874 continue; 875 // Same color than parents. 876 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) 877 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = 878 *SUColors.begin(); 879 else { 880 std::map<std::set<unsigned>, unsigned>::iterator Pos = 881 ColorCombinations.find(SUColors); 882 if (Pos != ColorCombinations.end()) { 883 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second; 884 } else { 885 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = 886 NextNonReservedID; 887 ColorCombinations[SUColors] = NextNonReservedID++; 888 } 889 } 890 } 891 } 892 893 void SIScheduleBlockCreator::colorAccordingToReservedDependencies() { 894 unsigned DAGSize = DAG->SUnits.size(); 895 std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations; 896 897 // Every combination of colors given by the top down 898 // and bottom up Reserved node dependency 899 900 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 901 SUnit *SU = &DAG->SUnits[i]; 902 std::pair<unsigned, unsigned> SUColors; 903 904 // High latency instructions: already given. 905 if (CurrentColoring[SU->NodeNum]) 906 continue; 907 908 SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum]; 909 SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum]; 910 911 std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos = 912 ColorCombinations.find(SUColors); 913 if (Pos != ColorCombinations.end()) { 914 CurrentColoring[SU->NodeNum] = Pos->second; 915 } else { 916 CurrentColoring[SU->NodeNum] = NextNonReservedID; 917 ColorCombinations[SUColors] = NextNonReservedID++; 918 } 919 } 920 } 921 922 void SIScheduleBlockCreator::colorEndsAccordingToDependencies() { 923 unsigned DAGSize = DAG->SUnits.size(); 924 std::vector<int> PendingColoring = CurrentColoring; 925 926 assert(DAGSize >= 1 && 927 CurrentBottomUpReservedDependencyColoring.size() == DAGSize && 928 CurrentTopDownReservedDependencyColoring.size() == DAGSize); 929 // If there is no reserved block at all, do nothing. We don't want 930 // everything in one block. 931 if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(), 932 CurrentBottomUpReservedDependencyColoring.end()) == 0 && 933 *std::max_element(CurrentTopDownReservedDependencyColoring.begin(), 934 CurrentTopDownReservedDependencyColoring.end()) == 0) 935 return; 936 937 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 938 SUnit *SU = &DAG->SUnits[SUNum]; 939 std::set<unsigned> SUColors; 940 std::set<unsigned> SUColorsPending; 941 942 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 943 continue; 944 945 if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 || 946 CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0) 947 continue; 948 949 for (SDep& SuccDep : SU->Succs) { 950 SUnit *Succ = SuccDep.getSUnit(); 951 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 952 continue; 953 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 || 954 CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0) 955 SUColors.insert(CurrentColoring[Succ->NodeNum]); 956 SUColorsPending.insert(PendingColoring[Succ->NodeNum]); 957 } 958 // If there is only one child/parent block, and that block 959 // is not among the ones we are removing in this path, then 960 // merge the instruction to that block 961 if (SUColors.size() == 1 && SUColorsPending.size() == 1) 962 PendingColoring[SU->NodeNum] = *SUColors.begin(); 963 else // TODO: Attribute new colors depending on color 964 // combination of children. 965 PendingColoring[SU->NodeNum] = NextNonReservedID++; 966 } 967 CurrentColoring = PendingColoring; 968 } 969 970 971 void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() { 972 unsigned DAGSize = DAG->SUnits.size(); 973 unsigned PreviousColor; 974 std::set<unsigned> SeenColors; 975 976 if (DAGSize <= 1) 977 return; 978 979 PreviousColor = CurrentColoring[0]; 980 981 for (unsigned i = 1, e = DAGSize; i != e; ++i) { 982 SUnit *SU = &DAG->SUnits[i]; 983 unsigned CurrentColor = CurrentColoring[i]; 984 unsigned PreviousColorSave = PreviousColor; 985 assert(i == SU->NodeNum); 986 987 if (CurrentColor != PreviousColor) 988 SeenColors.insert(PreviousColor); 989 PreviousColor = CurrentColor; 990 991 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 992 continue; 993 994 if (SeenColors.find(CurrentColor) == SeenColors.end()) 995 continue; 996 997 if (PreviousColorSave != CurrentColor) 998 CurrentColoring[i] = NextNonReservedID++; 999 else 1000 CurrentColoring[i] = CurrentColoring[i-1]; 1001 } 1002 } 1003 1004 void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() { 1005 unsigned DAGSize = DAG->SUnits.size(); 1006 1007 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1008 SUnit *SU = &DAG->SUnits[SUNum]; 1009 std::set<unsigned> SUColors; 1010 1011 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1012 continue; 1013 1014 // No predecessor: Vgpr constant loading. 1015 // Low latency instructions usually have a predecessor (the address) 1016 if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum]) 1017 continue; 1018 1019 for (SDep& SuccDep : SU->Succs) { 1020 SUnit *Succ = SuccDep.getSUnit(); 1021 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1022 continue; 1023 SUColors.insert(CurrentColoring[Succ->NodeNum]); 1024 } 1025 if (SUColors.size() == 1) 1026 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 1027 } 1028 } 1029 1030 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() { 1031 unsigned DAGSize = DAG->SUnits.size(); 1032 1033 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1034 SUnit *SU = &DAG->SUnits[SUNum]; 1035 std::set<unsigned> SUColors; 1036 1037 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1038 continue; 1039 1040 for (SDep& SuccDep : SU->Succs) { 1041 SUnit *Succ = SuccDep.getSUnit(); 1042 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1043 continue; 1044 SUColors.insert(CurrentColoring[Succ->NodeNum]); 1045 } 1046 if (SUColors.size() == 1) 1047 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 1048 } 1049 } 1050 1051 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() { 1052 unsigned DAGSize = DAG->SUnits.size(); 1053 1054 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1055 SUnit *SU = &DAG->SUnits[SUNum]; 1056 std::set<unsigned> SUColors; 1057 1058 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1059 continue; 1060 1061 for (SDep& SuccDep : SU->Succs) { 1062 SUnit *Succ = SuccDep.getSUnit(); 1063 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1064 continue; 1065 SUColors.insert(CurrentColoring[Succ->NodeNum]); 1066 } 1067 if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize) 1068 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 1069 } 1070 } 1071 1072 void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() { 1073 unsigned DAGSize = DAG->SUnits.size(); 1074 std::map<unsigned, unsigned> ColorCount; 1075 1076 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1077 SUnit *SU = &DAG->SUnits[SUNum]; 1078 unsigned color = CurrentColoring[SU->NodeNum]; 1079 ++ColorCount[color]; 1080 } 1081 1082 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1083 SUnit *SU = &DAG->SUnits[SUNum]; 1084 unsigned color = CurrentColoring[SU->NodeNum]; 1085 std::set<unsigned> SUColors; 1086 1087 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1088 continue; 1089 1090 if (ColorCount[color] > 1) 1091 continue; 1092 1093 for (SDep& SuccDep : SU->Succs) { 1094 SUnit *Succ = SuccDep.getSUnit(); 1095 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1096 continue; 1097 SUColors.insert(CurrentColoring[Succ->NodeNum]); 1098 } 1099 if (SUColors.size() == 1 && *SUColors.begin() != color) { 1100 --ColorCount[color]; 1101 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 1102 ++ColorCount[*SUColors.begin()]; 1103 } 1104 } 1105 } 1106 1107 void SIScheduleBlockCreator::cutHugeBlocks() { 1108 // TODO 1109 } 1110 1111 void SIScheduleBlockCreator::regroupNoUserInstructions() { 1112 unsigned DAGSize = DAG->SUnits.size(); 1113 int GroupID = NextNonReservedID++; 1114 1115 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1116 SUnit *SU = &DAG->SUnits[SUNum]; 1117 bool hasSuccessor = false; 1118 1119 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1120 continue; 1121 1122 for (SDep& SuccDep : SU->Succs) { 1123 SUnit *Succ = SuccDep.getSUnit(); 1124 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1125 continue; 1126 hasSuccessor = true; 1127 } 1128 if (!hasSuccessor) 1129 CurrentColoring[SU->NodeNum] = GroupID; 1130 } 1131 } 1132 1133 void SIScheduleBlockCreator::colorExports() { 1134 unsigned ExportColor = NextNonReservedID++; 1135 SmallVector<unsigned, 8> ExpGroup; 1136 1137 // Put all exports together in a block. 1138 // The block will naturally end up being scheduled last, 1139 // thus putting exports at the end of the schedule, which 1140 // is better for performance. 1141 // However we must ensure, for safety, the exports can be put 1142 // together in the same block without any other instruction. 1143 // This could happen, for example, when scheduling after regalloc 1144 // if reloading a spilled register from memory using the same 1145 // register than used in a previous export. 1146 // If that happens, do not regroup the exports. 1147 for (unsigned SUNum : DAG->TopDownIndex2SU) { 1148 const SUnit &SU = DAG->SUnits[SUNum]; 1149 if (SIInstrInfo::isEXP(*SU.getInstr())) { 1150 // Check the EXP can be added to the group safely, 1151 // ie without needing any other instruction. 1152 // The EXP is allowed to depend on other EXP 1153 // (they will be in the same group). 1154 for (unsigned j : ExpGroup) { 1155 bool HasSubGraph; 1156 std::vector<int> SubGraph; 1157 // By construction (topological order), if SU and 1158 // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary 1159 // in the parent graph of SU. 1160 #ifndef NDEBUG 1161 SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j], 1162 HasSubGraph); 1163 assert(!HasSubGraph); 1164 #endif 1165 SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU, 1166 HasSubGraph); 1167 if (!HasSubGraph) 1168 continue; // No dependencies between each other 1169 1170 // SubGraph contains all the instructions required 1171 // between EXP SUnits[j] and EXP SU. 1172 for (unsigned k : SubGraph) { 1173 if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr())) 1174 // Other instructions than EXP would be required in the group. 1175 // Abort the groupping. 1176 return; 1177 } 1178 } 1179 1180 ExpGroup.push_back(SUNum); 1181 } 1182 } 1183 1184 // The group can be formed. Give the color. 1185 for (unsigned j : ExpGroup) 1186 CurrentColoring[j] = ExportColor; 1187 } 1188 1189 void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) { 1190 unsigned DAGSize = DAG->SUnits.size(); 1191 std::map<unsigned,unsigned> RealID; 1192 1193 CurrentBlocks.clear(); 1194 CurrentColoring.clear(); 1195 CurrentColoring.resize(DAGSize, 0); 1196 Node2CurrentBlock.clear(); 1197 1198 // Restore links previous scheduling variant has overridden. 1199 DAG->restoreSULinksLeft(); 1200 1201 NextReservedID = 1; 1202 NextNonReservedID = DAGSize + 1; 1203 1204 DEBUG(dbgs() << "Coloring the graph\n"); 1205 1206 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped) 1207 colorHighLatenciesGroups(); 1208 else 1209 colorHighLatenciesAlone(); 1210 colorComputeReservedDependencies(); 1211 colorAccordingToReservedDependencies(); 1212 colorEndsAccordingToDependencies(); 1213 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive) 1214 colorForceConsecutiveOrderInGroup(); 1215 regroupNoUserInstructions(); 1216 colorMergeConstantLoadsNextGroup(); 1217 colorMergeIfPossibleNextGroupOnlyForReserved(); 1218 colorExports(); 1219 1220 // Put SUs of same color into same block 1221 Node2CurrentBlock.resize(DAGSize, -1); 1222 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1223 SUnit *SU = &DAG->SUnits[i]; 1224 unsigned Color = CurrentColoring[SU->NodeNum]; 1225 if (RealID.find(Color) == RealID.end()) { 1226 int ID = CurrentBlocks.size(); 1227 BlockPtrs.push_back(llvm::make_unique<SIScheduleBlock>(DAG, this, ID)); 1228 CurrentBlocks.push_back(BlockPtrs.rbegin()->get()); 1229 RealID[Color] = ID; 1230 } 1231 CurrentBlocks[RealID[Color]]->addUnit(SU); 1232 Node2CurrentBlock[SU->NodeNum] = RealID[Color]; 1233 } 1234 1235 // Build dependencies between blocks. 1236 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1237 SUnit *SU = &DAG->SUnits[i]; 1238 int SUID = Node2CurrentBlock[i]; 1239 for (SDep& SuccDep : SU->Succs) { 1240 SUnit *Succ = SuccDep.getSUnit(); 1241 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1242 continue; 1243 if (Node2CurrentBlock[Succ->NodeNum] != SUID) 1244 CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]], 1245 SuccDep.isCtrl() ? NoData : Data); 1246 } 1247 for (SDep& PredDep : SU->Preds) { 1248 SUnit *Pred = PredDep.getSUnit(); 1249 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) 1250 continue; 1251 if (Node2CurrentBlock[Pred->NodeNum] != SUID) 1252 CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]); 1253 } 1254 } 1255 1256 // Free root and leafs of all blocks to enable scheduling inside them. 1257 for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { 1258 SIScheduleBlock *Block = CurrentBlocks[i]; 1259 Block->finalizeUnits(); 1260 } 1261 DEBUG( 1262 dbgs() << "Blocks created:\n\n"; 1263 for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { 1264 SIScheduleBlock *Block = CurrentBlocks[i]; 1265 Block->printDebug(true); 1266 } 1267 ); 1268 } 1269 1270 // Two functions taken from Codegen/MachineScheduler.cpp 1271 1272 /// Non-const version. 1273 static MachineBasicBlock::iterator 1274 nextIfDebug(MachineBasicBlock::iterator I, 1275 MachineBasicBlock::const_iterator End) { 1276 for (; I != End; ++I) { 1277 if (!I->isDebugValue()) 1278 break; 1279 } 1280 return I; 1281 } 1282 1283 void SIScheduleBlockCreator::topologicalSort() { 1284 unsigned DAGSize = CurrentBlocks.size(); 1285 std::vector<int> WorkList; 1286 1287 DEBUG(dbgs() << "Topological Sort\n"); 1288 1289 WorkList.reserve(DAGSize); 1290 TopDownIndex2Block.resize(DAGSize); 1291 TopDownBlock2Index.resize(DAGSize); 1292 BottomUpIndex2Block.resize(DAGSize); 1293 1294 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1295 SIScheduleBlock *Block = CurrentBlocks[i]; 1296 unsigned Degree = Block->getSuccs().size(); 1297 TopDownBlock2Index[i] = Degree; 1298 if (Degree == 0) { 1299 WorkList.push_back(i); 1300 } 1301 } 1302 1303 int Id = DAGSize; 1304 while (!WorkList.empty()) { 1305 int i = WorkList.back(); 1306 SIScheduleBlock *Block = CurrentBlocks[i]; 1307 WorkList.pop_back(); 1308 TopDownBlock2Index[i] = --Id; 1309 TopDownIndex2Block[Id] = i; 1310 for (SIScheduleBlock* Pred : Block->getPreds()) { 1311 if (!--TopDownBlock2Index[Pred->getID()]) 1312 WorkList.push_back(Pred->getID()); 1313 } 1314 } 1315 1316 #ifndef NDEBUG 1317 // Check correctness of the ordering. 1318 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1319 SIScheduleBlock *Block = CurrentBlocks[i]; 1320 for (SIScheduleBlock* Pred : Block->getPreds()) { 1321 assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] && 1322 "Wrong Top Down topological sorting"); 1323 } 1324 } 1325 #endif 1326 1327 BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(), 1328 TopDownIndex2Block.rend()); 1329 } 1330 1331 void SIScheduleBlockCreator::scheduleInsideBlocks() { 1332 unsigned DAGSize = CurrentBlocks.size(); 1333 1334 DEBUG(dbgs() << "\nScheduling Blocks\n\n"); 1335 1336 // We do schedule a valid scheduling such that a Block corresponds 1337 // to a range of instructions. 1338 DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n"); 1339 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1340 SIScheduleBlock *Block = CurrentBlocks[i]; 1341 Block->fastSchedule(); 1342 } 1343 1344 // Note: the following code, and the part restoring previous position 1345 // is by far the most expensive operation of the Scheduler. 1346 1347 // Do not update CurrentTop. 1348 MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop(); 1349 std::vector<MachineBasicBlock::iterator> PosOld; 1350 std::vector<MachineBasicBlock::iterator> PosNew; 1351 PosOld.reserve(DAG->SUnits.size()); 1352 PosNew.reserve(DAG->SUnits.size()); 1353 1354 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1355 int BlockIndice = TopDownIndex2Block[i]; 1356 SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; 1357 std::vector<SUnit*> SUs = Block->getScheduledUnits(); 1358 1359 for (SUnit* SU : SUs) { 1360 MachineInstr *MI = SU->getInstr(); 1361 MachineBasicBlock::iterator Pos = MI; 1362 PosOld.push_back(Pos); 1363 if (&*CurrentTopFastSched == MI) { 1364 PosNew.push_back(Pos); 1365 CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched, 1366 DAG->getCurrentBottom()); 1367 } else { 1368 // Update the instruction stream. 1369 DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI); 1370 1371 // Update LiveIntervals. 1372 // Note: Moving all instructions and calling handleMove every time 1373 // is the most cpu intensive operation of the scheduler. 1374 // It would gain a lot if there was a way to recompute the 1375 // LiveIntervals for the entire scheduling region. 1376 DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true); 1377 PosNew.push_back(CurrentTopFastSched); 1378 } 1379 } 1380 } 1381 1382 // Now we have Block of SUs == Block of MI. 1383 // We do the final schedule for the instructions inside the block. 1384 // The property that all the SUs of the Block are grouped together as MI 1385 // is used for correct reg usage tracking. 1386 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1387 SIScheduleBlock *Block = CurrentBlocks[i]; 1388 std::vector<SUnit*> SUs = Block->getScheduledUnits(); 1389 Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr()); 1390 } 1391 1392 DEBUG(dbgs() << "Restoring MI Pos\n"); 1393 // Restore old ordering (which prevents a LIS->handleMove bug). 1394 for (unsigned i = PosOld.size(), e = 0; i != e; --i) { 1395 MachineBasicBlock::iterator POld = PosOld[i-1]; 1396 MachineBasicBlock::iterator PNew = PosNew[i-1]; 1397 if (PNew != POld) { 1398 // Update the instruction stream. 1399 DAG->getBB()->splice(POld, DAG->getBB(), PNew); 1400 1401 // Update LiveIntervals. 1402 DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true); 1403 } 1404 } 1405 1406 DEBUG( 1407 for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { 1408 SIScheduleBlock *Block = CurrentBlocks[i]; 1409 Block->printDebug(true); 1410 } 1411 ); 1412 } 1413 1414 void SIScheduleBlockCreator::fillStats() { 1415 unsigned DAGSize = CurrentBlocks.size(); 1416 1417 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1418 int BlockIndice = TopDownIndex2Block[i]; 1419 SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; 1420 if (Block->getPreds().empty()) 1421 Block->Depth = 0; 1422 else { 1423 unsigned Depth = 0; 1424 for (SIScheduleBlock *Pred : Block->getPreds()) { 1425 if (Depth < Pred->Depth + Pred->getCost()) 1426 Depth = Pred->Depth + Pred->getCost(); 1427 } 1428 Block->Depth = Depth; 1429 } 1430 } 1431 1432 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1433 int BlockIndice = BottomUpIndex2Block[i]; 1434 SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; 1435 if (Block->getSuccs().empty()) 1436 Block->Height = 0; 1437 else { 1438 unsigned Height = 0; 1439 for (const auto &Succ : Block->getSuccs()) 1440 Height = std::max(Height, Succ.first->Height + Succ.first->getCost()); 1441 Block->Height = Height; 1442 } 1443 } 1444 } 1445 1446 // SIScheduleBlockScheduler // 1447 1448 SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, 1449 SISchedulerBlockSchedulerVariant Variant, 1450 SIScheduleBlocks BlocksStruct) : 1451 DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks), 1452 LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0), 1453 SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) { 1454 1455 // Fill the usage of every output 1456 // Warning: while by construction we always have a link between two blocks 1457 // when one needs a result from the other, the number of users of an output 1458 // is not the sum of child blocks having as input the same virtual register. 1459 // Here is an example. A produces x and y. B eats x and produces x'. 1460 // C eats x' and y. The register coalescer may have attributed the same 1461 // virtual register to x and x'. 1462 // To count accurately, we do a topological sort. In case the register is 1463 // found for several parents, we increment the usage of the one with the 1464 // highest topological index. 1465 LiveOutRegsNumUsages.resize(Blocks.size()); 1466 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1467 SIScheduleBlock *Block = Blocks[i]; 1468 for (unsigned Reg : Block->getInRegs()) { 1469 bool Found = false; 1470 int topoInd = -1; 1471 for (SIScheduleBlock* Pred: Block->getPreds()) { 1472 std::set<unsigned> PredOutRegs = Pred->getOutRegs(); 1473 std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg); 1474 1475 if (RegPos != PredOutRegs.end()) { 1476 Found = true; 1477 if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) { 1478 topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()]; 1479 } 1480 } 1481 } 1482 1483 if (!Found) 1484 continue; 1485 1486 int PredID = BlocksStruct.TopDownIndex2Block[topoInd]; 1487 ++LiveOutRegsNumUsages[PredID][Reg]; 1488 } 1489 } 1490 1491 LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0); 1492 BlockNumPredsLeft.resize(Blocks.size()); 1493 BlockNumSuccsLeft.resize(Blocks.size()); 1494 1495 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1496 SIScheduleBlock *Block = Blocks[i]; 1497 BlockNumPredsLeft[i] = Block->getPreds().size(); 1498 BlockNumSuccsLeft[i] = Block->getSuccs().size(); 1499 } 1500 1501 #ifndef NDEBUG 1502 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1503 SIScheduleBlock *Block = Blocks[i]; 1504 assert(Block->getID() == i); 1505 } 1506 #endif 1507 1508 std::set<unsigned> InRegs = DAG->getInRegs(); 1509 addLiveRegs(InRegs); 1510 1511 // Increase LiveOutRegsNumUsages for blocks 1512 // producing registers consumed in another 1513 // scheduling region. 1514 for (unsigned Reg : DAG->getOutRegs()) { 1515 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1516 // Do reverse traversal 1517 int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i]; 1518 SIScheduleBlock *Block = Blocks[ID]; 1519 const std::set<unsigned> &OutRegs = Block->getOutRegs(); 1520 1521 if (OutRegs.find(Reg) == OutRegs.end()) 1522 continue; 1523 1524 ++LiveOutRegsNumUsages[ID][Reg]; 1525 break; 1526 } 1527 } 1528 1529 // Fill LiveRegsConsumers for regs that were already 1530 // defined before scheduling. 1531 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1532 SIScheduleBlock *Block = Blocks[i]; 1533 for (unsigned Reg : Block->getInRegs()) { 1534 bool Found = false; 1535 for (SIScheduleBlock* Pred: Block->getPreds()) { 1536 std::set<unsigned> PredOutRegs = Pred->getOutRegs(); 1537 std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg); 1538 1539 if (RegPos != PredOutRegs.end()) { 1540 Found = true; 1541 break; 1542 } 1543 } 1544 1545 if (!Found) 1546 ++LiveRegsConsumers[Reg]; 1547 } 1548 } 1549 1550 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1551 SIScheduleBlock *Block = Blocks[i]; 1552 if (BlockNumPredsLeft[i] == 0) { 1553 ReadyBlocks.push_back(Block); 1554 } 1555 } 1556 1557 while (SIScheduleBlock *Block = pickBlock()) { 1558 BlocksScheduled.push_back(Block); 1559 blockScheduled(Block); 1560 } 1561 1562 DEBUG( 1563 dbgs() << "Block Order:"; 1564 for (SIScheduleBlock* Block : BlocksScheduled) { 1565 dbgs() << ' ' << Block->getID(); 1566 } 1567 dbgs() << '\n'; 1568 ); 1569 } 1570 1571 bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand, 1572 SIBlockSchedCandidate &TryCand) { 1573 if (!Cand.isValid()) { 1574 TryCand.Reason = NodeOrder; 1575 return true; 1576 } 1577 1578 // Try to hide high latencies. 1579 if (tryLess(TryCand.LastPosHighLatParentScheduled, 1580 Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency)) 1581 return true; 1582 // Schedule high latencies early so you can hide them better. 1583 if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency, 1584 TryCand, Cand, Latency)) 1585 return true; 1586 if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height, 1587 TryCand, Cand, Depth)) 1588 return true; 1589 if (tryGreater(TryCand.NumHighLatencySuccessors, 1590 Cand.NumHighLatencySuccessors, 1591 TryCand, Cand, Successor)) 1592 return true; 1593 return false; 1594 } 1595 1596 bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand, 1597 SIBlockSchedCandidate &TryCand) { 1598 if (!Cand.isValid()) { 1599 TryCand.Reason = NodeOrder; 1600 return true; 1601 } 1602 1603 if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0, 1604 TryCand, Cand, RegUsage)) 1605 return true; 1606 if (tryGreater(TryCand.NumSuccessors > 0, 1607 Cand.NumSuccessors > 0, 1608 TryCand, Cand, Successor)) 1609 return true; 1610 if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth)) 1611 return true; 1612 if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff, 1613 TryCand, Cand, RegUsage)) 1614 return true; 1615 return false; 1616 } 1617 1618 SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { 1619 SIBlockSchedCandidate Cand; 1620 std::vector<SIScheduleBlock*>::iterator Best; 1621 SIScheduleBlock *Block; 1622 if (ReadyBlocks.empty()) 1623 return nullptr; 1624 1625 DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(), 1626 VregCurrentUsage, SregCurrentUsage); 1627 if (VregCurrentUsage > maxVregUsage) 1628 maxVregUsage = VregCurrentUsage; 1629 if (SregCurrentUsage > maxSregUsage) 1630 maxSregUsage = SregCurrentUsage; 1631 DEBUG( 1632 dbgs() << "Picking New Blocks\n"; 1633 dbgs() << "Available: "; 1634 for (SIScheduleBlock* Block : ReadyBlocks) 1635 dbgs() << Block->getID() << ' '; 1636 dbgs() << "\nCurrent Live:\n"; 1637 for (unsigned Reg : LiveRegs) 1638 dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; 1639 dbgs() << '\n'; 1640 dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; 1641 dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n'; 1642 ); 1643 1644 Cand.Block = nullptr; 1645 for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(), 1646 E = ReadyBlocks.end(); I != E; ++I) { 1647 SIBlockSchedCandidate TryCand; 1648 TryCand.Block = *I; 1649 TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock(); 1650 TryCand.VGPRUsageDiff = 1651 checkRegUsageImpact(TryCand.Block->getInRegs(), 1652 TryCand.Block->getOutRegs())[DAG->getVGPRSetID()]; 1653 TryCand.NumSuccessors = TryCand.Block->getSuccs().size(); 1654 TryCand.NumHighLatencySuccessors = 1655 TryCand.Block->getNumHighLatencySuccessors(); 1656 TryCand.LastPosHighLatParentScheduled = 1657 (unsigned int) std::max<int> (0, 1658 LastPosHighLatencyParentScheduled[TryCand.Block->getID()] - 1659 LastPosWaitedHighLatency); 1660 TryCand.Height = TryCand.Block->Height; 1661 // Try not to increase VGPR usage too much, else we may spill. 1662 if (VregCurrentUsage > 120 || 1663 Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) { 1664 if (!tryCandidateRegUsage(Cand, TryCand) && 1665 Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage) 1666 tryCandidateLatency(Cand, TryCand); 1667 } else { 1668 if (!tryCandidateLatency(Cand, TryCand)) 1669 tryCandidateRegUsage(Cand, TryCand); 1670 } 1671 if (TryCand.Reason != NoCand) { 1672 Cand.setBest(TryCand); 1673 Best = I; 1674 DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' ' 1675 << getReasonStr(Cand.Reason) << '\n'); 1676 } 1677 } 1678 1679 DEBUG( 1680 dbgs() << "Picking: " << Cand.Block->getID() << '\n'; 1681 dbgs() << "Is a block with high latency instruction: " 1682 << (Cand.IsHighLatency ? "yes\n" : "no\n"); 1683 dbgs() << "Position of last high latency dependency: " 1684 << Cand.LastPosHighLatParentScheduled << '\n'; 1685 dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n'; 1686 dbgs() << '\n'; 1687 ); 1688 1689 Block = Cand.Block; 1690 ReadyBlocks.erase(Best); 1691 return Block; 1692 } 1693 1694 // Tracking of currently alive registers to determine VGPR Usage. 1695 1696 void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) { 1697 for (unsigned Reg : Regs) { 1698 // For now only track virtual registers. 1699 if (!TargetRegisterInfo::isVirtualRegister(Reg)) 1700 continue; 1701 // If not already in the live set, then add it. 1702 (void) LiveRegs.insert(Reg); 1703 } 1704 } 1705 1706 void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block, 1707 std::set<unsigned> &Regs) { 1708 for (unsigned Reg : Regs) { 1709 // For now only track virtual registers. 1710 std::set<unsigned>::iterator Pos = LiveRegs.find(Reg); 1711 assert (Pos != LiveRegs.end() && // Reg must be live. 1712 LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() && 1713 LiveRegsConsumers[Reg] >= 1); 1714 --LiveRegsConsumers[Reg]; 1715 if (LiveRegsConsumers[Reg] == 0) 1716 LiveRegs.erase(Pos); 1717 } 1718 } 1719 1720 void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) { 1721 for (const auto &Block : Parent->getSuccs()) { 1722 if (--BlockNumPredsLeft[Block.first->getID()] == 0) 1723 ReadyBlocks.push_back(Block.first); 1724 1725 if (Parent->isHighLatencyBlock() && 1726 Block.second == SIScheduleBlockLinkKind::Data) 1727 LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled; 1728 } 1729 } 1730 1731 void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) { 1732 decreaseLiveRegs(Block, Block->getInRegs()); 1733 addLiveRegs(Block->getOutRegs()); 1734 releaseBlockSuccs(Block); 1735 for (std::map<unsigned, unsigned>::iterator RegI = 1736 LiveOutRegsNumUsages[Block->getID()].begin(), 1737 E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) { 1738 std::pair<unsigned, unsigned> RegP = *RegI; 1739 // We produce this register, thus it must not be previously alive. 1740 assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() || 1741 LiveRegsConsumers[RegP.first] == 0); 1742 LiveRegsConsumers[RegP.first] += RegP.second; 1743 } 1744 if (LastPosHighLatencyParentScheduled[Block->getID()] > 1745 (unsigned)LastPosWaitedHighLatency) 1746 LastPosWaitedHighLatency = 1747 LastPosHighLatencyParentScheduled[Block->getID()]; 1748 ++NumBlockScheduled; 1749 } 1750 1751 std::vector<int> 1752 SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs, 1753 std::set<unsigned> &OutRegs) { 1754 std::vector<int> DiffSetPressure; 1755 DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0); 1756 1757 for (unsigned Reg : InRegs) { 1758 // For now only track virtual registers. 1759 if (!TargetRegisterInfo::isVirtualRegister(Reg)) 1760 continue; 1761 if (LiveRegsConsumers[Reg] > 1) 1762 continue; 1763 PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); 1764 for (; PSetI.isValid(); ++PSetI) { 1765 DiffSetPressure[*PSetI] -= PSetI.getWeight(); 1766 } 1767 } 1768 1769 for (unsigned Reg : OutRegs) { 1770 // For now only track virtual registers. 1771 if (!TargetRegisterInfo::isVirtualRegister(Reg)) 1772 continue; 1773 PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); 1774 for (; PSetI.isValid(); ++PSetI) { 1775 DiffSetPressure[*PSetI] += PSetI.getWeight(); 1776 } 1777 } 1778 1779 return DiffSetPressure; 1780 } 1781 1782 // SIScheduler // 1783 1784 struct SIScheduleBlockResult 1785 SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, 1786 SISchedulerBlockSchedulerVariant ScheduleVariant) { 1787 SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant); 1788 SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks); 1789 std::vector<SIScheduleBlock*> ScheduledBlocks; 1790 struct SIScheduleBlockResult Res; 1791 1792 ScheduledBlocks = Scheduler.getBlocks(); 1793 1794 for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) { 1795 SIScheduleBlock *Block = ScheduledBlocks[b]; 1796 std::vector<SUnit*> SUs = Block->getScheduledUnits(); 1797 1798 for (SUnit* SU : SUs) 1799 Res.SUs.push_back(SU->NodeNum); 1800 } 1801 1802 Res.MaxSGPRUsage = Scheduler.getSGPRUsage(); 1803 Res.MaxVGPRUsage = Scheduler.getVGPRUsage(); 1804 return Res; 1805 } 1806 1807 // SIScheduleDAGMI // 1808 1809 SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) : 1810 ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C)) { 1811 SITII = static_cast<const SIInstrInfo*>(TII); 1812 SITRI = static_cast<const SIRegisterInfo*>(TRI); 1813 1814 VGPRSetID = SITRI->getVGPRPressureSet(); 1815 SGPRSetID = SITRI->getSGPRPressureSet(); 1816 } 1817 1818 SIScheduleDAGMI::~SIScheduleDAGMI() = default; 1819 1820 // Code adapted from scheduleDAG.cpp 1821 // Does a topological sort over the SUs. 1822 // Both TopDown and BottomUp 1823 void SIScheduleDAGMI::topologicalSort() { 1824 Topo.InitDAGTopologicalSorting(); 1825 1826 TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end()); 1827 BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend()); 1828 } 1829 1830 // Move low latencies further from their user without 1831 // increasing SGPR usage (in general) 1832 // This is to be replaced by a better pass that would 1833 // take into account SGPR usage (based on VGPR Usage 1834 // and the corresponding wavefront count), that would 1835 // try to merge groups of loads if it make sense, etc 1836 void SIScheduleDAGMI::moveLowLatencies() { 1837 unsigned DAGSize = SUnits.size(); 1838 int LastLowLatencyUser = -1; 1839 int LastLowLatencyPos = -1; 1840 1841 for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) { 1842 SUnit *SU = &SUnits[ScheduledSUnits[i]]; 1843 bool IsLowLatencyUser = false; 1844 unsigned MinPos = 0; 1845 1846 for (SDep& PredDep : SU->Preds) { 1847 SUnit *Pred = PredDep.getSUnit(); 1848 if (SITII->isLowLatencyInstruction(*Pred->getInstr())) { 1849 IsLowLatencyUser = true; 1850 } 1851 if (Pred->NodeNum >= DAGSize) 1852 continue; 1853 unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum]; 1854 if (PredPos >= MinPos) 1855 MinPos = PredPos + 1; 1856 } 1857 1858 if (SITII->isLowLatencyInstruction(*SU->getInstr())) { 1859 unsigned BestPos = LastLowLatencyUser + 1; 1860 if ((int)BestPos <= LastLowLatencyPos) 1861 BestPos = LastLowLatencyPos + 1; 1862 if (BestPos < MinPos) 1863 BestPos = MinPos; 1864 if (BestPos < i) { 1865 for (unsigned u = i; u > BestPos; --u) { 1866 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; 1867 ScheduledSUnits[u] = ScheduledSUnits[u-1]; 1868 } 1869 ScheduledSUnits[BestPos] = SU->NodeNum; 1870 ScheduledSUnitsInv[SU->NodeNum] = BestPos; 1871 } 1872 LastLowLatencyPos = BestPos; 1873 if (IsLowLatencyUser) 1874 LastLowLatencyUser = BestPos; 1875 } else if (IsLowLatencyUser) { 1876 LastLowLatencyUser = i; 1877 // Moves COPY instructions on which depends 1878 // the low latency instructions too. 1879 } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) { 1880 bool CopyForLowLat = false; 1881 for (SDep& SuccDep : SU->Succs) { 1882 SUnit *Succ = SuccDep.getSUnit(); 1883 if (SITII->isLowLatencyInstruction(*Succ->getInstr())) { 1884 CopyForLowLat = true; 1885 } 1886 } 1887 if (!CopyForLowLat) 1888 continue; 1889 if (MinPos < i) { 1890 for (unsigned u = i; u > MinPos; --u) { 1891 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; 1892 ScheduledSUnits[u] = ScheduledSUnits[u-1]; 1893 } 1894 ScheduledSUnits[MinPos] = SU->NodeNum; 1895 ScheduledSUnitsInv[SU->NodeNum] = MinPos; 1896 } 1897 } 1898 } 1899 } 1900 1901 void SIScheduleDAGMI::restoreSULinksLeft() { 1902 for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { 1903 SUnits[i].isScheduled = false; 1904 SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft; 1905 SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft; 1906 SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft; 1907 SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft; 1908 } 1909 } 1910 1911 // Return the Vgpr and Sgpr usage corresponding to some virtual registers. 1912 template<typename _Iterator> void 1913 SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End, 1914 unsigned &VgprUsage, unsigned &SgprUsage) { 1915 VgprUsage = 0; 1916 SgprUsage = 0; 1917 for (_Iterator RegI = First; RegI != End; ++RegI) { 1918 unsigned Reg = *RegI; 1919 // For now only track virtual registers 1920 if (!TargetRegisterInfo::isVirtualRegister(Reg)) 1921 continue; 1922 PSetIterator PSetI = MRI.getPressureSets(Reg); 1923 for (; PSetI.isValid(); ++PSetI) { 1924 if (*PSetI == VGPRSetID) 1925 VgprUsage += PSetI.getWeight(); 1926 else if (*PSetI == SGPRSetID) 1927 SgprUsage += PSetI.getWeight(); 1928 } 1929 } 1930 } 1931 1932 void SIScheduleDAGMI::schedule() 1933 { 1934 SmallVector<SUnit*, 8> TopRoots, BotRoots; 1935 SIScheduleBlockResult Best, Temp; 1936 DEBUG(dbgs() << "Preparing Scheduling\n"); 1937 1938 buildDAGWithRegPressure(); 1939 DEBUG( 1940 for(SUnit& SU : SUnits) 1941 SU.dumpAll(this) 1942 ); 1943 1944 topologicalSort(); 1945 findRootsAndBiasEdges(TopRoots, BotRoots); 1946 // We reuse several ScheduleDAGMI and ScheduleDAGMILive 1947 // functions, but to make them happy we must initialize 1948 // the default Scheduler implementation (even if we do not 1949 // run it) 1950 SchedImpl->initialize(this); 1951 initQueues(TopRoots, BotRoots); 1952 1953 // Fill some stats to help scheduling. 1954 1955 SUnitsLinksBackup = SUnits; 1956 IsLowLatencySU.clear(); 1957 LowLatencyOffset.clear(); 1958 IsHighLatencySU.clear(); 1959 1960 IsLowLatencySU.resize(SUnits.size(), 0); 1961 LowLatencyOffset.resize(SUnits.size(), 0); 1962 IsHighLatencySU.resize(SUnits.size(), 0); 1963 1964 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { 1965 SUnit *SU = &SUnits[i]; 1966 unsigned BaseLatReg; 1967 int64_t OffLatReg; 1968 if (SITII->isLowLatencyInstruction(*SU->getInstr())) { 1969 IsLowLatencySU[i] = 1; 1970 if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg, 1971 TRI)) 1972 LowLatencyOffset[i] = OffLatReg; 1973 } else if (SITII->isHighLatencyInstruction(*SU->getInstr())) 1974 IsHighLatencySU[i] = 1; 1975 } 1976 1977 SIScheduler Scheduler(this); 1978 Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone, 1979 SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage); 1980 1981 // if VGPR usage is extremely high, try other good performing variants 1982 // which could lead to lower VGPR usage 1983 if (Best.MaxVGPRUsage > 180) { 1984 static const std::pair<SISchedulerBlockCreatorVariant, 1985 SISchedulerBlockSchedulerVariant> 1986 Variants[] = { 1987 { LatenciesAlone, BlockRegUsageLatency }, 1988 // { LatenciesAlone, BlockRegUsage }, 1989 { LatenciesGrouped, BlockLatencyRegUsage }, 1990 // { LatenciesGrouped, BlockRegUsageLatency }, 1991 // { LatenciesGrouped, BlockRegUsage }, 1992 { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, 1993 // { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, 1994 // { LatenciesAlonePlusConsecutive, BlockRegUsage } 1995 }; 1996 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { 1997 Temp = Scheduler.scheduleVariant(v.first, v.second); 1998 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) 1999 Best = Temp; 2000 } 2001 } 2002 // if VGPR usage is still extremely high, we may spill. Try other variants 2003 // which are less performing, but that could lead to lower VGPR usage. 2004 if (Best.MaxVGPRUsage > 200) { 2005 static const std::pair<SISchedulerBlockCreatorVariant, 2006 SISchedulerBlockSchedulerVariant> 2007 Variants[] = { 2008 // { LatenciesAlone, BlockRegUsageLatency }, 2009 { LatenciesAlone, BlockRegUsage }, 2010 // { LatenciesGrouped, BlockLatencyRegUsage }, 2011 { LatenciesGrouped, BlockRegUsageLatency }, 2012 { LatenciesGrouped, BlockRegUsage }, 2013 // { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, 2014 { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, 2015 { LatenciesAlonePlusConsecutive, BlockRegUsage } 2016 }; 2017 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { 2018 Temp = Scheduler.scheduleVariant(v.first, v.second); 2019 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) 2020 Best = Temp; 2021 } 2022 } 2023 2024 ScheduledSUnits = Best.SUs; 2025 ScheduledSUnitsInv.resize(SUnits.size()); 2026 2027 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { 2028 ScheduledSUnitsInv[ScheduledSUnits[i]] = i; 2029 } 2030 2031 moveLowLatencies(); 2032 2033 // Tell the outside world about the result of the scheduling. 2034 2035 assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker"); 2036 TopRPTracker.setPos(CurrentTop); 2037 2038 for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(), 2039 E = ScheduledSUnits.end(); I != E; ++I) { 2040 SUnit *SU = &SUnits[*I]; 2041 2042 scheduleMI(SU, true); 2043 2044 DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " 2045 << *SU->getInstr()); 2046 } 2047 2048 assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone."); 2049 2050 placeDebugValues(); 2051 2052 DEBUG({ 2053 dbgs() << "*** Final schedule for " 2054 << printMBBReference(*begin()->getParent()) << " ***\n"; 2055 dumpSchedule(); 2056 dbgs() << '\n'; 2057 }); 2058 } 2059