1 //===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief SI Machine Scheduler interface 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "SIMachineScheduler.h" 16 #include "AMDGPU.h" 17 #include "SIInstrInfo.h" 18 #include "SIRegisterInfo.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/CodeGen/LiveInterval.h" 22 #include "llvm/CodeGen/LiveIntervalAnalysis.h" 23 #include "llvm/CodeGen/MachineInstr.h" 24 #include "llvm/CodeGen/MachineRegisterInfo.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/RegisterPressure.h" 27 #include "llvm/CodeGen/SlotIndexes.h" 28 #include "llvm/Support/Debug.h" 29 #include "llvm/Support/ErrorHandling.h" 30 #include "llvm/Support/raw_ostream.h" 31 #include "llvm/Target/TargetRegisterInfo.h" 32 #include <algorithm> 33 #include <cassert> 34 #include <map> 35 #include <set> 36 #include <utility> 37 #include <vector> 38 39 using namespace llvm; 40 41 #define DEBUG_TYPE "misched" 42 43 // This scheduler implements a different scheduling algorithm than 44 // GenericScheduler. 45 // 46 // There are several specific architecture behaviours that can't be modelled 47 // for GenericScheduler: 48 // . When accessing the result of an SGPR load instruction, you have to wait 49 // for all the SGPR load instructions before your current instruction to 50 // have finished. 51 // . When accessing the result of an VGPR load instruction, you have to wait 52 // for all the VGPR load instructions previous to the VGPR load instruction 53 // you are interested in to finish. 54 // . The less the register pressure, the best load latencies are hidden 55 // 56 // Moreover some specifities (like the fact a lot of instructions in the shader 57 // have few dependencies) makes the generic scheduler have some unpredictable 58 // behaviours. For example when register pressure becomes high, it can either 59 // manage to prevent register pressure from going too high, or it can 60 // increase register pressure even more than if it hadn't taken register 61 // pressure into account. 62 // 63 // Also some other bad behaviours are generated, like loading at the beginning 64 // of the shader a constant in VGPR you won't need until the end of the shader. 65 // 66 // The scheduling problem for SI can distinguish three main parts: 67 // . Hiding high latencies (texture sampling, etc) 68 // . Hiding low latencies (SGPR constant loading, etc) 69 // . Keeping register usage low for better latency hiding and general 70 // performance 71 // 72 // Some other things can also affect performance, but are hard to predict 73 // (cache usage, the fact the HW can issue several instructions from different 74 // wavefronts if different types, etc) 75 // 76 // This scheduler tries to solve the scheduling problem by dividing it into 77 // simpler sub-problems. It divides the instructions into blocks, schedules 78 // locally inside the blocks where it takes care of low latencies, and then 79 // chooses the order of the blocks by taking care of high latencies. 80 // Dividing the instructions into blocks helps control keeping register 81 // usage low. 82 // 83 // First the instructions are put into blocks. 84 // We want the blocks help control register usage and hide high latencies 85 // later. To help control register usage, we typically want all local 86 // computations, when for example you create a result that can be comsummed 87 // right away, to be contained in a block. Block inputs and outputs would 88 // typically be important results that are needed in several locations of 89 // the shader. Since we do want blocks to help hide high latencies, we want 90 // the instructions inside the block to have a minimal set of dependencies 91 // on high latencies. It will make it easy to pick blocks to hide specific 92 // high latencies. 93 // The block creation algorithm is divided into several steps, and several 94 // variants can be tried during the scheduling process. 95 // 96 // Second the order of the instructions inside the blocks is chosen. 97 // At that step we do take into account only register usage and hiding 98 // low latency instructions 99 // 100 // Third the block order is chosen, there we try to hide high latencies 101 // and keep register usage low. 102 // 103 // After the third step, a pass is done to improve the hiding of low 104 // latencies. 105 // 106 // Actually when talking about 'low latency' or 'high latency' it includes 107 // both the latency to get the cache (or global mem) data go to the register, 108 // and the bandwidth limitations. 109 // Increasing the number of active wavefronts helps hide the former, but it 110 // doesn't solve the latter, thus why even if wavefront count is high, we have 111 // to try have as many instructions hiding high latencies as possible. 112 // The OpenCL doc says for example latency of 400 cycles for a global mem access, 113 // which is hidden by 10 instructions if the wavefront count is 10. 114 115 // Some figures taken from AMD docs: 116 // Both texture and constant L1 caches are 4-way associative with 64 bytes 117 // lines. 118 // Constant cache is shared with 4 CUs. 119 // For texture sampling, the address generation unit receives 4 texture 120 // addresses per cycle, thus we could expect texture sampling latency to be 121 // equivalent to 4 instructions in the very best case (a VGPR is 64 work items, 122 // instructions in a wavefront group are executed every 4 cycles), 123 // or 16 instructions if the other wavefronts associated to the 3 other VALUs 124 // of the CU do texture sampling too. (Don't take these figures too seriously, 125 // as I'm not 100% sure of the computation) 126 // Data exports should get similar latency. 127 // For constant loading, the cache is shader with 4 CUs. 128 // The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit" 129 // I guess if the other CU don't read the cache, it can go up to 64B/cycle. 130 // It means a simple s_buffer_load should take one instruction to hide, as 131 // well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same 132 // cache line. 133 // 134 // As of today the driver doesn't preload the constants in cache, thus the 135 // first loads get extra latency. The doc says global memory access can be 136 // 300-600 cycles. We do not specially take that into account when scheduling 137 // As we expect the driver to be able to preload the constants soon. 138 139 // common code // 140 141 #ifndef NDEBUG 142 143 static const char *getReasonStr(SIScheduleCandReason Reason) { 144 switch (Reason) { 145 case NoCand: return "NOCAND"; 146 case RegUsage: return "REGUSAGE"; 147 case Latency: return "LATENCY"; 148 case Successor: return "SUCCESSOR"; 149 case Depth: return "DEPTH"; 150 case NodeOrder: return "ORDER"; 151 } 152 llvm_unreachable("Unknown reason!"); 153 } 154 155 #endif 156 157 static bool tryLess(int TryVal, int CandVal, 158 SISchedulerCandidate &TryCand, 159 SISchedulerCandidate &Cand, 160 SIScheduleCandReason Reason) { 161 if (TryVal < CandVal) { 162 TryCand.Reason = Reason; 163 return true; 164 } 165 if (TryVal > CandVal) { 166 if (Cand.Reason > Reason) 167 Cand.Reason = Reason; 168 return true; 169 } 170 Cand.setRepeat(Reason); 171 return false; 172 } 173 174 static bool tryGreater(int TryVal, int CandVal, 175 SISchedulerCandidate &TryCand, 176 SISchedulerCandidate &Cand, 177 SIScheduleCandReason Reason) { 178 if (TryVal > CandVal) { 179 TryCand.Reason = Reason; 180 return true; 181 } 182 if (TryVal < CandVal) { 183 if (Cand.Reason > Reason) 184 Cand.Reason = Reason; 185 return true; 186 } 187 Cand.setRepeat(Reason); 188 return false; 189 } 190 191 // SIScheduleBlock // 192 193 void SIScheduleBlock::addUnit(SUnit *SU) { 194 NodeNum2Index[SU->NodeNum] = SUnits.size(); 195 SUnits.push_back(SU); 196 } 197 198 #ifndef NDEBUG 199 void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) { 200 201 dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason); 202 dbgs() << '\n'; 203 } 204 #endif 205 206 void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand, 207 SISchedCandidate &TryCand) { 208 // Initialize the candidate if needed. 209 if (!Cand.isValid()) { 210 TryCand.Reason = NodeOrder; 211 return; 212 } 213 214 if (Cand.SGPRUsage > 60 && 215 tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage)) 216 return; 217 218 // Schedule low latency instructions as top as possible. 219 // Order of priority is: 220 // . Low latency instructions which do not depend on other low latency 221 // instructions we haven't waited for 222 // . Other instructions which do not depend on low latency instructions 223 // we haven't waited for 224 // . Low latencies 225 // . All other instructions 226 // Goal is to get: low latency instructions - independent instructions 227 // - (eventually some more low latency instructions) 228 // - instructions that depend on the first low latency instructions. 229 // If in the block there is a lot of constant loads, the SGPR usage 230 // could go quite high, thus above the arbitrary limit of 60 will encourage 231 // use the already loaded constants (in order to release some SGPRs) before 232 // loading more. 233 if (tryLess(TryCand.HasLowLatencyNonWaitedParent, 234 Cand.HasLowLatencyNonWaitedParent, 235 TryCand, Cand, SIScheduleCandReason::Depth)) 236 return; 237 238 if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency, 239 TryCand, Cand, SIScheduleCandReason::Depth)) 240 return; 241 242 if (TryCand.IsLowLatency && 243 tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset, 244 TryCand, Cand, SIScheduleCandReason::Depth)) 245 return; 246 247 if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage)) 248 return; 249 250 // Fall through to original instruction order. 251 if (TryCand.SU->NodeNum < Cand.SU->NodeNum) { 252 TryCand.Reason = NodeOrder; 253 } 254 } 255 256 SUnit* SIScheduleBlock::pickNode() { 257 SISchedCandidate TopCand; 258 259 for (SUnit* SU : TopReadySUs) { 260 SISchedCandidate TryCand; 261 std::vector<unsigned> pressure; 262 std::vector<unsigned> MaxPressure; 263 // Predict register usage after this instruction. 264 TryCand.SU = SU; 265 TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure); 266 TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()]; 267 TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()]; 268 TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum]; 269 TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum]; 270 TryCand.HasLowLatencyNonWaitedParent = 271 HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]; 272 tryCandidateTopDown(TopCand, TryCand); 273 if (TryCand.Reason != NoCand) 274 TopCand.setBest(TryCand); 275 } 276 277 return TopCand.SU; 278 } 279 280 281 // Schedule something valid. 282 void SIScheduleBlock::fastSchedule() { 283 TopReadySUs.clear(); 284 if (Scheduled) 285 undoSchedule(); 286 287 for (SUnit* SU : SUnits) { 288 if (!SU->NumPredsLeft) 289 TopReadySUs.push_back(SU); 290 } 291 292 while (!TopReadySUs.empty()) { 293 SUnit *SU = TopReadySUs[0]; 294 ScheduledSUnits.push_back(SU); 295 nodeScheduled(SU); 296 } 297 298 Scheduled = true; 299 } 300 301 // Returns if the register was set between first and last. 302 static bool isDefBetween(unsigned Reg, 303 SlotIndex First, SlotIndex Last, 304 const MachineRegisterInfo *MRI, 305 const LiveIntervals *LIS) { 306 for (MachineRegisterInfo::def_instr_iterator 307 UI = MRI->def_instr_begin(Reg), 308 UE = MRI->def_instr_end(); UI != UE; ++UI) { 309 const MachineInstr* MI = &*UI; 310 if (MI->isDebugValue()) 311 continue; 312 SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot(); 313 if (InstSlot >= First && InstSlot <= Last) 314 return true; 315 } 316 return false; 317 } 318 319 void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, 320 MachineBasicBlock::iterator EndBlock) { 321 IntervalPressure Pressure, BotPressure; 322 RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure); 323 LiveIntervals *LIS = DAG->getLIS(); 324 MachineRegisterInfo *MRI = DAG->getMRI(); 325 DAG->initRPTracker(TopRPTracker); 326 DAG->initRPTracker(BotRPTracker); 327 DAG->initRPTracker(RPTracker); 328 329 // Goes though all SU. RPTracker captures what had to be alive for the SUs 330 // to execute, and what is still alive at the end. 331 for (SUnit* SU : ScheduledSUnits) { 332 RPTracker.setPos(SU->getInstr()); 333 RPTracker.advance(); 334 } 335 336 // Close the RPTracker to finalize live ins/outs. 337 RPTracker.closeRegion(); 338 339 // Initialize the live ins and live outs. 340 TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs); 341 BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs); 342 343 // Do not Track Physical Registers, because it messes up. 344 for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) { 345 if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit)) 346 LiveInRegs.insert(RegMaskPair.RegUnit); 347 } 348 LiveOutRegs.clear(); 349 // There is several possibilities to distinguish: 350 // 1) Reg is not input to any instruction in the block, but is output of one 351 // 2) 1) + read in the block and not needed after it 352 // 3) 1) + read in the block but needed in another block 353 // 4) Reg is input of an instruction but another block will read it too 354 // 5) Reg is input of an instruction and then rewritten in the block. 355 // result is not read in the block (implies used in another block) 356 // 6) Reg is input of an instruction and then rewritten in the block. 357 // result is read in the block and not needed in another block 358 // 7) Reg is input of an instruction and then rewritten in the block. 359 // result is read in the block but also needed in another block 360 // LiveInRegs will contains all the regs in situation 4, 5, 6, 7 361 // We want LiveOutRegs to contain only Regs whose content will be read after 362 // in another block, and whose content was written in the current block, 363 // that is we want it to get 1, 3, 5, 7 364 // Since we made the MIs of a block to be packed all together before 365 // scheduling, then the LiveIntervals were correct, and the RPTracker was 366 // able to correctly handle 5 vs 6, 2 vs 3. 367 // (Note: This is not sufficient for RPTracker to not do mistakes for case 4) 368 // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7 369 // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7 370 // The use of findDefBetween removes the case 4. 371 for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) { 372 unsigned Reg = RegMaskPair.RegUnit; 373 if (TargetRegisterInfo::isVirtualRegister(Reg) && 374 isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(), 375 LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI, 376 LIS)) { 377 LiveOutRegs.insert(Reg); 378 } 379 } 380 381 // Pressure = sum_alive_registers register size 382 // Internally llvm will represent some registers as big 128 bits registers 383 // for example, but they actually correspond to 4 actual 32 bits registers. 384 // Thus Pressure is not equal to num_alive_registers * constant. 385 LiveInPressure = TopPressure.MaxSetPressure; 386 LiveOutPressure = BotPressure.MaxSetPressure; 387 388 // Prepares TopRPTracker for top down scheduling. 389 TopRPTracker.closeTop(); 390 } 391 392 void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock, 393 MachineBasicBlock::iterator EndBlock) { 394 if (!Scheduled) 395 fastSchedule(); 396 397 // PreScheduling phase to set LiveIn and LiveOut. 398 initRegPressure(BeginBlock, EndBlock); 399 undoSchedule(); 400 401 // Schedule for real now. 402 403 TopReadySUs.clear(); 404 405 for (SUnit* SU : SUnits) { 406 if (!SU->NumPredsLeft) 407 TopReadySUs.push_back(SU); 408 } 409 410 while (!TopReadySUs.empty()) { 411 SUnit *SU = pickNode(); 412 ScheduledSUnits.push_back(SU); 413 TopRPTracker.setPos(SU->getInstr()); 414 TopRPTracker.advance(); 415 nodeScheduled(SU); 416 } 417 418 // TODO: compute InternalAdditionnalPressure. 419 InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size()); 420 421 // Check everything is right. 422 #ifndef NDEBUG 423 assert(SUnits.size() == ScheduledSUnits.size() && 424 TopReadySUs.empty()); 425 for (SUnit* SU : SUnits) { 426 assert(SU->isScheduled && 427 SU->NumPredsLeft == 0); 428 } 429 #endif 430 431 Scheduled = true; 432 } 433 434 void SIScheduleBlock::undoSchedule() { 435 for (SUnit* SU : SUnits) { 436 SU->isScheduled = false; 437 for (SDep& Succ : SU->Succs) { 438 if (BC->isSUInBlock(Succ.getSUnit(), ID)) 439 undoReleaseSucc(SU, &Succ); 440 } 441 } 442 HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); 443 ScheduledSUnits.clear(); 444 Scheduled = false; 445 } 446 447 void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) { 448 SUnit *SuccSU = SuccEdge->getSUnit(); 449 450 if (SuccEdge->isWeak()) { 451 ++SuccSU->WeakPredsLeft; 452 return; 453 } 454 ++SuccSU->NumPredsLeft; 455 } 456 457 void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) { 458 SUnit *SuccSU = SuccEdge->getSUnit(); 459 460 if (SuccEdge->isWeak()) { 461 --SuccSU->WeakPredsLeft; 462 return; 463 } 464 #ifndef NDEBUG 465 if (SuccSU->NumPredsLeft == 0) { 466 dbgs() << "*** Scheduling failed! ***\n"; 467 SuccSU->dump(DAG); 468 dbgs() << " has been released too many times!\n"; 469 llvm_unreachable(nullptr); 470 } 471 #endif 472 473 --SuccSU->NumPredsLeft; 474 } 475 476 /// Release Successors of the SU that are in the block or not. 477 void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) { 478 for (SDep& Succ : SU->Succs) { 479 SUnit *SuccSU = Succ.getSUnit(); 480 481 if (SuccSU->NodeNum >= DAG->SUnits.size()) 482 continue; 483 484 if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock) 485 continue; 486 487 releaseSucc(SU, &Succ); 488 if (SuccSU->NumPredsLeft == 0 && InOrOutBlock) 489 TopReadySUs.push_back(SuccSU); 490 } 491 } 492 493 void SIScheduleBlock::nodeScheduled(SUnit *SU) { 494 // Is in TopReadySUs 495 assert (!SU->NumPredsLeft); 496 std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU); 497 if (I == TopReadySUs.end()) { 498 dbgs() << "Data Structure Bug in SI Scheduler\n"; 499 llvm_unreachable(nullptr); 500 } 501 TopReadySUs.erase(I); 502 503 releaseSuccessors(SU, true); 504 // Scheduling this node will trigger a wait, 505 // thus propagate to other instructions that they do not need to wait either. 506 if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]) 507 HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); 508 509 if (DAG->IsLowLatencySU[SU->NodeNum]) { 510 for (SDep& Succ : SU->Succs) { 511 std::map<unsigned, unsigned>::iterator I = 512 NodeNum2Index.find(Succ.getSUnit()->NodeNum); 513 if (I != NodeNum2Index.end()) 514 HasLowLatencyNonWaitedParent[I->second] = 1; 515 } 516 } 517 SU->isScheduled = true; 518 } 519 520 void SIScheduleBlock::finalizeUnits() { 521 // We remove links from outside blocks to enable scheduling inside the block. 522 for (SUnit* SU : SUnits) { 523 releaseSuccessors(SU, false); 524 if (DAG->IsHighLatencySU[SU->NodeNum]) 525 HighLatencyBlock = true; 526 } 527 HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0); 528 } 529 530 // we maintain ascending order of IDs 531 void SIScheduleBlock::addPred(SIScheduleBlock *Pred) { 532 unsigned PredID = Pred->getID(); 533 534 // Check if not already predecessor. 535 for (SIScheduleBlock* P : Preds) { 536 if (PredID == P->getID()) 537 return; 538 } 539 Preds.push_back(Pred); 540 541 assert(none_of(Succs, 542 [=](std::pair<SIScheduleBlock*, 543 SIScheduleBlockLinkKind> S) { 544 return PredID == S.first->getID(); 545 }) && 546 "Loop in the Block Graph!"); 547 } 548 549 void SIScheduleBlock::addSucc(SIScheduleBlock *Succ, 550 SIScheduleBlockLinkKind Kind) { 551 unsigned SuccID = Succ->getID(); 552 553 // Check if not already predecessor. 554 for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) { 555 if (SuccID == S.first->getID()) { 556 if (S.second == SIScheduleBlockLinkKind::NoData && 557 Kind == SIScheduleBlockLinkKind::Data) 558 S.second = Kind; 559 return; 560 } 561 } 562 if (Succ->isHighLatencyBlock()) 563 ++NumHighLatencySuccessors; 564 Succs.push_back(std::make_pair(Succ, Kind)); 565 566 assert(none_of(Preds, 567 [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && 568 "Loop in the Block Graph!"); 569 } 570 571 #ifndef NDEBUG 572 void SIScheduleBlock::printDebug(bool full) { 573 dbgs() << "Block (" << ID << ")\n"; 574 if (!full) 575 return; 576 577 dbgs() << "\nContains High Latency Instruction: " 578 << HighLatencyBlock << '\n'; 579 dbgs() << "\nDepends On:\n"; 580 for (SIScheduleBlock* P : Preds) { 581 P->printDebug(false); 582 } 583 584 dbgs() << "\nSuccessors:\n"; 585 for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) { 586 if (S.second == SIScheduleBlockLinkKind::Data) 587 dbgs() << "(Data Dep) "; 588 S.first->printDebug(false); 589 } 590 591 if (Scheduled) { 592 dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' ' 593 << LiveInPressure[DAG->getVGPRSetID()] << '\n'; 594 dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' ' 595 << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n"; 596 dbgs() << "LiveIns:\n"; 597 for (unsigned Reg : LiveInRegs) 598 dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' '; 599 600 dbgs() << "\nLiveOuts:\n"; 601 for (unsigned Reg : LiveOutRegs) 602 dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' '; 603 } 604 605 dbgs() << "\nInstructions:\n"; 606 if (!Scheduled) { 607 for (SUnit* SU : SUnits) { 608 SU->dump(DAG); 609 } 610 } else { 611 for (SUnit* SU : SUnits) { 612 SU->dump(DAG); 613 } 614 } 615 616 dbgs() << "///////////////////////\n"; 617 } 618 #endif 619 620 // SIScheduleBlockCreator // 621 622 SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) : 623 DAG(DAG) { 624 } 625 626 SIScheduleBlockCreator::~SIScheduleBlockCreator() = default; 627 628 SIScheduleBlocks 629 SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) { 630 std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B = 631 Blocks.find(BlockVariant); 632 if (B == Blocks.end()) { 633 SIScheduleBlocks Res; 634 createBlocksForVariant(BlockVariant); 635 topologicalSort(); 636 scheduleInsideBlocks(); 637 fillStats(); 638 Res.Blocks = CurrentBlocks; 639 Res.TopDownIndex2Block = TopDownIndex2Block; 640 Res.TopDownBlock2Index = TopDownBlock2Index; 641 Blocks[BlockVariant] = Res; 642 return Res; 643 } else { 644 return B->second; 645 } 646 } 647 648 bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) { 649 if (SU->NodeNum >= DAG->SUnits.size()) 650 return false; 651 return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID; 652 } 653 654 void SIScheduleBlockCreator::colorHighLatenciesAlone() { 655 unsigned DAGSize = DAG->SUnits.size(); 656 657 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 658 SUnit *SU = &DAG->SUnits[i]; 659 if (DAG->IsHighLatencySU[SU->NodeNum]) { 660 CurrentColoring[SU->NodeNum] = NextReservedID++; 661 } 662 } 663 } 664 665 static bool 666 hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) { 667 for (const auto &PredDep : SU.Preds) { 668 if (PredDep.getSUnit() == &FromSU && 669 PredDep.getKind() == llvm::SDep::Data) 670 return true; 671 } 672 return false; 673 } 674 675 void SIScheduleBlockCreator::colorHighLatenciesGroups() { 676 unsigned DAGSize = DAG->SUnits.size(); 677 unsigned NumHighLatencies = 0; 678 unsigned GroupSize; 679 int Color = NextReservedID; 680 unsigned Count = 0; 681 std::set<unsigned> FormingGroup; 682 683 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 684 SUnit *SU = &DAG->SUnits[i]; 685 if (DAG->IsHighLatencySU[SU->NodeNum]) 686 ++NumHighLatencies; 687 } 688 689 if (NumHighLatencies == 0) 690 return; 691 692 if (NumHighLatencies <= 6) 693 GroupSize = 2; 694 else if (NumHighLatencies <= 12) 695 GroupSize = 3; 696 else 697 GroupSize = 4; 698 699 for (unsigned SUNum : DAG->TopDownIndex2SU) { 700 const SUnit &SU = DAG->SUnits[SUNum]; 701 if (DAG->IsHighLatencySU[SU.NodeNum]) { 702 unsigned CompatibleGroup = true; 703 int ProposedColor = Color; 704 std::vector<int> AdditionalElements; 705 706 // We don't want to put in the same block 707 // two high latency instructions that depend 708 // on each other. 709 // One way would be to check canAddEdge 710 // in both directions, but that currently is not 711 // enough because there the high latency order is 712 // enforced (via links). 713 // Instead, look at the dependencies between the 714 // high latency instructions and deduce if it is 715 // a data dependency or not. 716 for (unsigned j : FormingGroup) { 717 bool HasSubGraph; 718 std::vector<int> SubGraph; 719 // By construction (topological order), if SU and 720 // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary 721 // in the parent graph of SU. 722 #ifndef NDEBUG 723 SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j], 724 HasSubGraph); 725 assert(!HasSubGraph); 726 #endif 727 SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU, 728 HasSubGraph); 729 if (!HasSubGraph) 730 continue; // No dependencies between each other 731 else if (SubGraph.size() > 5) { 732 // Too many elements would be required to be added to the block. 733 CompatibleGroup = false; 734 break; 735 } 736 else { 737 // Check the type of dependency 738 for (unsigned k : SubGraph) { 739 // If in the path to join the two instructions, 740 // there is another high latency instruction, 741 // or instructions colored for another block 742 // abort the merge. 743 if (DAG->IsHighLatencySU[k] || 744 (CurrentColoring[k] != ProposedColor && 745 CurrentColoring[k] != 0)) { 746 CompatibleGroup = false; 747 break; 748 } 749 // If one of the SU in the subgraph depends on the result of SU j, 750 // there'll be a data dependency. 751 if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) { 752 CompatibleGroup = false; 753 break; 754 } 755 } 756 if (!CompatibleGroup) 757 break; 758 // Same check for the SU 759 if (hasDataDependencyPred(SU, DAG->SUnits[j])) { 760 CompatibleGroup = false; 761 break; 762 } 763 // Add all the required instructions to the block 764 // These cannot live in another block (because they 765 // depend (order dependency) on one of the 766 // instruction in the block, and are required for the 767 // high latency instruction we add. 768 AdditionalElements.insert(AdditionalElements.end(), 769 SubGraph.begin(), SubGraph.end()); 770 } 771 } 772 if (CompatibleGroup) { 773 FormingGroup.insert(SU.NodeNum); 774 for (unsigned j : AdditionalElements) 775 CurrentColoring[j] = ProposedColor; 776 CurrentColoring[SU.NodeNum] = ProposedColor; 777 ++Count; 778 } 779 // Found one incompatible instruction, 780 // or has filled a big enough group. 781 // -> start a new one. 782 if (!CompatibleGroup) { 783 FormingGroup.clear(); 784 Color = ++NextReservedID; 785 ProposedColor = Color; 786 FormingGroup.insert(SU.NodeNum); 787 CurrentColoring[SU.NodeNum] = ProposedColor; 788 Count = 0; 789 } else if (Count == GroupSize) { 790 FormingGroup.clear(); 791 Color = ++NextReservedID; 792 ProposedColor = Color; 793 Count = 0; 794 } 795 } 796 } 797 } 798 799 void SIScheduleBlockCreator::colorComputeReservedDependencies() { 800 unsigned DAGSize = DAG->SUnits.size(); 801 std::map<std::set<unsigned>, unsigned> ColorCombinations; 802 803 CurrentTopDownReservedDependencyColoring.clear(); 804 CurrentBottomUpReservedDependencyColoring.clear(); 805 806 CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0); 807 CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0); 808 809 // Traverse TopDown, and give different colors to SUs depending 810 // on which combination of High Latencies they depend on. 811 812 for (unsigned SUNum : DAG->TopDownIndex2SU) { 813 SUnit *SU = &DAG->SUnits[SUNum]; 814 std::set<unsigned> SUColors; 815 816 // Already given. 817 if (CurrentColoring[SU->NodeNum]) { 818 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = 819 CurrentColoring[SU->NodeNum]; 820 continue; 821 } 822 823 for (SDep& PredDep : SU->Preds) { 824 SUnit *Pred = PredDep.getSUnit(); 825 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) 826 continue; 827 if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0) 828 SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]); 829 } 830 // Color 0 by default. 831 if (SUColors.empty()) 832 continue; 833 // Same color than parents. 834 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) 835 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = 836 *SUColors.begin(); 837 else { 838 std::map<std::set<unsigned>, unsigned>::iterator Pos = 839 ColorCombinations.find(SUColors); 840 if (Pos != ColorCombinations.end()) { 841 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second; 842 } else { 843 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = 844 NextNonReservedID; 845 ColorCombinations[SUColors] = NextNonReservedID++; 846 } 847 } 848 } 849 850 ColorCombinations.clear(); 851 852 // Same as before, but BottomUp. 853 854 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 855 SUnit *SU = &DAG->SUnits[SUNum]; 856 std::set<unsigned> SUColors; 857 858 // Already given. 859 if (CurrentColoring[SU->NodeNum]) { 860 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = 861 CurrentColoring[SU->NodeNum]; 862 continue; 863 } 864 865 for (SDep& SuccDep : SU->Succs) { 866 SUnit *Succ = SuccDep.getSUnit(); 867 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 868 continue; 869 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0) 870 SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]); 871 } 872 // Keep color 0. 873 if (SUColors.empty()) 874 continue; 875 // Same color than parents. 876 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) 877 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = 878 *SUColors.begin(); 879 else { 880 std::map<std::set<unsigned>, unsigned>::iterator Pos = 881 ColorCombinations.find(SUColors); 882 if (Pos != ColorCombinations.end()) { 883 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second; 884 } else { 885 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = 886 NextNonReservedID; 887 ColorCombinations[SUColors] = NextNonReservedID++; 888 } 889 } 890 } 891 } 892 893 void SIScheduleBlockCreator::colorAccordingToReservedDependencies() { 894 unsigned DAGSize = DAG->SUnits.size(); 895 std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations; 896 897 // Every combination of colors given by the top down 898 // and bottom up Reserved node dependency 899 900 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 901 SUnit *SU = &DAG->SUnits[i]; 902 std::pair<unsigned, unsigned> SUColors; 903 904 // High latency instructions: already given. 905 if (CurrentColoring[SU->NodeNum]) 906 continue; 907 908 SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum]; 909 SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum]; 910 911 std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos = 912 ColorCombinations.find(SUColors); 913 if (Pos != ColorCombinations.end()) { 914 CurrentColoring[SU->NodeNum] = Pos->second; 915 } else { 916 CurrentColoring[SU->NodeNum] = NextNonReservedID; 917 ColorCombinations[SUColors] = NextNonReservedID++; 918 } 919 } 920 } 921 922 void SIScheduleBlockCreator::colorEndsAccordingToDependencies() { 923 unsigned DAGSize = DAG->SUnits.size(); 924 std::vector<int> PendingColoring = CurrentColoring; 925 926 assert(DAGSize >= 1 && 927 CurrentBottomUpReservedDependencyColoring.size() == DAGSize && 928 CurrentTopDownReservedDependencyColoring.size() == DAGSize); 929 // If there is no reserved block at all, do nothing. We don't want 930 // everything in one block. 931 if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(), 932 CurrentBottomUpReservedDependencyColoring.end()) == 0 && 933 *std::max_element(CurrentTopDownReservedDependencyColoring.begin(), 934 CurrentTopDownReservedDependencyColoring.end()) == 0) 935 return; 936 937 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 938 SUnit *SU = &DAG->SUnits[SUNum]; 939 std::set<unsigned> SUColors; 940 std::set<unsigned> SUColorsPending; 941 942 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 943 continue; 944 945 if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 || 946 CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0) 947 continue; 948 949 for (SDep& SuccDep : SU->Succs) { 950 SUnit *Succ = SuccDep.getSUnit(); 951 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 952 continue; 953 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 || 954 CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0) 955 SUColors.insert(CurrentColoring[Succ->NodeNum]); 956 SUColorsPending.insert(PendingColoring[Succ->NodeNum]); 957 } 958 // If there is only one child/parent block, and that block 959 // is not among the ones we are removing in this path, then 960 // merge the instruction to that block 961 if (SUColors.size() == 1 && SUColorsPending.size() == 1) 962 PendingColoring[SU->NodeNum] = *SUColors.begin(); 963 else // TODO: Attribute new colors depending on color 964 // combination of children. 965 PendingColoring[SU->NodeNum] = NextNonReservedID++; 966 } 967 CurrentColoring = PendingColoring; 968 } 969 970 971 void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() { 972 unsigned DAGSize = DAG->SUnits.size(); 973 unsigned PreviousColor; 974 std::set<unsigned> SeenColors; 975 976 if (DAGSize <= 1) 977 return; 978 979 PreviousColor = CurrentColoring[0]; 980 981 for (unsigned i = 1, e = DAGSize; i != e; ++i) { 982 SUnit *SU = &DAG->SUnits[i]; 983 unsigned CurrentColor = CurrentColoring[i]; 984 unsigned PreviousColorSave = PreviousColor; 985 assert(i == SU->NodeNum); 986 987 if (CurrentColor != PreviousColor) 988 SeenColors.insert(PreviousColor); 989 PreviousColor = CurrentColor; 990 991 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 992 continue; 993 994 if (SeenColors.find(CurrentColor) == SeenColors.end()) 995 continue; 996 997 if (PreviousColorSave != CurrentColor) 998 CurrentColoring[i] = NextNonReservedID++; 999 else 1000 CurrentColoring[i] = CurrentColoring[i-1]; 1001 } 1002 } 1003 1004 void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() { 1005 unsigned DAGSize = DAG->SUnits.size(); 1006 1007 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1008 SUnit *SU = &DAG->SUnits[SUNum]; 1009 std::set<unsigned> SUColors; 1010 1011 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1012 continue; 1013 1014 // No predecessor: Vgpr constant loading. 1015 // Low latency instructions usually have a predecessor (the address) 1016 if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum]) 1017 continue; 1018 1019 for (SDep& SuccDep : SU->Succs) { 1020 SUnit *Succ = SuccDep.getSUnit(); 1021 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1022 continue; 1023 SUColors.insert(CurrentColoring[Succ->NodeNum]); 1024 } 1025 if (SUColors.size() == 1) 1026 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 1027 } 1028 } 1029 1030 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() { 1031 unsigned DAGSize = DAG->SUnits.size(); 1032 1033 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1034 SUnit *SU = &DAG->SUnits[SUNum]; 1035 std::set<unsigned> SUColors; 1036 1037 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1038 continue; 1039 1040 for (SDep& SuccDep : SU->Succs) { 1041 SUnit *Succ = SuccDep.getSUnit(); 1042 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1043 continue; 1044 SUColors.insert(CurrentColoring[Succ->NodeNum]); 1045 } 1046 if (SUColors.size() == 1) 1047 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 1048 } 1049 } 1050 1051 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() { 1052 unsigned DAGSize = DAG->SUnits.size(); 1053 1054 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1055 SUnit *SU = &DAG->SUnits[SUNum]; 1056 std::set<unsigned> SUColors; 1057 1058 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1059 continue; 1060 1061 for (SDep& SuccDep : SU->Succs) { 1062 SUnit *Succ = SuccDep.getSUnit(); 1063 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1064 continue; 1065 SUColors.insert(CurrentColoring[Succ->NodeNum]); 1066 } 1067 if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize) 1068 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 1069 } 1070 } 1071 1072 void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() { 1073 unsigned DAGSize = DAG->SUnits.size(); 1074 std::map<unsigned, unsigned> ColorCount; 1075 1076 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1077 SUnit *SU = &DAG->SUnits[SUNum]; 1078 unsigned color = CurrentColoring[SU->NodeNum]; 1079 ++ColorCount[color]; 1080 } 1081 1082 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1083 SUnit *SU = &DAG->SUnits[SUNum]; 1084 unsigned color = CurrentColoring[SU->NodeNum]; 1085 std::set<unsigned> SUColors; 1086 1087 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1088 continue; 1089 1090 if (ColorCount[color] > 1) 1091 continue; 1092 1093 for (SDep& SuccDep : SU->Succs) { 1094 SUnit *Succ = SuccDep.getSUnit(); 1095 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1096 continue; 1097 SUColors.insert(CurrentColoring[Succ->NodeNum]); 1098 } 1099 if (SUColors.size() == 1 && *SUColors.begin() != color) { 1100 --ColorCount[color]; 1101 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 1102 ++ColorCount[*SUColors.begin()]; 1103 } 1104 } 1105 } 1106 1107 void SIScheduleBlockCreator::cutHugeBlocks() { 1108 // TODO 1109 } 1110 1111 void SIScheduleBlockCreator::regroupNoUserInstructions() { 1112 unsigned DAGSize = DAG->SUnits.size(); 1113 int GroupID = NextNonReservedID++; 1114 1115 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1116 SUnit *SU = &DAG->SUnits[SUNum]; 1117 bool hasSuccessor = false; 1118 1119 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1120 continue; 1121 1122 for (SDep& SuccDep : SU->Succs) { 1123 SUnit *Succ = SuccDep.getSUnit(); 1124 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1125 continue; 1126 hasSuccessor = true; 1127 } 1128 if (!hasSuccessor) 1129 CurrentColoring[SU->NodeNum] = GroupID; 1130 } 1131 } 1132 1133 void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) { 1134 unsigned DAGSize = DAG->SUnits.size(); 1135 std::map<unsigned,unsigned> RealID; 1136 1137 CurrentBlocks.clear(); 1138 CurrentColoring.clear(); 1139 CurrentColoring.resize(DAGSize, 0); 1140 Node2CurrentBlock.clear(); 1141 1142 // Restore links previous scheduling variant has overridden. 1143 DAG->restoreSULinksLeft(); 1144 1145 NextReservedID = 1; 1146 NextNonReservedID = DAGSize + 1; 1147 1148 DEBUG(dbgs() << "Coloring the graph\n"); 1149 1150 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped) 1151 colorHighLatenciesGroups(); 1152 else 1153 colorHighLatenciesAlone(); 1154 colorComputeReservedDependencies(); 1155 colorAccordingToReservedDependencies(); 1156 colorEndsAccordingToDependencies(); 1157 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive) 1158 colorForceConsecutiveOrderInGroup(); 1159 regroupNoUserInstructions(); 1160 colorMergeConstantLoadsNextGroup(); 1161 colorMergeIfPossibleNextGroupOnlyForReserved(); 1162 1163 // Put SUs of same color into same block 1164 Node2CurrentBlock.resize(DAGSize, -1); 1165 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1166 SUnit *SU = &DAG->SUnits[i]; 1167 unsigned Color = CurrentColoring[SU->NodeNum]; 1168 if (RealID.find(Color) == RealID.end()) { 1169 int ID = CurrentBlocks.size(); 1170 BlockPtrs.push_back(llvm::make_unique<SIScheduleBlock>(DAG, this, ID)); 1171 CurrentBlocks.push_back(BlockPtrs.rbegin()->get()); 1172 RealID[Color] = ID; 1173 } 1174 CurrentBlocks[RealID[Color]]->addUnit(SU); 1175 Node2CurrentBlock[SU->NodeNum] = RealID[Color]; 1176 } 1177 1178 // Build dependencies between blocks. 1179 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1180 SUnit *SU = &DAG->SUnits[i]; 1181 int SUID = Node2CurrentBlock[i]; 1182 for (SDep& SuccDep : SU->Succs) { 1183 SUnit *Succ = SuccDep.getSUnit(); 1184 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1185 continue; 1186 if (Node2CurrentBlock[Succ->NodeNum] != SUID) 1187 CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]], 1188 SuccDep.isCtrl() ? NoData : Data); 1189 } 1190 for (SDep& PredDep : SU->Preds) { 1191 SUnit *Pred = PredDep.getSUnit(); 1192 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) 1193 continue; 1194 if (Node2CurrentBlock[Pred->NodeNum] != SUID) 1195 CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]); 1196 } 1197 } 1198 1199 // Free root and leafs of all blocks to enable scheduling inside them. 1200 for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { 1201 SIScheduleBlock *Block = CurrentBlocks[i]; 1202 Block->finalizeUnits(); 1203 } 1204 DEBUG( 1205 dbgs() << "Blocks created:\n\n"; 1206 for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { 1207 SIScheduleBlock *Block = CurrentBlocks[i]; 1208 Block->printDebug(true); 1209 } 1210 ); 1211 } 1212 1213 // Two functions taken from Codegen/MachineScheduler.cpp 1214 1215 /// Non-const version. 1216 static MachineBasicBlock::iterator 1217 nextIfDebug(MachineBasicBlock::iterator I, 1218 MachineBasicBlock::const_iterator End) { 1219 for (; I != End; ++I) { 1220 if (!I->isDebugValue()) 1221 break; 1222 } 1223 return I; 1224 } 1225 1226 void SIScheduleBlockCreator::topologicalSort() { 1227 unsigned DAGSize = CurrentBlocks.size(); 1228 std::vector<int> WorkList; 1229 1230 DEBUG(dbgs() << "Topological Sort\n"); 1231 1232 WorkList.reserve(DAGSize); 1233 TopDownIndex2Block.resize(DAGSize); 1234 TopDownBlock2Index.resize(DAGSize); 1235 BottomUpIndex2Block.resize(DAGSize); 1236 1237 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1238 SIScheduleBlock *Block = CurrentBlocks[i]; 1239 unsigned Degree = Block->getSuccs().size(); 1240 TopDownBlock2Index[i] = Degree; 1241 if (Degree == 0) { 1242 WorkList.push_back(i); 1243 } 1244 } 1245 1246 int Id = DAGSize; 1247 while (!WorkList.empty()) { 1248 int i = WorkList.back(); 1249 SIScheduleBlock *Block = CurrentBlocks[i]; 1250 WorkList.pop_back(); 1251 TopDownBlock2Index[i] = --Id; 1252 TopDownIndex2Block[Id] = i; 1253 for (SIScheduleBlock* Pred : Block->getPreds()) { 1254 if (!--TopDownBlock2Index[Pred->getID()]) 1255 WorkList.push_back(Pred->getID()); 1256 } 1257 } 1258 1259 #ifndef NDEBUG 1260 // Check correctness of the ordering. 1261 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1262 SIScheduleBlock *Block = CurrentBlocks[i]; 1263 for (SIScheduleBlock* Pred : Block->getPreds()) { 1264 assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] && 1265 "Wrong Top Down topological sorting"); 1266 } 1267 } 1268 #endif 1269 1270 BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(), 1271 TopDownIndex2Block.rend()); 1272 } 1273 1274 void SIScheduleBlockCreator::scheduleInsideBlocks() { 1275 unsigned DAGSize = CurrentBlocks.size(); 1276 1277 DEBUG(dbgs() << "\nScheduling Blocks\n\n"); 1278 1279 // We do schedule a valid scheduling such that a Block corresponds 1280 // to a range of instructions. 1281 DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n"); 1282 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1283 SIScheduleBlock *Block = CurrentBlocks[i]; 1284 Block->fastSchedule(); 1285 } 1286 1287 // Note: the following code, and the part restoring previous position 1288 // is by far the most expensive operation of the Scheduler. 1289 1290 // Do not update CurrentTop. 1291 MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop(); 1292 std::vector<MachineBasicBlock::iterator> PosOld; 1293 std::vector<MachineBasicBlock::iterator> PosNew; 1294 PosOld.reserve(DAG->SUnits.size()); 1295 PosNew.reserve(DAG->SUnits.size()); 1296 1297 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1298 int BlockIndice = TopDownIndex2Block[i]; 1299 SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; 1300 std::vector<SUnit*> SUs = Block->getScheduledUnits(); 1301 1302 for (SUnit* SU : SUs) { 1303 MachineInstr *MI = SU->getInstr(); 1304 MachineBasicBlock::iterator Pos = MI; 1305 PosOld.push_back(Pos); 1306 if (&*CurrentTopFastSched == MI) { 1307 PosNew.push_back(Pos); 1308 CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched, 1309 DAG->getCurrentBottom()); 1310 } else { 1311 // Update the instruction stream. 1312 DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI); 1313 1314 // Update LiveIntervals. 1315 // Note: Moving all instructions and calling handleMove every time 1316 // is the most cpu intensive operation of the scheduler. 1317 // It would gain a lot if there was a way to recompute the 1318 // LiveIntervals for the entire scheduling region. 1319 DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true); 1320 PosNew.push_back(CurrentTopFastSched); 1321 } 1322 } 1323 } 1324 1325 // Now we have Block of SUs == Block of MI. 1326 // We do the final schedule for the instructions inside the block. 1327 // The property that all the SUs of the Block are grouped together as MI 1328 // is used for correct reg usage tracking. 1329 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1330 SIScheduleBlock *Block = CurrentBlocks[i]; 1331 std::vector<SUnit*> SUs = Block->getScheduledUnits(); 1332 Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr()); 1333 } 1334 1335 DEBUG(dbgs() << "Restoring MI Pos\n"); 1336 // Restore old ordering (which prevents a LIS->handleMove bug). 1337 for (unsigned i = PosOld.size(), e = 0; i != e; --i) { 1338 MachineBasicBlock::iterator POld = PosOld[i-1]; 1339 MachineBasicBlock::iterator PNew = PosNew[i-1]; 1340 if (PNew != POld) { 1341 // Update the instruction stream. 1342 DAG->getBB()->splice(POld, DAG->getBB(), PNew); 1343 1344 // Update LiveIntervals. 1345 DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true); 1346 } 1347 } 1348 1349 DEBUG( 1350 for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { 1351 SIScheduleBlock *Block = CurrentBlocks[i]; 1352 Block->printDebug(true); 1353 } 1354 ); 1355 } 1356 1357 void SIScheduleBlockCreator::fillStats() { 1358 unsigned DAGSize = CurrentBlocks.size(); 1359 1360 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1361 int BlockIndice = TopDownIndex2Block[i]; 1362 SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; 1363 if (Block->getPreds().empty()) 1364 Block->Depth = 0; 1365 else { 1366 unsigned Depth = 0; 1367 for (SIScheduleBlock *Pred : Block->getPreds()) { 1368 if (Depth < Pred->Depth + 1) 1369 Depth = Pred->Depth + 1; 1370 } 1371 Block->Depth = Depth; 1372 } 1373 } 1374 1375 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1376 int BlockIndice = BottomUpIndex2Block[i]; 1377 SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; 1378 if (Block->getSuccs().empty()) 1379 Block->Height = 0; 1380 else { 1381 unsigned Height = 0; 1382 for (const auto &Succ : Block->getSuccs()) 1383 Height = std::min(Height, Succ.first->Height + 1); 1384 Block->Height = Height; 1385 } 1386 } 1387 } 1388 1389 // SIScheduleBlockScheduler // 1390 1391 SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, 1392 SISchedulerBlockSchedulerVariant Variant, 1393 SIScheduleBlocks BlocksStruct) : 1394 DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks), 1395 LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0), 1396 SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) { 1397 1398 // Fill the usage of every output 1399 // Warning: while by construction we always have a link between two blocks 1400 // when one needs a result from the other, the number of users of an output 1401 // is not the sum of child blocks having as input the same virtual register. 1402 // Here is an example. A produces x and y. B eats x and produces x'. 1403 // C eats x' and y. The register coalescer may have attributed the same 1404 // virtual register to x and x'. 1405 // To count accurately, we do a topological sort. In case the register is 1406 // found for several parents, we increment the usage of the one with the 1407 // highest topological index. 1408 LiveOutRegsNumUsages.resize(Blocks.size()); 1409 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1410 SIScheduleBlock *Block = Blocks[i]; 1411 for (unsigned Reg : Block->getInRegs()) { 1412 bool Found = false; 1413 int topoInd = -1; 1414 for (SIScheduleBlock* Pred: Block->getPreds()) { 1415 std::set<unsigned> PredOutRegs = Pred->getOutRegs(); 1416 std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg); 1417 1418 if (RegPos != PredOutRegs.end()) { 1419 Found = true; 1420 if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) { 1421 topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()]; 1422 } 1423 } 1424 } 1425 1426 if (!Found) 1427 continue; 1428 1429 int PredID = BlocksStruct.TopDownIndex2Block[topoInd]; 1430 ++LiveOutRegsNumUsages[PredID][Reg]; 1431 } 1432 } 1433 1434 LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0); 1435 BlockNumPredsLeft.resize(Blocks.size()); 1436 BlockNumSuccsLeft.resize(Blocks.size()); 1437 1438 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1439 SIScheduleBlock *Block = Blocks[i]; 1440 BlockNumPredsLeft[i] = Block->getPreds().size(); 1441 BlockNumSuccsLeft[i] = Block->getSuccs().size(); 1442 } 1443 1444 #ifndef NDEBUG 1445 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1446 SIScheduleBlock *Block = Blocks[i]; 1447 assert(Block->getID() == i); 1448 } 1449 #endif 1450 1451 std::set<unsigned> InRegs = DAG->getInRegs(); 1452 addLiveRegs(InRegs); 1453 1454 // Increase LiveOutRegsNumUsages for blocks 1455 // producing registers consumed in another 1456 // scheduling region. 1457 for (unsigned Reg : DAG->getOutRegs()) { 1458 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1459 // Do reverse traversal 1460 int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i]; 1461 SIScheduleBlock *Block = Blocks[ID]; 1462 const std::set<unsigned> &OutRegs = Block->getOutRegs(); 1463 1464 if (OutRegs.find(Reg) == OutRegs.end()) 1465 continue; 1466 1467 ++LiveOutRegsNumUsages[ID][Reg]; 1468 break; 1469 } 1470 } 1471 1472 // Fill LiveRegsConsumers for regs that were already 1473 // defined before scheduling. 1474 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1475 SIScheduleBlock *Block = Blocks[i]; 1476 for (unsigned Reg : Block->getInRegs()) { 1477 bool Found = false; 1478 for (SIScheduleBlock* Pred: Block->getPreds()) { 1479 std::set<unsigned> PredOutRegs = Pred->getOutRegs(); 1480 std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg); 1481 1482 if (RegPos != PredOutRegs.end()) { 1483 Found = true; 1484 break; 1485 } 1486 } 1487 1488 if (!Found) 1489 ++LiveRegsConsumers[Reg]; 1490 } 1491 } 1492 1493 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1494 SIScheduleBlock *Block = Blocks[i]; 1495 if (BlockNumPredsLeft[i] == 0) { 1496 ReadyBlocks.push_back(Block); 1497 } 1498 } 1499 1500 while (SIScheduleBlock *Block = pickBlock()) { 1501 BlocksScheduled.push_back(Block); 1502 blockScheduled(Block); 1503 } 1504 1505 DEBUG( 1506 dbgs() << "Block Order:"; 1507 for (SIScheduleBlock* Block : BlocksScheduled) { 1508 dbgs() << ' ' << Block->getID(); 1509 } 1510 dbgs() << '\n'; 1511 ); 1512 } 1513 1514 bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand, 1515 SIBlockSchedCandidate &TryCand) { 1516 if (!Cand.isValid()) { 1517 TryCand.Reason = NodeOrder; 1518 return true; 1519 } 1520 1521 // Try to hide high latencies. 1522 if (tryLess(TryCand.LastPosHighLatParentScheduled, 1523 Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency)) 1524 return true; 1525 // Schedule high latencies early so you can hide them better. 1526 if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency, 1527 TryCand, Cand, Latency)) 1528 return true; 1529 if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height, 1530 TryCand, Cand, Depth)) 1531 return true; 1532 if (tryGreater(TryCand.NumHighLatencySuccessors, 1533 Cand.NumHighLatencySuccessors, 1534 TryCand, Cand, Successor)) 1535 return true; 1536 return false; 1537 } 1538 1539 bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand, 1540 SIBlockSchedCandidate &TryCand) { 1541 if (!Cand.isValid()) { 1542 TryCand.Reason = NodeOrder; 1543 return true; 1544 } 1545 1546 if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0, 1547 TryCand, Cand, RegUsage)) 1548 return true; 1549 if (tryGreater(TryCand.NumSuccessors > 0, 1550 Cand.NumSuccessors > 0, 1551 TryCand, Cand, Successor)) 1552 return true; 1553 if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth)) 1554 return true; 1555 if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff, 1556 TryCand, Cand, RegUsage)) 1557 return true; 1558 return false; 1559 } 1560 1561 SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { 1562 SIBlockSchedCandidate Cand; 1563 std::vector<SIScheduleBlock*>::iterator Best; 1564 SIScheduleBlock *Block; 1565 if (ReadyBlocks.empty()) 1566 return nullptr; 1567 1568 DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(), 1569 VregCurrentUsage, SregCurrentUsage); 1570 if (VregCurrentUsage > maxVregUsage) 1571 maxVregUsage = VregCurrentUsage; 1572 if (SregCurrentUsage > maxSregUsage) 1573 maxSregUsage = SregCurrentUsage; 1574 DEBUG( 1575 dbgs() << "Picking New Blocks\n"; 1576 dbgs() << "Available: "; 1577 for (SIScheduleBlock* Block : ReadyBlocks) 1578 dbgs() << Block->getID() << ' '; 1579 dbgs() << "\nCurrent Live:\n"; 1580 for (unsigned Reg : LiveRegs) 1581 dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' '; 1582 dbgs() << '\n'; 1583 dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; 1584 dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n'; 1585 ); 1586 1587 Cand.Block = nullptr; 1588 for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(), 1589 E = ReadyBlocks.end(); I != E; ++I) { 1590 SIBlockSchedCandidate TryCand; 1591 TryCand.Block = *I; 1592 TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock(); 1593 TryCand.VGPRUsageDiff = 1594 checkRegUsageImpact(TryCand.Block->getInRegs(), 1595 TryCand.Block->getOutRegs())[DAG->getVGPRSetID()]; 1596 TryCand.NumSuccessors = TryCand.Block->getSuccs().size(); 1597 TryCand.NumHighLatencySuccessors = 1598 TryCand.Block->getNumHighLatencySuccessors(); 1599 TryCand.LastPosHighLatParentScheduled = 1600 (unsigned int) std::max<int> (0, 1601 LastPosHighLatencyParentScheduled[TryCand.Block->getID()] - 1602 LastPosWaitedHighLatency); 1603 TryCand.Height = TryCand.Block->Height; 1604 // Try not to increase VGPR usage too much, else we may spill. 1605 if (VregCurrentUsage > 120 || 1606 Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) { 1607 if (!tryCandidateRegUsage(Cand, TryCand) && 1608 Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage) 1609 tryCandidateLatency(Cand, TryCand); 1610 } else { 1611 if (!tryCandidateLatency(Cand, TryCand)) 1612 tryCandidateRegUsage(Cand, TryCand); 1613 } 1614 if (TryCand.Reason != NoCand) { 1615 Cand.setBest(TryCand); 1616 Best = I; 1617 DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' ' 1618 << getReasonStr(Cand.Reason) << '\n'); 1619 } 1620 } 1621 1622 DEBUG( 1623 dbgs() << "Picking: " << Cand.Block->getID() << '\n'; 1624 dbgs() << "Is a block with high latency instruction: " 1625 << (Cand.IsHighLatency ? "yes\n" : "no\n"); 1626 dbgs() << "Position of last high latency dependency: " 1627 << Cand.LastPosHighLatParentScheduled << '\n'; 1628 dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n'; 1629 dbgs() << '\n'; 1630 ); 1631 1632 Block = Cand.Block; 1633 ReadyBlocks.erase(Best); 1634 return Block; 1635 } 1636 1637 // Tracking of currently alive registers to determine VGPR Usage. 1638 1639 void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) { 1640 for (unsigned Reg : Regs) { 1641 // For now only track virtual registers. 1642 if (!TargetRegisterInfo::isVirtualRegister(Reg)) 1643 continue; 1644 // If not already in the live set, then add it. 1645 (void) LiveRegs.insert(Reg); 1646 } 1647 } 1648 1649 void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block, 1650 std::set<unsigned> &Regs) { 1651 for (unsigned Reg : Regs) { 1652 // For now only track virtual registers. 1653 std::set<unsigned>::iterator Pos = LiveRegs.find(Reg); 1654 assert (Pos != LiveRegs.end() && // Reg must be live. 1655 LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() && 1656 LiveRegsConsumers[Reg] >= 1); 1657 --LiveRegsConsumers[Reg]; 1658 if (LiveRegsConsumers[Reg] == 0) 1659 LiveRegs.erase(Pos); 1660 } 1661 } 1662 1663 void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) { 1664 for (const auto &Block : Parent->getSuccs()) { 1665 if (--BlockNumPredsLeft[Block.first->getID()] == 0) 1666 ReadyBlocks.push_back(Block.first); 1667 1668 if (Parent->isHighLatencyBlock() && 1669 Block.second == SIScheduleBlockLinkKind::Data) 1670 LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled; 1671 } 1672 } 1673 1674 void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) { 1675 decreaseLiveRegs(Block, Block->getInRegs()); 1676 addLiveRegs(Block->getOutRegs()); 1677 releaseBlockSuccs(Block); 1678 for (std::map<unsigned, unsigned>::iterator RegI = 1679 LiveOutRegsNumUsages[Block->getID()].begin(), 1680 E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) { 1681 std::pair<unsigned, unsigned> RegP = *RegI; 1682 // We produce this register, thus it must not be previously alive. 1683 assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() || 1684 LiveRegsConsumers[RegP.first] == 0); 1685 LiveRegsConsumers[RegP.first] += RegP.second; 1686 } 1687 if (LastPosHighLatencyParentScheduled[Block->getID()] > 1688 (unsigned)LastPosWaitedHighLatency) 1689 LastPosWaitedHighLatency = 1690 LastPosHighLatencyParentScheduled[Block->getID()]; 1691 ++NumBlockScheduled; 1692 } 1693 1694 std::vector<int> 1695 SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs, 1696 std::set<unsigned> &OutRegs) { 1697 std::vector<int> DiffSetPressure; 1698 DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0); 1699 1700 for (unsigned Reg : InRegs) { 1701 // For now only track virtual registers. 1702 if (!TargetRegisterInfo::isVirtualRegister(Reg)) 1703 continue; 1704 if (LiveRegsConsumers[Reg] > 1) 1705 continue; 1706 PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); 1707 for (; PSetI.isValid(); ++PSetI) { 1708 DiffSetPressure[*PSetI] -= PSetI.getWeight(); 1709 } 1710 } 1711 1712 for (unsigned Reg : OutRegs) { 1713 // For now only track virtual registers. 1714 if (!TargetRegisterInfo::isVirtualRegister(Reg)) 1715 continue; 1716 PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); 1717 for (; PSetI.isValid(); ++PSetI) { 1718 DiffSetPressure[*PSetI] += PSetI.getWeight(); 1719 } 1720 } 1721 1722 return DiffSetPressure; 1723 } 1724 1725 // SIScheduler // 1726 1727 struct SIScheduleBlockResult 1728 SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, 1729 SISchedulerBlockSchedulerVariant ScheduleVariant) { 1730 SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant); 1731 SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks); 1732 std::vector<SIScheduleBlock*> ScheduledBlocks; 1733 struct SIScheduleBlockResult Res; 1734 1735 ScheduledBlocks = Scheduler.getBlocks(); 1736 1737 for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) { 1738 SIScheduleBlock *Block = ScheduledBlocks[b]; 1739 std::vector<SUnit*> SUs = Block->getScheduledUnits(); 1740 1741 for (SUnit* SU : SUs) 1742 Res.SUs.push_back(SU->NodeNum); 1743 } 1744 1745 Res.MaxSGPRUsage = Scheduler.getSGPRUsage(); 1746 Res.MaxVGPRUsage = Scheduler.getVGPRUsage(); 1747 return Res; 1748 } 1749 1750 // SIScheduleDAGMI // 1751 1752 SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) : 1753 ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C)) { 1754 SITII = static_cast<const SIInstrInfo*>(TII); 1755 SITRI = static_cast<const SIRegisterInfo*>(TRI); 1756 1757 VGPRSetID = SITRI->getVGPRPressureSet(); 1758 SGPRSetID = SITRI->getSGPRPressureSet(); 1759 } 1760 1761 SIScheduleDAGMI::~SIScheduleDAGMI() = default; 1762 1763 // Code adapted from scheduleDAG.cpp 1764 // Does a topological sort over the SUs. 1765 // Both TopDown and BottomUp 1766 void SIScheduleDAGMI::topologicalSort() { 1767 Topo.InitDAGTopologicalSorting(); 1768 1769 TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end()); 1770 BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend()); 1771 } 1772 1773 // Move low latencies further from their user without 1774 // increasing SGPR usage (in general) 1775 // This is to be replaced by a better pass that would 1776 // take into account SGPR usage (based on VGPR Usage 1777 // and the corresponding wavefront count), that would 1778 // try to merge groups of loads if it make sense, etc 1779 void SIScheduleDAGMI::moveLowLatencies() { 1780 unsigned DAGSize = SUnits.size(); 1781 int LastLowLatencyUser = -1; 1782 int LastLowLatencyPos = -1; 1783 1784 for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) { 1785 SUnit *SU = &SUnits[ScheduledSUnits[i]]; 1786 bool IsLowLatencyUser = false; 1787 unsigned MinPos = 0; 1788 1789 for (SDep& PredDep : SU->Preds) { 1790 SUnit *Pred = PredDep.getSUnit(); 1791 if (SITII->isLowLatencyInstruction(*Pred->getInstr())) { 1792 IsLowLatencyUser = true; 1793 } 1794 if (Pred->NodeNum >= DAGSize) 1795 continue; 1796 unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum]; 1797 if (PredPos >= MinPos) 1798 MinPos = PredPos + 1; 1799 } 1800 1801 if (SITII->isLowLatencyInstruction(*SU->getInstr())) { 1802 unsigned BestPos = LastLowLatencyUser + 1; 1803 if ((int)BestPos <= LastLowLatencyPos) 1804 BestPos = LastLowLatencyPos + 1; 1805 if (BestPos < MinPos) 1806 BestPos = MinPos; 1807 if (BestPos < i) { 1808 for (unsigned u = i; u > BestPos; --u) { 1809 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; 1810 ScheduledSUnits[u] = ScheduledSUnits[u-1]; 1811 } 1812 ScheduledSUnits[BestPos] = SU->NodeNum; 1813 ScheduledSUnitsInv[SU->NodeNum] = BestPos; 1814 } 1815 LastLowLatencyPos = BestPos; 1816 if (IsLowLatencyUser) 1817 LastLowLatencyUser = BestPos; 1818 } else if (IsLowLatencyUser) { 1819 LastLowLatencyUser = i; 1820 // Moves COPY instructions on which depends 1821 // the low latency instructions too. 1822 } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) { 1823 bool CopyForLowLat = false; 1824 for (SDep& SuccDep : SU->Succs) { 1825 SUnit *Succ = SuccDep.getSUnit(); 1826 if (SITII->isLowLatencyInstruction(*Succ->getInstr())) { 1827 CopyForLowLat = true; 1828 } 1829 } 1830 if (!CopyForLowLat) 1831 continue; 1832 if (MinPos < i) { 1833 for (unsigned u = i; u > MinPos; --u) { 1834 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; 1835 ScheduledSUnits[u] = ScheduledSUnits[u-1]; 1836 } 1837 ScheduledSUnits[MinPos] = SU->NodeNum; 1838 ScheduledSUnitsInv[SU->NodeNum] = MinPos; 1839 } 1840 } 1841 } 1842 } 1843 1844 void SIScheduleDAGMI::restoreSULinksLeft() { 1845 for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { 1846 SUnits[i].isScheduled = false; 1847 SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft; 1848 SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft; 1849 SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft; 1850 SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft; 1851 } 1852 } 1853 1854 // Return the Vgpr and Sgpr usage corresponding to some virtual registers. 1855 template<typename _Iterator> void 1856 SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End, 1857 unsigned &VgprUsage, unsigned &SgprUsage) { 1858 VgprUsage = 0; 1859 SgprUsage = 0; 1860 for (_Iterator RegI = First; RegI != End; ++RegI) { 1861 unsigned Reg = *RegI; 1862 // For now only track virtual registers 1863 if (!TargetRegisterInfo::isVirtualRegister(Reg)) 1864 continue; 1865 PSetIterator PSetI = MRI.getPressureSets(Reg); 1866 for (; PSetI.isValid(); ++PSetI) { 1867 if (*PSetI == VGPRSetID) 1868 VgprUsage += PSetI.getWeight(); 1869 else if (*PSetI == SGPRSetID) 1870 SgprUsage += PSetI.getWeight(); 1871 } 1872 } 1873 } 1874 1875 void SIScheduleDAGMI::schedule() 1876 { 1877 SmallVector<SUnit*, 8> TopRoots, BotRoots; 1878 SIScheduleBlockResult Best, Temp; 1879 DEBUG(dbgs() << "Preparing Scheduling\n"); 1880 1881 buildDAGWithRegPressure(); 1882 DEBUG( 1883 for(SUnit& SU : SUnits) 1884 SU.dumpAll(this) 1885 ); 1886 1887 topologicalSort(); 1888 findRootsAndBiasEdges(TopRoots, BotRoots); 1889 // We reuse several ScheduleDAGMI and ScheduleDAGMILive 1890 // functions, but to make them happy we must initialize 1891 // the default Scheduler implementation (even if we do not 1892 // run it) 1893 SchedImpl->initialize(this); 1894 initQueues(TopRoots, BotRoots); 1895 1896 // Fill some stats to help scheduling. 1897 1898 SUnitsLinksBackup = SUnits; 1899 IsLowLatencySU.clear(); 1900 LowLatencyOffset.clear(); 1901 IsHighLatencySU.clear(); 1902 1903 IsLowLatencySU.resize(SUnits.size(), 0); 1904 LowLatencyOffset.resize(SUnits.size(), 0); 1905 IsHighLatencySU.resize(SUnits.size(), 0); 1906 1907 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { 1908 SUnit *SU = &SUnits[i]; 1909 unsigned BaseLatReg; 1910 int64_t OffLatReg; 1911 if (SITII->isLowLatencyInstruction(*SU->getInstr())) { 1912 IsLowLatencySU[i] = 1; 1913 if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg, 1914 TRI)) 1915 LowLatencyOffset[i] = OffLatReg; 1916 } else if (SITII->isHighLatencyInstruction(*SU->getInstr())) 1917 IsHighLatencySU[i] = 1; 1918 } 1919 1920 SIScheduler Scheduler(this); 1921 Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone, 1922 SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage); 1923 1924 // if VGPR usage is extremely high, try other good performing variants 1925 // which could lead to lower VGPR usage 1926 if (Best.MaxVGPRUsage > 180) { 1927 static const std::pair<SISchedulerBlockCreatorVariant, 1928 SISchedulerBlockSchedulerVariant> 1929 Variants[] = { 1930 { LatenciesAlone, BlockRegUsageLatency }, 1931 // { LatenciesAlone, BlockRegUsage }, 1932 { LatenciesGrouped, BlockLatencyRegUsage }, 1933 // { LatenciesGrouped, BlockRegUsageLatency }, 1934 // { LatenciesGrouped, BlockRegUsage }, 1935 { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, 1936 // { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, 1937 // { LatenciesAlonePlusConsecutive, BlockRegUsage } 1938 }; 1939 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { 1940 Temp = Scheduler.scheduleVariant(v.first, v.second); 1941 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) 1942 Best = Temp; 1943 } 1944 } 1945 // if VGPR usage is still extremely high, we may spill. Try other variants 1946 // which are less performing, but that could lead to lower VGPR usage. 1947 if (Best.MaxVGPRUsage > 200) { 1948 static const std::pair<SISchedulerBlockCreatorVariant, 1949 SISchedulerBlockSchedulerVariant> 1950 Variants[] = { 1951 // { LatenciesAlone, BlockRegUsageLatency }, 1952 { LatenciesAlone, BlockRegUsage }, 1953 // { LatenciesGrouped, BlockLatencyRegUsage }, 1954 { LatenciesGrouped, BlockRegUsageLatency }, 1955 { LatenciesGrouped, BlockRegUsage }, 1956 // { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, 1957 { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, 1958 { LatenciesAlonePlusConsecutive, BlockRegUsage } 1959 }; 1960 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { 1961 Temp = Scheduler.scheduleVariant(v.first, v.second); 1962 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) 1963 Best = Temp; 1964 } 1965 } 1966 1967 ScheduledSUnits = Best.SUs; 1968 ScheduledSUnitsInv.resize(SUnits.size()); 1969 1970 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { 1971 ScheduledSUnitsInv[ScheduledSUnits[i]] = i; 1972 } 1973 1974 moveLowLatencies(); 1975 1976 // Tell the outside world about the result of the scheduling. 1977 1978 assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker"); 1979 TopRPTracker.setPos(CurrentTop); 1980 1981 for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(), 1982 E = ScheduledSUnits.end(); I != E; ++I) { 1983 SUnit *SU = &SUnits[*I]; 1984 1985 scheduleMI(SU, true); 1986 1987 DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " 1988 << *SU->getInstr()); 1989 } 1990 1991 assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone."); 1992 1993 placeDebugValues(); 1994 1995 DEBUG({ 1996 unsigned BBNum = begin()->getParent()->getNumber(); 1997 dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n"; 1998 dumpSchedule(); 1999 dbgs() << '\n'; 2000 }); 2001 } 2002