1 //===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// SI Machine Scheduler interface 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "SIMachineScheduler.h" 16 #include "AMDGPU.h" 17 #include "SIInstrInfo.h" 18 #include "SIRegisterInfo.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "llvm/ADT/STLExtras.h" 21 #include "llvm/ADT/SmallVector.h" 22 #include "llvm/CodeGen/LiveInterval.h" 23 #include "llvm/CodeGen/LiveIntervals.h" 24 #include "llvm/CodeGen/MachineInstr.h" 25 #include "llvm/CodeGen/MachineRegisterInfo.h" 26 #include "llvm/CodeGen/MachineScheduler.h" 27 #include "llvm/CodeGen/RegisterPressure.h" 28 #include "llvm/CodeGen/SlotIndexes.h" 29 #include "llvm/CodeGen/TargetRegisterInfo.h" 30 #include "llvm/Support/Debug.h" 31 #include "llvm/Support/ErrorHandling.h" 32 #include "llvm/Support/raw_ostream.h" 33 #include <algorithm> 34 #include <cassert> 35 #include <map> 36 #include <set> 37 #include <utility> 38 #include <vector> 39 40 using namespace llvm; 41 42 #define DEBUG_TYPE "machine-scheduler" 43 44 // This scheduler implements a different scheduling algorithm than 45 // GenericScheduler. 46 // 47 // There are several specific architecture behaviours that can't be modelled 48 // for GenericScheduler: 49 // . When accessing the result of an SGPR load instruction, you have to wait 50 // for all the SGPR load instructions before your current instruction to 51 // have finished. 52 // . When accessing the result of an VGPR load instruction, you have to wait 53 // for all the VGPR load instructions previous to the VGPR load instruction 54 // you are interested in to finish. 55 // . The less the register pressure, the best load latencies are hidden 56 // 57 // Moreover some specifities (like the fact a lot of instructions in the shader 58 // have few dependencies) makes the generic scheduler have some unpredictable 59 // behaviours. For example when register pressure becomes high, it can either 60 // manage to prevent register pressure from going too high, or it can 61 // increase register pressure even more than if it hadn't taken register 62 // pressure into account. 63 // 64 // Also some other bad behaviours are generated, like loading at the beginning 65 // of the shader a constant in VGPR you won't need until the end of the shader. 66 // 67 // The scheduling problem for SI can distinguish three main parts: 68 // . Hiding high latencies (texture sampling, etc) 69 // . Hiding low latencies (SGPR constant loading, etc) 70 // . Keeping register usage low for better latency hiding and general 71 // performance 72 // 73 // Some other things can also affect performance, but are hard to predict 74 // (cache usage, the fact the HW can issue several instructions from different 75 // wavefronts if different types, etc) 76 // 77 // This scheduler tries to solve the scheduling problem by dividing it into 78 // simpler sub-problems. It divides the instructions into blocks, schedules 79 // locally inside the blocks where it takes care of low latencies, and then 80 // chooses the order of the blocks by taking care of high latencies. 81 // Dividing the instructions into blocks helps control keeping register 82 // usage low. 83 // 84 // First the instructions are put into blocks. 85 // We want the blocks help control register usage and hide high latencies 86 // later. To help control register usage, we typically want all local 87 // computations, when for example you create a result that can be comsummed 88 // right away, to be contained in a block. Block inputs and outputs would 89 // typically be important results that are needed in several locations of 90 // the shader. Since we do want blocks to help hide high latencies, we want 91 // the instructions inside the block to have a minimal set of dependencies 92 // on high latencies. It will make it easy to pick blocks to hide specific 93 // high latencies. 94 // The block creation algorithm is divided into several steps, and several 95 // variants can be tried during the scheduling process. 96 // 97 // Second the order of the instructions inside the blocks is chosen. 98 // At that step we do take into account only register usage and hiding 99 // low latency instructions 100 // 101 // Third the block order is chosen, there we try to hide high latencies 102 // and keep register usage low. 103 // 104 // After the third step, a pass is done to improve the hiding of low 105 // latencies. 106 // 107 // Actually when talking about 'low latency' or 'high latency' it includes 108 // both the latency to get the cache (or global mem) data go to the register, 109 // and the bandwidth limitations. 110 // Increasing the number of active wavefronts helps hide the former, but it 111 // doesn't solve the latter, thus why even if wavefront count is high, we have 112 // to try have as many instructions hiding high latencies as possible. 113 // The OpenCL doc says for example latency of 400 cycles for a global mem access, 114 // which is hidden by 10 instructions if the wavefront count is 10. 115 116 // Some figures taken from AMD docs: 117 // Both texture and constant L1 caches are 4-way associative with 64 bytes 118 // lines. 119 // Constant cache is shared with 4 CUs. 120 // For texture sampling, the address generation unit receives 4 texture 121 // addresses per cycle, thus we could expect texture sampling latency to be 122 // equivalent to 4 instructions in the very best case (a VGPR is 64 work items, 123 // instructions in a wavefront group are executed every 4 cycles), 124 // or 16 instructions if the other wavefronts associated to the 3 other VALUs 125 // of the CU do texture sampling too. (Don't take these figures too seriously, 126 // as I'm not 100% sure of the computation) 127 // Data exports should get similar latency. 128 // For constant loading, the cache is shader with 4 CUs. 129 // The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit" 130 // I guess if the other CU don't read the cache, it can go up to 64B/cycle. 131 // It means a simple s_buffer_load should take one instruction to hide, as 132 // well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same 133 // cache line. 134 // 135 // As of today the driver doesn't preload the constants in cache, thus the 136 // first loads get extra latency. The doc says global memory access can be 137 // 300-600 cycles. We do not specially take that into account when scheduling 138 // As we expect the driver to be able to preload the constants soon. 139 140 // common code // 141 142 #ifndef NDEBUG 143 144 static const char *getReasonStr(SIScheduleCandReason Reason) { 145 switch (Reason) { 146 case NoCand: return "NOCAND"; 147 case RegUsage: return "REGUSAGE"; 148 case Latency: return "LATENCY"; 149 case Successor: return "SUCCESSOR"; 150 case Depth: return "DEPTH"; 151 case NodeOrder: return "ORDER"; 152 } 153 llvm_unreachable("Unknown reason!"); 154 } 155 156 #endif 157 158 namespace llvm { 159 namespace SISched { 160 static bool tryLess(int TryVal, int CandVal, 161 SISchedulerCandidate &TryCand, 162 SISchedulerCandidate &Cand, 163 SIScheduleCandReason Reason) { 164 if (TryVal < CandVal) { 165 TryCand.Reason = Reason; 166 return true; 167 } 168 if (TryVal > CandVal) { 169 if (Cand.Reason > Reason) 170 Cand.Reason = Reason; 171 return true; 172 } 173 Cand.setRepeat(Reason); 174 return false; 175 } 176 177 static bool tryGreater(int TryVal, int CandVal, 178 SISchedulerCandidate &TryCand, 179 SISchedulerCandidate &Cand, 180 SIScheduleCandReason Reason) { 181 if (TryVal > CandVal) { 182 TryCand.Reason = Reason; 183 return true; 184 } 185 if (TryVal < CandVal) { 186 if (Cand.Reason > Reason) 187 Cand.Reason = Reason; 188 return true; 189 } 190 Cand.setRepeat(Reason); 191 return false; 192 } 193 } // end namespace SISched 194 } // end namespace llvm 195 196 // SIScheduleBlock // 197 198 void SIScheduleBlock::addUnit(SUnit *SU) { 199 NodeNum2Index[SU->NodeNum] = SUnits.size(); 200 SUnits.push_back(SU); 201 } 202 203 #ifndef NDEBUG 204 void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) { 205 206 dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason); 207 dbgs() << '\n'; 208 } 209 #endif 210 211 void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand, 212 SISchedCandidate &TryCand) { 213 // Initialize the candidate if needed. 214 if (!Cand.isValid()) { 215 TryCand.Reason = NodeOrder; 216 return; 217 } 218 219 if (Cand.SGPRUsage > 60 && 220 SISched::tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, 221 TryCand, Cand, RegUsage)) 222 return; 223 224 // Schedule low latency instructions as top as possible. 225 // Order of priority is: 226 // . Low latency instructions which do not depend on other low latency 227 // instructions we haven't waited for 228 // . Other instructions which do not depend on low latency instructions 229 // we haven't waited for 230 // . Low latencies 231 // . All other instructions 232 // Goal is to get: low latency instructions - independent instructions 233 // - (eventually some more low latency instructions) 234 // - instructions that depend on the first low latency instructions. 235 // If in the block there is a lot of constant loads, the SGPR usage 236 // could go quite high, thus above the arbitrary limit of 60 will encourage 237 // use the already loaded constants (in order to release some SGPRs) before 238 // loading more. 239 if (SISched::tryLess(TryCand.HasLowLatencyNonWaitedParent, 240 Cand.HasLowLatencyNonWaitedParent, 241 TryCand, Cand, SIScheduleCandReason::Depth)) 242 return; 243 244 if (SISched::tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency, 245 TryCand, Cand, SIScheduleCandReason::Depth)) 246 return; 247 248 if (TryCand.IsLowLatency && 249 SISched::tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset, 250 TryCand, Cand, SIScheduleCandReason::Depth)) 251 return; 252 253 if (SISched::tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, 254 TryCand, Cand, RegUsage)) 255 return; 256 257 // Fall through to original instruction order. 258 if (TryCand.SU->NodeNum < Cand.SU->NodeNum) { 259 TryCand.Reason = NodeOrder; 260 } 261 } 262 263 SUnit* SIScheduleBlock::pickNode() { 264 SISchedCandidate TopCand; 265 266 for (SUnit* SU : TopReadySUs) { 267 SISchedCandidate TryCand; 268 std::vector<unsigned> pressure; 269 std::vector<unsigned> MaxPressure; 270 // Predict register usage after this instruction. 271 TryCand.SU = SU; 272 TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure); 273 TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()]; 274 TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()]; 275 TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum]; 276 TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum]; 277 TryCand.HasLowLatencyNonWaitedParent = 278 HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]; 279 tryCandidateTopDown(TopCand, TryCand); 280 if (TryCand.Reason != NoCand) 281 TopCand.setBest(TryCand); 282 } 283 284 return TopCand.SU; 285 } 286 287 288 // Schedule something valid. 289 void SIScheduleBlock::fastSchedule() { 290 TopReadySUs.clear(); 291 if (Scheduled) 292 undoSchedule(); 293 294 for (SUnit* SU : SUnits) { 295 if (!SU->NumPredsLeft) 296 TopReadySUs.push_back(SU); 297 } 298 299 while (!TopReadySUs.empty()) { 300 SUnit *SU = TopReadySUs[0]; 301 ScheduledSUnits.push_back(SU); 302 nodeScheduled(SU); 303 } 304 305 Scheduled = true; 306 } 307 308 // Returns if the register was set between first and last. 309 static bool isDefBetween(unsigned Reg, 310 SlotIndex First, SlotIndex Last, 311 const MachineRegisterInfo *MRI, 312 const LiveIntervals *LIS) { 313 for (MachineRegisterInfo::def_instr_iterator 314 UI = MRI->def_instr_begin(Reg), 315 UE = MRI->def_instr_end(); UI != UE; ++UI) { 316 const MachineInstr* MI = &*UI; 317 if (MI->isDebugValue()) 318 continue; 319 SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot(); 320 if (InstSlot >= First && InstSlot <= Last) 321 return true; 322 } 323 return false; 324 } 325 326 void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, 327 MachineBasicBlock::iterator EndBlock) { 328 IntervalPressure Pressure, BotPressure; 329 RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure); 330 LiveIntervals *LIS = DAG->getLIS(); 331 MachineRegisterInfo *MRI = DAG->getMRI(); 332 DAG->initRPTracker(TopRPTracker); 333 DAG->initRPTracker(BotRPTracker); 334 DAG->initRPTracker(RPTracker); 335 336 // Goes though all SU. RPTracker captures what had to be alive for the SUs 337 // to execute, and what is still alive at the end. 338 for (SUnit* SU : ScheduledSUnits) { 339 RPTracker.setPos(SU->getInstr()); 340 RPTracker.advance(); 341 } 342 343 // Close the RPTracker to finalize live ins/outs. 344 RPTracker.closeRegion(); 345 346 // Initialize the live ins and live outs. 347 TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs); 348 BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs); 349 350 // Do not Track Physical Registers, because it messes up. 351 for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) { 352 if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit)) 353 LiveInRegs.insert(RegMaskPair.RegUnit); 354 } 355 LiveOutRegs.clear(); 356 // There is several possibilities to distinguish: 357 // 1) Reg is not input to any instruction in the block, but is output of one 358 // 2) 1) + read in the block and not needed after it 359 // 3) 1) + read in the block but needed in another block 360 // 4) Reg is input of an instruction but another block will read it too 361 // 5) Reg is input of an instruction and then rewritten in the block. 362 // result is not read in the block (implies used in another block) 363 // 6) Reg is input of an instruction and then rewritten in the block. 364 // result is read in the block and not needed in another block 365 // 7) Reg is input of an instruction and then rewritten in the block. 366 // result is read in the block but also needed in another block 367 // LiveInRegs will contains all the regs in situation 4, 5, 6, 7 368 // We want LiveOutRegs to contain only Regs whose content will be read after 369 // in another block, and whose content was written in the current block, 370 // that is we want it to get 1, 3, 5, 7 371 // Since we made the MIs of a block to be packed all together before 372 // scheduling, then the LiveIntervals were correct, and the RPTracker was 373 // able to correctly handle 5 vs 6, 2 vs 3. 374 // (Note: This is not sufficient for RPTracker to not do mistakes for case 4) 375 // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7 376 // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7 377 // The use of findDefBetween removes the case 4. 378 for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) { 379 unsigned Reg = RegMaskPair.RegUnit; 380 if (TargetRegisterInfo::isVirtualRegister(Reg) && 381 isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(), 382 LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI, 383 LIS)) { 384 LiveOutRegs.insert(Reg); 385 } 386 } 387 388 // Pressure = sum_alive_registers register size 389 // Internally llvm will represent some registers as big 128 bits registers 390 // for example, but they actually correspond to 4 actual 32 bits registers. 391 // Thus Pressure is not equal to num_alive_registers * constant. 392 LiveInPressure = TopPressure.MaxSetPressure; 393 LiveOutPressure = BotPressure.MaxSetPressure; 394 395 // Prepares TopRPTracker for top down scheduling. 396 TopRPTracker.closeTop(); 397 } 398 399 void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock, 400 MachineBasicBlock::iterator EndBlock) { 401 if (!Scheduled) 402 fastSchedule(); 403 404 // PreScheduling phase to set LiveIn and LiveOut. 405 initRegPressure(BeginBlock, EndBlock); 406 undoSchedule(); 407 408 // Schedule for real now. 409 410 TopReadySUs.clear(); 411 412 for (SUnit* SU : SUnits) { 413 if (!SU->NumPredsLeft) 414 TopReadySUs.push_back(SU); 415 } 416 417 while (!TopReadySUs.empty()) { 418 SUnit *SU = pickNode(); 419 ScheduledSUnits.push_back(SU); 420 TopRPTracker.setPos(SU->getInstr()); 421 TopRPTracker.advance(); 422 nodeScheduled(SU); 423 } 424 425 // TODO: compute InternalAdditionnalPressure. 426 InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size()); 427 428 // Check everything is right. 429 #ifndef NDEBUG 430 assert(SUnits.size() == ScheduledSUnits.size() && 431 TopReadySUs.empty()); 432 for (SUnit* SU : SUnits) { 433 assert(SU->isScheduled && 434 SU->NumPredsLeft == 0); 435 } 436 #endif 437 438 Scheduled = true; 439 } 440 441 void SIScheduleBlock::undoSchedule() { 442 for (SUnit* SU : SUnits) { 443 SU->isScheduled = false; 444 for (SDep& Succ : SU->Succs) { 445 if (BC->isSUInBlock(Succ.getSUnit(), ID)) 446 undoReleaseSucc(SU, &Succ); 447 } 448 } 449 HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); 450 ScheduledSUnits.clear(); 451 Scheduled = false; 452 } 453 454 void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) { 455 SUnit *SuccSU = SuccEdge->getSUnit(); 456 457 if (SuccEdge->isWeak()) { 458 ++SuccSU->WeakPredsLeft; 459 return; 460 } 461 ++SuccSU->NumPredsLeft; 462 } 463 464 void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) { 465 SUnit *SuccSU = SuccEdge->getSUnit(); 466 467 if (SuccEdge->isWeak()) { 468 --SuccSU->WeakPredsLeft; 469 return; 470 } 471 #ifndef NDEBUG 472 if (SuccSU->NumPredsLeft == 0) { 473 dbgs() << "*** Scheduling failed! ***\n"; 474 DAG->dumpNode(*SuccSU); 475 dbgs() << " has been released too many times!\n"; 476 llvm_unreachable(nullptr); 477 } 478 #endif 479 480 --SuccSU->NumPredsLeft; 481 } 482 483 /// Release Successors of the SU that are in the block or not. 484 void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) { 485 for (SDep& Succ : SU->Succs) { 486 SUnit *SuccSU = Succ.getSUnit(); 487 488 if (SuccSU->NodeNum >= DAG->SUnits.size()) 489 continue; 490 491 if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock) 492 continue; 493 494 releaseSucc(SU, &Succ); 495 if (SuccSU->NumPredsLeft == 0 && InOrOutBlock) 496 TopReadySUs.push_back(SuccSU); 497 } 498 } 499 500 void SIScheduleBlock::nodeScheduled(SUnit *SU) { 501 // Is in TopReadySUs 502 assert (!SU->NumPredsLeft); 503 std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU); 504 if (I == TopReadySUs.end()) { 505 dbgs() << "Data Structure Bug in SI Scheduler\n"; 506 llvm_unreachable(nullptr); 507 } 508 TopReadySUs.erase(I); 509 510 releaseSuccessors(SU, true); 511 // Scheduling this node will trigger a wait, 512 // thus propagate to other instructions that they do not need to wait either. 513 if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]) 514 HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); 515 516 if (DAG->IsLowLatencySU[SU->NodeNum]) { 517 for (SDep& Succ : SU->Succs) { 518 std::map<unsigned, unsigned>::iterator I = 519 NodeNum2Index.find(Succ.getSUnit()->NodeNum); 520 if (I != NodeNum2Index.end()) 521 HasLowLatencyNonWaitedParent[I->second] = 1; 522 } 523 } 524 SU->isScheduled = true; 525 } 526 527 void SIScheduleBlock::finalizeUnits() { 528 // We remove links from outside blocks to enable scheduling inside the block. 529 for (SUnit* SU : SUnits) { 530 releaseSuccessors(SU, false); 531 if (DAG->IsHighLatencySU[SU->NodeNum]) 532 HighLatencyBlock = true; 533 } 534 HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0); 535 } 536 537 // we maintain ascending order of IDs 538 void SIScheduleBlock::addPred(SIScheduleBlock *Pred) { 539 unsigned PredID = Pred->getID(); 540 541 // Check if not already predecessor. 542 for (SIScheduleBlock* P : Preds) { 543 if (PredID == P->getID()) 544 return; 545 } 546 Preds.push_back(Pred); 547 548 assert(none_of(Succs, 549 [=](std::pair<SIScheduleBlock*, 550 SIScheduleBlockLinkKind> S) { 551 return PredID == S.first->getID(); 552 }) && 553 "Loop in the Block Graph!"); 554 } 555 556 void SIScheduleBlock::addSucc(SIScheduleBlock *Succ, 557 SIScheduleBlockLinkKind Kind) { 558 unsigned SuccID = Succ->getID(); 559 560 // Check if not already predecessor. 561 for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) { 562 if (SuccID == S.first->getID()) { 563 if (S.second == SIScheduleBlockLinkKind::NoData && 564 Kind == SIScheduleBlockLinkKind::Data) 565 S.second = Kind; 566 return; 567 } 568 } 569 if (Succ->isHighLatencyBlock()) 570 ++NumHighLatencySuccessors; 571 Succs.push_back(std::make_pair(Succ, Kind)); 572 573 assert(none_of(Preds, 574 [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && 575 "Loop in the Block Graph!"); 576 } 577 578 #ifndef NDEBUG 579 void SIScheduleBlock::printDebug(bool full) { 580 dbgs() << "Block (" << ID << ")\n"; 581 if (!full) 582 return; 583 584 dbgs() << "\nContains High Latency Instruction: " 585 << HighLatencyBlock << '\n'; 586 dbgs() << "\nDepends On:\n"; 587 for (SIScheduleBlock* P : Preds) { 588 P->printDebug(false); 589 } 590 591 dbgs() << "\nSuccessors:\n"; 592 for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) { 593 if (S.second == SIScheduleBlockLinkKind::Data) 594 dbgs() << "(Data Dep) "; 595 S.first->printDebug(false); 596 } 597 598 if (Scheduled) { 599 dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' ' 600 << LiveInPressure[DAG->getVGPRSetID()] << '\n'; 601 dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' ' 602 << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n"; 603 dbgs() << "LiveIns:\n"; 604 for (unsigned Reg : LiveInRegs) 605 dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; 606 607 dbgs() << "\nLiveOuts:\n"; 608 for (unsigned Reg : LiveOutRegs) 609 dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; 610 } 611 612 dbgs() << "\nInstructions:\n"; 613 if (!Scheduled) { 614 for (const SUnit* SU : SUnits) 615 DAG->dumpNode(*SU); 616 } else { 617 for (const SUnit* SU : SUnits) 618 DAG->dumpNode(*SU); 619 } 620 621 dbgs() << "///////////////////////\n"; 622 } 623 #endif 624 625 // SIScheduleBlockCreator // 626 627 SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) : 628 DAG(DAG) { 629 } 630 631 SIScheduleBlockCreator::~SIScheduleBlockCreator() = default; 632 633 SIScheduleBlocks 634 SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) { 635 std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B = 636 Blocks.find(BlockVariant); 637 if (B == Blocks.end()) { 638 SIScheduleBlocks Res; 639 createBlocksForVariant(BlockVariant); 640 topologicalSort(); 641 scheduleInsideBlocks(); 642 fillStats(); 643 Res.Blocks = CurrentBlocks; 644 Res.TopDownIndex2Block = TopDownIndex2Block; 645 Res.TopDownBlock2Index = TopDownBlock2Index; 646 Blocks[BlockVariant] = Res; 647 return Res; 648 } else { 649 return B->second; 650 } 651 } 652 653 bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) { 654 if (SU->NodeNum >= DAG->SUnits.size()) 655 return false; 656 return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID; 657 } 658 659 void SIScheduleBlockCreator::colorHighLatenciesAlone() { 660 unsigned DAGSize = DAG->SUnits.size(); 661 662 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 663 SUnit *SU = &DAG->SUnits[i]; 664 if (DAG->IsHighLatencySU[SU->NodeNum]) { 665 CurrentColoring[SU->NodeNum] = NextReservedID++; 666 } 667 } 668 } 669 670 static bool 671 hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) { 672 for (const auto &PredDep : SU.Preds) { 673 if (PredDep.getSUnit() == &FromSU && 674 PredDep.getKind() == llvm::SDep::Data) 675 return true; 676 } 677 return false; 678 } 679 680 void SIScheduleBlockCreator::colorHighLatenciesGroups() { 681 unsigned DAGSize = DAG->SUnits.size(); 682 unsigned NumHighLatencies = 0; 683 unsigned GroupSize; 684 int Color = NextReservedID; 685 unsigned Count = 0; 686 std::set<unsigned> FormingGroup; 687 688 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 689 SUnit *SU = &DAG->SUnits[i]; 690 if (DAG->IsHighLatencySU[SU->NodeNum]) 691 ++NumHighLatencies; 692 } 693 694 if (NumHighLatencies == 0) 695 return; 696 697 if (NumHighLatencies <= 6) 698 GroupSize = 2; 699 else if (NumHighLatencies <= 12) 700 GroupSize = 3; 701 else 702 GroupSize = 4; 703 704 for (unsigned SUNum : DAG->TopDownIndex2SU) { 705 const SUnit &SU = DAG->SUnits[SUNum]; 706 if (DAG->IsHighLatencySU[SU.NodeNum]) { 707 unsigned CompatibleGroup = true; 708 int ProposedColor = Color; 709 std::vector<int> AdditionalElements; 710 711 // We don't want to put in the same block 712 // two high latency instructions that depend 713 // on each other. 714 // One way would be to check canAddEdge 715 // in both directions, but that currently is not 716 // enough because there the high latency order is 717 // enforced (via links). 718 // Instead, look at the dependencies between the 719 // high latency instructions and deduce if it is 720 // a data dependency or not. 721 for (unsigned j : FormingGroup) { 722 bool HasSubGraph; 723 std::vector<int> SubGraph; 724 // By construction (topological order), if SU and 725 // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary 726 // in the parent graph of SU. 727 #ifndef NDEBUG 728 SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j], 729 HasSubGraph); 730 assert(!HasSubGraph); 731 #endif 732 SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU, 733 HasSubGraph); 734 if (!HasSubGraph) 735 continue; // No dependencies between each other 736 else if (SubGraph.size() > 5) { 737 // Too many elements would be required to be added to the block. 738 CompatibleGroup = false; 739 break; 740 } 741 else { 742 // Check the type of dependency 743 for (unsigned k : SubGraph) { 744 // If in the path to join the two instructions, 745 // there is another high latency instruction, 746 // or instructions colored for another block 747 // abort the merge. 748 if (DAG->IsHighLatencySU[k] || 749 (CurrentColoring[k] != ProposedColor && 750 CurrentColoring[k] != 0)) { 751 CompatibleGroup = false; 752 break; 753 } 754 // If one of the SU in the subgraph depends on the result of SU j, 755 // there'll be a data dependency. 756 if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) { 757 CompatibleGroup = false; 758 break; 759 } 760 } 761 if (!CompatibleGroup) 762 break; 763 // Same check for the SU 764 if (hasDataDependencyPred(SU, DAG->SUnits[j])) { 765 CompatibleGroup = false; 766 break; 767 } 768 // Add all the required instructions to the block 769 // These cannot live in another block (because they 770 // depend (order dependency) on one of the 771 // instruction in the block, and are required for the 772 // high latency instruction we add. 773 AdditionalElements.insert(AdditionalElements.end(), 774 SubGraph.begin(), SubGraph.end()); 775 } 776 } 777 if (CompatibleGroup) { 778 FormingGroup.insert(SU.NodeNum); 779 for (unsigned j : AdditionalElements) 780 CurrentColoring[j] = ProposedColor; 781 CurrentColoring[SU.NodeNum] = ProposedColor; 782 ++Count; 783 } 784 // Found one incompatible instruction, 785 // or has filled a big enough group. 786 // -> start a new one. 787 if (!CompatibleGroup) { 788 FormingGroup.clear(); 789 Color = ++NextReservedID; 790 ProposedColor = Color; 791 FormingGroup.insert(SU.NodeNum); 792 CurrentColoring[SU.NodeNum] = ProposedColor; 793 Count = 0; 794 } else if (Count == GroupSize) { 795 FormingGroup.clear(); 796 Color = ++NextReservedID; 797 ProposedColor = Color; 798 Count = 0; 799 } 800 } 801 } 802 } 803 804 void SIScheduleBlockCreator::colorComputeReservedDependencies() { 805 unsigned DAGSize = DAG->SUnits.size(); 806 std::map<std::set<unsigned>, unsigned> ColorCombinations; 807 808 CurrentTopDownReservedDependencyColoring.clear(); 809 CurrentBottomUpReservedDependencyColoring.clear(); 810 811 CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0); 812 CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0); 813 814 // Traverse TopDown, and give different colors to SUs depending 815 // on which combination of High Latencies they depend on. 816 817 for (unsigned SUNum : DAG->TopDownIndex2SU) { 818 SUnit *SU = &DAG->SUnits[SUNum]; 819 std::set<unsigned> SUColors; 820 821 // Already given. 822 if (CurrentColoring[SU->NodeNum]) { 823 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = 824 CurrentColoring[SU->NodeNum]; 825 continue; 826 } 827 828 for (SDep& PredDep : SU->Preds) { 829 SUnit *Pred = PredDep.getSUnit(); 830 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) 831 continue; 832 if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0) 833 SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]); 834 } 835 // Color 0 by default. 836 if (SUColors.empty()) 837 continue; 838 // Same color than parents. 839 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) 840 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = 841 *SUColors.begin(); 842 else { 843 std::map<std::set<unsigned>, unsigned>::iterator Pos = 844 ColorCombinations.find(SUColors); 845 if (Pos != ColorCombinations.end()) { 846 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second; 847 } else { 848 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = 849 NextNonReservedID; 850 ColorCombinations[SUColors] = NextNonReservedID++; 851 } 852 } 853 } 854 855 ColorCombinations.clear(); 856 857 // Same as before, but BottomUp. 858 859 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 860 SUnit *SU = &DAG->SUnits[SUNum]; 861 std::set<unsigned> SUColors; 862 863 // Already given. 864 if (CurrentColoring[SU->NodeNum]) { 865 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = 866 CurrentColoring[SU->NodeNum]; 867 continue; 868 } 869 870 for (SDep& SuccDep : SU->Succs) { 871 SUnit *Succ = SuccDep.getSUnit(); 872 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 873 continue; 874 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0) 875 SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]); 876 } 877 // Keep color 0. 878 if (SUColors.empty()) 879 continue; 880 // Same color than parents. 881 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) 882 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = 883 *SUColors.begin(); 884 else { 885 std::map<std::set<unsigned>, unsigned>::iterator Pos = 886 ColorCombinations.find(SUColors); 887 if (Pos != ColorCombinations.end()) { 888 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second; 889 } else { 890 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = 891 NextNonReservedID; 892 ColorCombinations[SUColors] = NextNonReservedID++; 893 } 894 } 895 } 896 } 897 898 void SIScheduleBlockCreator::colorAccordingToReservedDependencies() { 899 unsigned DAGSize = DAG->SUnits.size(); 900 std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations; 901 902 // Every combination of colors given by the top down 903 // and bottom up Reserved node dependency 904 905 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 906 SUnit *SU = &DAG->SUnits[i]; 907 std::pair<unsigned, unsigned> SUColors; 908 909 // High latency instructions: already given. 910 if (CurrentColoring[SU->NodeNum]) 911 continue; 912 913 SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum]; 914 SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum]; 915 916 std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos = 917 ColorCombinations.find(SUColors); 918 if (Pos != ColorCombinations.end()) { 919 CurrentColoring[SU->NodeNum] = Pos->second; 920 } else { 921 CurrentColoring[SU->NodeNum] = NextNonReservedID; 922 ColorCombinations[SUColors] = NextNonReservedID++; 923 } 924 } 925 } 926 927 void SIScheduleBlockCreator::colorEndsAccordingToDependencies() { 928 unsigned DAGSize = DAG->SUnits.size(); 929 std::vector<int> PendingColoring = CurrentColoring; 930 931 assert(DAGSize >= 1 && 932 CurrentBottomUpReservedDependencyColoring.size() == DAGSize && 933 CurrentTopDownReservedDependencyColoring.size() == DAGSize); 934 // If there is no reserved block at all, do nothing. We don't want 935 // everything in one block. 936 if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(), 937 CurrentBottomUpReservedDependencyColoring.end()) == 0 && 938 *std::max_element(CurrentTopDownReservedDependencyColoring.begin(), 939 CurrentTopDownReservedDependencyColoring.end()) == 0) 940 return; 941 942 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 943 SUnit *SU = &DAG->SUnits[SUNum]; 944 std::set<unsigned> SUColors; 945 std::set<unsigned> SUColorsPending; 946 947 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 948 continue; 949 950 if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 || 951 CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0) 952 continue; 953 954 for (SDep& SuccDep : SU->Succs) { 955 SUnit *Succ = SuccDep.getSUnit(); 956 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 957 continue; 958 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 || 959 CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0) 960 SUColors.insert(CurrentColoring[Succ->NodeNum]); 961 SUColorsPending.insert(PendingColoring[Succ->NodeNum]); 962 } 963 // If there is only one child/parent block, and that block 964 // is not among the ones we are removing in this path, then 965 // merge the instruction to that block 966 if (SUColors.size() == 1 && SUColorsPending.size() == 1) 967 PendingColoring[SU->NodeNum] = *SUColors.begin(); 968 else // TODO: Attribute new colors depending on color 969 // combination of children. 970 PendingColoring[SU->NodeNum] = NextNonReservedID++; 971 } 972 CurrentColoring = PendingColoring; 973 } 974 975 976 void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() { 977 unsigned DAGSize = DAG->SUnits.size(); 978 unsigned PreviousColor; 979 std::set<unsigned> SeenColors; 980 981 if (DAGSize <= 1) 982 return; 983 984 PreviousColor = CurrentColoring[0]; 985 986 for (unsigned i = 1, e = DAGSize; i != e; ++i) { 987 SUnit *SU = &DAG->SUnits[i]; 988 unsigned CurrentColor = CurrentColoring[i]; 989 unsigned PreviousColorSave = PreviousColor; 990 assert(i == SU->NodeNum); 991 992 if (CurrentColor != PreviousColor) 993 SeenColors.insert(PreviousColor); 994 PreviousColor = CurrentColor; 995 996 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 997 continue; 998 999 if (SeenColors.find(CurrentColor) == SeenColors.end()) 1000 continue; 1001 1002 if (PreviousColorSave != CurrentColor) 1003 CurrentColoring[i] = NextNonReservedID++; 1004 else 1005 CurrentColoring[i] = CurrentColoring[i-1]; 1006 } 1007 } 1008 1009 void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() { 1010 unsigned DAGSize = DAG->SUnits.size(); 1011 1012 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1013 SUnit *SU = &DAG->SUnits[SUNum]; 1014 std::set<unsigned> SUColors; 1015 1016 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1017 continue; 1018 1019 // No predecessor: Vgpr constant loading. 1020 // Low latency instructions usually have a predecessor (the address) 1021 if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum]) 1022 continue; 1023 1024 for (SDep& SuccDep : SU->Succs) { 1025 SUnit *Succ = SuccDep.getSUnit(); 1026 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1027 continue; 1028 SUColors.insert(CurrentColoring[Succ->NodeNum]); 1029 } 1030 if (SUColors.size() == 1) 1031 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 1032 } 1033 } 1034 1035 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() { 1036 unsigned DAGSize = DAG->SUnits.size(); 1037 1038 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1039 SUnit *SU = &DAG->SUnits[SUNum]; 1040 std::set<unsigned> SUColors; 1041 1042 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1043 continue; 1044 1045 for (SDep& SuccDep : SU->Succs) { 1046 SUnit *Succ = SuccDep.getSUnit(); 1047 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1048 continue; 1049 SUColors.insert(CurrentColoring[Succ->NodeNum]); 1050 } 1051 if (SUColors.size() == 1) 1052 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 1053 } 1054 } 1055 1056 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() { 1057 unsigned DAGSize = DAG->SUnits.size(); 1058 1059 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1060 SUnit *SU = &DAG->SUnits[SUNum]; 1061 std::set<unsigned> SUColors; 1062 1063 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1064 continue; 1065 1066 for (SDep& SuccDep : SU->Succs) { 1067 SUnit *Succ = SuccDep.getSUnit(); 1068 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1069 continue; 1070 SUColors.insert(CurrentColoring[Succ->NodeNum]); 1071 } 1072 if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize) 1073 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 1074 } 1075 } 1076 1077 void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() { 1078 unsigned DAGSize = DAG->SUnits.size(); 1079 std::map<unsigned, unsigned> ColorCount; 1080 1081 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1082 SUnit *SU = &DAG->SUnits[SUNum]; 1083 unsigned color = CurrentColoring[SU->NodeNum]; 1084 ++ColorCount[color]; 1085 } 1086 1087 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1088 SUnit *SU = &DAG->SUnits[SUNum]; 1089 unsigned color = CurrentColoring[SU->NodeNum]; 1090 std::set<unsigned> SUColors; 1091 1092 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1093 continue; 1094 1095 if (ColorCount[color] > 1) 1096 continue; 1097 1098 for (SDep& SuccDep : SU->Succs) { 1099 SUnit *Succ = SuccDep.getSUnit(); 1100 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1101 continue; 1102 SUColors.insert(CurrentColoring[Succ->NodeNum]); 1103 } 1104 if (SUColors.size() == 1 && *SUColors.begin() != color) { 1105 --ColorCount[color]; 1106 CurrentColoring[SU->NodeNum] = *SUColors.begin(); 1107 ++ColorCount[*SUColors.begin()]; 1108 } 1109 } 1110 } 1111 1112 void SIScheduleBlockCreator::cutHugeBlocks() { 1113 // TODO 1114 } 1115 1116 void SIScheduleBlockCreator::regroupNoUserInstructions() { 1117 unsigned DAGSize = DAG->SUnits.size(); 1118 int GroupID = NextNonReservedID++; 1119 1120 for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1121 SUnit *SU = &DAG->SUnits[SUNum]; 1122 bool hasSuccessor = false; 1123 1124 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1125 continue; 1126 1127 for (SDep& SuccDep : SU->Succs) { 1128 SUnit *Succ = SuccDep.getSUnit(); 1129 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1130 continue; 1131 hasSuccessor = true; 1132 } 1133 if (!hasSuccessor) 1134 CurrentColoring[SU->NodeNum] = GroupID; 1135 } 1136 } 1137 1138 void SIScheduleBlockCreator::colorExports() { 1139 unsigned ExportColor = NextNonReservedID++; 1140 SmallVector<unsigned, 8> ExpGroup; 1141 1142 // Put all exports together in a block. 1143 // The block will naturally end up being scheduled last, 1144 // thus putting exports at the end of the schedule, which 1145 // is better for performance. 1146 // However we must ensure, for safety, the exports can be put 1147 // together in the same block without any other instruction. 1148 // This could happen, for example, when scheduling after regalloc 1149 // if reloading a spilled register from memory using the same 1150 // register than used in a previous export. 1151 // If that happens, do not regroup the exports. 1152 for (unsigned SUNum : DAG->TopDownIndex2SU) { 1153 const SUnit &SU = DAG->SUnits[SUNum]; 1154 if (SIInstrInfo::isEXP(*SU.getInstr())) { 1155 // Check the EXP can be added to the group safely, 1156 // ie without needing any other instruction. 1157 // The EXP is allowed to depend on other EXP 1158 // (they will be in the same group). 1159 for (unsigned j : ExpGroup) { 1160 bool HasSubGraph; 1161 std::vector<int> SubGraph; 1162 // By construction (topological order), if SU and 1163 // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary 1164 // in the parent graph of SU. 1165 #ifndef NDEBUG 1166 SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j], 1167 HasSubGraph); 1168 assert(!HasSubGraph); 1169 #endif 1170 SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU, 1171 HasSubGraph); 1172 if (!HasSubGraph) 1173 continue; // No dependencies between each other 1174 1175 // SubGraph contains all the instructions required 1176 // between EXP SUnits[j] and EXP SU. 1177 for (unsigned k : SubGraph) { 1178 if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr())) 1179 // Other instructions than EXP would be required in the group. 1180 // Abort the groupping. 1181 return; 1182 } 1183 } 1184 1185 ExpGroup.push_back(SUNum); 1186 } 1187 } 1188 1189 // The group can be formed. Give the color. 1190 for (unsigned j : ExpGroup) 1191 CurrentColoring[j] = ExportColor; 1192 } 1193 1194 void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) { 1195 unsigned DAGSize = DAG->SUnits.size(); 1196 std::map<unsigned,unsigned> RealID; 1197 1198 CurrentBlocks.clear(); 1199 CurrentColoring.clear(); 1200 CurrentColoring.resize(DAGSize, 0); 1201 Node2CurrentBlock.clear(); 1202 1203 // Restore links previous scheduling variant has overridden. 1204 DAG->restoreSULinksLeft(); 1205 1206 NextReservedID = 1; 1207 NextNonReservedID = DAGSize + 1; 1208 1209 LLVM_DEBUG(dbgs() << "Coloring the graph\n"); 1210 1211 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped) 1212 colorHighLatenciesGroups(); 1213 else 1214 colorHighLatenciesAlone(); 1215 colorComputeReservedDependencies(); 1216 colorAccordingToReservedDependencies(); 1217 colorEndsAccordingToDependencies(); 1218 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive) 1219 colorForceConsecutiveOrderInGroup(); 1220 regroupNoUserInstructions(); 1221 colorMergeConstantLoadsNextGroup(); 1222 colorMergeIfPossibleNextGroupOnlyForReserved(); 1223 colorExports(); 1224 1225 // Put SUs of same color into same block 1226 Node2CurrentBlock.resize(DAGSize, -1); 1227 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1228 SUnit *SU = &DAG->SUnits[i]; 1229 unsigned Color = CurrentColoring[SU->NodeNum]; 1230 if (RealID.find(Color) == RealID.end()) { 1231 int ID = CurrentBlocks.size(); 1232 BlockPtrs.push_back(llvm::make_unique<SIScheduleBlock>(DAG, this, ID)); 1233 CurrentBlocks.push_back(BlockPtrs.rbegin()->get()); 1234 RealID[Color] = ID; 1235 } 1236 CurrentBlocks[RealID[Color]]->addUnit(SU); 1237 Node2CurrentBlock[SU->NodeNum] = RealID[Color]; 1238 } 1239 1240 // Build dependencies between blocks. 1241 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1242 SUnit *SU = &DAG->SUnits[i]; 1243 int SUID = Node2CurrentBlock[i]; 1244 for (SDep& SuccDep : SU->Succs) { 1245 SUnit *Succ = SuccDep.getSUnit(); 1246 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1247 continue; 1248 if (Node2CurrentBlock[Succ->NodeNum] != SUID) 1249 CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]], 1250 SuccDep.isCtrl() ? NoData : Data); 1251 } 1252 for (SDep& PredDep : SU->Preds) { 1253 SUnit *Pred = PredDep.getSUnit(); 1254 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) 1255 continue; 1256 if (Node2CurrentBlock[Pred->NodeNum] != SUID) 1257 CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]); 1258 } 1259 } 1260 1261 // Free root and leafs of all blocks to enable scheduling inside them. 1262 for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { 1263 SIScheduleBlock *Block = CurrentBlocks[i]; 1264 Block->finalizeUnits(); 1265 } 1266 LLVM_DEBUG(dbgs() << "Blocks created:\n\n"; 1267 for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { 1268 SIScheduleBlock *Block = CurrentBlocks[i]; 1269 Block->printDebug(true); 1270 }); 1271 } 1272 1273 // Two functions taken from Codegen/MachineScheduler.cpp 1274 1275 /// Non-const version. 1276 static MachineBasicBlock::iterator 1277 nextIfDebug(MachineBasicBlock::iterator I, 1278 MachineBasicBlock::const_iterator End) { 1279 for (; I != End; ++I) { 1280 if (!I->isDebugInstr()) 1281 break; 1282 } 1283 return I; 1284 } 1285 1286 void SIScheduleBlockCreator::topologicalSort() { 1287 unsigned DAGSize = CurrentBlocks.size(); 1288 std::vector<int> WorkList; 1289 1290 LLVM_DEBUG(dbgs() << "Topological Sort\n"); 1291 1292 WorkList.reserve(DAGSize); 1293 TopDownIndex2Block.resize(DAGSize); 1294 TopDownBlock2Index.resize(DAGSize); 1295 BottomUpIndex2Block.resize(DAGSize); 1296 1297 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1298 SIScheduleBlock *Block = CurrentBlocks[i]; 1299 unsigned Degree = Block->getSuccs().size(); 1300 TopDownBlock2Index[i] = Degree; 1301 if (Degree == 0) { 1302 WorkList.push_back(i); 1303 } 1304 } 1305 1306 int Id = DAGSize; 1307 while (!WorkList.empty()) { 1308 int i = WorkList.back(); 1309 SIScheduleBlock *Block = CurrentBlocks[i]; 1310 WorkList.pop_back(); 1311 TopDownBlock2Index[i] = --Id; 1312 TopDownIndex2Block[Id] = i; 1313 for (SIScheduleBlock* Pred : Block->getPreds()) { 1314 if (!--TopDownBlock2Index[Pred->getID()]) 1315 WorkList.push_back(Pred->getID()); 1316 } 1317 } 1318 1319 #ifndef NDEBUG 1320 // Check correctness of the ordering. 1321 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1322 SIScheduleBlock *Block = CurrentBlocks[i]; 1323 for (SIScheduleBlock* Pred : Block->getPreds()) { 1324 assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] && 1325 "Wrong Top Down topological sorting"); 1326 } 1327 } 1328 #endif 1329 1330 BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(), 1331 TopDownIndex2Block.rend()); 1332 } 1333 1334 void SIScheduleBlockCreator::scheduleInsideBlocks() { 1335 unsigned DAGSize = CurrentBlocks.size(); 1336 1337 LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n"); 1338 1339 // We do schedule a valid scheduling such that a Block corresponds 1340 // to a range of instructions. 1341 LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n"); 1342 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1343 SIScheduleBlock *Block = CurrentBlocks[i]; 1344 Block->fastSchedule(); 1345 } 1346 1347 // Note: the following code, and the part restoring previous position 1348 // is by far the most expensive operation of the Scheduler. 1349 1350 // Do not update CurrentTop. 1351 MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop(); 1352 std::vector<MachineBasicBlock::iterator> PosOld; 1353 std::vector<MachineBasicBlock::iterator> PosNew; 1354 PosOld.reserve(DAG->SUnits.size()); 1355 PosNew.reserve(DAG->SUnits.size()); 1356 1357 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1358 int BlockIndice = TopDownIndex2Block[i]; 1359 SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; 1360 std::vector<SUnit*> SUs = Block->getScheduledUnits(); 1361 1362 for (SUnit* SU : SUs) { 1363 MachineInstr *MI = SU->getInstr(); 1364 MachineBasicBlock::iterator Pos = MI; 1365 PosOld.push_back(Pos); 1366 if (&*CurrentTopFastSched == MI) { 1367 PosNew.push_back(Pos); 1368 CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched, 1369 DAG->getCurrentBottom()); 1370 } else { 1371 // Update the instruction stream. 1372 DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI); 1373 1374 // Update LiveIntervals. 1375 // Note: Moving all instructions and calling handleMove every time 1376 // is the most cpu intensive operation of the scheduler. 1377 // It would gain a lot if there was a way to recompute the 1378 // LiveIntervals for the entire scheduling region. 1379 DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true); 1380 PosNew.push_back(CurrentTopFastSched); 1381 } 1382 } 1383 } 1384 1385 // Now we have Block of SUs == Block of MI. 1386 // We do the final schedule for the instructions inside the block. 1387 // The property that all the SUs of the Block are grouped together as MI 1388 // is used for correct reg usage tracking. 1389 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1390 SIScheduleBlock *Block = CurrentBlocks[i]; 1391 std::vector<SUnit*> SUs = Block->getScheduledUnits(); 1392 Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr()); 1393 } 1394 1395 LLVM_DEBUG(dbgs() << "Restoring MI Pos\n"); 1396 // Restore old ordering (which prevents a LIS->handleMove bug). 1397 for (unsigned i = PosOld.size(), e = 0; i != e; --i) { 1398 MachineBasicBlock::iterator POld = PosOld[i-1]; 1399 MachineBasicBlock::iterator PNew = PosNew[i-1]; 1400 if (PNew != POld) { 1401 // Update the instruction stream. 1402 DAG->getBB()->splice(POld, DAG->getBB(), PNew); 1403 1404 // Update LiveIntervals. 1405 DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true); 1406 } 1407 } 1408 1409 LLVM_DEBUG(for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { 1410 SIScheduleBlock *Block = CurrentBlocks[i]; 1411 Block->printDebug(true); 1412 }); 1413 } 1414 1415 void SIScheduleBlockCreator::fillStats() { 1416 unsigned DAGSize = CurrentBlocks.size(); 1417 1418 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1419 int BlockIndice = TopDownIndex2Block[i]; 1420 SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; 1421 if (Block->getPreds().empty()) 1422 Block->Depth = 0; 1423 else { 1424 unsigned Depth = 0; 1425 for (SIScheduleBlock *Pred : Block->getPreds()) { 1426 if (Depth < Pred->Depth + Pred->getCost()) 1427 Depth = Pred->Depth + Pred->getCost(); 1428 } 1429 Block->Depth = Depth; 1430 } 1431 } 1432 1433 for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1434 int BlockIndice = BottomUpIndex2Block[i]; 1435 SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; 1436 if (Block->getSuccs().empty()) 1437 Block->Height = 0; 1438 else { 1439 unsigned Height = 0; 1440 for (const auto &Succ : Block->getSuccs()) 1441 Height = std::max(Height, Succ.first->Height + Succ.first->getCost()); 1442 Block->Height = Height; 1443 } 1444 } 1445 } 1446 1447 // SIScheduleBlockScheduler // 1448 1449 SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, 1450 SISchedulerBlockSchedulerVariant Variant, 1451 SIScheduleBlocks BlocksStruct) : 1452 DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks), 1453 LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0), 1454 SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) { 1455 1456 // Fill the usage of every output 1457 // Warning: while by construction we always have a link between two blocks 1458 // when one needs a result from the other, the number of users of an output 1459 // is not the sum of child blocks having as input the same virtual register. 1460 // Here is an example. A produces x and y. B eats x and produces x'. 1461 // C eats x' and y. The register coalescer may have attributed the same 1462 // virtual register to x and x'. 1463 // To count accurately, we do a topological sort. In case the register is 1464 // found for several parents, we increment the usage of the one with the 1465 // highest topological index. 1466 LiveOutRegsNumUsages.resize(Blocks.size()); 1467 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1468 SIScheduleBlock *Block = Blocks[i]; 1469 for (unsigned Reg : Block->getInRegs()) { 1470 bool Found = false; 1471 int topoInd = -1; 1472 for (SIScheduleBlock* Pred: Block->getPreds()) { 1473 std::set<unsigned> PredOutRegs = Pred->getOutRegs(); 1474 std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg); 1475 1476 if (RegPos != PredOutRegs.end()) { 1477 Found = true; 1478 if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) { 1479 topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()]; 1480 } 1481 } 1482 } 1483 1484 if (!Found) 1485 continue; 1486 1487 int PredID = BlocksStruct.TopDownIndex2Block[topoInd]; 1488 ++LiveOutRegsNumUsages[PredID][Reg]; 1489 } 1490 } 1491 1492 LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0); 1493 BlockNumPredsLeft.resize(Blocks.size()); 1494 BlockNumSuccsLeft.resize(Blocks.size()); 1495 1496 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1497 SIScheduleBlock *Block = Blocks[i]; 1498 BlockNumPredsLeft[i] = Block->getPreds().size(); 1499 BlockNumSuccsLeft[i] = Block->getSuccs().size(); 1500 } 1501 1502 #ifndef NDEBUG 1503 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1504 SIScheduleBlock *Block = Blocks[i]; 1505 assert(Block->getID() == i); 1506 } 1507 #endif 1508 1509 std::set<unsigned> InRegs = DAG->getInRegs(); 1510 addLiveRegs(InRegs); 1511 1512 // Increase LiveOutRegsNumUsages for blocks 1513 // producing registers consumed in another 1514 // scheduling region. 1515 for (unsigned Reg : DAG->getOutRegs()) { 1516 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1517 // Do reverse traversal 1518 int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i]; 1519 SIScheduleBlock *Block = Blocks[ID]; 1520 const std::set<unsigned> &OutRegs = Block->getOutRegs(); 1521 1522 if (OutRegs.find(Reg) == OutRegs.end()) 1523 continue; 1524 1525 ++LiveOutRegsNumUsages[ID][Reg]; 1526 break; 1527 } 1528 } 1529 1530 // Fill LiveRegsConsumers for regs that were already 1531 // defined before scheduling. 1532 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1533 SIScheduleBlock *Block = Blocks[i]; 1534 for (unsigned Reg : Block->getInRegs()) { 1535 bool Found = false; 1536 for (SIScheduleBlock* Pred: Block->getPreds()) { 1537 std::set<unsigned> PredOutRegs = Pred->getOutRegs(); 1538 std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg); 1539 1540 if (RegPos != PredOutRegs.end()) { 1541 Found = true; 1542 break; 1543 } 1544 } 1545 1546 if (!Found) 1547 ++LiveRegsConsumers[Reg]; 1548 } 1549 } 1550 1551 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1552 SIScheduleBlock *Block = Blocks[i]; 1553 if (BlockNumPredsLeft[i] == 0) { 1554 ReadyBlocks.push_back(Block); 1555 } 1556 } 1557 1558 while (SIScheduleBlock *Block = pickBlock()) { 1559 BlocksScheduled.push_back(Block); 1560 blockScheduled(Block); 1561 } 1562 1563 LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Block 1564 : BlocksScheduled) { 1565 dbgs() << ' ' << Block->getID(); 1566 } dbgs() << '\n';); 1567 } 1568 1569 bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand, 1570 SIBlockSchedCandidate &TryCand) { 1571 if (!Cand.isValid()) { 1572 TryCand.Reason = NodeOrder; 1573 return true; 1574 } 1575 1576 // Try to hide high latencies. 1577 if (SISched::tryLess(TryCand.LastPosHighLatParentScheduled, 1578 Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency)) 1579 return true; 1580 // Schedule high latencies early so you can hide them better. 1581 if (SISched::tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency, 1582 TryCand, Cand, Latency)) 1583 return true; 1584 if (TryCand.IsHighLatency && SISched::tryGreater(TryCand.Height, Cand.Height, 1585 TryCand, Cand, Depth)) 1586 return true; 1587 if (SISched::tryGreater(TryCand.NumHighLatencySuccessors, 1588 Cand.NumHighLatencySuccessors, 1589 TryCand, Cand, Successor)) 1590 return true; 1591 return false; 1592 } 1593 1594 bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand, 1595 SIBlockSchedCandidate &TryCand) { 1596 if (!Cand.isValid()) { 1597 TryCand.Reason = NodeOrder; 1598 return true; 1599 } 1600 1601 if (SISched::tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0, 1602 TryCand, Cand, RegUsage)) 1603 return true; 1604 if (SISched::tryGreater(TryCand.NumSuccessors > 0, 1605 Cand.NumSuccessors > 0, 1606 TryCand, Cand, Successor)) 1607 return true; 1608 if (SISched::tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth)) 1609 return true; 1610 if (SISched::tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff, 1611 TryCand, Cand, RegUsage)) 1612 return true; 1613 return false; 1614 } 1615 1616 SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { 1617 SIBlockSchedCandidate Cand; 1618 std::vector<SIScheduleBlock*>::iterator Best; 1619 SIScheduleBlock *Block; 1620 if (ReadyBlocks.empty()) 1621 return nullptr; 1622 1623 DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(), 1624 VregCurrentUsage, SregCurrentUsage); 1625 if (VregCurrentUsage > maxVregUsage) 1626 maxVregUsage = VregCurrentUsage; 1627 if (SregCurrentUsage > maxSregUsage) 1628 maxSregUsage = SregCurrentUsage; 1629 LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: "; 1630 for (SIScheduleBlock *Block 1631 : ReadyBlocks) dbgs() 1632 << Block->getID() << ' '; 1633 dbgs() << "\nCurrent Live:\n"; 1634 for (unsigned Reg 1635 : LiveRegs) dbgs() 1636 << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; 1637 dbgs() << '\n'; 1638 dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; 1639 dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';); 1640 1641 Cand.Block = nullptr; 1642 for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(), 1643 E = ReadyBlocks.end(); I != E; ++I) { 1644 SIBlockSchedCandidate TryCand; 1645 TryCand.Block = *I; 1646 TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock(); 1647 TryCand.VGPRUsageDiff = 1648 checkRegUsageImpact(TryCand.Block->getInRegs(), 1649 TryCand.Block->getOutRegs())[DAG->getVGPRSetID()]; 1650 TryCand.NumSuccessors = TryCand.Block->getSuccs().size(); 1651 TryCand.NumHighLatencySuccessors = 1652 TryCand.Block->getNumHighLatencySuccessors(); 1653 TryCand.LastPosHighLatParentScheduled = 1654 (unsigned int) std::max<int> (0, 1655 LastPosHighLatencyParentScheduled[TryCand.Block->getID()] - 1656 LastPosWaitedHighLatency); 1657 TryCand.Height = TryCand.Block->Height; 1658 // Try not to increase VGPR usage too much, else we may spill. 1659 if (VregCurrentUsage > 120 || 1660 Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) { 1661 if (!tryCandidateRegUsage(Cand, TryCand) && 1662 Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage) 1663 tryCandidateLatency(Cand, TryCand); 1664 } else { 1665 if (!tryCandidateLatency(Cand, TryCand)) 1666 tryCandidateRegUsage(Cand, TryCand); 1667 } 1668 if (TryCand.Reason != NoCand) { 1669 Cand.setBest(TryCand); 1670 Best = I; 1671 LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' ' 1672 << getReasonStr(Cand.Reason) << '\n'); 1673 } 1674 } 1675 1676 LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << '\n'; 1677 dbgs() << "Is a block with high latency instruction: " 1678 << (Cand.IsHighLatency ? "yes\n" : "no\n"); 1679 dbgs() << "Position of last high latency dependency: " 1680 << Cand.LastPosHighLatParentScheduled << '\n'; 1681 dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n'; 1682 dbgs() << '\n';); 1683 1684 Block = Cand.Block; 1685 ReadyBlocks.erase(Best); 1686 return Block; 1687 } 1688 1689 // Tracking of currently alive registers to determine VGPR Usage. 1690 1691 void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) { 1692 for (unsigned Reg : Regs) { 1693 // For now only track virtual registers. 1694 if (!TargetRegisterInfo::isVirtualRegister(Reg)) 1695 continue; 1696 // If not already in the live set, then add it. 1697 (void) LiveRegs.insert(Reg); 1698 } 1699 } 1700 1701 void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block, 1702 std::set<unsigned> &Regs) { 1703 for (unsigned Reg : Regs) { 1704 // For now only track virtual registers. 1705 std::set<unsigned>::iterator Pos = LiveRegs.find(Reg); 1706 assert (Pos != LiveRegs.end() && // Reg must be live. 1707 LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() && 1708 LiveRegsConsumers[Reg] >= 1); 1709 --LiveRegsConsumers[Reg]; 1710 if (LiveRegsConsumers[Reg] == 0) 1711 LiveRegs.erase(Pos); 1712 } 1713 } 1714 1715 void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) { 1716 for (const auto &Block : Parent->getSuccs()) { 1717 if (--BlockNumPredsLeft[Block.first->getID()] == 0) 1718 ReadyBlocks.push_back(Block.first); 1719 1720 if (Parent->isHighLatencyBlock() && 1721 Block.second == SIScheduleBlockLinkKind::Data) 1722 LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled; 1723 } 1724 } 1725 1726 void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) { 1727 decreaseLiveRegs(Block, Block->getInRegs()); 1728 addLiveRegs(Block->getOutRegs()); 1729 releaseBlockSuccs(Block); 1730 for (std::map<unsigned, unsigned>::iterator RegI = 1731 LiveOutRegsNumUsages[Block->getID()].begin(), 1732 E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) { 1733 std::pair<unsigned, unsigned> RegP = *RegI; 1734 // We produce this register, thus it must not be previously alive. 1735 assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() || 1736 LiveRegsConsumers[RegP.first] == 0); 1737 LiveRegsConsumers[RegP.first] += RegP.second; 1738 } 1739 if (LastPosHighLatencyParentScheduled[Block->getID()] > 1740 (unsigned)LastPosWaitedHighLatency) 1741 LastPosWaitedHighLatency = 1742 LastPosHighLatencyParentScheduled[Block->getID()]; 1743 ++NumBlockScheduled; 1744 } 1745 1746 std::vector<int> 1747 SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs, 1748 std::set<unsigned> &OutRegs) { 1749 std::vector<int> DiffSetPressure; 1750 DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0); 1751 1752 for (unsigned Reg : InRegs) { 1753 // For now only track virtual registers. 1754 if (!TargetRegisterInfo::isVirtualRegister(Reg)) 1755 continue; 1756 if (LiveRegsConsumers[Reg] > 1) 1757 continue; 1758 PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); 1759 for (; PSetI.isValid(); ++PSetI) { 1760 DiffSetPressure[*PSetI] -= PSetI.getWeight(); 1761 } 1762 } 1763 1764 for (unsigned Reg : OutRegs) { 1765 // For now only track virtual registers. 1766 if (!TargetRegisterInfo::isVirtualRegister(Reg)) 1767 continue; 1768 PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); 1769 for (; PSetI.isValid(); ++PSetI) { 1770 DiffSetPressure[*PSetI] += PSetI.getWeight(); 1771 } 1772 } 1773 1774 return DiffSetPressure; 1775 } 1776 1777 // SIScheduler // 1778 1779 struct SIScheduleBlockResult 1780 SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, 1781 SISchedulerBlockSchedulerVariant ScheduleVariant) { 1782 SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant); 1783 SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks); 1784 std::vector<SIScheduleBlock*> ScheduledBlocks; 1785 struct SIScheduleBlockResult Res; 1786 1787 ScheduledBlocks = Scheduler.getBlocks(); 1788 1789 for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) { 1790 SIScheduleBlock *Block = ScheduledBlocks[b]; 1791 std::vector<SUnit*> SUs = Block->getScheduledUnits(); 1792 1793 for (SUnit* SU : SUs) 1794 Res.SUs.push_back(SU->NodeNum); 1795 } 1796 1797 Res.MaxSGPRUsage = Scheduler.getSGPRUsage(); 1798 Res.MaxVGPRUsage = Scheduler.getVGPRUsage(); 1799 return Res; 1800 } 1801 1802 // SIScheduleDAGMI // 1803 1804 SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) : 1805 ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C)) { 1806 SITII = static_cast<const SIInstrInfo*>(TII); 1807 SITRI = static_cast<const SIRegisterInfo*>(TRI); 1808 1809 VGPRSetID = SITRI->getVGPRPressureSet(); 1810 SGPRSetID = SITRI->getSGPRPressureSet(); 1811 } 1812 1813 SIScheduleDAGMI::~SIScheduleDAGMI() = default; 1814 1815 // Code adapted from scheduleDAG.cpp 1816 // Does a topological sort over the SUs. 1817 // Both TopDown and BottomUp 1818 void SIScheduleDAGMI::topologicalSort() { 1819 Topo.InitDAGTopologicalSorting(); 1820 1821 TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end()); 1822 BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend()); 1823 } 1824 1825 // Move low latencies further from their user without 1826 // increasing SGPR usage (in general) 1827 // This is to be replaced by a better pass that would 1828 // take into account SGPR usage (based on VGPR Usage 1829 // and the corresponding wavefront count), that would 1830 // try to merge groups of loads if it make sense, etc 1831 void SIScheduleDAGMI::moveLowLatencies() { 1832 unsigned DAGSize = SUnits.size(); 1833 int LastLowLatencyUser = -1; 1834 int LastLowLatencyPos = -1; 1835 1836 for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) { 1837 SUnit *SU = &SUnits[ScheduledSUnits[i]]; 1838 bool IsLowLatencyUser = false; 1839 unsigned MinPos = 0; 1840 1841 for (SDep& PredDep : SU->Preds) { 1842 SUnit *Pred = PredDep.getSUnit(); 1843 if (SITII->isLowLatencyInstruction(*Pred->getInstr())) { 1844 IsLowLatencyUser = true; 1845 } 1846 if (Pred->NodeNum >= DAGSize) 1847 continue; 1848 unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum]; 1849 if (PredPos >= MinPos) 1850 MinPos = PredPos + 1; 1851 } 1852 1853 if (SITII->isLowLatencyInstruction(*SU->getInstr())) { 1854 unsigned BestPos = LastLowLatencyUser + 1; 1855 if ((int)BestPos <= LastLowLatencyPos) 1856 BestPos = LastLowLatencyPos + 1; 1857 if (BestPos < MinPos) 1858 BestPos = MinPos; 1859 if (BestPos < i) { 1860 for (unsigned u = i; u > BestPos; --u) { 1861 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; 1862 ScheduledSUnits[u] = ScheduledSUnits[u-1]; 1863 } 1864 ScheduledSUnits[BestPos] = SU->NodeNum; 1865 ScheduledSUnitsInv[SU->NodeNum] = BestPos; 1866 } 1867 LastLowLatencyPos = BestPos; 1868 if (IsLowLatencyUser) 1869 LastLowLatencyUser = BestPos; 1870 } else if (IsLowLatencyUser) { 1871 LastLowLatencyUser = i; 1872 // Moves COPY instructions on which depends 1873 // the low latency instructions too. 1874 } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) { 1875 bool CopyForLowLat = false; 1876 for (SDep& SuccDep : SU->Succs) { 1877 SUnit *Succ = SuccDep.getSUnit(); 1878 if (SITII->isLowLatencyInstruction(*Succ->getInstr())) { 1879 CopyForLowLat = true; 1880 } 1881 } 1882 if (!CopyForLowLat) 1883 continue; 1884 if (MinPos < i) { 1885 for (unsigned u = i; u > MinPos; --u) { 1886 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; 1887 ScheduledSUnits[u] = ScheduledSUnits[u-1]; 1888 } 1889 ScheduledSUnits[MinPos] = SU->NodeNum; 1890 ScheduledSUnitsInv[SU->NodeNum] = MinPos; 1891 } 1892 } 1893 } 1894 } 1895 1896 void SIScheduleDAGMI::restoreSULinksLeft() { 1897 for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { 1898 SUnits[i].isScheduled = false; 1899 SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft; 1900 SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft; 1901 SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft; 1902 SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft; 1903 } 1904 } 1905 1906 // Return the Vgpr and Sgpr usage corresponding to some virtual registers. 1907 template<typename _Iterator> void 1908 SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End, 1909 unsigned &VgprUsage, unsigned &SgprUsage) { 1910 VgprUsage = 0; 1911 SgprUsage = 0; 1912 for (_Iterator RegI = First; RegI != End; ++RegI) { 1913 unsigned Reg = *RegI; 1914 // For now only track virtual registers 1915 if (!TargetRegisterInfo::isVirtualRegister(Reg)) 1916 continue; 1917 PSetIterator PSetI = MRI.getPressureSets(Reg); 1918 for (; PSetI.isValid(); ++PSetI) { 1919 if (*PSetI == VGPRSetID) 1920 VgprUsage += PSetI.getWeight(); 1921 else if (*PSetI == SGPRSetID) 1922 SgprUsage += PSetI.getWeight(); 1923 } 1924 } 1925 } 1926 1927 void SIScheduleDAGMI::schedule() 1928 { 1929 SmallVector<SUnit*, 8> TopRoots, BotRoots; 1930 SIScheduleBlockResult Best, Temp; 1931 LLVM_DEBUG(dbgs() << "Preparing Scheduling\n"); 1932 1933 buildDAGWithRegPressure(); 1934 LLVM_DEBUG(dump()); 1935 1936 topologicalSort(); 1937 findRootsAndBiasEdges(TopRoots, BotRoots); 1938 // We reuse several ScheduleDAGMI and ScheduleDAGMILive 1939 // functions, but to make them happy we must initialize 1940 // the default Scheduler implementation (even if we do not 1941 // run it) 1942 SchedImpl->initialize(this); 1943 initQueues(TopRoots, BotRoots); 1944 1945 // Fill some stats to help scheduling. 1946 1947 SUnitsLinksBackup = SUnits; 1948 IsLowLatencySU.clear(); 1949 LowLatencyOffset.clear(); 1950 IsHighLatencySU.clear(); 1951 1952 IsLowLatencySU.resize(SUnits.size(), 0); 1953 LowLatencyOffset.resize(SUnits.size(), 0); 1954 IsHighLatencySU.resize(SUnits.size(), 0); 1955 1956 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { 1957 SUnit *SU = &SUnits[i]; 1958 unsigned BaseLatReg; 1959 int64_t OffLatReg; 1960 if (SITII->isLowLatencyInstruction(*SU->getInstr())) { 1961 IsLowLatencySU[i] = 1; 1962 if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg, 1963 TRI)) 1964 LowLatencyOffset[i] = OffLatReg; 1965 } else if (SITII->isHighLatencyInstruction(*SU->getInstr())) 1966 IsHighLatencySU[i] = 1; 1967 } 1968 1969 SIScheduler Scheduler(this); 1970 Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone, 1971 SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage); 1972 1973 // if VGPR usage is extremely high, try other good performing variants 1974 // which could lead to lower VGPR usage 1975 if (Best.MaxVGPRUsage > 180) { 1976 static const std::pair<SISchedulerBlockCreatorVariant, 1977 SISchedulerBlockSchedulerVariant> 1978 Variants[] = { 1979 { LatenciesAlone, BlockRegUsageLatency }, 1980 // { LatenciesAlone, BlockRegUsage }, 1981 { LatenciesGrouped, BlockLatencyRegUsage }, 1982 // { LatenciesGrouped, BlockRegUsageLatency }, 1983 // { LatenciesGrouped, BlockRegUsage }, 1984 { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, 1985 // { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, 1986 // { LatenciesAlonePlusConsecutive, BlockRegUsage } 1987 }; 1988 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { 1989 Temp = Scheduler.scheduleVariant(v.first, v.second); 1990 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) 1991 Best = Temp; 1992 } 1993 } 1994 // if VGPR usage is still extremely high, we may spill. Try other variants 1995 // which are less performing, but that could lead to lower VGPR usage. 1996 if (Best.MaxVGPRUsage > 200) { 1997 static const std::pair<SISchedulerBlockCreatorVariant, 1998 SISchedulerBlockSchedulerVariant> 1999 Variants[] = { 2000 // { LatenciesAlone, BlockRegUsageLatency }, 2001 { LatenciesAlone, BlockRegUsage }, 2002 // { LatenciesGrouped, BlockLatencyRegUsage }, 2003 { LatenciesGrouped, BlockRegUsageLatency }, 2004 { LatenciesGrouped, BlockRegUsage }, 2005 // { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, 2006 { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, 2007 { LatenciesAlonePlusConsecutive, BlockRegUsage } 2008 }; 2009 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { 2010 Temp = Scheduler.scheduleVariant(v.first, v.second); 2011 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) 2012 Best = Temp; 2013 } 2014 } 2015 2016 ScheduledSUnits = Best.SUs; 2017 ScheduledSUnitsInv.resize(SUnits.size()); 2018 2019 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { 2020 ScheduledSUnitsInv[ScheduledSUnits[i]] = i; 2021 } 2022 2023 moveLowLatencies(); 2024 2025 // Tell the outside world about the result of the scheduling. 2026 2027 assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker"); 2028 TopRPTracker.setPos(CurrentTop); 2029 2030 for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(), 2031 E = ScheduledSUnits.end(); I != E; ++I) { 2032 SUnit *SU = &SUnits[*I]; 2033 2034 scheduleMI(SU, true); 2035 2036 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " 2037 << *SU->getInstr()); 2038 } 2039 2040 assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone."); 2041 2042 placeDebugValues(); 2043 2044 LLVM_DEBUG({ 2045 dbgs() << "*** Final schedule for " 2046 << printMBBReference(*begin()->getParent()) << " ***\n"; 2047 dumpSchedule(); 2048 dbgs() << '\n'; 2049 }); 2050 } 2051