1 //===- HexagonMachineScheduler.cpp - MI Scheduler for Hexagon -------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // MachineScheduler schedules machine instructions after phi elimination. It 11 // preserves LiveIntervals so it can be invoked before register allocation. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "HexagonMachineScheduler.h" 16 #include "HexagonSubtarget.h" 17 #include "llvm/CodeGen/MachineLoopInfo.h" 18 #include "llvm/CodeGen/ScheduleDAGMutation.h" 19 #include "llvm/IR/Function.h" 20 21 #include <iomanip> 22 #include <sstream> 23 24 static cl::opt<bool> IgnoreBBRegPressure("ignore-bb-reg-pressure", 25 cl::Hidden, cl::ZeroOrMore, cl::init(false)); 26 27 static cl::opt<bool> SchedPredsCloser("sched-preds-closer", 28 cl::Hidden, cl::ZeroOrMore, cl::init(true)); 29 30 static cl::opt<unsigned> SchedDebugVerboseLevel("misched-verbose-level", 31 cl::Hidden, cl::ZeroOrMore, cl::init(1)); 32 33 static cl::opt<bool> TopUseShorterTie("top-use-shorter-tie", 34 cl::Hidden, cl::ZeroOrMore, cl::init(false)); 35 36 static cl::opt<bool> BotUseShorterTie("bot-use-shorter-tie", 37 cl::Hidden, cl::ZeroOrMore, cl::init(false)); 38 39 static cl::opt<bool> DisableTCTie("disable-tc-tie", 40 cl::Hidden, cl::ZeroOrMore, cl::init(false)); 41 42 static cl::opt<bool> SchedRetvalOptimization("sched-retval-optimization", 43 cl::Hidden, cl::ZeroOrMore, cl::init(true)); 44 45 // Check if the scheduler should penalize instructions that are available to 46 // early due to a zero-latency dependence. 47 static cl::opt<bool> CheckEarlyAvail("check-early-avail", cl::Hidden, 48 cl::ZeroOrMore, cl::init(true)); 49 50 using namespace llvm; 51 52 #define DEBUG_TYPE "misched" 53 54 namespace { 55 class HexagonCallMutation : public ScheduleDAGMutation { 56 public: 57 void apply(ScheduleDAGInstrs *DAG) override; 58 private: 59 bool shouldTFRICallBind(const HexagonInstrInfo &HII, 60 const SUnit &Inst1, const SUnit &Inst2) const; 61 }; 62 } // end anonymous namespace 63 64 // Check if a call and subsequent A2_tfrpi instructions should maintain 65 // scheduling affinity. We are looking for the TFRI to be consumed in 66 // the next instruction. This should help reduce the instances of 67 // double register pairs being allocated and scheduled before a call 68 // when not used until after the call. This situation is exacerbated 69 // by the fact that we allocate the pair from the callee saves list, 70 // leading to excess spills and restores. 71 bool HexagonCallMutation::shouldTFRICallBind(const HexagonInstrInfo &HII, 72 const SUnit &Inst1, const SUnit &Inst2) const { 73 if (Inst1.getInstr()->getOpcode() != Hexagon::A2_tfrpi) 74 return false; 75 76 // TypeXTYPE are 64 bit operations. 77 if (HII.getType(*Inst2.getInstr()) == HexagonII::TypeXTYPE) 78 return true; 79 return false; 80 } 81 82 void HexagonCallMutation::apply(ScheduleDAGInstrs *DAG) { 83 SUnit* LastSequentialCall = nullptr; 84 unsigned VRegHoldingRet = 0; 85 unsigned RetRegister; 86 SUnit* LastUseOfRet = nullptr; 87 auto &TRI = *DAG->MF.getSubtarget().getRegisterInfo(); 88 auto &HII = *DAG->MF.getSubtarget<HexagonSubtarget>().getInstrInfo(); 89 90 // Currently we only catch the situation when compare gets scheduled 91 // before preceding call. 92 for (unsigned su = 0, e = DAG->SUnits.size(); su != e; ++su) { 93 // Remember the call. 94 if (DAG->SUnits[su].getInstr()->isCall()) 95 LastSequentialCall = &DAG->SUnits[su]; 96 // Look for a compare that defines a predicate. 97 else if (DAG->SUnits[su].getInstr()->isCompare() && LastSequentialCall) 98 DAG->SUnits[su].addPred(SDep(LastSequentialCall, SDep::Barrier)); 99 // Look for call and tfri* instructions. 100 else if (SchedPredsCloser && LastSequentialCall && su > 1 && su < e-1 && 101 shouldTFRICallBind(HII, DAG->SUnits[su], DAG->SUnits[su+1])) 102 DAG->SUnits[su].addPred(SDep(&DAG->SUnits[su-1], SDep::Barrier)); 103 // Prevent redundant register copies between two calls, which are caused by 104 // both the return value and the argument for the next call being in %R0. 105 // Example: 106 // 1: <call1> 107 // 2: %VregX = COPY %R0 108 // 3: <use of %VregX> 109 // 4: %R0 = ... 110 // 5: <call2> 111 // The scheduler would often swap 3 and 4, so an additional register is 112 // needed. This code inserts a Barrier dependence between 3 & 4 to prevent 113 // this. The same applies for %D0 and %V0/%W0, which are also handled. 114 else if (SchedRetvalOptimization) { 115 const MachineInstr *MI = DAG->SUnits[su].getInstr(); 116 if (MI->isCopy() && (MI->readsRegister(Hexagon::R0, &TRI) || 117 MI->readsRegister(Hexagon::V0, &TRI))) { 118 // %vregX = COPY %R0 119 VRegHoldingRet = MI->getOperand(0).getReg(); 120 RetRegister = MI->getOperand(1).getReg(); 121 LastUseOfRet = nullptr; 122 } else if (VRegHoldingRet && MI->readsVirtualRegister(VRegHoldingRet)) 123 // <use of %vregX> 124 LastUseOfRet = &DAG->SUnits[su]; 125 else if (LastUseOfRet && MI->definesRegister(RetRegister, &TRI)) 126 // %R0 = ... 127 DAG->SUnits[su].addPred(SDep(LastUseOfRet, SDep::Barrier)); 128 } 129 } 130 } 131 132 133 /// Save the last formed packet 134 void VLIWResourceModel::savePacket() { 135 OldPacket = Packet; 136 } 137 138 /// Check if scheduling of this SU is possible 139 /// in the current packet. 140 /// It is _not_ precise (statefull), it is more like 141 /// another heuristic. Many corner cases are figured 142 /// empirically. 143 bool VLIWResourceModel::isResourceAvailable(SUnit *SU) { 144 if (!SU || !SU->getInstr()) 145 return false; 146 147 // First see if the pipeline could receive this instruction 148 // in the current cycle. 149 switch (SU->getInstr()->getOpcode()) { 150 default: 151 if (!ResourcesModel->canReserveResources(*SU->getInstr())) 152 return false; 153 case TargetOpcode::EXTRACT_SUBREG: 154 case TargetOpcode::INSERT_SUBREG: 155 case TargetOpcode::SUBREG_TO_REG: 156 case TargetOpcode::REG_SEQUENCE: 157 case TargetOpcode::IMPLICIT_DEF: 158 case TargetOpcode::COPY: 159 case TargetOpcode::INLINEASM: 160 break; 161 } 162 163 MachineFunction &MF = *SU->getInstr()->getParent()->getParent(); 164 auto &QII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo(); 165 166 // Now see if there are no other dependencies to instructions already 167 // in the packet. 168 for (unsigned i = 0, e = Packet.size(); i != e; ++i) { 169 if (Packet[i]->Succs.size() == 0) 170 continue; 171 172 // Enable .cur formation. 173 if (QII.mayBeCurLoad(*Packet[i]->getInstr())) 174 continue; 175 176 for (SUnit::const_succ_iterator I = Packet[i]->Succs.begin(), 177 E = Packet[i]->Succs.end(); I != E; ++I) { 178 // Since we do not add pseudos to packets, might as well 179 // ignore order dependencies. 180 if (I->isCtrl()) 181 continue; 182 183 if (I->getSUnit() == SU) 184 return false; 185 } 186 } 187 return true; 188 } 189 190 /// Keep track of available resources. 191 bool VLIWResourceModel::reserveResources(SUnit *SU) { 192 bool startNewCycle = false; 193 // Artificially reset state. 194 if (!SU) { 195 ResourcesModel->clearResources(); 196 savePacket(); 197 Packet.clear(); 198 TotalPackets++; 199 return false; 200 } 201 // If this SU does not fit in the packet 202 // start a new one. 203 if (!isResourceAvailable(SU)) { 204 ResourcesModel->clearResources(); 205 savePacket(); 206 Packet.clear(); 207 TotalPackets++; 208 startNewCycle = true; 209 } 210 211 switch (SU->getInstr()->getOpcode()) { 212 default: 213 ResourcesModel->reserveResources(*SU->getInstr()); 214 break; 215 case TargetOpcode::EXTRACT_SUBREG: 216 case TargetOpcode::INSERT_SUBREG: 217 case TargetOpcode::SUBREG_TO_REG: 218 case TargetOpcode::REG_SEQUENCE: 219 case TargetOpcode::IMPLICIT_DEF: 220 case TargetOpcode::KILL: 221 case TargetOpcode::CFI_INSTRUCTION: 222 case TargetOpcode::EH_LABEL: 223 case TargetOpcode::COPY: 224 case TargetOpcode::INLINEASM: 225 break; 226 } 227 Packet.push_back(SU); 228 229 #ifndef NDEBUG 230 DEBUG(dbgs() << "Packet[" << TotalPackets << "]:\n"); 231 for (unsigned i = 0, e = Packet.size(); i != e; ++i) { 232 DEBUG(dbgs() << "\t[" << i << "] SU("); 233 DEBUG(dbgs() << Packet[i]->NodeNum << ")\t"); 234 DEBUG(Packet[i]->getInstr()->dump()); 235 } 236 #endif 237 238 // If packet is now full, reset the state so in the next cycle 239 // we start fresh. 240 if (Packet.size() >= SchedModel->getIssueWidth()) { 241 ResourcesModel->clearResources(); 242 savePacket(); 243 Packet.clear(); 244 TotalPackets++; 245 startNewCycle = true; 246 } 247 248 return startNewCycle; 249 } 250 251 /// schedule - Called back from MachineScheduler::runOnMachineFunction 252 /// after setting up the current scheduling region. [RegionBegin, RegionEnd) 253 /// only includes instructions that have DAG nodes, not scheduling boundaries. 254 void VLIWMachineScheduler::schedule() { 255 DEBUG(dbgs() 256 << "********** MI Converging Scheduling VLIW BB#" << BB->getNumber() 257 << " " << BB->getName() 258 << " in_func " << BB->getParent()->getFunction()->getName() 259 << " at loop depth " << MLI->getLoopDepth(BB) 260 << " \n"); 261 262 buildDAGWithRegPressure(); 263 264 SmallVector<SUnit*, 8> TopRoots, BotRoots; 265 findRootsAndBiasEdges(TopRoots, BotRoots); 266 267 // Initialize the strategy before modifying the DAG. 268 SchedImpl->initialize(this); 269 270 DEBUG(unsigned maxH = 0; 271 for (unsigned su = 0, e = SUnits.size(); su != e; ++su) 272 if (SUnits[su].getHeight() > maxH) 273 maxH = SUnits[su].getHeight(); 274 dbgs() << "Max Height " << maxH << "\n";); 275 DEBUG(unsigned maxD = 0; 276 for (unsigned su = 0, e = SUnits.size(); su != e; ++su) 277 if (SUnits[su].getDepth() > maxD) 278 maxD = SUnits[su].getDepth(); 279 dbgs() << "Max Depth " << maxD << "\n";); 280 DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su) 281 SUnits[su].dumpAll(this)); 282 283 initQueues(TopRoots, BotRoots); 284 285 bool IsTopNode = false; 286 while (true) { 287 DEBUG(dbgs() << "** VLIWMachineScheduler::schedule picking next node\n"); 288 SUnit *SU = SchedImpl->pickNode(IsTopNode); 289 if (!SU) break; 290 291 if (!checkSchedLimit()) 292 break; 293 294 scheduleMI(SU, IsTopNode); 295 296 updateQueues(SU, IsTopNode); 297 298 // Notify the scheduling strategy after updating the DAG. 299 SchedImpl->schedNode(SU, IsTopNode); 300 } 301 assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone."); 302 303 placeDebugValues(); 304 305 DEBUG({ 306 unsigned BBNum = begin()->getParent()->getNumber(); 307 dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n"; 308 dumpSchedule(); 309 dbgs() << '\n'; 310 }); 311 } 312 313 void ConvergingVLIWScheduler::initialize(ScheduleDAGMI *dag) { 314 DAG = static_cast<VLIWMachineScheduler*>(dag); 315 SchedModel = DAG->getSchedModel(); 316 317 Top.init(DAG, SchedModel); 318 Bot.init(DAG, SchedModel); 319 320 // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or 321 // are disabled, then these HazardRecs will be disabled. 322 const InstrItineraryData *Itin = DAG->getSchedModel()->getInstrItineraries(); 323 const TargetSubtargetInfo &STI = DAG->MF.getSubtarget(); 324 const TargetInstrInfo *TII = STI.getInstrInfo(); 325 delete Top.HazardRec; 326 delete Bot.HazardRec; 327 Top.HazardRec = TII->CreateTargetMIHazardRecognizer(Itin, DAG); 328 Bot.HazardRec = TII->CreateTargetMIHazardRecognizer(Itin, DAG); 329 330 delete Top.ResourceModel; 331 delete Bot.ResourceModel; 332 Top.ResourceModel = new VLIWResourceModel(STI, DAG->getSchedModel()); 333 Bot.ResourceModel = new VLIWResourceModel(STI, DAG->getSchedModel()); 334 335 assert((!llvm::ForceTopDown || !llvm::ForceBottomUp) && 336 "-misched-topdown incompatible with -misched-bottomup"); 337 338 DAG->addMutation(make_unique<HexagonSubtarget::HexagonDAGMutation>()); 339 DAG->addMutation(make_unique<HexagonCallMutation>()); 340 } 341 342 void ConvergingVLIWScheduler::releaseTopNode(SUnit *SU) { 343 if (SU->isScheduled) 344 return; 345 346 for (const SDep &PI : SU->Preds) { 347 unsigned PredReadyCycle = PI.getSUnit()->TopReadyCycle; 348 unsigned MinLatency = PI.getLatency(); 349 #ifndef NDEBUG 350 Top.MaxMinLatency = std::max(MinLatency, Top.MaxMinLatency); 351 #endif 352 if (SU->TopReadyCycle < PredReadyCycle + MinLatency) 353 SU->TopReadyCycle = PredReadyCycle + MinLatency; 354 } 355 Top.releaseNode(SU, SU->TopReadyCycle); 356 } 357 358 void ConvergingVLIWScheduler::releaseBottomNode(SUnit *SU) { 359 if (SU->isScheduled) 360 return; 361 362 assert(SU->getInstr() && "Scheduled SUnit must have instr"); 363 364 for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); 365 I != E; ++I) { 366 unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle; 367 unsigned MinLatency = I->getLatency(); 368 #ifndef NDEBUG 369 Bot.MaxMinLatency = std::max(MinLatency, Bot.MaxMinLatency); 370 #endif 371 if (SU->BotReadyCycle < SuccReadyCycle + MinLatency) 372 SU->BotReadyCycle = SuccReadyCycle + MinLatency; 373 } 374 Bot.releaseNode(SU, SU->BotReadyCycle); 375 } 376 377 /// Does this SU have a hazard within the current instruction group. 378 /// 379 /// The scheduler supports two modes of hazard recognition. The first is the 380 /// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that 381 /// supports highly complicated in-order reservation tables 382 /// (ScoreboardHazardRecognizer) and arbitrary target-specific logic. 383 /// 384 /// The second is a streamlined mechanism that checks for hazards based on 385 /// simple counters that the scheduler itself maintains. It explicitly checks 386 /// for instruction dispatch limitations, including the number of micro-ops that 387 /// can dispatch per cycle. 388 /// 389 /// TODO: Also check whether the SU must start a new group. 390 bool ConvergingVLIWScheduler::VLIWSchedBoundary::checkHazard(SUnit *SU) { 391 if (HazardRec->isEnabled()) 392 return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard; 393 394 unsigned uops = SchedModel->getNumMicroOps(SU->getInstr()); 395 if (IssueCount + uops > SchedModel->getIssueWidth()) 396 return true; 397 398 return false; 399 } 400 401 void ConvergingVLIWScheduler::VLIWSchedBoundary::releaseNode(SUnit *SU, 402 unsigned ReadyCycle) { 403 if (ReadyCycle < MinReadyCycle) 404 MinReadyCycle = ReadyCycle; 405 406 // Check for interlocks first. For the purpose of other heuristics, an 407 // instruction that cannot issue appears as if it's not in the ReadyQueue. 408 if (ReadyCycle > CurrCycle || checkHazard(SU)) 409 410 Pending.push(SU); 411 else 412 Available.push(SU); 413 } 414 415 /// Move the boundary of scheduled code by one cycle. 416 void ConvergingVLIWScheduler::VLIWSchedBoundary::bumpCycle() { 417 unsigned Width = SchedModel->getIssueWidth(); 418 IssueCount = (IssueCount <= Width) ? 0 : IssueCount - Width; 419 420 assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized"); 421 unsigned NextCycle = std::max(CurrCycle + 1, MinReadyCycle); 422 423 if (!HazardRec->isEnabled()) { 424 // Bypass HazardRec virtual calls. 425 CurrCycle = NextCycle; 426 } else { 427 // Bypass getHazardType calls in case of long latency. 428 for (; CurrCycle != NextCycle; ++CurrCycle) { 429 if (isTop()) 430 HazardRec->AdvanceCycle(); 431 else 432 HazardRec->RecedeCycle(); 433 } 434 } 435 CheckPending = true; 436 437 DEBUG(dbgs() << "*** Next cycle " << Available.getName() << " cycle " 438 << CurrCycle << '\n'); 439 } 440 441 /// Move the boundary of scheduled code by one SUnit. 442 void ConvergingVLIWScheduler::VLIWSchedBoundary::bumpNode(SUnit *SU) { 443 bool startNewCycle = false; 444 445 // Update the reservation table. 446 if (HazardRec->isEnabled()) { 447 if (!isTop() && SU->isCall) { 448 // Calls are scheduled with their preceding instructions. For bottom-up 449 // scheduling, clear the pipeline state before emitting. 450 HazardRec->Reset(); 451 } 452 HazardRec->EmitInstruction(SU); 453 } 454 455 // Update DFA model. 456 startNewCycle = ResourceModel->reserveResources(SU); 457 458 // Check the instruction group dispatch limit. 459 // TODO: Check if this SU must end a dispatch group. 460 IssueCount += SchedModel->getNumMicroOps(SU->getInstr()); 461 if (startNewCycle) { 462 DEBUG(dbgs() << "*** Max instrs at cycle " << CurrCycle << '\n'); 463 bumpCycle(); 464 } 465 else 466 DEBUG(dbgs() << "*** IssueCount " << IssueCount 467 << " at cycle " << CurrCycle << '\n'); 468 } 469 470 /// Release pending ready nodes in to the available queue. This makes them 471 /// visible to heuristics. 472 void ConvergingVLIWScheduler::VLIWSchedBoundary::releasePending() { 473 // If the available queue is empty, it is safe to reset MinReadyCycle. 474 if (Available.empty()) 475 MinReadyCycle = UINT_MAX; 476 477 // Check to see if any of the pending instructions are ready to issue. If 478 // so, add them to the available queue. 479 for (unsigned i = 0, e = Pending.size(); i != e; ++i) { 480 SUnit *SU = *(Pending.begin()+i); 481 unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle; 482 483 if (ReadyCycle < MinReadyCycle) 484 MinReadyCycle = ReadyCycle; 485 486 if (ReadyCycle > CurrCycle) 487 continue; 488 489 if (checkHazard(SU)) 490 continue; 491 492 Available.push(SU); 493 Pending.remove(Pending.begin()+i); 494 --i; --e; 495 } 496 CheckPending = false; 497 } 498 499 /// Remove SU from the ready set for this boundary. 500 void ConvergingVLIWScheduler::VLIWSchedBoundary::removeReady(SUnit *SU) { 501 if (Available.isInQueue(SU)) 502 Available.remove(Available.find(SU)); 503 else { 504 assert(Pending.isInQueue(SU) && "bad ready count"); 505 Pending.remove(Pending.find(SU)); 506 } 507 } 508 509 /// If this queue only has one ready candidate, return it. As a side effect, 510 /// advance the cycle until at least one node is ready. If multiple instructions 511 /// are ready, return NULL. 512 SUnit *ConvergingVLIWScheduler::VLIWSchedBoundary::pickOnlyChoice() { 513 if (CheckPending) 514 releasePending(); 515 516 for (unsigned i = 0; Available.empty(); ++i) { 517 assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) && 518 "permanent hazard"); (void)i; 519 ResourceModel->reserveResources(nullptr); 520 bumpCycle(); 521 releasePending(); 522 } 523 if (Available.size() == 1) 524 return *Available.begin(); 525 return nullptr; 526 } 527 528 #ifndef NDEBUG 529 void ConvergingVLIWScheduler::traceCandidate(const char *Label, 530 const ReadyQueue &Q, SUnit *SU, int Cost, PressureChange P) { 531 dbgs() << Label << " " << Q.getName() << " "; 532 if (P.isValid()) 533 dbgs() << DAG->TRI->getRegPressureSetName(P.getPSet()) << ":" 534 << P.getUnitInc() << " "; 535 else 536 dbgs() << " "; 537 dbgs() << "cost(" << Cost << ")\t"; 538 SU->dump(DAG); 539 } 540 541 // Very detailed queue dump, to be used with higher verbosity levels. 542 void ConvergingVLIWScheduler::readyQueueVerboseDump( 543 const RegPressureTracker &RPTracker, SchedCandidate &Candidate, 544 ReadyQueue &Q) { 545 RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker); 546 547 dbgs() << ">>> " << Q.getName() << "\n"; 548 for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) { 549 RegPressureDelta RPDelta; 550 TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta, 551 DAG->getRegionCriticalPSets(), 552 DAG->getRegPressure().MaxSetPressure); 553 std::stringstream dbgstr; 554 dbgstr << "SU(" << std::setw(3) << (*I)->NodeNum << ")"; 555 dbgs() << dbgstr.str(); 556 SchedulingCost(Q, *I, Candidate, RPDelta, true); 557 dbgs() << "\t"; 558 (*I)->getInstr()->dump(); 559 } 560 dbgs() << "\n"; 561 } 562 #endif 563 564 /// getSingleUnscheduledPred - If there is exactly one unscheduled predecessor 565 /// of SU, return it, otherwise return null. 566 static SUnit *getSingleUnscheduledPred(SUnit *SU) { 567 SUnit *OnlyAvailablePred = nullptr; 568 for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); 569 I != E; ++I) { 570 SUnit &Pred = *I->getSUnit(); 571 if (!Pred.isScheduled) { 572 // We found an available, but not scheduled, predecessor. If it's the 573 // only one we have found, keep track of it... otherwise give up. 574 if (OnlyAvailablePred && OnlyAvailablePred != &Pred) 575 return nullptr; 576 OnlyAvailablePred = &Pred; 577 } 578 } 579 return OnlyAvailablePred; 580 } 581 582 /// getSingleUnscheduledSucc - If there is exactly one unscheduled successor 583 /// of SU, return it, otherwise return null. 584 static SUnit *getSingleUnscheduledSucc(SUnit *SU) { 585 SUnit *OnlyAvailableSucc = nullptr; 586 for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); 587 I != E; ++I) { 588 SUnit &Succ = *I->getSUnit(); 589 if (!Succ.isScheduled) { 590 // We found an available, but not scheduled, successor. If it's the 591 // only one we have found, keep track of it... otherwise give up. 592 if (OnlyAvailableSucc && OnlyAvailableSucc != &Succ) 593 return nullptr; 594 OnlyAvailableSucc = &Succ; 595 } 596 } 597 return OnlyAvailableSucc; 598 } 599 600 // Constants used to denote relative importance of 601 // heuristic components for cost computation. 602 static const unsigned PriorityOne = 200; 603 static const unsigned PriorityTwo = 50; 604 static const unsigned PriorityThree = 75; 605 static const unsigned ScaleTwo = 10; 606 static const unsigned FactorOne = 2; 607 608 /// Single point to compute overall scheduling cost. 609 /// TODO: More heuristics will be used soon. 610 int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU, 611 SchedCandidate &Candidate, 612 RegPressureDelta &Delta, 613 bool verbose) { 614 // Initial trivial priority. 615 int ResCount = 1; 616 617 // Do not waste time on a node that is already scheduled. 618 if (!SU || SU->isScheduled) 619 return ResCount; 620 621 MachineInstr &Instr = *SU->getInstr(); 622 623 DEBUG(if (verbose) dbgs() << ((Q.getID() == TopQID) ? "(top|" : "(bot|")); 624 // Forced priority is high. 625 if (SU->isScheduleHigh) { 626 ResCount += PriorityOne; 627 DEBUG(dbgs() << "H|"); 628 } 629 630 // Critical path first. 631 if (Q.getID() == TopQID) { 632 ResCount += (SU->getHeight() * ScaleTwo); 633 634 DEBUG(if (verbose) { 635 std::stringstream dbgstr; 636 dbgstr << "h" << std::setw(3) << SU->getHeight() << "|"; 637 dbgs() << dbgstr.str(); 638 }); 639 640 // If resources are available for it, multiply the 641 // chance of scheduling. 642 if (Top.ResourceModel->isResourceAvailable(SU)) { 643 ResCount <<= FactorOne; 644 ResCount += PriorityThree; 645 DEBUG(if (verbose) dbgs() << "A|"); 646 } else 647 DEBUG(if (verbose) dbgs() << " |"); 648 } else { 649 ResCount += (SU->getDepth() * ScaleTwo); 650 651 DEBUG(if (verbose) { 652 std::stringstream dbgstr; 653 dbgstr << "d" << std::setw(3) << SU->getDepth() << "|"; 654 dbgs() << dbgstr.str(); 655 }); 656 657 // If resources are available for it, multiply the 658 // chance of scheduling. 659 if (Bot.ResourceModel->isResourceAvailable(SU)) { 660 ResCount <<= FactorOne; 661 ResCount += PriorityThree; 662 DEBUG(if (verbose) dbgs() << "A|"); 663 } else 664 DEBUG(if (verbose) dbgs() << " |"); 665 } 666 667 unsigned NumNodesBlocking = 0; 668 if (Q.getID() == TopQID) { 669 // How many SUs does it block from scheduling? 670 // Look at all of the successors of this node. 671 // Count the number of nodes that 672 // this node is the sole unscheduled node for. 673 for (const SDep &SI : SU->Succs) 674 if (getSingleUnscheduledPred(SI.getSUnit()) == SU) 675 ++NumNodesBlocking; 676 } else { 677 // How many unscheduled predecessors block this node? 678 for (const SDep &PI : SU->Preds) 679 if (getSingleUnscheduledSucc(PI.getSUnit()) == SU) 680 ++NumNodesBlocking; 681 } 682 ResCount += (NumNodesBlocking * ScaleTwo); 683 684 DEBUG(if (verbose) { 685 std::stringstream dbgstr; 686 dbgstr << "blk " << std::setw(2) << NumNodesBlocking << ")|"; 687 dbgs() << dbgstr.str(); 688 }); 689 690 // Factor in reg pressure as a heuristic. 691 if (!IgnoreBBRegPressure) { 692 // Decrease priority by the amount that register pressure exceeds the limit. 693 ResCount -= (Delta.Excess.getUnitInc()*PriorityOne); 694 // Decrease priority if register pressure exceeds the limit. 695 ResCount -= (Delta.CriticalMax.getUnitInc()*PriorityOne); 696 // Decrease priority slightly if register pressure would increase over the 697 // current maximum. 698 ResCount -= (Delta.CurrentMax.getUnitInc()*PriorityTwo); 699 DEBUG(if (verbose) { 700 dbgs() << "RP " << Delta.Excess.getUnitInc() << "/" 701 << Delta.CriticalMax.getUnitInc() <<"/" 702 << Delta.CurrentMax.getUnitInc() << ")|"; 703 }); 704 } 705 706 // Give a little extra priority to a .cur instruction if there is a resource 707 // available for it. 708 auto &QST = DAG->MF.getSubtarget<HexagonSubtarget>(); 709 auto &QII = *QST.getInstrInfo(); 710 if (SU->isInstr() && QII.mayBeCurLoad(*SU->getInstr())) { 711 if (Q.getID() == TopQID && Top.ResourceModel->isResourceAvailable(SU)) { 712 ResCount += PriorityTwo; 713 DEBUG(if (verbose) dbgs() << "C|"); 714 } else if (Q.getID() == BotQID && 715 Bot.ResourceModel->isResourceAvailable(SU)) { 716 ResCount += PriorityTwo; 717 DEBUG(if (verbose) dbgs() << "C|"); 718 } 719 } 720 721 // Give preference to a zero latency instruction if the dependent 722 // instruction is in the current packet. 723 if (Q.getID() == TopQID) { 724 for (const SDep &PI : SU->Preds) { 725 if (!PI.getSUnit()->getInstr()->isPseudo() && PI.isAssignedRegDep() && 726 PI.getLatency() == 0 && 727 Top.ResourceModel->isInPacket(PI.getSUnit())) { 728 ResCount += PriorityThree; 729 DEBUG(if (verbose) dbgs() << "Z|"); 730 } 731 } 732 } else { 733 for (const SDep &SI : SU->Succs) { 734 if (!SI.getSUnit()->getInstr()->isPseudo() && SI.isAssignedRegDep() && 735 SI.getLatency() == 0 && 736 Bot.ResourceModel->isInPacket(SI.getSUnit())) { 737 ResCount += PriorityThree; 738 DEBUG(if (verbose) dbgs() << "Z|"); 739 } 740 } 741 } 742 743 // Give less preference to an instruction that will cause a stall with 744 // an instruction in the previous packet. 745 if (QII.isV60VectorInstruction(Instr)) { 746 // Check for stalls in the previous packet. 747 if (Q.getID() == TopQID) { 748 for (auto J : Top.ResourceModel->OldPacket) 749 if (QII.producesStall(*J->getInstr(), Instr)) 750 ResCount -= PriorityOne; 751 } else { 752 for (auto J : Bot.ResourceModel->OldPacket) 753 if (QII.producesStall(Instr, *J->getInstr())) 754 ResCount -= PriorityOne; 755 } 756 } 757 758 // If the instruction has a non-zero latency dependence with an instruction in 759 // the current packet, then it should not be scheduled yet. The case occurs 760 // when the dependent instruction is scheduled in a new packet, so the 761 // scheduler updates the current cycle and pending instructions become 762 // available. 763 if (CheckEarlyAvail) { 764 if (Q.getID() == TopQID) { 765 for (const auto &PI : SU->Preds) { 766 if (PI.getLatency() > 0 && 767 Top.ResourceModel->isInPacket(PI.getSUnit())) { 768 ResCount -= PriorityOne; 769 DEBUG(if (verbose) dbgs() << "D|"); 770 } 771 } 772 } else { 773 for (const auto &SI : SU->Succs) { 774 if (SI.getLatency() > 0 && 775 Bot.ResourceModel->isInPacket(SI.getSUnit())) { 776 ResCount -= PriorityOne; 777 DEBUG(if (verbose) dbgs() << "D|"); 778 } 779 } 780 } 781 } 782 783 DEBUG(if (verbose) { 784 std::stringstream dbgstr; 785 dbgstr << "Total " << std::setw(4) << ResCount << ")"; 786 dbgs() << dbgstr.str(); 787 }); 788 789 return ResCount; 790 } 791 792 /// Pick the best candidate from the top queue. 793 /// 794 /// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during 795 /// DAG building. To adjust for the current scheduling location we need to 796 /// maintain the number of vreg uses remaining to be top-scheduled. 797 ConvergingVLIWScheduler::CandResult ConvergingVLIWScheduler:: 798 pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker, 799 SchedCandidate &Candidate) { 800 DEBUG(if (SchedDebugVerboseLevel > 1) 801 readyQueueVerboseDump(RPTracker, Candidate, Q); 802 else Q.dump();); 803 804 // getMaxPressureDelta temporarily modifies the tracker. 805 RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker); 806 807 // BestSU remains NULL if no top candidates beat the best existing candidate. 808 CandResult FoundCandidate = NoCand; 809 for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) { 810 RegPressureDelta RPDelta; 811 TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta, 812 DAG->getRegionCriticalPSets(), 813 DAG->getRegPressure().MaxSetPressure); 814 815 int CurrentCost = SchedulingCost(Q, *I, Candidate, RPDelta, false); 816 817 // Initialize the candidate if needed. 818 if (!Candidate.SU) { 819 DEBUG(traceCandidate("DCAND", Q, *I, CurrentCost)); 820 Candidate.SU = *I; 821 Candidate.RPDelta = RPDelta; 822 Candidate.SCost = CurrentCost; 823 FoundCandidate = NodeOrder; 824 continue; 825 } 826 827 // Best cost. 828 if (CurrentCost > Candidate.SCost) { 829 DEBUG(traceCandidate("CCAND", Q, *I, CurrentCost)); 830 Candidate.SU = *I; 831 Candidate.RPDelta = RPDelta; 832 Candidate.SCost = CurrentCost; 833 FoundCandidate = BestCost; 834 continue; 835 } 836 837 // Tie breaker using Timing Class. 838 if (!DisableTCTie) { 839 auto &QST = DAG->MF.getSubtarget<HexagonSubtarget>(); 840 auto &QII = *QST.getInstrInfo(); 841 842 const MachineInstr *MI = (*I)->getInstr(); 843 const MachineInstr *CandI = Candidate.SU->getInstr(); 844 const InstrItineraryData *InstrItins = QST.getInstrItineraryData(); 845 846 unsigned InstrLatency = QII.getInstrTimingClassLatency(InstrItins, *MI); 847 unsigned CandLatency = QII.getInstrTimingClassLatency(InstrItins, *CandI); 848 DEBUG(dbgs() << "TC Tie Breaker Cand: " 849 << CandLatency << " Instr:" << InstrLatency << "\n" 850 << *MI << *CandI << "\n"); 851 if (Q.getID() == TopQID && CurrentCost == Candidate.SCost) { 852 if (InstrLatency < CandLatency && TopUseShorterTie) { 853 Candidate.SU = *I; 854 Candidate.RPDelta = RPDelta; 855 Candidate.SCost = CurrentCost; 856 FoundCandidate = BestCost; 857 DEBUG(dbgs() << "Used top shorter tie breaker\n"); 858 continue; 859 } else if (InstrLatency > CandLatency && !TopUseShorterTie) { 860 Candidate.SU = *I; 861 Candidate.RPDelta = RPDelta; 862 Candidate.SCost = CurrentCost; 863 FoundCandidate = BestCost; 864 DEBUG(dbgs() << "Used top longer tie breaker\n"); 865 continue; 866 } 867 } else if (Q.getID() == BotQID && CurrentCost == Candidate.SCost) { 868 if (InstrLatency < CandLatency && BotUseShorterTie) { 869 Candidate.SU = *I; 870 Candidate.RPDelta = RPDelta; 871 Candidate.SCost = CurrentCost; 872 FoundCandidate = BestCost; 873 DEBUG(dbgs() << "Used Bot shorter tie breaker\n"); 874 continue; 875 } else if (InstrLatency > CandLatency && !BotUseShorterTie) { 876 Candidate.SU = *I; 877 Candidate.RPDelta = RPDelta; 878 Candidate.SCost = CurrentCost; 879 FoundCandidate = BestCost; 880 DEBUG(dbgs() << "Used Bot longer tie breaker\n"); 881 continue; 882 } 883 } 884 } 885 886 if (CurrentCost == Candidate.SCost) { 887 if ((Q.getID() == TopQID && 888 (*I)->Succs.size() > Candidate.SU->Succs.size()) || 889 (Q.getID() == BotQID && 890 (*I)->Preds.size() < Candidate.SU->Preds.size())) { 891 DEBUG(traceCandidate("SPCAND", Q, *I, CurrentCost)); 892 Candidate.SU = *I; 893 Candidate.RPDelta = RPDelta; 894 Candidate.SCost = CurrentCost; 895 FoundCandidate = BestCost; 896 continue; 897 } 898 } 899 900 // Fall through to original instruction order. 901 // Only consider node order if Candidate was chosen from this Q. 902 if (FoundCandidate == NoCand) 903 continue; 904 } 905 return FoundCandidate; 906 } 907 908 /// Pick the best candidate node from either the top or bottom queue. 909 SUnit *ConvergingVLIWScheduler::pickNodeBidrectional(bool &IsTopNode) { 910 // Schedule as far as possible in the direction of no choice. This is most 911 // efficient, but also provides the best heuristics for CriticalPSets. 912 if (SUnit *SU = Bot.pickOnlyChoice()) { 913 DEBUG(dbgs() << "Picked only Bottom\n"); 914 IsTopNode = false; 915 return SU; 916 } 917 if (SUnit *SU = Top.pickOnlyChoice()) { 918 DEBUG(dbgs() << "Picked only Top\n"); 919 IsTopNode = true; 920 return SU; 921 } 922 SchedCandidate BotCand; 923 // Prefer bottom scheduling when heuristics are silent. 924 CandResult BotResult = pickNodeFromQueue(Bot.Available, 925 DAG->getBotRPTracker(), BotCand); 926 assert(BotResult != NoCand && "failed to find the first candidate"); 927 928 // If either Q has a single candidate that provides the least increase in 929 // Excess pressure, we can immediately schedule from that Q. 930 // 931 // RegionCriticalPSets summarizes the pressure within the scheduled region and 932 // affects picking from either Q. If scheduling in one direction must 933 // increase pressure for one of the excess PSets, then schedule in that 934 // direction first to provide more freedom in the other direction. 935 if (BotResult == SingleExcess || BotResult == SingleCritical) { 936 DEBUG(dbgs() << "Prefered Bottom Node\n"); 937 IsTopNode = false; 938 return BotCand.SU; 939 } 940 // Check if the top Q has a better candidate. 941 SchedCandidate TopCand; 942 CandResult TopResult = pickNodeFromQueue(Top.Available, 943 DAG->getTopRPTracker(), TopCand); 944 assert(TopResult != NoCand && "failed to find the first candidate"); 945 946 if (TopResult == SingleExcess || TopResult == SingleCritical) { 947 DEBUG(dbgs() << "Prefered Top Node\n"); 948 IsTopNode = true; 949 return TopCand.SU; 950 } 951 // If either Q has a single candidate that minimizes pressure above the 952 // original region's pressure pick it. 953 if (BotResult == SingleMax) { 954 DEBUG(dbgs() << "Prefered Bottom Node SingleMax\n"); 955 IsTopNode = false; 956 return BotCand.SU; 957 } 958 if (TopResult == SingleMax) { 959 DEBUG(dbgs() << "Prefered Top Node SingleMax\n"); 960 IsTopNode = true; 961 return TopCand.SU; 962 } 963 if (TopCand.SCost > BotCand.SCost) { 964 DEBUG(dbgs() << "Prefered Top Node Cost\n"); 965 IsTopNode = true; 966 return TopCand.SU; 967 } 968 // Otherwise prefer the bottom candidate in node order. 969 DEBUG(dbgs() << "Prefered Bottom in Node order\n"); 970 IsTopNode = false; 971 return BotCand.SU; 972 } 973 974 /// Pick the best node to balance the schedule. Implements MachineSchedStrategy. 975 SUnit *ConvergingVLIWScheduler::pickNode(bool &IsTopNode) { 976 if (DAG->top() == DAG->bottom()) { 977 assert(Top.Available.empty() && Top.Pending.empty() && 978 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); 979 return nullptr; 980 } 981 SUnit *SU; 982 if (llvm::ForceTopDown) { 983 SU = Top.pickOnlyChoice(); 984 if (!SU) { 985 SchedCandidate TopCand; 986 CandResult TopResult = 987 pickNodeFromQueue(Top.Available, DAG->getTopRPTracker(), TopCand); 988 assert(TopResult != NoCand && "failed to find the first candidate"); 989 (void)TopResult; 990 SU = TopCand.SU; 991 } 992 IsTopNode = true; 993 } else if (llvm::ForceBottomUp) { 994 SU = Bot.pickOnlyChoice(); 995 if (!SU) { 996 SchedCandidate BotCand; 997 CandResult BotResult = 998 pickNodeFromQueue(Bot.Available, DAG->getBotRPTracker(), BotCand); 999 assert(BotResult != NoCand && "failed to find the first candidate"); 1000 (void)BotResult; 1001 SU = BotCand.SU; 1002 } 1003 IsTopNode = false; 1004 } else { 1005 SU = pickNodeBidrectional(IsTopNode); 1006 } 1007 if (SU->isTopReady()) 1008 Top.removeReady(SU); 1009 if (SU->isBottomReady()) 1010 Bot.removeReady(SU); 1011 1012 DEBUG(dbgs() << "*** " << (IsTopNode ? "Top" : "Bottom") 1013 << " Scheduling Instruction in cycle " 1014 << (IsTopNode ? Top.CurrCycle : Bot.CurrCycle) << '\n'; 1015 SU->dump(DAG)); 1016 return SU; 1017 } 1018 1019 /// Update the scheduler's state after scheduling a node. This is the same node 1020 /// that was just returned by pickNode(). However, VLIWMachineScheduler needs 1021 /// to update it's state based on the current cycle before MachineSchedStrategy 1022 /// does. 1023 void ConvergingVLIWScheduler::schedNode(SUnit *SU, bool IsTopNode) { 1024 if (IsTopNode) { 1025 SU->TopReadyCycle = Top.CurrCycle; 1026 Top.bumpNode(SU); 1027 } else { 1028 SU->BotReadyCycle = Bot.CurrCycle; 1029 Bot.bumpNode(SU); 1030 } 1031 } 1032