1 //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This contains a MachineSchedStrategy implementation for maximizing wave 11 /// occupancy on GCN hardware. 12 //===----------------------------------------------------------------------===// 13 14 #include "GCNSchedStrategy.h" 15 #include "SIMachineFunctionInfo.h" 16 17 #define DEBUG_TYPE "machine-scheduler" 18 19 using namespace llvm; 20 21 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( 22 const MachineSchedContext *C) : 23 GenericScheduler(C), TargetOccupancy(0), HasClusteredNodes(false), 24 HasExcessPressure(false), MF(nullptr) { } 25 26 void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) { 27 GenericScheduler::initialize(DAG); 28 29 MF = &DAG->MF; 30 31 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 32 33 // FIXME: This is also necessary, because some passes that run after 34 // scheduling and before regalloc increase register pressure. 35 const unsigned ErrorMargin = 3; 36 37 SGPRExcessLimit = 38 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass); 39 VGPRExcessLimit = 40 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass); 41 42 SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); 43 // Set the initial TargetOccupnacy to the maximum occupancy that we can 44 // achieve for this function. This effectively sets a lower bound on the 45 // 'Critical' register limits in the scheduler. 46 TargetOccupancy = MFI.getOccupancy(); 47 SGPRCriticalLimit = 48 std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit); 49 VGPRCriticalLimit = 50 std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit); 51 52 // Subtract error margin from register limits and avoid overflow. 53 SGPRCriticalLimit = 54 std::min(SGPRCriticalLimit - ErrorMargin, SGPRCriticalLimit); 55 VGPRCriticalLimit = 56 std::min(VGPRCriticalLimit - ErrorMargin, VGPRCriticalLimit); 57 SGPRExcessLimit = std::min(SGPRExcessLimit - ErrorMargin, SGPRExcessLimit); 58 VGPRExcessLimit = std::min(VGPRExcessLimit - ErrorMargin, VGPRExcessLimit); 59 } 60 61 void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, 62 bool AtTop, const RegPressureTracker &RPTracker, 63 const SIRegisterInfo *SRI, 64 unsigned SGPRPressure, 65 unsigned VGPRPressure) { 66 67 Cand.SU = SU; 68 Cand.AtTop = AtTop; 69 70 // getDownwardPressure() and getUpwardPressure() make temporary changes to 71 // the tracker, so we need to pass those function a non-const copy. 72 RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker); 73 74 Pressure.clear(); 75 MaxPressure.clear(); 76 77 if (AtTop) 78 TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure); 79 else { 80 // FIXME: I think for bottom up scheduling, the register pressure is cached 81 // and can be retrieved by DAG->getPressureDif(SU). 82 TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); 83 } 84 85 unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; 86 unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; 87 88 // If two instructions increase the pressure of different register sets 89 // by the same amount, the generic scheduler will prefer to schedule the 90 // instruction that increases the set with the least amount of registers, 91 // which in our case would be SGPRs. This is rarely what we want, so 92 // when we report excess/critical register pressure, we do it either 93 // only for VGPRs or only for SGPRs. 94 95 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs. 96 const unsigned MaxVGPRPressureInc = 16; 97 bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit; 98 bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit; 99 100 101 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold 102 // to increase the likelihood we don't go over the limits. We should improve 103 // the analysis to look through dependencies to find the path with the least 104 // register pressure. 105 106 // We only need to update the RPDelta for instructions that increase register 107 // pressure. Instructions that decrease or keep reg pressure the same will be 108 // marked as RegExcess in tryCandidate() when they are compared with 109 // instructions that increase the register pressure. 110 if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) { 111 HasExcessPressure = true; 112 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); 113 Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit); 114 } 115 116 if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) { 117 HasExcessPressure = true; 118 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32); 119 Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit); 120 } 121 122 // Register pressure is considered 'CRITICAL' if it is approaching a value 123 // that would reduce the wave occupancy for the execution unit. When 124 // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both 125 // has the same cost, so we don't need to prefer one over the other. 126 127 int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit; 128 int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit; 129 130 if (SGPRDelta >= 0 || VGPRDelta >= 0) { 131 HasExcessPressure = true; 132 if (SGPRDelta > VGPRDelta) { 133 Cand.RPDelta.CriticalMax = 134 PressureChange(AMDGPU::RegisterPressureSets::SReg_32); 135 Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta); 136 } else { 137 Cand.RPDelta.CriticalMax = 138 PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); 139 Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta); 140 } 141 } 142 } 143 144 // This function is mostly cut and pasted from 145 // GenericScheduler::pickNodeFromQueue() 146 void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, 147 const CandPolicy &ZonePolicy, 148 const RegPressureTracker &RPTracker, 149 SchedCandidate &Cand) { 150 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI); 151 ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos(); 152 unsigned SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; 153 unsigned VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; 154 ReadyQueue &Q = Zone.Available; 155 for (SUnit *SU : Q) { 156 157 SchedCandidate TryCand(ZonePolicy); 158 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, 159 SGPRPressure, VGPRPressure); 160 // Pass SchedBoundary only when comparing nodes from the same boundary. 161 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; 162 GenericScheduler::tryCandidate(Cand, TryCand, ZoneArg); 163 if (TryCand.Reason != NoCand) { 164 // Initialize resource delta if needed in case future heuristics query it. 165 if (TryCand.ResDelta == SchedResourceDelta()) 166 TryCand.initResourceDelta(Zone.DAG, SchedModel); 167 Cand.setBest(TryCand); 168 LLVM_DEBUG(traceCandidate(Cand)); 169 } 170 } 171 } 172 173 // This function is mostly cut and pasted from 174 // GenericScheduler::pickNodeBidirectional() 175 SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) { 176 // Schedule as far as possible in the direction of no choice. This is most 177 // efficient, but also provides the best heuristics for CriticalPSets. 178 if (SUnit *SU = Bot.pickOnlyChoice()) { 179 IsTopNode = false; 180 return SU; 181 } 182 if (SUnit *SU = Top.pickOnlyChoice()) { 183 IsTopNode = true; 184 return SU; 185 } 186 // Set the bottom-up policy based on the state of the current bottom zone and 187 // the instructions outside the zone, including the top zone. 188 CandPolicy BotPolicy; 189 setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top); 190 // Set the top-down policy based on the state of the current top zone and 191 // the instructions outside the zone, including the bottom zone. 192 CandPolicy TopPolicy; 193 setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot); 194 195 // See if BotCand is still valid (because we previously scheduled from Top). 196 LLVM_DEBUG(dbgs() << "Picking from Bot:\n"); 197 if (!BotCand.isValid() || BotCand.SU->isScheduled || 198 BotCand.Policy != BotPolicy) { 199 BotCand.reset(CandPolicy()); 200 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand); 201 assert(BotCand.Reason != NoCand && "failed to find the first candidate"); 202 } else { 203 LLVM_DEBUG(traceCandidate(BotCand)); 204 #ifndef NDEBUG 205 if (VerifyScheduling) { 206 SchedCandidate TCand; 207 TCand.reset(CandPolicy()); 208 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand); 209 assert(TCand.SU == BotCand.SU && 210 "Last pick result should correspond to re-picking right now"); 211 } 212 #endif 213 } 214 215 // Check if the top Q has a better candidate. 216 LLVM_DEBUG(dbgs() << "Picking from Top:\n"); 217 if (!TopCand.isValid() || TopCand.SU->isScheduled || 218 TopCand.Policy != TopPolicy) { 219 TopCand.reset(CandPolicy()); 220 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand); 221 assert(TopCand.Reason != NoCand && "failed to find the first candidate"); 222 } else { 223 LLVM_DEBUG(traceCandidate(TopCand)); 224 #ifndef NDEBUG 225 if (VerifyScheduling) { 226 SchedCandidate TCand; 227 TCand.reset(CandPolicy()); 228 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand); 229 assert(TCand.SU == TopCand.SU && 230 "Last pick result should correspond to re-picking right now"); 231 } 232 #endif 233 } 234 235 // Pick best from BotCand and TopCand. 236 LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand); 237 dbgs() << "Bot Cand: "; traceCandidate(BotCand);); 238 SchedCandidate Cand = BotCand; 239 TopCand.Reason = NoCand; 240 GenericScheduler::tryCandidate(Cand, TopCand, nullptr); 241 if (TopCand.Reason != NoCand) { 242 Cand.setBest(TopCand); 243 } 244 LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand);); 245 246 IsTopNode = Cand.AtTop; 247 return Cand.SU; 248 } 249 250 // This function is mostly cut and pasted from 251 // GenericScheduler::pickNode() 252 SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) { 253 if (DAG->top() == DAG->bottom()) { 254 assert(Top.Available.empty() && Top.Pending.empty() && 255 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); 256 return nullptr; 257 } 258 SUnit *SU; 259 do { 260 if (RegionPolicy.OnlyTopDown) { 261 SU = Top.pickOnlyChoice(); 262 if (!SU) { 263 CandPolicy NoPolicy; 264 TopCand.reset(NoPolicy); 265 pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand); 266 assert(TopCand.Reason != NoCand && "failed to find a candidate"); 267 SU = TopCand.SU; 268 } 269 IsTopNode = true; 270 } else if (RegionPolicy.OnlyBottomUp) { 271 SU = Bot.pickOnlyChoice(); 272 if (!SU) { 273 CandPolicy NoPolicy; 274 BotCand.reset(NoPolicy); 275 pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand); 276 assert(BotCand.Reason != NoCand && "failed to find a candidate"); 277 SU = BotCand.SU; 278 } 279 IsTopNode = false; 280 } else { 281 SU = pickNodeBidirectional(IsTopNode); 282 } 283 } while (SU->isScheduled); 284 285 if (SU->isTopReady()) 286 Top.removeReady(SU); 287 if (SU->isBottomReady()) 288 Bot.removeReady(SU); 289 290 if (!HasClusteredNodes && SU->getInstr()->mayLoadOrStore()) { 291 for (SDep &Dep : SU->Preds) { 292 if (Dep.isCluster()) { 293 HasClusteredNodes = true; 294 break; 295 } 296 } 297 } 298 299 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " 300 << *SU->getInstr()); 301 return SU; 302 } 303 304 GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C, 305 std::unique_ptr<MachineSchedStrategy> S) : 306 ScheduleDAGMILive(C, std::move(S)), 307 ST(MF.getSubtarget<GCNSubtarget>()), 308 MFI(*MF.getInfo<SIMachineFunctionInfo>()), 309 StartingOccupancy(MFI.getOccupancy()), 310 MinOccupancy(StartingOccupancy), Stage(Collect), RegionIdx(0) { 311 312 LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); 313 } 314 315 void GCNScheduleDAGMILive::schedule() { 316 if (Stage == Collect) { 317 // Just record regions at the first pass. 318 Regions.push_back(std::make_pair(RegionBegin, RegionEnd)); 319 return; 320 } 321 322 std::vector<MachineInstr*> Unsched; 323 Unsched.reserve(NumRegionInstrs); 324 for (auto &I : *this) { 325 Unsched.push_back(&I); 326 } 327 328 GCNRegPressure PressureBefore; 329 if (LIS) { 330 PressureBefore = Pressure[RegionIdx]; 331 332 LLVM_DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:"; 333 GCNRPTracker::printLiveRegs(dbgs(), LiveIns[RegionIdx], MRI); 334 dbgs() << "Region live-in pressure: "; 335 llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs()); 336 dbgs() << "Region register pressure: "; 337 PressureBefore.print(dbgs())); 338 } 339 340 GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl; 341 // Set HasClusteredNodes to true for late stages where we have already 342 // collected it. That way pickNode() will not scan SDep's when not needed. 343 S.HasClusteredNodes = Stage > InitialSchedule; 344 S.HasExcessPressure = false; 345 ScheduleDAGMILive::schedule(); 346 Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd); 347 RescheduleRegions[RegionIdx] = false; 348 if (Stage == InitialSchedule && S.HasClusteredNodes) 349 RegionsWithClusters[RegionIdx] = true; 350 if (S.HasExcessPressure) 351 RegionsWithHighRP[RegionIdx] = true; 352 353 if (!LIS) 354 return; 355 356 // Check the results of scheduling. 357 auto PressureAfter = getRealRegPressure(); 358 359 LLVM_DEBUG(dbgs() << "Pressure after scheduling: "; 360 PressureAfter.print(dbgs())); 361 362 if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && 363 PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { 364 Pressure[RegionIdx] = PressureAfter; 365 RegionsWithMinOcc[RegionIdx] = 366 PressureAfter.getOccupancy(ST) == MinOccupancy; 367 368 LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n"); 369 return; 370 } 371 372 unsigned WavesAfter = 373 std::min(S.TargetOccupancy, PressureAfter.getOccupancy(ST)); 374 unsigned WavesBefore = 375 std::min(S.TargetOccupancy, PressureBefore.getOccupancy(ST)); 376 LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore 377 << ", after " << WavesAfter << ".\n"); 378 379 // We may not be able to keep the current target occupancy because of the just 380 // scheduled region. We might still be able to revert scheduling if the 381 // occupancy before was higher, or if the current schedule has register 382 // pressure higher than the excess limits which could lead to more spilling. 383 unsigned NewOccupancy = std::max(WavesAfter, WavesBefore); 384 385 // Allow memory bound functions to drop to 4 waves if not limited by an 386 // attribute. 387 if (WavesAfter < WavesBefore && WavesAfter < MinOccupancy && 388 WavesAfter >= MFI.getMinAllowedOccupancy()) { 389 LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to " 390 << MFI.getMinAllowedOccupancy() << " waves\n"); 391 NewOccupancy = WavesAfter; 392 } 393 394 if (NewOccupancy < MinOccupancy) { 395 MinOccupancy = NewOccupancy; 396 MFI.limitOccupancy(MinOccupancy); 397 RegionsWithMinOcc.reset(); 398 LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " 399 << MinOccupancy << ".\n"); 400 } 401 402 unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); 403 unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); 404 if (PressureAfter.getVGPRNum(false) > MaxVGPRs || 405 PressureAfter.getAGPRNum() > MaxVGPRs || 406 PressureAfter.getSGPRNum() > MaxSGPRs) { 407 RescheduleRegions[RegionIdx] = true; 408 RegionsWithHighRP[RegionIdx] = true; 409 } 410 411 // If this condition is true, then either the occupancy before and after 412 // scheduling is the same, or we are allowing the occupancy to drop because 413 // the function is memory bound. Even if we are OK with the current occupancy, 414 // we still need to verify that we will not introduce any extra chance of 415 // spilling. 416 if (WavesAfter >= MinOccupancy) { 417 if (Stage == UnclusteredReschedule && 418 !PressureAfter.less(ST, PressureBefore)) { 419 LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n"); 420 } else if (WavesAfter > MFI.getMinWavesPerEU() || 421 PressureAfter.less(ST, PressureBefore) || 422 !RescheduleRegions[RegionIdx]) { 423 Pressure[RegionIdx] = PressureAfter; 424 RegionsWithMinOcc[RegionIdx] = 425 PressureAfter.getOccupancy(ST) == MinOccupancy; 426 if (!RegionsWithClusters[RegionIdx] && 427 (Stage + 1) == UnclusteredReschedule) 428 RescheduleRegions[RegionIdx] = false; 429 return; 430 } else { 431 LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n"); 432 } 433 } 434 435 RegionsWithMinOcc[RegionIdx] = 436 PressureBefore.getOccupancy(ST) == MinOccupancy; 437 LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); 438 RescheduleRegions[RegionIdx] = RegionsWithClusters[RegionIdx] || 439 (Stage + 1) != UnclusteredReschedule; 440 RegionEnd = RegionBegin; 441 int SkippedDebugInstr = 0; 442 for (MachineInstr *MI : Unsched) { 443 if (MI->isDebugInstr()) { 444 ++SkippedDebugInstr; 445 continue; 446 } 447 448 if (MI->getIterator() != RegionEnd) { 449 BB->remove(MI); 450 BB->insert(RegionEnd, MI); 451 if (!MI->isDebugInstr()) 452 LIS->handleMove(*MI, true); 453 } 454 // Reset read-undef flags and update them later. 455 for (auto &Op : MI->operands()) 456 if (Op.isReg() && Op.isDef()) 457 Op.setIsUndef(false); 458 RegisterOperands RegOpers; 459 RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false); 460 if (!MI->isDebugInstr()) { 461 if (ShouldTrackLaneMasks) { 462 // Adjust liveness and add missing dead+read-undef flags. 463 SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot(); 464 RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI); 465 } else { 466 // Adjust for missing dead-def flags. 467 RegOpers.detectDeadDefs(*MI, *LIS); 468 } 469 } 470 RegionEnd = MI->getIterator(); 471 ++RegionEnd; 472 LLVM_DEBUG(dbgs() << "Scheduling " << *MI); 473 } 474 475 // After reverting schedule, debug instrs will now be at the end of the block 476 // and RegionEnd will point to the first debug instr. Increment RegionEnd 477 // pass debug instrs to the actual end of the scheduling region. 478 while (SkippedDebugInstr-- > 0) 479 ++RegionEnd; 480 481 // If Unsched.front() instruction is a debug instruction, this will actually 482 // shrink the region since we moved all debug instructions to the end of the 483 // block. Find the first instruction that is not a debug instruction. 484 RegionBegin = Unsched.front()->getIterator(); 485 if (RegionBegin->isDebugInstr()) { 486 for (MachineInstr *MI : Unsched) { 487 if (MI->isDebugInstr()) 488 continue; 489 RegionBegin = MI->getIterator(); 490 break; 491 } 492 } 493 494 // Then move the debug instructions back into their correct place and set 495 // RegionBegin and RegionEnd if needed. 496 placeDebugValues(); 497 498 Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd); 499 } 500 501 GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure() const { 502 GCNDownwardRPTracker RPTracker(*LIS); 503 RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]); 504 return RPTracker.moveMaxPressure(); 505 } 506 507 void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) { 508 GCNDownwardRPTracker RPTracker(*LIS); 509 510 // If the block has the only successor then live-ins of that successor are 511 // live-outs of the current block. We can reuse calculated live set if the 512 // successor will be sent to scheduling past current block. 513 const MachineBasicBlock *OnlySucc = nullptr; 514 if (MBB->succ_size() == 1 && !(*MBB->succ_begin())->empty()) { 515 SlotIndexes *Ind = LIS->getSlotIndexes(); 516 if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(*MBB->succ_begin())) 517 OnlySucc = *MBB->succ_begin(); 518 } 519 520 // Scheduler sends regions from the end of the block upwards. 521 size_t CurRegion = RegionIdx; 522 for (size_t E = Regions.size(); CurRegion != E; ++CurRegion) 523 if (Regions[CurRegion].first->getParent() != MBB) 524 break; 525 --CurRegion; 526 527 auto I = MBB->begin(); 528 auto LiveInIt = MBBLiveIns.find(MBB); 529 auto &Rgn = Regions[CurRegion]; 530 auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second); 531 if (LiveInIt != MBBLiveIns.end()) { 532 auto LiveIn = std::move(LiveInIt->second); 533 RPTracker.reset(*MBB->begin(), &LiveIn); 534 MBBLiveIns.erase(LiveInIt); 535 } else { 536 I = Rgn.first; 537 auto LRS = BBLiveInMap.lookup(NonDbgMI); 538 #ifdef EXPENSIVE_CHECKS 539 assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS)); 540 #endif 541 RPTracker.reset(*I, &LRS); 542 } 543 544 for ( ; ; ) { 545 I = RPTracker.getNext(); 546 547 if (Regions[CurRegion].first == I || NonDbgMI == I) { 548 LiveIns[CurRegion] = RPTracker.getLiveRegs(); 549 RPTracker.clearMaxPressure(); 550 } 551 552 if (Regions[CurRegion].second == I) { 553 Pressure[CurRegion] = RPTracker.moveMaxPressure(); 554 if (CurRegion-- == RegionIdx) 555 break; 556 } 557 RPTracker.advanceToNext(); 558 RPTracker.advanceBeforeNext(); 559 } 560 561 if (OnlySucc) { 562 if (I != MBB->end()) { 563 RPTracker.advanceToNext(); 564 RPTracker.advance(MBB->end()); 565 } 566 RPTracker.reset(*OnlySucc->begin(), &RPTracker.getLiveRegs()); 567 RPTracker.advanceBeforeNext(); 568 MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs(); 569 } 570 } 571 572 DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> 573 GCNScheduleDAGMILive::getBBLiveInMap() const { 574 assert(!Regions.empty()); 575 std::vector<MachineInstr *> BBStarters; 576 BBStarters.reserve(Regions.size()); 577 auto I = Regions.rbegin(), E = Regions.rend(); 578 auto *BB = I->first->getParent(); 579 do { 580 auto *MI = &*skipDebugInstructionsForward(I->first, I->second); 581 BBStarters.push_back(MI); 582 do { 583 ++I; 584 } while (I != E && I->first->getParent() == BB); 585 } while (I != E); 586 return getLiveRegMap(BBStarters, false /*After*/, *LIS); 587 } 588 589 void GCNScheduleDAGMILive::finalizeSchedule() { 590 LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); 591 592 LiveIns.resize(Regions.size()); 593 Pressure.resize(Regions.size()); 594 RescheduleRegions.resize(Regions.size()); 595 RegionsWithClusters.resize(Regions.size()); 596 RegionsWithHighRP.resize(Regions.size()); 597 RegionsWithMinOcc.resize(Regions.size()); 598 RescheduleRegions.set(); 599 RegionsWithClusters.reset(); 600 RegionsWithHighRP.reset(); 601 RegionsWithMinOcc.reset(); 602 603 if (!Regions.empty()) 604 BBLiveInMap = getBBLiveInMap(); 605 606 std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations; 607 608 do { 609 Stage++; 610 RegionIdx = 0; 611 MachineBasicBlock *MBB = nullptr; 612 613 if (Stage > InitialSchedule) { 614 if (!LIS) 615 break; 616 617 // Retry function scheduling if we found resulting occupancy and it is 618 // lower than used for first pass scheduling. This will give more freedom 619 // to schedule low register pressure blocks. 620 // Code is partially copied from MachineSchedulerBase::scheduleRegions(). 621 622 if (Stage == UnclusteredReschedule) { 623 if (RescheduleRegions.none()) 624 continue; 625 LLVM_DEBUG(dbgs() << 626 "Retrying function scheduling without clustering.\n"); 627 } 628 629 if (Stage == ClusteredLowOccupancyReschedule) { 630 if (StartingOccupancy <= MinOccupancy) 631 break; 632 633 LLVM_DEBUG( 634 dbgs() 635 << "Retrying function scheduling with lowest recorded occupancy " 636 << MinOccupancy << ".\n"); 637 } 638 639 if (Stage == PreRARematerialize) { 640 if (RegionsWithMinOcc.count() != 1 || Regions.size() == 1) 641 break; 642 643 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 644 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 645 // Check maximum occupancy 646 if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) == 647 MinOccupancy) 648 break; 649 650 // FIXME: This pass will invalidate cached LiveIns, MBBLiveIns and 651 // Pressure for regions inbetween the defs and region we sinked the def 652 // to. Will need to be fixed if there is another pass after this pass. 653 static_assert(LastStage == PreRARematerialize, 654 "Passes after PreRARematerialize are not supported"); 655 656 unsigned HighRPIdx = RegionsWithMinOcc.find_first(); 657 collectRematerializableInstructions(HighRPIdx); 658 if (RematerializableInsts.empty() || 659 !sinkTriviallyRematInsts(ST, TII, HighRPIdx)) 660 break; 661 662 LLVM_DEBUG( 663 dbgs() << "Retrying function scheduling with improved occupancy of " 664 << MinOccupancy << " from rematerializing\n"); 665 } 666 } 667 668 if (Stage == UnclusteredReschedule) 669 SavedMutations.swap(Mutations); 670 671 for (auto Region : Regions) { 672 if (((Stage == UnclusteredReschedule || Stage == PreRARematerialize) && 673 !RescheduleRegions[RegionIdx]) || 674 (Stage == ClusteredLowOccupancyReschedule && 675 !RegionsWithClusters[RegionIdx] && !RegionsWithHighRP[RegionIdx])) { 676 677 ++RegionIdx; 678 continue; 679 } 680 681 RegionBegin = Region.first; 682 RegionEnd = Region.second; 683 684 if (RegionBegin->getParent() != MBB) { 685 if (MBB) finishBlock(); 686 MBB = RegionBegin->getParent(); 687 startBlock(MBB); 688 if (Stage == InitialSchedule) 689 computeBlockPressure(MBB); 690 } 691 692 unsigned NumRegionInstrs = std::distance(begin(), end()); 693 enterRegion(MBB, begin(), end(), NumRegionInstrs); 694 695 // Skip empty scheduling regions (0 or 1 schedulable instructions). 696 if (begin() == end() || begin() == std::prev(end())) { 697 exitRegion(); 698 ++RegionIdx; 699 continue; 700 } 701 702 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n"); 703 LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*MBB) << " " 704 << MBB->getName() << "\n From: " << *begin() 705 << " To: "; 706 if (RegionEnd != MBB->end()) dbgs() << *RegionEnd; 707 else dbgs() << "End"; 708 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); 709 710 schedule(); 711 712 exitRegion(); 713 ++RegionIdx; 714 } 715 finishBlock(); 716 717 if (Stage == UnclusteredReschedule) 718 SavedMutations.swap(Mutations); 719 } while (Stage != LastStage); 720 } 721 722 void GCNScheduleDAGMILive::collectRematerializableInstructions( 723 unsigned HighRPIdx) { 724 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI); 725 const GCNRPTracker::LiveRegSet &HighRPLiveIns = LiveIns[HighRPIdx]; 726 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { 727 Register Reg = Register::index2VirtReg(I); 728 if (!LIS->hasInterval(Reg)) 729 continue; 730 731 // TODO: Handle AGPR and SGPR rematerialization 732 if (!SRI->isVGPRClass(MRI.getRegClass(Reg)) || !MRI.hasOneDef(Reg) || 733 !MRI.hasOneUse(Reg)) 734 continue; 735 736 // We are only collecting defs that are live-through or defined in another 737 // block and used inside this region. This means that the register must be 738 // in the live-in set for this region, else skip this def. 739 if (HighRPLiveIns.find(Reg) == HighRPLiveIns.end()) 740 continue; 741 742 MachineInstr *Def = MRI.getOneDef(Reg)->getParent(); 743 if (!Def || !isTriviallyReMaterializable(*Def, AA)) 744 continue; 745 746 MachineInstr *UseI = &*MRI.use_instr_begin(Reg); 747 if (Def->getParent() == UseI->getParent()) 748 continue; 749 750 RematerializableInsts.push_back(std::make_pair(Def, UseI)); 751 } 752 } 753 754 bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, 755 const TargetInstrInfo *TII, 756 unsigned HighRPIdx) { 757 RescheduleRegions.reset(); 758 GCNRPTracker::LiveRegSet NewLiveIns; 759 // We may not need to rematerialize all instructions. Keep a list of 760 // instructions we are rematerializing at the end. 761 SmallVector<std::pair<MachineInstr *, MachineInstr *>, 4> 762 TrivialRematDefsToSink; 763 764 GCNRegPressure RegionPressure = Pressure[HighRPIdx]; 765 int VGPRUsage = RegionPressure.getVGPRNum(ST.hasGFX90AInsts()); 766 int SGPRUsage = RegionPressure.getSGPRNum(); 767 768 // TODO: Handle occupancy drop due to AGPR and SGPR. 769 // Check if cause of occupancy drop is due to VGPR usage. 770 if (ST.getOccupancyWithNumVGPRs(VGPRUsage) > MinOccupancy || 771 ST.getOccupancyWithNumSGPRs(SGPRUsage) == MinOccupancy) 772 return false; 773 774 NewLiveIns.copyFrom(LiveIns[HighRPIdx]); 775 // First check if we have enough trivially rematerializable instructions to 776 // improve occupancy. Optimistically assume all instructions we are able to 777 // sink decreased RP. 778 int TotalSinkableRegs = 0; 779 for (auto &It : RematerializableInsts) { 780 Register DefReg = It.first->getOperand(0).getReg(); 781 TotalSinkableRegs += SIRegisterInfo::getNumCoveredRegs(NewLiveIns[DefReg]); 782 } 783 int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs; 784 unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink); 785 // If in the most optimistic scenario, we cannot improve occupancy, then do 786 // not attempt to sink any instructions. 787 if (OptimisticOccupancy <= MinOccupancy) 788 return false; 789 790 // Keep a list of newly rematerialized instructions so that we can easily 791 // undo if occupancy is not improved. 792 DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef; 793 GCNDownwardRPTracker RPT(*LIS); 794 auto *NonDbgMI = &*skipDebugInstructionsForward(Regions[HighRPIdx].first, 795 Regions[HighRPIdx].second); 796 unsigned ImproveOccupancy = 0; 797 for (auto &It : RematerializableInsts) { 798 MachineInstr *Def = It.first; 799 MachineBasicBlock::iterator InsertPos = 800 MachineBasicBlock::iterator(It.second); 801 Register Reg = Def->getOperand(0).getReg(); 802 // Rematerialize MI to its use block. Since we are only rematerializing 803 // instructions that do not have any virtual reg uses, we do not need to 804 // call LiveRangeEdit::allUsesAvailableAt() and 805 // LiveRangeEdit::canRematerializeAt(). 806 NewLiveIns[Reg] = LaneBitmask::getNone(); 807 TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, 808 Def->getOperand(0).getSubReg(), *Def, *TRI); 809 MachineInstr *NewMI = &*(--InsertPos); 810 LIS->InsertMachineInstrInMaps(*NewMI); 811 LIS->removeInterval(Reg); 812 LIS->createAndComputeVirtRegInterval(Reg); 813 InsertedMIToOldDef[NewMI] = Def; 814 815 // FIXME: Need better way to update RP without re-iterating over region 816 RPT.reset(*NonDbgMI, &NewLiveIns); 817 RPT.advance(Regions[HighRPIdx].second); 818 GCNRegPressure RPAfterSinking = RPT.moveMaxPressure(); 819 ImproveOccupancy = RPAfterSinking.getOccupancy(ST); 820 if (ImproveOccupancy > MinOccupancy) 821 break; 822 } 823 824 if (ImproveOccupancy <= MinOccupancy) { 825 // Occupancy is not improved. Undo sinking for the region 826 for (auto &Entry : InsertedMIToOldDef) { 827 MachineInstr *MI = Entry.first; 828 MachineInstr *OldMI = Entry.second; 829 Register Reg = MI->getOperand(0).getReg(); 830 LIS->RemoveMachineInstrFromMaps(*MI); 831 MI->eraseFromParent(); 832 OldMI->clearRegisterDeads(Reg); 833 LIS->removeInterval(Reg); 834 LIS->createAndComputeVirtRegInterval(Reg); 835 } 836 return false; 837 } 838 839 // Occupancy is improved. 840 for (auto &Entry : InsertedMIToOldDef) { 841 MachineInstr *MI = Entry.first; 842 MachineInstr *OldMI = Entry.second; 843 // Update region boundaries in scheduling region we sinked from since we 844 // may sink an instruction that was at the beginning or end of its region 845 updateRegionBoundaries(OldMI, /*NewMI =*/nullptr, /*Removing =*/true); 846 847 // Remove OldMI from BBLiveInMap since we are sinking it from its MBB. 848 BBLiveInMap.erase(OldMI); 849 850 // Remove OldMI and update LIS 851 Register Reg = MI->getOperand(0).getReg(); 852 LIS->RemoveMachineInstrFromMaps(*OldMI); 853 OldMI->eraseFromParent(); 854 LIS->removeInterval(Reg); 855 LIS->createAndComputeVirtRegInterval(Reg); 856 857 // Update region boundaries in region we sinked to. 858 MachineBasicBlock::iterator InsertPos = 859 std::next(MachineBasicBlock::iterator(MI)); 860 updateRegionBoundaries(InsertPos, MI); 861 } 862 863 // Update cached live-ins and register pressure after rematerializing 864 LiveIns[HighRPIdx].copyFrom(NewLiveIns); 865 MBBLiveIns.erase(Regions[HighRPIdx].first->getParent()); 866 867 GCNDownwardRPTracker RPTracker(*LIS); 868 RPTracker.advance(Regions[HighRPIdx].first, Regions[HighRPIdx].second, 869 &LiveIns[HighRPIdx]); 870 Pressure[HighRPIdx] = RPTracker.moveMaxPressure(); 871 872 SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 873 MFI.increaseOccupancy(MF, ++MinOccupancy); 874 RescheduleRegions[HighRPIdx] = true; 875 876 return true; 877 } 878 879 // Copied from MachineLICM 880 bool GCNScheduleDAGMILive::isTriviallyReMaterializable(const MachineInstr &MI, 881 AAResults *AA) { 882 if (!TII->isTriviallyReMaterializable(MI, AA)) 883 return false; 884 885 for (const MachineOperand &MO : MI.operands()) 886 if (MO.isReg() && MO.isUse() && MO.getReg().isVirtual()) 887 return false; 888 889 return true; 890 } 891 892 // When removing, we will have to check both beginning and ending of the region. 893 // When inserting, we will only have to check if we are inserting NewMI in front 894 // of a scheduling region and do not need to check the ending since we will only 895 // ever be inserting before an already existing MI. 896 void GCNScheduleDAGMILive::updateRegionBoundaries( 897 MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) { 898 unsigned I = 0, E = Regions.size(); 899 // Search for first region of the block where MI is located 900 while (I != E && MI->getParent() != Regions[I].first->getParent()) 901 ++I; 902 903 for (; I != E; ++I) { 904 if (MI->getParent() != Regions[I].first->getParent()) 905 return; 906 907 if (Removing && MI == Regions[I].first && MI == Regions[I].second) { 908 // MI is in a region with size 1, after removing, the region will be 909 // size 0, set RegionBegin and RegionEnd to pass end of block iterator. 910 Regions[I] = 911 std::make_pair(MI->getParent()->end(), MI->getParent()->end()); 912 return; 913 } 914 if (MI == Regions[I].first) { 915 if (Removing) 916 Regions[I] = std::make_pair(std::next(MI), Regions[I].second); 917 else 918 // Inserted NewMI in front of region, set new RegionBegin to NewMI 919 Regions[I] = std::make_pair(MachineBasicBlock::iterator(NewMI), 920 Regions[I].second); 921 return; 922 } 923 if (Removing && MI == Regions[I].second) { 924 Regions[I] = std::make_pair(Regions[I].first, std::prev(MI)); 925 return; 926 } 927 } 928 } 929