1 //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This contains a MachineSchedStrategy implementation for maximizing wave 11 /// occupancy on GCN hardware. 12 //===----------------------------------------------------------------------===// 13 14 #include "GCNSchedStrategy.h" 15 #include "SIMachineFunctionInfo.h" 16 #include "llvm/CodeGen/RegisterClassInfo.h" 17 18 #define DEBUG_TYPE "machine-scheduler" 19 20 using namespace llvm; 21 22 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( 23 const MachineSchedContext *C) : 24 GenericScheduler(C), TargetOccupancy(0), HasClusteredNodes(false), 25 HasExcessPressure(false), MF(nullptr) { } 26 27 void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) { 28 GenericScheduler::initialize(DAG); 29 30 MF = &DAG->MF; 31 32 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 33 34 // FIXME: This is also necessary, because some passes that run after 35 // scheduling and before regalloc increase register pressure. 36 const unsigned ErrorMargin = 3; 37 38 SGPRExcessLimit = 39 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass); 40 VGPRExcessLimit = 41 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass); 42 43 SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); 44 // Set the initial TargetOccupnacy to the maximum occupancy that we can 45 // achieve for this function. This effectively sets a lower bound on the 46 // 'Critical' register limits in the scheduler. 47 TargetOccupancy = MFI.getOccupancy(); 48 SGPRCriticalLimit = 49 std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit); 50 VGPRCriticalLimit = 51 std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit); 52 53 // Subtract error margin from register limits and avoid overflow. 54 SGPRCriticalLimit = 55 std::min(SGPRCriticalLimit - ErrorMargin, SGPRCriticalLimit); 56 VGPRCriticalLimit = 57 std::min(VGPRCriticalLimit - ErrorMargin, VGPRCriticalLimit); 58 SGPRExcessLimit = std::min(SGPRExcessLimit - ErrorMargin, SGPRExcessLimit); 59 VGPRExcessLimit = std::min(VGPRExcessLimit - ErrorMargin, VGPRExcessLimit); 60 } 61 62 void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, 63 bool AtTop, const RegPressureTracker &RPTracker, 64 const SIRegisterInfo *SRI, 65 unsigned SGPRPressure, 66 unsigned VGPRPressure) { 67 68 Cand.SU = SU; 69 Cand.AtTop = AtTop; 70 71 // getDownwardPressure() and getUpwardPressure() make temporary changes to 72 // the tracker, so we need to pass those function a non-const copy. 73 RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker); 74 75 Pressure.clear(); 76 MaxPressure.clear(); 77 78 if (AtTop) 79 TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure); 80 else { 81 // FIXME: I think for bottom up scheduling, the register pressure is cached 82 // and can be retrieved by DAG->getPressureDif(SU). 83 TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); 84 } 85 86 unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; 87 unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; 88 89 // If two instructions increase the pressure of different register sets 90 // by the same amount, the generic scheduler will prefer to schedule the 91 // instruction that increases the set with the least amount of registers, 92 // which in our case would be SGPRs. This is rarely what we want, so 93 // when we report excess/critical register pressure, we do it either 94 // only for VGPRs or only for SGPRs. 95 96 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs. 97 const unsigned MaxVGPRPressureInc = 16; 98 bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit; 99 bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit; 100 101 102 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold 103 // to increase the likelihood we don't go over the limits. We should improve 104 // the analysis to look through dependencies to find the path with the least 105 // register pressure. 106 107 // We only need to update the RPDelta for instructions that increase register 108 // pressure. Instructions that decrease or keep reg pressure the same will be 109 // marked as RegExcess in tryCandidate() when they are compared with 110 // instructions that increase the register pressure. 111 if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) { 112 HasExcessPressure = true; 113 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); 114 Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit); 115 } 116 117 if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) { 118 HasExcessPressure = true; 119 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32); 120 Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit); 121 } 122 123 // Register pressure is considered 'CRITICAL' if it is approaching a value 124 // that would reduce the wave occupancy for the execution unit. When 125 // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both 126 // has the same cost, so we don't need to prefer one over the other. 127 128 int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit; 129 int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit; 130 131 if (SGPRDelta >= 0 || VGPRDelta >= 0) { 132 HasExcessPressure = true; 133 if (SGPRDelta > VGPRDelta) { 134 Cand.RPDelta.CriticalMax = 135 PressureChange(AMDGPU::RegisterPressureSets::SReg_32); 136 Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta); 137 } else { 138 Cand.RPDelta.CriticalMax = 139 PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); 140 Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta); 141 } 142 } 143 } 144 145 // This function is mostly cut and pasted from 146 // GenericScheduler::pickNodeFromQueue() 147 void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, 148 const CandPolicy &ZonePolicy, 149 const RegPressureTracker &RPTracker, 150 SchedCandidate &Cand) { 151 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI); 152 ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos(); 153 unsigned SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; 154 unsigned VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; 155 ReadyQueue &Q = Zone.Available; 156 for (SUnit *SU : Q) { 157 158 SchedCandidate TryCand(ZonePolicy); 159 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, 160 SGPRPressure, VGPRPressure); 161 // Pass SchedBoundary only when comparing nodes from the same boundary. 162 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; 163 GenericScheduler::tryCandidate(Cand, TryCand, ZoneArg); 164 if (TryCand.Reason != NoCand) { 165 // Initialize resource delta if needed in case future heuristics query it. 166 if (TryCand.ResDelta == SchedResourceDelta()) 167 TryCand.initResourceDelta(Zone.DAG, SchedModel); 168 Cand.setBest(TryCand); 169 LLVM_DEBUG(traceCandidate(Cand)); 170 } 171 } 172 } 173 174 // This function is mostly cut and pasted from 175 // GenericScheduler::pickNodeBidirectional() 176 SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) { 177 // Schedule as far as possible in the direction of no choice. This is most 178 // efficient, but also provides the best heuristics for CriticalPSets. 179 if (SUnit *SU = Bot.pickOnlyChoice()) { 180 IsTopNode = false; 181 return SU; 182 } 183 if (SUnit *SU = Top.pickOnlyChoice()) { 184 IsTopNode = true; 185 return SU; 186 } 187 // Set the bottom-up policy based on the state of the current bottom zone and 188 // the instructions outside the zone, including the top zone. 189 CandPolicy BotPolicy; 190 setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top); 191 // Set the top-down policy based on the state of the current top zone and 192 // the instructions outside the zone, including the bottom zone. 193 CandPolicy TopPolicy; 194 setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot); 195 196 // See if BotCand is still valid (because we previously scheduled from Top). 197 LLVM_DEBUG(dbgs() << "Picking from Bot:\n"); 198 if (!BotCand.isValid() || BotCand.SU->isScheduled || 199 BotCand.Policy != BotPolicy) { 200 BotCand.reset(CandPolicy()); 201 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand); 202 assert(BotCand.Reason != NoCand && "failed to find the first candidate"); 203 } else { 204 LLVM_DEBUG(traceCandidate(BotCand)); 205 #ifndef NDEBUG 206 if (VerifyScheduling) { 207 SchedCandidate TCand; 208 TCand.reset(CandPolicy()); 209 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand); 210 assert(TCand.SU == BotCand.SU && 211 "Last pick result should correspond to re-picking right now"); 212 } 213 #endif 214 } 215 216 // Check if the top Q has a better candidate. 217 LLVM_DEBUG(dbgs() << "Picking from Top:\n"); 218 if (!TopCand.isValid() || TopCand.SU->isScheduled || 219 TopCand.Policy != TopPolicy) { 220 TopCand.reset(CandPolicy()); 221 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand); 222 assert(TopCand.Reason != NoCand && "failed to find the first candidate"); 223 } else { 224 LLVM_DEBUG(traceCandidate(TopCand)); 225 #ifndef NDEBUG 226 if (VerifyScheduling) { 227 SchedCandidate TCand; 228 TCand.reset(CandPolicy()); 229 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand); 230 assert(TCand.SU == TopCand.SU && 231 "Last pick result should correspond to re-picking right now"); 232 } 233 #endif 234 } 235 236 // Pick best from BotCand and TopCand. 237 LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand); 238 dbgs() << "Bot Cand: "; traceCandidate(BotCand);); 239 SchedCandidate Cand = BotCand; 240 TopCand.Reason = NoCand; 241 GenericScheduler::tryCandidate(Cand, TopCand, nullptr); 242 if (TopCand.Reason != NoCand) { 243 Cand.setBest(TopCand); 244 } 245 LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand);); 246 247 IsTopNode = Cand.AtTop; 248 return Cand.SU; 249 } 250 251 // This function is mostly cut and pasted from 252 // GenericScheduler::pickNode() 253 SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) { 254 if (DAG->top() == DAG->bottom()) { 255 assert(Top.Available.empty() && Top.Pending.empty() && 256 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); 257 return nullptr; 258 } 259 SUnit *SU; 260 do { 261 if (RegionPolicy.OnlyTopDown) { 262 SU = Top.pickOnlyChoice(); 263 if (!SU) { 264 CandPolicy NoPolicy; 265 TopCand.reset(NoPolicy); 266 pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand); 267 assert(TopCand.Reason != NoCand && "failed to find a candidate"); 268 SU = TopCand.SU; 269 } 270 IsTopNode = true; 271 } else if (RegionPolicy.OnlyBottomUp) { 272 SU = Bot.pickOnlyChoice(); 273 if (!SU) { 274 CandPolicy NoPolicy; 275 BotCand.reset(NoPolicy); 276 pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand); 277 assert(BotCand.Reason != NoCand && "failed to find a candidate"); 278 SU = BotCand.SU; 279 } 280 IsTopNode = false; 281 } else { 282 SU = pickNodeBidirectional(IsTopNode); 283 } 284 } while (SU->isScheduled); 285 286 if (SU->isTopReady()) 287 Top.removeReady(SU); 288 if (SU->isBottomReady()) 289 Bot.removeReady(SU); 290 291 if (!HasClusteredNodes && SU->getInstr()->mayLoadOrStore()) { 292 for (SDep &Dep : SU->Preds) { 293 if (Dep.isCluster()) { 294 HasClusteredNodes = true; 295 break; 296 } 297 } 298 } 299 300 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " 301 << *SU->getInstr()); 302 return SU; 303 } 304 305 GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C, 306 std::unique_ptr<MachineSchedStrategy> S) : 307 ScheduleDAGMILive(C, std::move(S)), 308 ST(MF.getSubtarget<GCNSubtarget>()), 309 MFI(*MF.getInfo<SIMachineFunctionInfo>()), 310 StartingOccupancy(MFI.getOccupancy()), 311 MinOccupancy(StartingOccupancy), Stage(Collect), RegionIdx(0) { 312 313 LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); 314 } 315 316 void GCNScheduleDAGMILive::schedule() { 317 if (Stage == Collect) { 318 // Just record regions at the first pass. 319 Regions.push_back(std::make_pair(RegionBegin, RegionEnd)); 320 return; 321 } 322 323 std::vector<MachineInstr*> Unsched; 324 Unsched.reserve(NumRegionInstrs); 325 for (auto &I : *this) { 326 Unsched.push_back(&I); 327 } 328 329 GCNRegPressure PressureBefore; 330 if (LIS) { 331 PressureBefore = Pressure[RegionIdx]; 332 333 LLVM_DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:"; 334 GCNRPTracker::printLiveRegs(dbgs(), LiveIns[RegionIdx], MRI); 335 dbgs() << "Region live-in pressure: "; 336 llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs()); 337 dbgs() << "Region register pressure: "; 338 PressureBefore.print(dbgs())); 339 } 340 341 GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl; 342 // Set HasClusteredNodes to true for late stages where we have already 343 // collected it. That way pickNode() will not scan SDep's when not needed. 344 S.HasClusteredNodes = Stage > InitialSchedule; 345 S.HasExcessPressure = false; 346 ScheduleDAGMILive::schedule(); 347 Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd); 348 RescheduleRegions[RegionIdx] = false; 349 if (Stage == InitialSchedule && S.HasClusteredNodes) 350 RegionsWithClusters[RegionIdx] = true; 351 if (S.HasExcessPressure) 352 RegionsWithHighRP[RegionIdx] = true; 353 354 if (!LIS) 355 return; 356 357 // Check the results of scheduling. 358 auto PressureAfter = getRealRegPressure(); 359 360 LLVM_DEBUG(dbgs() << "Pressure after scheduling: "; 361 PressureAfter.print(dbgs())); 362 363 if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && 364 PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { 365 Pressure[RegionIdx] = PressureAfter; 366 RegionsWithMinOcc[RegionIdx] = 367 PressureAfter.getOccupancy(ST) == MinOccupancy; 368 369 LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n"); 370 return; 371 } 372 373 unsigned WavesAfter = 374 std::min(S.TargetOccupancy, PressureAfter.getOccupancy(ST)); 375 unsigned WavesBefore = 376 std::min(S.TargetOccupancy, PressureBefore.getOccupancy(ST)); 377 LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore 378 << ", after " << WavesAfter << ".\n"); 379 380 // We may not be able to keep the current target occupancy because of the just 381 // scheduled region. We might still be able to revert scheduling if the 382 // occupancy before was higher, or if the current schedule has register 383 // pressure higher than the excess limits which could lead to more spilling. 384 unsigned NewOccupancy = std::max(WavesAfter, WavesBefore); 385 386 // Allow memory bound functions to drop to 4 waves if not limited by an 387 // attribute. 388 if (WavesAfter < WavesBefore && WavesAfter < MinOccupancy && 389 WavesAfter >= MFI.getMinAllowedOccupancy()) { 390 LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to " 391 << MFI.getMinAllowedOccupancy() << " waves\n"); 392 NewOccupancy = WavesAfter; 393 } 394 395 if (NewOccupancy < MinOccupancy) { 396 MinOccupancy = NewOccupancy; 397 MFI.limitOccupancy(MinOccupancy); 398 RegionsWithMinOcc.reset(); 399 LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " 400 << MinOccupancy << ".\n"); 401 } 402 403 unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); 404 unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); 405 if (PressureAfter.getVGPRNum(false) > MaxVGPRs || 406 PressureAfter.getAGPRNum() > MaxVGPRs || 407 PressureAfter.getSGPRNum() > MaxSGPRs) { 408 RescheduleRegions[RegionIdx] = true; 409 RegionsWithHighRP[RegionIdx] = true; 410 } 411 412 // If this condition is true, then either the occupancy before and after 413 // scheduling is the same, or we are allowing the occupancy to drop because 414 // the function is memory bound. Even if we are OK with the current occupancy, 415 // we still need to verify that we will not introduce any extra chance of 416 // spilling. 417 if (WavesAfter >= MinOccupancy) { 418 if (Stage == UnclusteredReschedule && 419 !PressureAfter.less(ST, PressureBefore)) { 420 LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n"); 421 } else if (WavesAfter > MFI.getMinWavesPerEU() || 422 PressureAfter.less(ST, PressureBefore) || 423 !RescheduleRegions[RegionIdx]) { 424 Pressure[RegionIdx] = PressureAfter; 425 RegionsWithMinOcc[RegionIdx] = 426 PressureAfter.getOccupancy(ST) == MinOccupancy; 427 if (!RegionsWithClusters[RegionIdx] && 428 (Stage + 1) == UnclusteredReschedule) 429 RescheduleRegions[RegionIdx] = false; 430 return; 431 } else { 432 LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n"); 433 } 434 } 435 436 RegionsWithMinOcc[RegionIdx] = 437 PressureBefore.getOccupancy(ST) == MinOccupancy; 438 LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); 439 RescheduleRegions[RegionIdx] = RegionsWithClusters[RegionIdx] || 440 (Stage + 1) != UnclusteredReschedule; 441 RegionEnd = RegionBegin; 442 int SkippedDebugInstr = 0; 443 for (MachineInstr *MI : Unsched) { 444 if (MI->isDebugInstr()) { 445 ++SkippedDebugInstr; 446 continue; 447 } 448 449 if (MI->getIterator() != RegionEnd) { 450 BB->remove(MI); 451 BB->insert(RegionEnd, MI); 452 if (!MI->isDebugInstr()) 453 LIS->handleMove(*MI, true); 454 } 455 // Reset read-undef flags and update them later. 456 for (auto &Op : MI->operands()) 457 if (Op.isReg() && Op.isDef()) 458 Op.setIsUndef(false); 459 RegisterOperands RegOpers; 460 RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false); 461 if (!MI->isDebugInstr()) { 462 if (ShouldTrackLaneMasks) { 463 // Adjust liveness and add missing dead+read-undef flags. 464 SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot(); 465 RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI); 466 } else { 467 // Adjust for missing dead-def flags. 468 RegOpers.detectDeadDefs(*MI, *LIS); 469 } 470 } 471 RegionEnd = MI->getIterator(); 472 ++RegionEnd; 473 LLVM_DEBUG(dbgs() << "Scheduling " << *MI); 474 } 475 476 // After reverting schedule, debug instrs will now be at the end of the block 477 // and RegionEnd will point to the first debug instr. Increment RegionEnd 478 // pass debug instrs to the actual end of the scheduling region. 479 while (SkippedDebugInstr-- > 0) 480 ++RegionEnd; 481 482 // If Unsched.front() instruction is a debug instruction, this will actually 483 // shrink the region since we moved all debug instructions to the end of the 484 // block. Find the first instruction that is not a debug instruction. 485 RegionBegin = Unsched.front()->getIterator(); 486 if (RegionBegin->isDebugInstr()) { 487 for (MachineInstr *MI : Unsched) { 488 if (MI->isDebugInstr()) 489 continue; 490 RegionBegin = MI->getIterator(); 491 break; 492 } 493 } 494 495 // Then move the debug instructions back into their correct place and set 496 // RegionBegin and RegionEnd if needed. 497 placeDebugValues(); 498 499 Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd); 500 } 501 502 GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure() const { 503 GCNDownwardRPTracker RPTracker(*LIS); 504 RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]); 505 return RPTracker.moveMaxPressure(); 506 } 507 508 void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) { 509 GCNDownwardRPTracker RPTracker(*LIS); 510 511 // If the block has the only successor then live-ins of that successor are 512 // live-outs of the current block. We can reuse calculated live set if the 513 // successor will be sent to scheduling past current block. 514 const MachineBasicBlock *OnlySucc = nullptr; 515 if (MBB->succ_size() == 1 && !(*MBB->succ_begin())->empty()) { 516 SlotIndexes *Ind = LIS->getSlotIndexes(); 517 if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(*MBB->succ_begin())) 518 OnlySucc = *MBB->succ_begin(); 519 } 520 521 // Scheduler sends regions from the end of the block upwards. 522 size_t CurRegion = RegionIdx; 523 for (size_t E = Regions.size(); CurRegion != E; ++CurRegion) 524 if (Regions[CurRegion].first->getParent() != MBB) 525 break; 526 --CurRegion; 527 528 auto I = MBB->begin(); 529 auto LiveInIt = MBBLiveIns.find(MBB); 530 auto &Rgn = Regions[CurRegion]; 531 auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second); 532 if (LiveInIt != MBBLiveIns.end()) { 533 auto LiveIn = std::move(LiveInIt->second); 534 RPTracker.reset(*MBB->begin(), &LiveIn); 535 MBBLiveIns.erase(LiveInIt); 536 } else { 537 I = Rgn.first; 538 auto LRS = BBLiveInMap.lookup(NonDbgMI); 539 #ifdef EXPENSIVE_CHECKS 540 assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS)); 541 #endif 542 RPTracker.reset(*I, &LRS); 543 } 544 545 for ( ; ; ) { 546 I = RPTracker.getNext(); 547 548 if (Regions[CurRegion].first == I || NonDbgMI == I) { 549 LiveIns[CurRegion] = RPTracker.getLiveRegs(); 550 RPTracker.clearMaxPressure(); 551 } 552 553 if (Regions[CurRegion].second == I) { 554 Pressure[CurRegion] = RPTracker.moveMaxPressure(); 555 if (CurRegion-- == RegionIdx) 556 break; 557 } 558 RPTracker.advanceToNext(); 559 RPTracker.advanceBeforeNext(); 560 } 561 562 if (OnlySucc) { 563 if (I != MBB->end()) { 564 RPTracker.advanceToNext(); 565 RPTracker.advance(MBB->end()); 566 } 567 RPTracker.reset(*OnlySucc->begin(), &RPTracker.getLiveRegs()); 568 RPTracker.advanceBeforeNext(); 569 MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs(); 570 } 571 } 572 573 DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> 574 GCNScheduleDAGMILive::getBBLiveInMap() const { 575 assert(!Regions.empty()); 576 std::vector<MachineInstr *> BBStarters; 577 BBStarters.reserve(Regions.size()); 578 auto I = Regions.rbegin(), E = Regions.rend(); 579 auto *BB = I->first->getParent(); 580 do { 581 auto *MI = &*skipDebugInstructionsForward(I->first, I->second); 582 BBStarters.push_back(MI); 583 do { 584 ++I; 585 } while (I != E && I->first->getParent() == BB); 586 } while (I != E); 587 return getLiveRegMap(BBStarters, false /*After*/, *LIS); 588 } 589 590 void GCNScheduleDAGMILive::finalizeSchedule() { 591 LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); 592 593 LiveIns.resize(Regions.size()); 594 Pressure.resize(Regions.size()); 595 RescheduleRegions.resize(Regions.size()); 596 RegionsWithClusters.resize(Regions.size()); 597 RegionsWithHighRP.resize(Regions.size()); 598 RegionsWithMinOcc.resize(Regions.size()); 599 RescheduleRegions.set(); 600 RegionsWithClusters.reset(); 601 RegionsWithHighRP.reset(); 602 RegionsWithMinOcc.reset(); 603 604 if (!Regions.empty()) 605 BBLiveInMap = getBBLiveInMap(); 606 607 std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations; 608 609 do { 610 Stage++; 611 RegionIdx = 0; 612 MachineBasicBlock *MBB = nullptr; 613 614 if (Stage > InitialSchedule) { 615 if (!LIS) 616 break; 617 618 // Retry function scheduling if we found resulting occupancy and it is 619 // lower than used for first pass scheduling. This will give more freedom 620 // to schedule low register pressure blocks. 621 // Code is partially copied from MachineSchedulerBase::scheduleRegions(). 622 623 if (Stage == UnclusteredReschedule) { 624 if (RescheduleRegions.none()) 625 continue; 626 LLVM_DEBUG(dbgs() << 627 "Retrying function scheduling without clustering.\n"); 628 } 629 630 if (Stage == ClusteredLowOccupancyReschedule) { 631 if (StartingOccupancy <= MinOccupancy) 632 break; 633 634 LLVM_DEBUG( 635 dbgs() 636 << "Retrying function scheduling with lowest recorded occupancy " 637 << MinOccupancy << ".\n"); 638 } 639 640 if (Stage == PreRARematerialize) { 641 if (RegionsWithMinOcc.count() != 1 || Regions.size() == 1) 642 break; 643 644 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 645 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 646 // Check maximum occupancy 647 if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) == 648 MinOccupancy) 649 break; 650 651 // FIXME: This pass will invalidate cached LiveIns, MBBLiveIns and 652 // Pressure for regions inbetween the defs and region we sinked the def 653 // to. Will need to be fixed if there is another pass after this pass. 654 static_assert(LastStage == PreRARematerialize, 655 "Passes after PreRARematerialize are not supported"); 656 657 unsigned HighRPIdx = RegionsWithMinOcc.find_first(); 658 collectRematerializableInstructions(HighRPIdx); 659 if (RematerializableInsts.empty() || 660 !sinkTriviallyRematInsts(ST, TII, HighRPIdx)) 661 break; 662 663 LLVM_DEBUG( 664 dbgs() << "Retrying function scheduling with improved occupancy of " 665 << MinOccupancy << " from rematerializing\n"); 666 } 667 } 668 669 if (Stage == UnclusteredReschedule) 670 SavedMutations.swap(Mutations); 671 672 for (auto Region : Regions) { 673 if (((Stage == UnclusteredReschedule || Stage == PreRARematerialize) && 674 !RescheduleRegions[RegionIdx]) || 675 (Stage == ClusteredLowOccupancyReschedule && 676 !RegionsWithClusters[RegionIdx] && !RegionsWithHighRP[RegionIdx])) { 677 678 ++RegionIdx; 679 continue; 680 } 681 682 RegionBegin = Region.first; 683 RegionEnd = Region.second; 684 685 if (RegionBegin->getParent() != MBB) { 686 if (MBB) finishBlock(); 687 MBB = RegionBegin->getParent(); 688 startBlock(MBB); 689 if (Stage == InitialSchedule) 690 computeBlockPressure(MBB); 691 } 692 693 unsigned NumRegionInstrs = std::distance(begin(), end()); 694 enterRegion(MBB, begin(), end(), NumRegionInstrs); 695 696 // Skip empty scheduling regions (0 or 1 schedulable instructions). 697 if (begin() == end() || begin() == std::prev(end())) { 698 exitRegion(); 699 ++RegionIdx; 700 continue; 701 } 702 703 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n"); 704 LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*MBB) << " " 705 << MBB->getName() << "\n From: " << *begin() 706 << " To: "; 707 if (RegionEnd != MBB->end()) dbgs() << *RegionEnd; 708 else dbgs() << "End"; 709 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); 710 711 schedule(); 712 713 exitRegion(); 714 ++RegionIdx; 715 } 716 finishBlock(); 717 718 if (Stage == UnclusteredReschedule) 719 SavedMutations.swap(Mutations); 720 } while (Stage != LastStage); 721 } 722 723 void GCNScheduleDAGMILive::collectRematerializableInstructions( 724 unsigned HighRPIdx) { 725 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI); 726 const GCNRPTracker::LiveRegSet &HighRPLiveIns = LiveIns[HighRPIdx]; 727 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { 728 Register Reg = Register::index2VirtReg(I); 729 if (!LIS->hasInterval(Reg)) 730 continue; 731 732 // TODO: Handle AGPR and SGPR rematerialization 733 if (!SRI->isVGPRClass(MRI.getRegClass(Reg)) || !MRI.hasOneDef(Reg) || 734 !MRI.hasOneNonDBGUse(Reg)) 735 continue; 736 737 // We are only collecting defs that are live-through or defined in another 738 // block and used inside this region. This means that the register must be 739 // in the live-in set for this region, else skip this def. 740 if (HighRPLiveIns.find(Reg) == HighRPLiveIns.end()) 741 continue; 742 743 MachineInstr *Def = MRI.getOneDef(Reg)->getParent(); 744 if (!Def || Def->getOperand(0).getSubReg() != 0 || 745 !isTriviallyReMaterializable(*Def, AA)) 746 continue; 747 748 MachineInstr *UseI = &*MRI.use_instr_nodbg_begin(Reg); 749 if (Def->getParent() == UseI->getParent()) 750 continue; 751 752 RematerializableInsts.push_back(std::make_pair(Def, UseI)); 753 } 754 } 755 756 bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, 757 const TargetInstrInfo *TII, 758 unsigned HighRPIdx) { 759 RescheduleRegions.reset(); 760 GCNRPTracker::LiveRegSet NewLiveIns; 761 // We may not need to rematerialize all instructions. Keep a list of 762 // instructions we are rematerializing at the end. 763 SmallVector<std::pair<MachineInstr *, MachineInstr *>, 4> 764 TrivialRematDefsToSink; 765 766 GCNRegPressure RegionPressure = Pressure[HighRPIdx]; 767 int VGPRUsage = RegionPressure.getVGPRNum(ST.hasGFX90AInsts()); 768 int SGPRUsage = RegionPressure.getSGPRNum(); 769 770 // TODO: Handle occupancy drop due to AGPR and SGPR. 771 // Check if cause of occupancy drop is due to VGPR usage. 772 if (ST.getOccupancyWithNumVGPRs(VGPRUsage) > MinOccupancy || 773 ST.getOccupancyWithNumSGPRs(SGPRUsage) == MinOccupancy) 774 return false; 775 776 NewLiveIns.copyFrom(LiveIns[HighRPIdx]); 777 // First check if we have enough trivially rematerializable instructions to 778 // improve occupancy. Optimistically assume all instructions we are able to 779 // sink decreased RP. 780 int TotalSinkableRegs = 0; 781 for (auto &It : RematerializableInsts) { 782 Register DefReg = It.first->getOperand(0).getReg(); 783 TotalSinkableRegs += SIRegisterInfo::getNumCoveredRegs(NewLiveIns[DefReg]); 784 } 785 int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs; 786 unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink); 787 // If in the most optimistic scenario, we cannot improve occupancy, then do 788 // not attempt to sink any instructions. 789 if (OptimisticOccupancy <= MinOccupancy) 790 return false; 791 792 // Keep a list of newly rematerialized instructions so that we can easily 793 // undo if occupancy is not improved. 794 DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef; 795 GCNDownwardRPTracker RPT(*LIS); 796 auto *NonDbgMI = &*skipDebugInstructionsForward(Regions[HighRPIdx].first, 797 Regions[HighRPIdx].second); 798 unsigned ImproveOccupancy = 0; 799 for (auto &It : RematerializableInsts) { 800 MachineInstr *Def = It.first; 801 MachineBasicBlock::iterator InsertPos = 802 MachineBasicBlock::iterator(It.second); 803 Register Reg = Def->getOperand(0).getReg(); 804 // Rematerialize MI to its use block. Since we are only rematerializing 805 // instructions that do not have any virtual reg uses, we do not need to 806 // call LiveRangeEdit::allUsesAvailableAt() and 807 // LiveRangeEdit::canRematerializeAt(). 808 NewLiveIns[Reg] = LaneBitmask::getNone(); 809 TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, 810 Def->getOperand(0).getSubReg(), *Def, *TRI); 811 MachineInstr *NewMI = &*(--InsertPos); 812 LIS->InsertMachineInstrInMaps(*NewMI); 813 LIS->removeInterval(Reg); 814 LIS->createAndComputeVirtRegInterval(Reg); 815 InsertedMIToOldDef[NewMI] = Def; 816 817 // FIXME: Need better way to update RP without re-iterating over region 818 RPT.reset(*NonDbgMI, &NewLiveIns); 819 RPT.advance(Regions[HighRPIdx].second); 820 GCNRegPressure RPAfterSinking = RPT.moveMaxPressure(); 821 ImproveOccupancy = RPAfterSinking.getOccupancy(ST); 822 if (ImproveOccupancy > MinOccupancy) 823 break; 824 } 825 826 if (ImproveOccupancy <= MinOccupancy) { 827 // Occupancy is not improved. Undo sinking for the region 828 for (auto &Entry : InsertedMIToOldDef) { 829 MachineInstr *MI = Entry.first; 830 MachineInstr *OldMI = Entry.second; 831 Register Reg = MI->getOperand(0).getReg(); 832 LIS->RemoveMachineInstrFromMaps(*MI); 833 MI->eraseFromParent(); 834 OldMI->clearRegisterDeads(Reg); 835 LIS->removeInterval(Reg); 836 LIS->createAndComputeVirtRegInterval(Reg); 837 } 838 return false; 839 } 840 841 // Occupancy is improved. 842 for (auto &Entry : InsertedMIToOldDef) { 843 MachineInstr *MI = Entry.first; 844 MachineInstr *OldMI = Entry.second; 845 // Update region boundaries in scheduling region we sinked from since we 846 // may sink an instruction that was at the beginning or end of its region 847 updateRegionBoundaries(OldMI, /*NewMI =*/nullptr, /*Removing =*/true); 848 849 // Remove OldMI from BBLiveInMap since we are sinking it from its MBB. 850 BBLiveInMap.erase(OldMI); 851 852 // Remove OldMI and update LIS 853 Register Reg = MI->getOperand(0).getReg(); 854 LIS->RemoveMachineInstrFromMaps(*OldMI); 855 OldMI->eraseFromParent(); 856 LIS->removeInterval(Reg); 857 LIS->createAndComputeVirtRegInterval(Reg); 858 859 // Update region boundaries in region we sinked to. 860 MachineBasicBlock::iterator InsertPos = 861 std::next(MachineBasicBlock::iterator(MI)); 862 updateRegionBoundaries(InsertPos, MI); 863 } 864 865 // Update cached live-ins and register pressure after rematerializing 866 LiveIns[HighRPIdx].copyFrom(NewLiveIns); 867 MBBLiveIns.erase(Regions[HighRPIdx].first->getParent()); 868 869 GCNDownwardRPTracker RPTracker(*LIS); 870 RPTracker.advance(Regions[HighRPIdx].first, Regions[HighRPIdx].second, 871 &LiveIns[HighRPIdx]); 872 Pressure[HighRPIdx] = RPTracker.moveMaxPressure(); 873 874 SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 875 MFI.increaseOccupancy(MF, ++MinOccupancy); 876 RescheduleRegions[HighRPIdx] = true; 877 878 return true; 879 } 880 881 // Copied from MachineLICM 882 bool GCNScheduleDAGMILive::isTriviallyReMaterializable(const MachineInstr &MI, 883 AAResults *AA) { 884 if (!TII->isTriviallyReMaterializable(MI, AA)) 885 return false; 886 887 for (const MachineOperand &MO : MI.operands()) 888 if (MO.isReg() && MO.isUse() && MO.getReg().isVirtual()) 889 return false; 890 891 return true; 892 } 893 894 // When removing, we will have to check both beginning and ending of the region. 895 // When inserting, we will only have to check if we are inserting NewMI in front 896 // of a scheduling region and do not need to check the ending since we will only 897 // ever be inserting before an already existing MI. 898 void GCNScheduleDAGMILive::updateRegionBoundaries( 899 MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) { 900 unsigned I = 0, E = Regions.size(); 901 // Search for first region of the block where MI is located 902 while (I != E && MI->getParent() != Regions[I].first->getParent()) 903 ++I; 904 905 for (; I != E; ++I) { 906 if (MI->getParent() != Regions[I].first->getParent()) 907 return; 908 909 if (Removing && MI == Regions[I].first && MI == Regions[I].second) { 910 // MI is in a region with size 1, after removing, the region will be 911 // size 0, set RegionBegin and RegionEnd to pass end of block iterator. 912 Regions[I] = 913 std::make_pair(MI->getParent()->end(), MI->getParent()->end()); 914 return; 915 } 916 if (MI == Regions[I].first) { 917 if (Removing) 918 Regions[I] = std::make_pair(std::next(MI), Regions[I].second); 919 else 920 // Inserted NewMI in front of region, set new RegionBegin to NewMI 921 Regions[I] = std::make_pair(MachineBasicBlock::iterator(NewMI), 922 Regions[I].second); 923 return; 924 } 925 if (Removing && MI == Regions[I].second) { 926 Regions[I] = std::make_pair(Regions[I].first, std::prev(MI)); 927 return; 928 } 929 } 930 } 931