1 //===---- ScheduleDAGInstrs.cpp - MachineInstr Rescheduling ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This implements the ScheduleDAGInstrs class, which implements 10 /// re-scheduling of MachineInstrs. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/CodeGen/ScheduleDAGInstrs.h" 15 #include "llvm/ADT/IntEqClasses.h" 16 #include "llvm/ADT/MapVector.h" 17 #include "llvm/ADT/SmallPtrSet.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/SparseSet.h" 20 #include "llvm/ADT/iterator_range.h" 21 #include "llvm/Analysis/AliasAnalysis.h" 22 #include "llvm/Analysis/ValueTracking.h" 23 #include "llvm/CodeGen/LiveIntervals.h" 24 #include "llvm/CodeGen/LivePhysRegs.h" 25 #include "llvm/CodeGen/MachineBasicBlock.h" 26 #include "llvm/CodeGen/MachineFrameInfo.h" 27 #include "llvm/CodeGen/MachineFunction.h" 28 #include "llvm/CodeGen/MachineInstr.h" 29 #include "llvm/CodeGen/MachineInstrBundle.h" 30 #include "llvm/CodeGen/MachineMemOperand.h" 31 #include "llvm/CodeGen/MachineOperand.h" 32 #include "llvm/CodeGen/MachineRegisterInfo.h" 33 #include "llvm/CodeGen/PseudoSourceValue.h" 34 #include "llvm/CodeGen/RegisterPressure.h" 35 #include "llvm/CodeGen/ScheduleDAG.h" 36 #include "llvm/CodeGen/ScheduleDFS.h" 37 #include "llvm/CodeGen/SlotIndexes.h" 38 #include "llvm/CodeGen/TargetRegisterInfo.h" 39 #include "llvm/CodeGen/TargetSubtargetInfo.h" 40 #include "llvm/Config/llvm-config.h" 41 #include "llvm/IR/Constants.h" 42 #include "llvm/IR/Function.h" 43 #include "llvm/IR/Instruction.h" 44 #include "llvm/IR/Instructions.h" 45 #include "llvm/IR/Operator.h" 46 #include "llvm/IR/Type.h" 47 #include "llvm/IR/Value.h" 48 #include "llvm/MC/LaneBitmask.h" 49 #include "llvm/MC/MCRegisterInfo.h" 50 #include "llvm/Support/Casting.h" 51 #include "llvm/Support/CommandLine.h" 52 #include "llvm/Support/Compiler.h" 53 #include "llvm/Support/Debug.h" 54 #include "llvm/Support/ErrorHandling.h" 55 #include "llvm/Support/Format.h" 56 #include "llvm/Support/raw_ostream.h" 57 #include <algorithm> 58 #include <cassert> 59 #include <iterator> 60 #include <string> 61 #include <utility> 62 #include <vector> 63 64 using namespace llvm; 65 66 #define DEBUG_TYPE "machine-scheduler" 67 68 static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi", cl::Hidden, 69 cl::ZeroOrMore, cl::init(false), 70 cl::desc("Enable use of AA during MI DAG construction")); 71 72 static cl::opt<bool> UseTBAA("use-tbaa-in-sched-mi", cl::Hidden, 73 cl::init(true), cl::desc("Enable use of TBAA during MI DAG construction")); 74 75 // Note: the two options below might be used in tuning compile time vs 76 // output quality. Setting HugeRegion so large that it will never be 77 // reached means best-effort, but may be slow. 78 79 // When Stores and Loads maps (or NonAliasStores and NonAliasLoads) 80 // together hold this many SUs, a reduction of maps will be done. 81 static cl::opt<unsigned> HugeRegion("dag-maps-huge-region", cl::Hidden, 82 cl::init(1000), cl::desc("The limit to use while constructing the DAG " 83 "prior to scheduling, at which point a trade-off " 84 "is made to avoid excessive compile time.")); 85 86 static cl::opt<unsigned> ReductionSize( 87 "dag-maps-reduction-size", cl::Hidden, 88 cl::desc("A huge scheduling region will have maps reduced by this many " 89 "nodes at a time. Defaults to HugeRegion / 2.")); 90 91 static unsigned getReductionSize() { 92 // Always reduce a huge region with half of the elements, except 93 // when user sets this number explicitly. 94 if (ReductionSize.getNumOccurrences() == 0) 95 return HugeRegion / 2; 96 return ReductionSize; 97 } 98 99 static void dumpSUList(ScheduleDAGInstrs::SUList &L) { 100 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 101 dbgs() << "{ "; 102 for (const SUnit *su : L) { 103 dbgs() << "SU(" << su->NodeNum << ")"; 104 if (su != L.back()) 105 dbgs() << ", "; 106 } 107 dbgs() << "}\n"; 108 #endif 109 } 110 111 ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf, 112 const MachineLoopInfo *mli, 113 bool RemoveKillFlags) 114 : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()), 115 RemoveKillFlags(RemoveKillFlags), 116 UnknownValue(UndefValue::get( 117 Type::getVoidTy(mf.getFunction().getContext()))) { 118 DbgValues.clear(); 119 120 const TargetSubtargetInfo &ST = mf.getSubtarget(); 121 SchedModel.init(&ST); 122 } 123 124 /// If this machine instr has memory reference information and it can be 125 /// tracked to a normal reference to a known object, return the Value 126 /// for that object. This function returns false the memory location is 127 /// unknown or may alias anything. 128 static bool getUnderlyingObjectsForInstr(const MachineInstr *MI, 129 const MachineFrameInfo &MFI, 130 UnderlyingObjectsVector &Objects, 131 const DataLayout &DL) { 132 auto allMMOsOkay = [&]() { 133 for (const MachineMemOperand *MMO : MI->memoperands()) { 134 if (MMO->isVolatile()) 135 return false; 136 137 if (const PseudoSourceValue *PSV = MMO->getPseudoValue()) { 138 // Function that contain tail calls don't have unique PseudoSourceValue 139 // objects. Two PseudoSourceValues might refer to the same or 140 // overlapping locations. The client code calling this function assumes 141 // this is not the case. So return a conservative answer of no known 142 // object. 143 if (MFI.hasTailCall()) 144 return false; 145 146 // For now, ignore PseudoSourceValues which may alias LLVM IR values 147 // because the code that uses this function has no way to cope with 148 // such aliases. 149 if (PSV->isAliased(&MFI)) 150 return false; 151 152 bool MayAlias = PSV->mayAlias(&MFI); 153 Objects.push_back(UnderlyingObjectsVector::value_type(PSV, MayAlias)); 154 } else if (const Value *V = MMO->getValue()) { 155 SmallVector<Value *, 4> Objs; 156 if (!getUnderlyingObjectsForCodeGen(V, Objs, DL)) 157 return false; 158 159 for (Value *V : Objs) { 160 assert(isIdentifiedObject(V)); 161 Objects.push_back(UnderlyingObjectsVector::value_type(V, true)); 162 } 163 } else 164 return false; 165 } 166 return true; 167 }; 168 169 if (!allMMOsOkay()) { 170 Objects.clear(); 171 return false; 172 } 173 174 return true; 175 } 176 177 void ScheduleDAGInstrs::startBlock(MachineBasicBlock *bb) { 178 BB = bb; 179 } 180 181 void ScheduleDAGInstrs::finishBlock() { 182 // Subclasses should no longer refer to the old block. 183 BB = nullptr; 184 } 185 186 void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb, 187 MachineBasicBlock::iterator begin, 188 MachineBasicBlock::iterator end, 189 unsigned regioninstrs) { 190 assert(bb == BB && "startBlock should set BB"); 191 RegionBegin = begin; 192 RegionEnd = end; 193 NumRegionInstrs = regioninstrs; 194 } 195 196 void ScheduleDAGInstrs::exitRegion() { 197 // Nothing to do. 198 } 199 200 void ScheduleDAGInstrs::addSchedBarrierDeps() { 201 MachineInstr *ExitMI = RegionEnd != BB->end() ? &*RegionEnd : nullptr; 202 ExitSU.setInstr(ExitMI); 203 // Add dependencies on the defs and uses of the instruction. 204 if (ExitMI) { 205 for (const MachineOperand &MO : ExitMI->operands()) { 206 if (!MO.isReg() || MO.isDef()) continue; 207 unsigned Reg = MO.getReg(); 208 if (TargetRegisterInfo::isPhysicalRegister(Reg)) { 209 Uses.insert(PhysRegSUOper(&ExitSU, -1, Reg)); 210 } else if (TargetRegisterInfo::isVirtualRegister(Reg) && MO.readsReg()) { 211 addVRegUseDeps(&ExitSU, ExitMI->getOperandNo(&MO)); 212 } 213 } 214 } 215 if (!ExitMI || (!ExitMI->isCall() && !ExitMI->isBarrier())) { 216 // For others, e.g. fallthrough, conditional branch, assume the exit 217 // uses all the registers that are livein to the successor blocks. 218 for (const MachineBasicBlock *Succ : BB->successors()) { 219 for (const auto &LI : Succ->liveins()) { 220 if (!Uses.contains(LI.PhysReg)) 221 Uses.insert(PhysRegSUOper(&ExitSU, -1, LI.PhysReg)); 222 } 223 } 224 } 225 } 226 227 /// MO is an operand of SU's instruction that defines a physical register. Adds 228 /// data dependencies from SU to any uses of the physical register. 229 void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) { 230 const MachineOperand &MO = SU->getInstr()->getOperand(OperIdx); 231 assert(MO.isDef() && "expect physreg def"); 232 233 // Ask the target if address-backscheduling is desirable, and if so how much. 234 const TargetSubtargetInfo &ST = MF.getSubtarget(); 235 236 // Only use any non-zero latency for real defs/uses, in contrast to 237 // "fake" operands added by regalloc. 238 const MCInstrDesc *DefMIDesc = &SU->getInstr()->getDesc(); 239 bool ImplicitPseudoDef = (OperIdx >= DefMIDesc->getNumOperands() && 240 !DefMIDesc->hasImplicitDefOfPhysReg(MO.getReg())); 241 for (MCRegAliasIterator Alias(MO.getReg(), TRI, true); 242 Alias.isValid(); ++Alias) { 243 if (!Uses.contains(*Alias)) 244 continue; 245 for (Reg2SUnitsMap::iterator I = Uses.find(*Alias); I != Uses.end(); ++I) { 246 SUnit *UseSU = I->SU; 247 if (UseSU == SU) 248 continue; 249 250 // Adjust the dependence latency using operand def/use information, 251 // then allow the target to perform its own adjustments. 252 int UseOp = I->OpIdx; 253 MachineInstr *RegUse = nullptr; 254 SDep Dep; 255 if (UseOp < 0) 256 Dep = SDep(SU, SDep::Artificial); 257 else { 258 // Set the hasPhysRegDefs only for physreg defs that have a use within 259 // the scheduling region. 260 SU->hasPhysRegDefs = true; 261 Dep = SDep(SU, SDep::Data, *Alias); 262 RegUse = UseSU->getInstr(); 263 } 264 const MCInstrDesc *UseMIDesc = 265 (RegUse ? &UseSU->getInstr()->getDesc() : nullptr); 266 bool ImplicitPseudoUse = 267 (UseMIDesc && UseOp >= ((int)UseMIDesc->getNumOperands()) && 268 !UseMIDesc->hasImplicitUseOfPhysReg(*Alias)); 269 if (!ImplicitPseudoDef && !ImplicitPseudoUse) { 270 Dep.setLatency(SchedModel.computeOperandLatency(SU->getInstr(), OperIdx, 271 RegUse, UseOp)); 272 ST.adjustSchedDependency(SU, UseSU, Dep); 273 } else 274 Dep.setLatency(0); 275 276 UseSU->addPred(Dep); 277 } 278 } 279 } 280 281 /// Adds register dependencies (data, anti, and output) from this SUnit 282 /// to following instructions in the same scheduling region that depend the 283 /// physical register referenced at OperIdx. 284 void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) { 285 MachineInstr *MI = SU->getInstr(); 286 MachineOperand &MO = MI->getOperand(OperIdx); 287 unsigned Reg = MO.getReg(); 288 // We do not need to track any dependencies for constant registers. 289 if (MRI.isConstantPhysReg(Reg)) 290 return; 291 292 // Optionally add output and anti dependencies. For anti 293 // dependencies we use a latency of 0 because for a multi-issue 294 // target we want to allow the defining instruction to issue 295 // in the same cycle as the using instruction. 296 // TODO: Using a latency of 1 here for output dependencies assumes 297 // there's no cost for reusing registers. 298 SDep::Kind Kind = MO.isUse() ? SDep::Anti : SDep::Output; 299 for (MCRegAliasIterator Alias(Reg, TRI, true); Alias.isValid(); ++Alias) { 300 if (!Defs.contains(*Alias)) 301 continue; 302 for (Reg2SUnitsMap::iterator I = Defs.find(*Alias); I != Defs.end(); ++I) { 303 SUnit *DefSU = I->SU; 304 if (DefSU == &ExitSU) 305 continue; 306 if (DefSU != SU && 307 (Kind != SDep::Output || !MO.isDead() || 308 !DefSU->getInstr()->registerDefIsDead(*Alias))) { 309 if (Kind == SDep::Anti) 310 DefSU->addPred(SDep(SU, Kind, /*Reg=*/*Alias)); 311 else { 312 SDep Dep(SU, Kind, /*Reg=*/*Alias); 313 Dep.setLatency( 314 SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr())); 315 DefSU->addPred(Dep); 316 } 317 } 318 } 319 } 320 321 if (!MO.isDef()) { 322 SU->hasPhysRegUses = true; 323 // Either insert a new Reg2SUnits entry with an empty SUnits list, or 324 // retrieve the existing SUnits list for this register's uses. 325 // Push this SUnit on the use list. 326 Uses.insert(PhysRegSUOper(SU, OperIdx, Reg)); 327 if (RemoveKillFlags) 328 MO.setIsKill(false); 329 } else { 330 addPhysRegDataDeps(SU, OperIdx); 331 332 // Clear previous uses and defs of this register and its subergisters. 333 for (MCSubRegIterator SubReg(Reg, TRI, true); SubReg.isValid(); ++SubReg) { 334 if (Uses.contains(*SubReg)) 335 Uses.eraseAll(*SubReg); 336 if (!MO.isDead()) 337 Defs.eraseAll(*SubReg); 338 } 339 if (MO.isDead() && SU->isCall) { 340 // Calls will not be reordered because of chain dependencies (see 341 // below). Since call operands are dead, calls may continue to be added 342 // to the DefList making dependence checking quadratic in the size of 343 // the block. Instead, we leave only one call at the back of the 344 // DefList. 345 Reg2SUnitsMap::RangePair P = Defs.equal_range(Reg); 346 Reg2SUnitsMap::iterator B = P.first; 347 Reg2SUnitsMap::iterator I = P.second; 348 for (bool isBegin = I == B; !isBegin; /* empty */) { 349 isBegin = (--I) == B; 350 if (!I->SU->isCall) 351 break; 352 I = Defs.erase(I); 353 } 354 } 355 356 // Defs are pushed in the order they are visited and never reordered. 357 Defs.insert(PhysRegSUOper(SU, OperIdx, Reg)); 358 } 359 } 360 361 LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const 362 { 363 unsigned Reg = MO.getReg(); 364 // No point in tracking lanemasks if we don't have interesting subregisters. 365 const TargetRegisterClass &RC = *MRI.getRegClass(Reg); 366 if (!RC.HasDisjunctSubRegs) 367 return LaneBitmask::getAll(); 368 369 unsigned SubReg = MO.getSubReg(); 370 if (SubReg == 0) 371 return RC.getLaneMask(); 372 return TRI->getSubRegIndexLaneMask(SubReg); 373 } 374 375 /// Adds register output and data dependencies from this SUnit to instructions 376 /// that occur later in the same scheduling region if they read from or write to 377 /// the virtual register defined at OperIdx. 378 /// 379 /// TODO: Hoist loop induction variable increments. This has to be 380 /// reevaluated. Generally, IV scheduling should be done before coalescing. 381 void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) { 382 MachineInstr *MI = SU->getInstr(); 383 MachineOperand &MO = MI->getOperand(OperIdx); 384 unsigned Reg = MO.getReg(); 385 386 LaneBitmask DefLaneMask; 387 LaneBitmask KillLaneMask; 388 if (TrackLaneMasks) { 389 bool IsKill = MO.getSubReg() == 0 || MO.isUndef(); 390 DefLaneMask = getLaneMaskForMO(MO); 391 // If we have a <read-undef> flag, none of the lane values comes from an 392 // earlier instruction. 393 KillLaneMask = IsKill ? LaneBitmask::getAll() : DefLaneMask; 394 395 // Clear undef flag, we'll re-add it later once we know which subregister 396 // Def is first. 397 MO.setIsUndef(false); 398 } else { 399 DefLaneMask = LaneBitmask::getAll(); 400 KillLaneMask = LaneBitmask::getAll(); 401 } 402 403 if (MO.isDead()) { 404 assert(CurrentVRegUses.find(Reg) == CurrentVRegUses.end() && 405 "Dead defs should have no uses"); 406 } else { 407 // Add data dependence to all uses we found so far. 408 const TargetSubtargetInfo &ST = MF.getSubtarget(); 409 for (VReg2SUnitOperIdxMultiMap::iterator I = CurrentVRegUses.find(Reg), 410 E = CurrentVRegUses.end(); I != E; /*empty*/) { 411 LaneBitmask LaneMask = I->LaneMask; 412 // Ignore uses of other lanes. 413 if ((LaneMask & KillLaneMask).none()) { 414 ++I; 415 continue; 416 } 417 418 if ((LaneMask & DefLaneMask).any()) { 419 SUnit *UseSU = I->SU; 420 MachineInstr *Use = UseSU->getInstr(); 421 SDep Dep(SU, SDep::Data, Reg); 422 Dep.setLatency(SchedModel.computeOperandLatency(MI, OperIdx, Use, 423 I->OperandIndex)); 424 ST.adjustSchedDependency(SU, UseSU, Dep); 425 UseSU->addPred(Dep); 426 } 427 428 LaneMask &= ~KillLaneMask; 429 // If we found a Def for all lanes of this use, remove it from the list. 430 if (LaneMask.any()) { 431 I->LaneMask = LaneMask; 432 ++I; 433 } else 434 I = CurrentVRegUses.erase(I); 435 } 436 } 437 438 // Shortcut: Singly defined vregs do not have output/anti dependencies. 439 if (MRI.hasOneDef(Reg)) 440 return; 441 442 // Add output dependence to the next nearest defs of this vreg. 443 // 444 // Unless this definition is dead, the output dependence should be 445 // transitively redundant with antidependencies from this definition's 446 // uses. We're conservative for now until we have a way to guarantee the uses 447 // are not eliminated sometime during scheduling. The output dependence edge 448 // is also useful if output latency exceeds def-use latency. 449 LaneBitmask LaneMask = DefLaneMask; 450 for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg), 451 CurrentVRegDefs.end())) { 452 // Ignore defs for other lanes. 453 if ((V2SU.LaneMask & LaneMask).none()) 454 continue; 455 // Add an output dependence. 456 SUnit *DefSU = V2SU.SU; 457 // Ignore additional defs of the same lanes in one instruction. This can 458 // happen because lanemasks are shared for targets with too many 459 // subregisters. We also use some representration tricks/hacks where we 460 // add super-register defs/uses, to imply that although we only access parts 461 // of the reg we care about the full one. 462 if (DefSU == SU) 463 continue; 464 SDep Dep(SU, SDep::Output, Reg); 465 Dep.setLatency( 466 SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr())); 467 DefSU->addPred(Dep); 468 469 // Update current definition. This can get tricky if the def was about a 470 // bigger lanemask before. We then have to shrink it and create a new 471 // VReg2SUnit for the non-overlapping part. 472 LaneBitmask OverlapMask = V2SU.LaneMask & LaneMask; 473 LaneBitmask NonOverlapMask = V2SU.LaneMask & ~LaneMask; 474 V2SU.SU = SU; 475 V2SU.LaneMask = OverlapMask; 476 if (NonOverlapMask.any()) 477 CurrentVRegDefs.insert(VReg2SUnit(Reg, NonOverlapMask, DefSU)); 478 } 479 // If there was no CurrentVRegDefs entry for some lanes yet, create one. 480 if (LaneMask.any()) 481 CurrentVRegDefs.insert(VReg2SUnit(Reg, LaneMask, SU)); 482 } 483 484 /// Adds a register data dependency if the instruction that defines the 485 /// virtual register used at OperIdx is mapped to an SUnit. Add a register 486 /// antidependency from this SUnit to instructions that occur later in the same 487 /// scheduling region if they write the virtual register. 488 /// 489 /// TODO: Handle ExitSU "uses" properly. 490 void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) { 491 const MachineInstr *MI = SU->getInstr(); 492 const MachineOperand &MO = MI->getOperand(OperIdx); 493 unsigned Reg = MO.getReg(); 494 495 // Remember the use. Data dependencies will be added when we find the def. 496 LaneBitmask LaneMask = TrackLaneMasks ? getLaneMaskForMO(MO) 497 : LaneBitmask::getAll(); 498 CurrentVRegUses.insert(VReg2SUnitOperIdx(Reg, LaneMask, OperIdx, SU)); 499 500 // Add antidependences to the following defs of the vreg. 501 for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg), 502 CurrentVRegDefs.end())) { 503 // Ignore defs for unrelated lanes. 504 LaneBitmask PrevDefLaneMask = V2SU.LaneMask; 505 if ((PrevDefLaneMask & LaneMask).none()) 506 continue; 507 if (V2SU.SU == SU) 508 continue; 509 510 V2SU.SU->addPred(SDep(SU, SDep::Anti, Reg)); 511 } 512 } 513 514 /// Returns true if MI is an instruction we are unable to reason about 515 /// (like a call or something with unmodeled side effects). 516 static inline bool isGlobalMemoryObject(AliasAnalysis *AA, MachineInstr *MI) { 517 return MI->isCall() || MI->hasUnmodeledSideEffects() || 518 (MI->hasOrderedMemoryRef() && !MI->isDereferenceableInvariantLoad(AA)); 519 } 520 521 void ScheduleDAGInstrs::addChainDependency (SUnit *SUa, SUnit *SUb, 522 unsigned Latency) { 523 if (SUa->getInstr()->mayAlias(AAForDep, *SUb->getInstr(), UseTBAA)) { 524 SDep Dep(SUa, SDep::MayAliasMem); 525 Dep.setLatency(Latency); 526 SUb->addPred(Dep); 527 } 528 } 529 530 /// Creates an SUnit for each real instruction, numbered in top-down 531 /// topological order. The instruction order A < B, implies that no edge exists 532 /// from B to A. 533 /// 534 /// Map each real instruction to its SUnit. 535 /// 536 /// After initSUnits, the SUnits vector cannot be resized and the scheduler may 537 /// hang onto SUnit pointers. We may relax this in the future by using SUnit IDs 538 /// instead of pointers. 539 /// 540 /// MachineScheduler relies on initSUnits numbering the nodes by their order in 541 /// the original instruction list. 542 void ScheduleDAGInstrs::initSUnits() { 543 // We'll be allocating one SUnit for each real instruction in the region, 544 // which is contained within a basic block. 545 SUnits.reserve(NumRegionInstrs); 546 547 for (MachineInstr &MI : make_range(RegionBegin, RegionEnd)) { 548 if (MI.isDebugInstr()) 549 continue; 550 551 SUnit *SU = newSUnit(&MI); 552 MISUnitMap[&MI] = SU; 553 554 SU->isCall = MI.isCall(); 555 SU->isCommutable = MI.isCommutable(); 556 557 // Assign the Latency field of SU using target-provided information. 558 SU->Latency = SchedModel.computeInstrLatency(SU->getInstr()); 559 560 // If this SUnit uses a reserved or unbuffered resource, mark it as such. 561 // 562 // Reserved resources block an instruction from issuing and stall the 563 // entire pipeline. These are identified by BufferSize=0. 564 // 565 // Unbuffered resources prevent execution of subsequent instructions that 566 // require the same resources. This is used for in-order execution pipelines 567 // within an out-of-order core. These are identified by BufferSize=1. 568 if (SchedModel.hasInstrSchedModel()) { 569 const MCSchedClassDesc *SC = getSchedClass(SU); 570 for (const MCWriteProcResEntry &PRE : 571 make_range(SchedModel.getWriteProcResBegin(SC), 572 SchedModel.getWriteProcResEnd(SC))) { 573 switch (SchedModel.getProcResource(PRE.ProcResourceIdx)->BufferSize) { 574 case 0: 575 SU->hasReservedResource = true; 576 break; 577 case 1: 578 SU->isUnbuffered = true; 579 break; 580 default: 581 break; 582 } 583 } 584 } 585 } 586 } 587 588 class ScheduleDAGInstrs::Value2SUsMap : public MapVector<ValueType, SUList> { 589 /// Current total number of SUs in map. 590 unsigned NumNodes = 0; 591 592 /// 1 for loads, 0 for stores. (see comment in SUList) 593 unsigned TrueMemOrderLatency; 594 595 public: 596 Value2SUsMap(unsigned lat = 0) : TrueMemOrderLatency(lat) {} 597 598 /// To keep NumNodes up to date, insert() is used instead of 599 /// this operator w/ push_back(). 600 ValueType &operator[](const SUList &Key) { 601 llvm_unreachable("Don't use. Use insert() instead."); }; 602 603 /// Adds SU to the SUList of V. If Map grows huge, reduce its size by calling 604 /// reduce(). 605 void inline insert(SUnit *SU, ValueType V) { 606 MapVector::operator[](V).push_back(SU); 607 NumNodes++; 608 } 609 610 /// Clears the list of SUs mapped to V. 611 void inline clearList(ValueType V) { 612 iterator Itr = find(V); 613 if (Itr != end()) { 614 assert(NumNodes >= Itr->second.size()); 615 NumNodes -= Itr->second.size(); 616 617 Itr->second.clear(); 618 } 619 } 620 621 /// Clears map from all contents. 622 void clear() { 623 MapVector<ValueType, SUList>::clear(); 624 NumNodes = 0; 625 } 626 627 unsigned inline size() const { return NumNodes; } 628 629 /// Counts the number of SUs in this map after a reduction. 630 void reComputeSize() { 631 NumNodes = 0; 632 for (auto &I : *this) 633 NumNodes += I.second.size(); 634 } 635 636 unsigned inline getTrueMemOrderLatency() const { 637 return TrueMemOrderLatency; 638 } 639 640 void dump(); 641 }; 642 643 void ScheduleDAGInstrs::addChainDependencies(SUnit *SU, 644 Value2SUsMap &Val2SUsMap) { 645 for (auto &I : Val2SUsMap) 646 addChainDependencies(SU, I.second, 647 Val2SUsMap.getTrueMemOrderLatency()); 648 } 649 650 void ScheduleDAGInstrs::addChainDependencies(SUnit *SU, 651 Value2SUsMap &Val2SUsMap, 652 ValueType V) { 653 Value2SUsMap::iterator Itr = Val2SUsMap.find(V); 654 if (Itr != Val2SUsMap.end()) 655 addChainDependencies(SU, Itr->second, 656 Val2SUsMap.getTrueMemOrderLatency()); 657 } 658 659 void ScheduleDAGInstrs::addBarrierChain(Value2SUsMap &map) { 660 assert(BarrierChain != nullptr); 661 662 for (auto &I : map) { 663 SUList &sus = I.second; 664 for (auto *SU : sus) 665 SU->addPredBarrier(BarrierChain); 666 } 667 map.clear(); 668 } 669 670 void ScheduleDAGInstrs::insertBarrierChain(Value2SUsMap &map) { 671 assert(BarrierChain != nullptr); 672 673 // Go through all lists of SUs. 674 for (Value2SUsMap::iterator I = map.begin(), EE = map.end(); I != EE;) { 675 Value2SUsMap::iterator CurrItr = I++; 676 SUList &sus = CurrItr->second; 677 SUList::iterator SUItr = sus.begin(), SUEE = sus.end(); 678 for (; SUItr != SUEE; ++SUItr) { 679 // Stop on BarrierChain or any instruction above it. 680 if ((*SUItr)->NodeNum <= BarrierChain->NodeNum) 681 break; 682 683 (*SUItr)->addPredBarrier(BarrierChain); 684 } 685 686 // Remove also the BarrierChain from list if present. 687 if (SUItr != SUEE && *SUItr == BarrierChain) 688 SUItr++; 689 690 // Remove all SUs that are now successors of BarrierChain. 691 if (SUItr != sus.begin()) 692 sus.erase(sus.begin(), SUItr); 693 } 694 695 // Remove all entries with empty su lists. 696 map.remove_if([&](std::pair<ValueType, SUList> &mapEntry) { 697 return (mapEntry.second.empty()); }); 698 699 // Recompute the size of the map (NumNodes). 700 map.reComputeSize(); 701 } 702 703 void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA, 704 RegPressureTracker *RPTracker, 705 PressureDiffs *PDiffs, 706 LiveIntervals *LIS, 707 bool TrackLaneMasks) { 708 const TargetSubtargetInfo &ST = MF.getSubtarget(); 709 bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI 710 : ST.useAA(); 711 AAForDep = UseAA ? AA : nullptr; 712 713 BarrierChain = nullptr; 714 715 this->TrackLaneMasks = TrackLaneMasks; 716 MISUnitMap.clear(); 717 ScheduleDAG::clearDAG(); 718 719 // Create an SUnit for each real instruction. 720 initSUnits(); 721 722 if (PDiffs) 723 PDiffs->init(SUnits.size()); 724 725 // We build scheduling units by walking a block's instruction list 726 // from bottom to top. 727 728 // Each MIs' memory operand(s) is analyzed to a list of underlying 729 // objects. The SU is then inserted in the SUList(s) mapped from the 730 // Value(s). Each Value thus gets mapped to lists of SUs depending 731 // on it, stores and loads kept separately. Two SUs are trivially 732 // non-aliasing if they both depend on only identified Values and do 733 // not share any common Value. 734 Value2SUsMap Stores, Loads(1 /*TrueMemOrderLatency*/); 735 736 // Certain memory accesses are known to not alias any SU in Stores 737 // or Loads, and have therefore their own 'NonAlias' 738 // domain. E.g. spill / reload instructions never alias LLVM I/R 739 // Values. It would be nice to assume that this type of memory 740 // accesses always have a proper memory operand modelling, and are 741 // therefore never unanalyzable, but this is conservatively not 742 // done. 743 Value2SUsMap NonAliasStores, NonAliasLoads(1 /*TrueMemOrderLatency*/); 744 745 // Remove any stale debug info; sometimes BuildSchedGraph is called again 746 // without emitting the info from the previous call. 747 DbgValues.clear(); 748 FirstDbgValue = nullptr; 749 750 assert(Defs.empty() && Uses.empty() && 751 "Only BuildGraph should update Defs/Uses"); 752 Defs.setUniverse(TRI->getNumRegs()); 753 Uses.setUniverse(TRI->getNumRegs()); 754 755 assert(CurrentVRegDefs.empty() && "nobody else should use CurrentVRegDefs"); 756 assert(CurrentVRegUses.empty() && "nobody else should use CurrentVRegUses"); 757 unsigned NumVirtRegs = MRI.getNumVirtRegs(); 758 CurrentVRegDefs.setUniverse(NumVirtRegs); 759 CurrentVRegUses.setUniverse(NumVirtRegs); 760 761 // Model data dependencies between instructions being scheduled and the 762 // ExitSU. 763 addSchedBarrierDeps(); 764 765 // Walk the list of instructions, from bottom moving up. 766 MachineInstr *DbgMI = nullptr; 767 for (MachineBasicBlock::iterator MII = RegionEnd, MIE = RegionBegin; 768 MII != MIE; --MII) { 769 MachineInstr &MI = *std::prev(MII); 770 if (DbgMI) { 771 DbgValues.push_back(std::make_pair(DbgMI, &MI)); 772 DbgMI = nullptr; 773 } 774 775 if (MI.isDebugValue()) { 776 DbgMI = &MI; 777 continue; 778 } 779 if (MI.isDebugLabel()) 780 continue; 781 782 SUnit *SU = MISUnitMap[&MI]; 783 assert(SU && "No SUnit mapped to this MI"); 784 785 if (RPTracker) { 786 RegisterOperands RegOpers; 787 RegOpers.collect(MI, *TRI, MRI, TrackLaneMasks, false); 788 if (TrackLaneMasks) { 789 SlotIndex SlotIdx = LIS->getInstructionIndex(MI); 790 RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx); 791 } 792 if (PDiffs != nullptr) 793 PDiffs->addInstruction(SU->NodeNum, RegOpers, MRI); 794 795 if (RPTracker->getPos() == RegionEnd || &*RPTracker->getPos() != &MI) 796 RPTracker->recedeSkipDebugValues(); 797 assert(&*RPTracker->getPos() == &MI && "RPTracker in sync"); 798 RPTracker->recede(RegOpers); 799 } 800 801 assert( 802 (CanHandleTerminators || (!MI.isTerminator() && !MI.isPosition())) && 803 "Cannot schedule terminators or labels!"); 804 805 // Add register-based dependencies (data, anti, and output). 806 // For some instructions (calls, returns, inline-asm, etc.) there can 807 // be explicit uses and implicit defs, in which case the use will appear 808 // on the operand list before the def. Do two passes over the operand 809 // list to make sure that defs are processed before any uses. 810 bool HasVRegDef = false; 811 for (unsigned j = 0, n = MI.getNumOperands(); j != n; ++j) { 812 const MachineOperand &MO = MI.getOperand(j); 813 if (!MO.isReg() || !MO.isDef()) 814 continue; 815 unsigned Reg = MO.getReg(); 816 if (TargetRegisterInfo::isPhysicalRegister(Reg)) { 817 addPhysRegDeps(SU, j); 818 } else if (TargetRegisterInfo::isVirtualRegister(Reg)) { 819 HasVRegDef = true; 820 addVRegDefDeps(SU, j); 821 } 822 } 823 // Now process all uses. 824 for (unsigned j = 0, n = MI.getNumOperands(); j != n; ++j) { 825 const MachineOperand &MO = MI.getOperand(j); 826 // Only look at use operands. 827 // We do not need to check for MO.readsReg() here because subsequent 828 // subregister defs will get output dependence edges and need no 829 // additional use dependencies. 830 if (!MO.isReg() || !MO.isUse()) 831 continue; 832 unsigned Reg = MO.getReg(); 833 if (TargetRegisterInfo::isPhysicalRegister(Reg)) { 834 addPhysRegDeps(SU, j); 835 } else if (TargetRegisterInfo::isVirtualRegister(Reg) && MO.readsReg()) { 836 addVRegUseDeps(SU, j); 837 } 838 } 839 840 // If we haven't seen any uses in this scheduling region, create a 841 // dependence edge to ExitSU to model the live-out latency. This is required 842 // for vreg defs with no in-region use, and prefetches with no vreg def. 843 // 844 // FIXME: NumDataSuccs would be more precise than NumSuccs here. This 845 // check currently relies on being called before adding chain deps. 846 if (SU->NumSuccs == 0 && SU->Latency > 1 && (HasVRegDef || MI.mayLoad())) { 847 SDep Dep(SU, SDep::Artificial); 848 Dep.setLatency(SU->Latency - 1); 849 ExitSU.addPred(Dep); 850 } 851 852 // Add memory dependencies (Note: isStoreToStackSlot and 853 // isLoadFromStackSLot are not usable after stack slots are lowered to 854 // actual addresses). 855 856 // This is a barrier event that acts as a pivotal node in the DAG. 857 if (isGlobalMemoryObject(AA, &MI)) { 858 859 // Become the barrier chain. 860 if (BarrierChain) 861 BarrierChain->addPredBarrier(SU); 862 BarrierChain = SU; 863 864 LLVM_DEBUG(dbgs() << "Global memory object and new barrier chain: SU(" 865 << BarrierChain->NodeNum << ").\n";); 866 867 // Add dependencies against everything below it and clear maps. 868 addBarrierChain(Stores); 869 addBarrierChain(Loads); 870 addBarrierChain(NonAliasStores); 871 addBarrierChain(NonAliasLoads); 872 873 continue; 874 } 875 876 // If it's not a store or a variant load, we're done. 877 if (!MI.mayStore() && 878 !(MI.mayLoad() && !MI.isDereferenceableInvariantLoad(AA))) 879 continue; 880 881 // Always add dependecy edge to BarrierChain if present. 882 if (BarrierChain) 883 BarrierChain->addPredBarrier(SU); 884 885 // Find the underlying objects for MI. The Objs vector is either 886 // empty, or filled with the Values of memory locations which this 887 // SU depends on. 888 UnderlyingObjectsVector Objs; 889 bool ObjsFound = getUnderlyingObjectsForInstr(&MI, MFI, Objs, 890 MF.getDataLayout()); 891 892 if (MI.mayStore()) { 893 if (!ObjsFound) { 894 // An unknown store depends on all stores and loads. 895 addChainDependencies(SU, Stores); 896 addChainDependencies(SU, NonAliasStores); 897 addChainDependencies(SU, Loads); 898 addChainDependencies(SU, NonAliasLoads); 899 900 // Map this store to 'UnknownValue'. 901 Stores.insert(SU, UnknownValue); 902 } else { 903 // Add precise dependencies against all previously seen memory 904 // accesses mapped to the same Value(s). 905 for (const UnderlyingObject &UnderlObj : Objs) { 906 ValueType V = UnderlObj.getValue(); 907 bool ThisMayAlias = UnderlObj.mayAlias(); 908 909 // Add dependencies to previous stores and loads mapped to V. 910 addChainDependencies(SU, (ThisMayAlias ? Stores : NonAliasStores), V); 911 addChainDependencies(SU, (ThisMayAlias ? Loads : NonAliasLoads), V); 912 } 913 // Update the store map after all chains have been added to avoid adding 914 // self-loop edge if multiple underlying objects are present. 915 for (const UnderlyingObject &UnderlObj : Objs) { 916 ValueType V = UnderlObj.getValue(); 917 bool ThisMayAlias = UnderlObj.mayAlias(); 918 919 // Map this store to V. 920 (ThisMayAlias ? Stores : NonAliasStores).insert(SU, V); 921 } 922 // The store may have dependencies to unanalyzable loads and 923 // stores. 924 addChainDependencies(SU, Loads, UnknownValue); 925 addChainDependencies(SU, Stores, UnknownValue); 926 } 927 } else { // SU is a load. 928 if (!ObjsFound) { 929 // An unknown load depends on all stores. 930 addChainDependencies(SU, Stores); 931 addChainDependencies(SU, NonAliasStores); 932 933 Loads.insert(SU, UnknownValue); 934 } else { 935 for (const UnderlyingObject &UnderlObj : Objs) { 936 ValueType V = UnderlObj.getValue(); 937 bool ThisMayAlias = UnderlObj.mayAlias(); 938 939 // Add precise dependencies against all previously seen stores 940 // mapping to the same Value(s). 941 addChainDependencies(SU, (ThisMayAlias ? Stores : NonAliasStores), V); 942 943 // Map this load to V. 944 (ThisMayAlias ? Loads : NonAliasLoads).insert(SU, V); 945 } 946 // The load may have dependencies to unanalyzable stores. 947 addChainDependencies(SU, Stores, UnknownValue); 948 } 949 } 950 951 // Reduce maps if they grow huge. 952 if (Stores.size() + Loads.size() >= HugeRegion) { 953 LLVM_DEBUG(dbgs() << "Reducing Stores and Loads maps.\n";); 954 reduceHugeMemNodeMaps(Stores, Loads, getReductionSize()); 955 } 956 if (NonAliasStores.size() + NonAliasLoads.size() >= HugeRegion) { 957 LLVM_DEBUG( 958 dbgs() << "Reducing NonAliasStores and NonAliasLoads maps.\n";); 959 reduceHugeMemNodeMaps(NonAliasStores, NonAliasLoads, getReductionSize()); 960 } 961 } 962 963 if (DbgMI) 964 FirstDbgValue = DbgMI; 965 966 Defs.clear(); 967 Uses.clear(); 968 CurrentVRegDefs.clear(); 969 CurrentVRegUses.clear(); 970 } 971 972 raw_ostream &llvm::operator<<(raw_ostream &OS, const PseudoSourceValue* PSV) { 973 PSV->printCustom(OS); 974 return OS; 975 } 976 977 void ScheduleDAGInstrs::Value2SUsMap::dump() { 978 for (auto &Itr : *this) { 979 if (Itr.first.is<const Value*>()) { 980 const Value *V = Itr.first.get<const Value*>(); 981 if (isa<UndefValue>(V)) 982 dbgs() << "Unknown"; 983 else 984 V->printAsOperand(dbgs()); 985 } 986 else if (Itr.first.is<const PseudoSourceValue*>()) 987 dbgs() << Itr.first.get<const PseudoSourceValue*>(); 988 else 989 llvm_unreachable("Unknown Value type."); 990 991 dbgs() << " : "; 992 dumpSUList(Itr.second); 993 } 994 } 995 996 void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores, 997 Value2SUsMap &loads, unsigned N) { 998 LLVM_DEBUG(dbgs() << "Before reduction:\nStoring SUnits:\n"; stores.dump(); 999 dbgs() << "Loading SUnits:\n"; loads.dump()); 1000 1001 // Insert all SU's NodeNums into a vector and sort it. 1002 std::vector<unsigned> NodeNums; 1003 NodeNums.reserve(stores.size() + loads.size()); 1004 for (auto &I : stores) 1005 for (auto *SU : I.second) 1006 NodeNums.push_back(SU->NodeNum); 1007 for (auto &I : loads) 1008 for (auto *SU : I.second) 1009 NodeNums.push_back(SU->NodeNum); 1010 llvm::sort(NodeNums); 1011 1012 // The N last elements in NodeNums will be removed, and the SU with 1013 // the lowest NodeNum of them will become the new BarrierChain to 1014 // let the not yet seen SUs have a dependency to the removed SUs. 1015 assert(N <= NodeNums.size()); 1016 SUnit *newBarrierChain = &SUnits[*(NodeNums.end() - N)]; 1017 if (BarrierChain) { 1018 // The aliasing and non-aliasing maps reduce independently of each 1019 // other, but share a common BarrierChain. Check if the 1020 // newBarrierChain is above the former one. If it is not, it may 1021 // introduce a loop to use newBarrierChain, so keep the old one. 1022 if (newBarrierChain->NodeNum < BarrierChain->NodeNum) { 1023 BarrierChain->addPredBarrier(newBarrierChain); 1024 BarrierChain = newBarrierChain; 1025 LLVM_DEBUG(dbgs() << "Inserting new barrier chain: SU(" 1026 << BarrierChain->NodeNum << ").\n";); 1027 } 1028 else 1029 LLVM_DEBUG(dbgs() << "Keeping old barrier chain: SU(" 1030 << BarrierChain->NodeNum << ").\n";); 1031 } 1032 else 1033 BarrierChain = newBarrierChain; 1034 1035 insertBarrierChain(stores); 1036 insertBarrierChain(loads); 1037 1038 LLVM_DEBUG(dbgs() << "After reduction:\nStoring SUnits:\n"; stores.dump(); 1039 dbgs() << "Loading SUnits:\n"; loads.dump()); 1040 } 1041 1042 static void toggleKills(const MachineRegisterInfo &MRI, LivePhysRegs &LiveRegs, 1043 MachineInstr &MI, bool addToLiveRegs) { 1044 for (MachineOperand &MO : MI.operands()) { 1045 if (!MO.isReg() || !MO.readsReg()) 1046 continue; 1047 unsigned Reg = MO.getReg(); 1048 if (!Reg) 1049 continue; 1050 1051 // Things that are available after the instruction are killed by it. 1052 bool IsKill = LiveRegs.available(MRI, Reg); 1053 MO.setIsKill(IsKill); 1054 if (addToLiveRegs) 1055 LiveRegs.addReg(Reg); 1056 } 1057 } 1058 1059 void ScheduleDAGInstrs::fixupKills(MachineBasicBlock &MBB) { 1060 LLVM_DEBUG(dbgs() << "Fixup kills for " << printMBBReference(MBB) << '\n'); 1061 1062 LiveRegs.init(*TRI); 1063 LiveRegs.addLiveOuts(MBB); 1064 1065 // Examine block from end to start... 1066 for (MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) { 1067 if (MI.isDebugInstr()) 1068 continue; 1069 1070 // Update liveness. Registers that are defed but not used in this 1071 // instruction are now dead. Mark register and all subregs as they 1072 // are completely defined. 1073 for (ConstMIBundleOperands O(MI); O.isValid(); ++O) { 1074 const MachineOperand &MO = *O; 1075 if (MO.isReg()) { 1076 if (!MO.isDef()) 1077 continue; 1078 unsigned Reg = MO.getReg(); 1079 if (!Reg) 1080 continue; 1081 LiveRegs.removeReg(Reg); 1082 } else if (MO.isRegMask()) { 1083 LiveRegs.removeRegsInMask(MO); 1084 } 1085 } 1086 1087 // If there is a bundle header fix it up first. 1088 if (!MI.isBundled()) { 1089 toggleKills(MRI, LiveRegs, MI, true); 1090 } else { 1091 MachineBasicBlock::instr_iterator First = MI.getIterator(); 1092 if (MI.isBundle()) { 1093 toggleKills(MRI, LiveRegs, MI, false); 1094 ++First; 1095 } 1096 // Some targets make the (questionable) assumtion that the instructions 1097 // inside the bundle are ordered and consequently only the last use of 1098 // a register inside the bundle can kill it. 1099 MachineBasicBlock::instr_iterator I = std::next(First); 1100 while (I->isBundledWithSucc()) 1101 ++I; 1102 do { 1103 if (!I->isDebugInstr()) 1104 toggleKills(MRI, LiveRegs, *I, true); 1105 --I; 1106 } while(I != First); 1107 } 1108 } 1109 } 1110 1111 void ScheduleDAGInstrs::dumpNode(const SUnit &SU) const { 1112 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1113 dumpNodeName(SU); 1114 dbgs() << ": "; 1115 SU.getInstr()->dump(); 1116 #endif 1117 } 1118 1119 void ScheduleDAGInstrs::dump() const { 1120 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1121 if (EntrySU.getInstr() != nullptr) 1122 dumpNodeAll(EntrySU); 1123 for (const SUnit &SU : SUnits) 1124 dumpNodeAll(SU); 1125 if (ExitSU.getInstr() != nullptr) 1126 dumpNodeAll(ExitSU); 1127 #endif 1128 } 1129 1130 std::string ScheduleDAGInstrs::getGraphNodeLabel(const SUnit *SU) const { 1131 std::string s; 1132 raw_string_ostream oss(s); 1133 if (SU == &EntrySU) 1134 oss << "<entry>"; 1135 else if (SU == &ExitSU) 1136 oss << "<exit>"; 1137 else 1138 SU->getInstr()->print(oss, /*SkipOpers=*/true); 1139 return oss.str(); 1140 } 1141 1142 /// Return the basic block label. It is not necessarilly unique because a block 1143 /// contains multiple scheduling regions. But it is fine for visualization. 1144 std::string ScheduleDAGInstrs::getDAGName() const { 1145 return "dag." + BB->getFullName(); 1146 } 1147 1148 //===----------------------------------------------------------------------===// 1149 // SchedDFSResult Implementation 1150 //===----------------------------------------------------------------------===// 1151 1152 namespace llvm { 1153 1154 /// Internal state used to compute SchedDFSResult. 1155 class SchedDFSImpl { 1156 SchedDFSResult &R; 1157 1158 /// Join DAG nodes into equivalence classes by their subtree. 1159 IntEqClasses SubtreeClasses; 1160 /// List PredSU, SuccSU pairs that represent data edges between subtrees. 1161 std::vector<std::pair<const SUnit *, const SUnit*>> ConnectionPairs; 1162 1163 struct RootData { 1164 unsigned NodeID; 1165 unsigned ParentNodeID; ///< Parent node (member of the parent subtree). 1166 unsigned SubInstrCount = 0; ///< Instr count in this tree only, not 1167 /// children. 1168 1169 RootData(unsigned id): NodeID(id), 1170 ParentNodeID(SchedDFSResult::InvalidSubtreeID) {} 1171 1172 unsigned getSparseSetIndex() const { return NodeID; } 1173 }; 1174 1175 SparseSet<RootData> RootSet; 1176 1177 public: 1178 SchedDFSImpl(SchedDFSResult &r): R(r), SubtreeClasses(R.DFSNodeData.size()) { 1179 RootSet.setUniverse(R.DFSNodeData.size()); 1180 } 1181 1182 /// Returns true if this node been visited by the DFS traversal. 1183 /// 1184 /// During visitPostorderNode the Node's SubtreeID is assigned to the Node 1185 /// ID. Later, SubtreeID is updated but remains valid. 1186 bool isVisited(const SUnit *SU) const { 1187 return R.DFSNodeData[SU->NodeNum].SubtreeID 1188 != SchedDFSResult::InvalidSubtreeID; 1189 } 1190 1191 /// Initializes this node's instruction count. We don't need to flag the node 1192 /// visited until visitPostorder because the DAG cannot have cycles. 1193 void visitPreorder(const SUnit *SU) { 1194 R.DFSNodeData[SU->NodeNum].InstrCount = 1195 SU->getInstr()->isTransient() ? 0 : 1; 1196 } 1197 1198 /// Called once for each node after all predecessors are visited. Revisit this 1199 /// node's predecessors and potentially join them now that we know the ILP of 1200 /// the other predecessors. 1201 void visitPostorderNode(const SUnit *SU) { 1202 // Mark this node as the root of a subtree. It may be joined with its 1203 // successors later. 1204 R.DFSNodeData[SU->NodeNum].SubtreeID = SU->NodeNum; 1205 RootData RData(SU->NodeNum); 1206 RData.SubInstrCount = SU->getInstr()->isTransient() ? 0 : 1; 1207 1208 // If any predecessors are still in their own subtree, they either cannot be 1209 // joined or are large enough to remain separate. If this parent node's 1210 // total instruction count is not greater than a child subtree by at least 1211 // the subtree limit, then try to join it now since splitting subtrees is 1212 // only useful if multiple high-pressure paths are possible. 1213 unsigned InstrCount = R.DFSNodeData[SU->NodeNum].InstrCount; 1214 for (const SDep &PredDep : SU->Preds) { 1215 if (PredDep.getKind() != SDep::Data) 1216 continue; 1217 unsigned PredNum = PredDep.getSUnit()->NodeNum; 1218 if ((InstrCount - R.DFSNodeData[PredNum].InstrCount) < R.SubtreeLimit) 1219 joinPredSubtree(PredDep, SU, /*CheckLimit=*/false); 1220 1221 // Either link or merge the TreeData entry from the child to the parent. 1222 if (R.DFSNodeData[PredNum].SubtreeID == PredNum) { 1223 // If the predecessor's parent is invalid, this is a tree edge and the 1224 // current node is the parent. 1225 if (RootSet[PredNum].ParentNodeID == SchedDFSResult::InvalidSubtreeID) 1226 RootSet[PredNum].ParentNodeID = SU->NodeNum; 1227 } 1228 else if (RootSet.count(PredNum)) { 1229 // The predecessor is not a root, but is still in the root set. This 1230 // must be the new parent that it was just joined to. Note that 1231 // RootSet[PredNum].ParentNodeID may either be invalid or may still be 1232 // set to the original parent. 1233 RData.SubInstrCount += RootSet[PredNum].SubInstrCount; 1234 RootSet.erase(PredNum); 1235 } 1236 } 1237 RootSet[SU->NodeNum] = RData; 1238 } 1239 1240 /// Called once for each tree edge after calling visitPostOrderNode on 1241 /// the predecessor. Increment the parent node's instruction count and 1242 /// preemptively join this subtree to its parent's if it is small enough. 1243 void visitPostorderEdge(const SDep &PredDep, const SUnit *Succ) { 1244 R.DFSNodeData[Succ->NodeNum].InstrCount 1245 += R.DFSNodeData[PredDep.getSUnit()->NodeNum].InstrCount; 1246 joinPredSubtree(PredDep, Succ); 1247 } 1248 1249 /// Adds a connection for cross edges. 1250 void visitCrossEdge(const SDep &PredDep, const SUnit *Succ) { 1251 ConnectionPairs.push_back(std::make_pair(PredDep.getSUnit(), Succ)); 1252 } 1253 1254 /// Sets each node's subtree ID to the representative ID and record 1255 /// connections between trees. 1256 void finalize() { 1257 SubtreeClasses.compress(); 1258 R.DFSTreeData.resize(SubtreeClasses.getNumClasses()); 1259 assert(SubtreeClasses.getNumClasses() == RootSet.size() 1260 && "number of roots should match trees"); 1261 for (const RootData &Root : RootSet) { 1262 unsigned TreeID = SubtreeClasses[Root.NodeID]; 1263 if (Root.ParentNodeID != SchedDFSResult::InvalidSubtreeID) 1264 R.DFSTreeData[TreeID].ParentTreeID = SubtreeClasses[Root.ParentNodeID]; 1265 R.DFSTreeData[TreeID].SubInstrCount = Root.SubInstrCount; 1266 // Note that SubInstrCount may be greater than InstrCount if we joined 1267 // subtrees across a cross edge. InstrCount will be attributed to the 1268 // original parent, while SubInstrCount will be attributed to the joined 1269 // parent. 1270 } 1271 R.SubtreeConnections.resize(SubtreeClasses.getNumClasses()); 1272 R.SubtreeConnectLevels.resize(SubtreeClasses.getNumClasses()); 1273 LLVM_DEBUG(dbgs() << R.getNumSubtrees() << " subtrees:\n"); 1274 for (unsigned Idx = 0, End = R.DFSNodeData.size(); Idx != End; ++Idx) { 1275 R.DFSNodeData[Idx].SubtreeID = SubtreeClasses[Idx]; 1276 LLVM_DEBUG(dbgs() << " SU(" << Idx << ") in tree " 1277 << R.DFSNodeData[Idx].SubtreeID << '\n'); 1278 } 1279 for (const std::pair<const SUnit*, const SUnit*> &P : ConnectionPairs) { 1280 unsigned PredTree = SubtreeClasses[P.first->NodeNum]; 1281 unsigned SuccTree = SubtreeClasses[P.second->NodeNum]; 1282 if (PredTree == SuccTree) 1283 continue; 1284 unsigned Depth = P.first->getDepth(); 1285 addConnection(PredTree, SuccTree, Depth); 1286 addConnection(SuccTree, PredTree, Depth); 1287 } 1288 } 1289 1290 protected: 1291 /// Joins the predecessor subtree with the successor that is its DFS parent. 1292 /// Applies some heuristics before joining. 1293 bool joinPredSubtree(const SDep &PredDep, const SUnit *Succ, 1294 bool CheckLimit = true) { 1295 assert(PredDep.getKind() == SDep::Data && "Subtrees are for data edges"); 1296 1297 // Check if the predecessor is already joined. 1298 const SUnit *PredSU = PredDep.getSUnit(); 1299 unsigned PredNum = PredSU->NodeNum; 1300 if (R.DFSNodeData[PredNum].SubtreeID != PredNum) 1301 return false; 1302 1303 // Four is the magic number of successors before a node is considered a 1304 // pinch point. 1305 unsigned NumDataSucs = 0; 1306 for (const SDep &SuccDep : PredSU->Succs) { 1307 if (SuccDep.getKind() == SDep::Data) { 1308 if (++NumDataSucs >= 4) 1309 return false; 1310 } 1311 } 1312 if (CheckLimit && R.DFSNodeData[PredNum].InstrCount > R.SubtreeLimit) 1313 return false; 1314 R.DFSNodeData[PredNum].SubtreeID = Succ->NodeNum; 1315 SubtreeClasses.join(Succ->NodeNum, PredNum); 1316 return true; 1317 } 1318 1319 /// Called by finalize() to record a connection between trees. 1320 void addConnection(unsigned FromTree, unsigned ToTree, unsigned Depth) { 1321 if (!Depth) 1322 return; 1323 1324 do { 1325 SmallVectorImpl<SchedDFSResult::Connection> &Connections = 1326 R.SubtreeConnections[FromTree]; 1327 for (SchedDFSResult::Connection &C : Connections) { 1328 if (C.TreeID == ToTree) { 1329 C.Level = std::max(C.Level, Depth); 1330 return; 1331 } 1332 } 1333 Connections.push_back(SchedDFSResult::Connection(ToTree, Depth)); 1334 FromTree = R.DFSTreeData[FromTree].ParentTreeID; 1335 } while (FromTree != SchedDFSResult::InvalidSubtreeID); 1336 } 1337 }; 1338 1339 } // end namespace llvm 1340 1341 namespace { 1342 1343 /// Manage the stack used by a reverse depth-first search over the DAG. 1344 class SchedDAGReverseDFS { 1345 std::vector<std::pair<const SUnit *, SUnit::const_pred_iterator>> DFSStack; 1346 1347 public: 1348 bool isComplete() const { return DFSStack.empty(); } 1349 1350 void follow(const SUnit *SU) { 1351 DFSStack.push_back(std::make_pair(SU, SU->Preds.begin())); 1352 } 1353 void advance() { ++DFSStack.back().second; } 1354 1355 const SDep *backtrack() { 1356 DFSStack.pop_back(); 1357 return DFSStack.empty() ? nullptr : std::prev(DFSStack.back().second); 1358 } 1359 1360 const SUnit *getCurr() const { return DFSStack.back().first; } 1361 1362 SUnit::const_pred_iterator getPred() const { return DFSStack.back().second; } 1363 1364 SUnit::const_pred_iterator getPredEnd() const { 1365 return getCurr()->Preds.end(); 1366 } 1367 }; 1368 1369 } // end anonymous namespace 1370 1371 static bool hasDataSucc(const SUnit *SU) { 1372 for (const SDep &SuccDep : SU->Succs) { 1373 if (SuccDep.getKind() == SDep::Data && 1374 !SuccDep.getSUnit()->isBoundaryNode()) 1375 return true; 1376 } 1377 return false; 1378 } 1379 1380 /// Computes an ILP metric for all nodes in the subDAG reachable via depth-first 1381 /// search from this root. 1382 void SchedDFSResult::compute(ArrayRef<SUnit> SUnits) { 1383 if (!IsBottomUp) 1384 llvm_unreachable("Top-down ILP metric is unimplemented"); 1385 1386 SchedDFSImpl Impl(*this); 1387 for (const SUnit &SU : SUnits) { 1388 if (Impl.isVisited(&SU) || hasDataSucc(&SU)) 1389 continue; 1390 1391 SchedDAGReverseDFS DFS; 1392 Impl.visitPreorder(&SU); 1393 DFS.follow(&SU); 1394 while (true) { 1395 // Traverse the leftmost path as far as possible. 1396 while (DFS.getPred() != DFS.getPredEnd()) { 1397 const SDep &PredDep = *DFS.getPred(); 1398 DFS.advance(); 1399 // Ignore non-data edges. 1400 if (PredDep.getKind() != SDep::Data 1401 || PredDep.getSUnit()->isBoundaryNode()) { 1402 continue; 1403 } 1404 // An already visited edge is a cross edge, assuming an acyclic DAG. 1405 if (Impl.isVisited(PredDep.getSUnit())) { 1406 Impl.visitCrossEdge(PredDep, DFS.getCurr()); 1407 continue; 1408 } 1409 Impl.visitPreorder(PredDep.getSUnit()); 1410 DFS.follow(PredDep.getSUnit()); 1411 } 1412 // Visit the top of the stack in postorder and backtrack. 1413 const SUnit *Child = DFS.getCurr(); 1414 const SDep *PredDep = DFS.backtrack(); 1415 Impl.visitPostorderNode(Child); 1416 if (PredDep) 1417 Impl.visitPostorderEdge(*PredDep, DFS.getCurr()); 1418 if (DFS.isComplete()) 1419 break; 1420 } 1421 } 1422 Impl.finalize(); 1423 } 1424 1425 /// The root of the given SubtreeID was just scheduled. For all subtrees 1426 /// connected to this tree, record the depth of the connection so that the 1427 /// nearest connected subtrees can be prioritized. 1428 void SchedDFSResult::scheduleTree(unsigned SubtreeID) { 1429 for (const Connection &C : SubtreeConnections[SubtreeID]) { 1430 SubtreeConnectLevels[C.TreeID] = 1431 std::max(SubtreeConnectLevels[C.TreeID], C.Level); 1432 LLVM_DEBUG(dbgs() << " Tree: " << C.TreeID << " @" 1433 << SubtreeConnectLevels[C.TreeID] << '\n'); 1434 } 1435 } 1436 1437 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1438 LLVM_DUMP_METHOD void ILPValue::print(raw_ostream &OS) const { 1439 OS << InstrCount << " / " << Length << " = "; 1440 if (!Length) 1441 OS << "BADILP"; 1442 else 1443 OS << format("%g", ((double)InstrCount / Length)); 1444 } 1445 1446 LLVM_DUMP_METHOD void ILPValue::dump() const { 1447 dbgs() << *this << '\n'; 1448 } 1449 1450 namespace llvm { 1451 1452 LLVM_DUMP_METHOD 1453 raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) { 1454 Val.print(OS); 1455 return OS; 1456 } 1457 1458 } // end namespace llvm 1459 1460 #endif 1461