1 //===---- ScheduleDAGInstrs.cpp - MachineInstr Rescheduling ---------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file This implements the ScheduleDAGInstrs class, which implements 11 /// re-scheduling of MachineInstrs. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/CodeGen/ScheduleDAGInstrs.h" 16 #include "llvm/ADT/IntEqClasses.h" 17 #include "llvm/ADT/MapVector.h" 18 #include "llvm/ADT/SmallPtrSet.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/ADT/SparseSet.h" 21 #include "llvm/ADT/iterator_range.h" 22 #include "llvm/Analysis/AliasAnalysis.h" 23 #include "llvm/Analysis/ValueTracking.h" 24 #include "llvm/CodeGen/LiveIntervalAnalysis.h" 25 #include "llvm/CodeGen/LivePhysRegs.h" 26 #include "llvm/CodeGen/MachineBasicBlock.h" 27 #include "llvm/CodeGen/MachineFrameInfo.h" 28 #include "llvm/CodeGen/MachineFunction.h" 29 #include "llvm/CodeGen/MachineInstr.h" 30 #include "llvm/CodeGen/MachineInstrBundle.h" 31 #include "llvm/CodeGen/MachineMemOperand.h" 32 #include "llvm/CodeGen/MachineOperand.h" 33 #include "llvm/CodeGen/MachineRegisterInfo.h" 34 #include "llvm/CodeGen/PseudoSourceValue.h" 35 #include "llvm/CodeGen/RegisterPressure.h" 36 #include "llvm/CodeGen/ScheduleDAG.h" 37 #include "llvm/CodeGen/ScheduleDFS.h" 38 #include "llvm/CodeGen/SlotIndexes.h" 39 #include "llvm/IR/Constants.h" 40 #include "llvm/IR/Function.h" 41 #include "llvm/IR/Instruction.h" 42 #include "llvm/IR/Instructions.h" 43 #include "llvm/IR/Operator.h" 44 #include "llvm/IR/Type.h" 45 #include "llvm/IR/Value.h" 46 #include "llvm/MC/LaneBitmask.h" 47 #include "llvm/MC/MCRegisterInfo.h" 48 #include "llvm/Support/Casting.h" 49 #include "llvm/Support/CommandLine.h" 50 #include "llvm/Support/Compiler.h" 51 #include "llvm/Support/Debug.h" 52 #include "llvm/Support/ErrorHandling.h" 53 #include "llvm/Support/Format.h" 54 #include "llvm/Support/raw_ostream.h" 55 #include "llvm/Target/TargetRegisterInfo.h" 56 #include "llvm/Target/TargetSubtargetInfo.h" 57 #include <algorithm> 58 #include <cassert> 59 #include <iterator> 60 #include <string> 61 #include <utility> 62 #include <vector> 63 64 using namespace llvm; 65 66 #define DEBUG_TYPE "misched" 67 68 static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi", cl::Hidden, 69 cl::ZeroOrMore, cl::init(false), 70 cl::desc("Enable use of AA during MI DAG construction")); 71 72 static cl::opt<bool> UseTBAA("use-tbaa-in-sched-mi", cl::Hidden, 73 cl::init(true), cl::desc("Enable use of TBAA during MI DAG construction")); 74 75 // Note: the two options below might be used in tuning compile time vs 76 // output quality. Setting HugeRegion so large that it will never be 77 // reached means best-effort, but may be slow. 78 79 // When Stores and Loads maps (or NonAliasStores and NonAliasLoads) 80 // together hold this many SUs, a reduction of maps will be done. 81 static cl::opt<unsigned> HugeRegion("dag-maps-huge-region", cl::Hidden, 82 cl::init(1000), cl::desc("The limit to use while constructing the DAG " 83 "prior to scheduling, at which point a trade-off " 84 "is made to avoid excessive compile time.")); 85 86 static cl::opt<unsigned> ReductionSize( 87 "dag-maps-reduction-size", cl::Hidden, 88 cl::desc("A huge scheduling region will have maps reduced by this many " 89 "nodes at a time. Defaults to HugeRegion / 2.")); 90 91 static unsigned getReductionSize() { 92 // Always reduce a huge region with half of the elements, except 93 // when user sets this number explicitly. 94 if (ReductionSize.getNumOccurrences() == 0) 95 return HugeRegion / 2; 96 return ReductionSize; 97 } 98 99 static void dumpSUList(ScheduleDAGInstrs::SUList &L) { 100 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 101 dbgs() << "{ "; 102 for (const SUnit *su : L) { 103 dbgs() << "SU(" << su->NodeNum << ")"; 104 if (su != L.back()) 105 dbgs() << ", "; 106 } 107 dbgs() << "}\n"; 108 #endif 109 } 110 111 ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf, 112 const MachineLoopInfo *mli, 113 bool RemoveKillFlags) 114 : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()), 115 RemoveKillFlags(RemoveKillFlags), 116 UnknownValue(UndefValue::get( 117 Type::getVoidTy(mf.getFunction()->getContext()))) { 118 DbgValues.clear(); 119 120 const TargetSubtargetInfo &ST = mf.getSubtarget(); 121 SchedModel.init(ST.getSchedModel(), &ST, TII); 122 } 123 124 /// This is the function that does the work of looking through basic 125 /// ptrtoint+arithmetic+inttoptr sequences. 126 static const Value *getUnderlyingObjectFromInt(const Value *V) { 127 do { 128 if (const Operator *U = dyn_cast<Operator>(V)) { 129 // If we find a ptrtoint, we can transfer control back to the 130 // regular getUnderlyingObjectFromInt. 131 if (U->getOpcode() == Instruction::PtrToInt) 132 return U->getOperand(0); 133 // If we find an add of a constant, a multiplied value, or a phi, it's 134 // likely that the other operand will lead us to the base 135 // object. We don't have to worry about the case where the 136 // object address is somehow being computed by the multiply, 137 // because our callers only care when the result is an 138 // identifiable object. 139 if (U->getOpcode() != Instruction::Add || 140 (!isa<ConstantInt>(U->getOperand(1)) && 141 Operator::getOpcode(U->getOperand(1)) != Instruction::Mul && 142 !isa<PHINode>(U->getOperand(1)))) 143 return V; 144 V = U->getOperand(0); 145 } else { 146 return V; 147 } 148 assert(V->getType()->isIntegerTy() && "Unexpected operand type!"); 149 } while (true); 150 } 151 152 /// This is a wrapper around GetUnderlyingObjects and adds support for basic 153 /// ptrtoint+arithmetic+inttoptr sequences. 154 static void getUnderlyingObjects(const Value *V, 155 SmallVectorImpl<Value *> &Objects, 156 const DataLayout &DL) { 157 SmallPtrSet<const Value *, 16> Visited; 158 SmallVector<const Value *, 4> Working(1, V); 159 do { 160 V = Working.pop_back_val(); 161 162 SmallVector<Value *, 4> Objs; 163 GetUnderlyingObjects(const_cast<Value *>(V), Objs, DL); 164 165 for (Value *V : Objs) { 166 if (!Visited.insert(V).second) 167 continue; 168 if (Operator::getOpcode(V) == Instruction::IntToPtr) { 169 const Value *O = 170 getUnderlyingObjectFromInt(cast<User>(V)->getOperand(0)); 171 if (O->getType()->isPointerTy()) { 172 Working.push_back(O); 173 continue; 174 } 175 } 176 Objects.push_back(const_cast<Value *>(V)); 177 } 178 } while (!Working.empty()); 179 } 180 181 /// If this machine instr has memory reference information and it can be tracked 182 /// to a normal reference to a known object, return the Value for that object. 183 static void getUnderlyingObjectsForInstr(const MachineInstr *MI, 184 const MachineFrameInfo &MFI, 185 UnderlyingObjectsVector &Objects, 186 const DataLayout &DL) { 187 auto allMMOsOkay = [&]() { 188 for (const MachineMemOperand *MMO : MI->memoperands()) { 189 if (MMO->isVolatile()) 190 return false; 191 192 if (const PseudoSourceValue *PSV = MMO->getPseudoValue()) { 193 // Function that contain tail calls don't have unique PseudoSourceValue 194 // objects. Two PseudoSourceValues might refer to the same or 195 // overlapping locations. The client code calling this function assumes 196 // this is not the case. So return a conservative answer of no known 197 // object. 198 if (MFI.hasTailCall()) 199 return false; 200 201 // For now, ignore PseudoSourceValues which may alias LLVM IR values 202 // because the code that uses this function has no way to cope with 203 // such aliases. 204 if (PSV->isAliased(&MFI)) 205 return false; 206 207 bool MayAlias = PSV->mayAlias(&MFI); 208 Objects.push_back(UnderlyingObjectsVector::value_type(PSV, MayAlias)); 209 } else if (const Value *V = MMO->getValue()) { 210 SmallVector<Value *, 4> Objs; 211 getUnderlyingObjects(V, Objs, DL); 212 213 for (Value *V : Objs) { 214 if (!isIdentifiedObject(V)) 215 return false; 216 217 Objects.push_back(UnderlyingObjectsVector::value_type(V, true)); 218 } 219 } else 220 return false; 221 } 222 return true; 223 }; 224 225 if (!allMMOsOkay()) 226 Objects.clear(); 227 } 228 229 void ScheduleDAGInstrs::startBlock(MachineBasicBlock *bb) { 230 BB = bb; 231 } 232 233 void ScheduleDAGInstrs::finishBlock() { 234 // Subclasses should no longer refer to the old block. 235 BB = nullptr; 236 } 237 238 void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb, 239 MachineBasicBlock::iterator begin, 240 MachineBasicBlock::iterator end, 241 unsigned regioninstrs) { 242 assert(bb == BB && "startBlock should set BB"); 243 RegionBegin = begin; 244 RegionEnd = end; 245 NumRegionInstrs = regioninstrs; 246 } 247 248 void ScheduleDAGInstrs::exitRegion() { 249 // Nothing to do. 250 } 251 252 void ScheduleDAGInstrs::addSchedBarrierDeps() { 253 MachineInstr *ExitMI = RegionEnd != BB->end() ? &*RegionEnd : nullptr; 254 ExitSU.setInstr(ExitMI); 255 // Add dependencies on the defs and uses of the instruction. 256 if (ExitMI) { 257 for (const MachineOperand &MO : ExitMI->operands()) { 258 if (!MO.isReg() || MO.isDef()) continue; 259 unsigned Reg = MO.getReg(); 260 if (TargetRegisterInfo::isPhysicalRegister(Reg)) { 261 Uses.insert(PhysRegSUOper(&ExitSU, -1, Reg)); 262 } else if (TargetRegisterInfo::isVirtualRegister(Reg) && MO.readsReg()) { 263 addVRegUseDeps(&ExitSU, ExitMI->getOperandNo(&MO)); 264 } 265 } 266 } 267 if (!ExitMI || (!ExitMI->isCall() && !ExitMI->isBarrier())) { 268 // For others, e.g. fallthrough, conditional branch, assume the exit 269 // uses all the registers that are livein to the successor blocks. 270 for (const MachineBasicBlock *Succ : BB->successors()) { 271 for (const auto &LI : Succ->liveins()) { 272 if (!Uses.contains(LI.PhysReg)) 273 Uses.insert(PhysRegSUOper(&ExitSU, -1, LI.PhysReg)); 274 } 275 } 276 } 277 } 278 279 /// MO is an operand of SU's instruction that defines a physical register. Adds 280 /// data dependencies from SU to any uses of the physical register. 281 void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) { 282 const MachineOperand &MO = SU->getInstr()->getOperand(OperIdx); 283 assert(MO.isDef() && "expect physreg def"); 284 285 // Ask the target if address-backscheduling is desirable, and if so how much. 286 const TargetSubtargetInfo &ST = MF.getSubtarget(); 287 288 for (MCRegAliasIterator Alias(MO.getReg(), TRI, true); 289 Alias.isValid(); ++Alias) { 290 if (!Uses.contains(*Alias)) 291 continue; 292 for (Reg2SUnitsMap::iterator I = Uses.find(*Alias); I != Uses.end(); ++I) { 293 SUnit *UseSU = I->SU; 294 if (UseSU == SU) 295 continue; 296 297 // Adjust the dependence latency using operand def/use information, 298 // then allow the target to perform its own adjustments. 299 int UseOp = I->OpIdx; 300 MachineInstr *RegUse = nullptr; 301 SDep Dep; 302 if (UseOp < 0) 303 Dep = SDep(SU, SDep::Artificial); 304 else { 305 // Set the hasPhysRegDefs only for physreg defs that have a use within 306 // the scheduling region. 307 SU->hasPhysRegDefs = true; 308 Dep = SDep(SU, SDep::Data, *Alias); 309 RegUse = UseSU->getInstr(); 310 } 311 Dep.setLatency( 312 SchedModel.computeOperandLatency(SU->getInstr(), OperIdx, RegUse, 313 UseOp)); 314 315 ST.adjustSchedDependency(SU, UseSU, Dep); 316 UseSU->addPred(Dep); 317 } 318 } 319 } 320 321 /// \brief Adds register dependencies (data, anti, and output) from this SUnit 322 /// to following instructions in the same scheduling region that depend the 323 /// physical register referenced at OperIdx. 324 void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) { 325 MachineInstr *MI = SU->getInstr(); 326 MachineOperand &MO = MI->getOperand(OperIdx); 327 unsigned Reg = MO.getReg(); 328 // We do not need to track any dependencies for constant registers. 329 if (MRI.isConstantPhysReg(Reg)) 330 return; 331 332 // Optionally add output and anti dependencies. For anti 333 // dependencies we use a latency of 0 because for a multi-issue 334 // target we want to allow the defining instruction to issue 335 // in the same cycle as the using instruction. 336 // TODO: Using a latency of 1 here for output dependencies assumes 337 // there's no cost for reusing registers. 338 SDep::Kind Kind = MO.isUse() ? SDep::Anti : SDep::Output; 339 for (MCRegAliasIterator Alias(Reg, TRI, true); Alias.isValid(); ++Alias) { 340 if (!Defs.contains(*Alias)) 341 continue; 342 for (Reg2SUnitsMap::iterator I = Defs.find(*Alias); I != Defs.end(); ++I) { 343 SUnit *DefSU = I->SU; 344 if (DefSU == &ExitSU) 345 continue; 346 if (DefSU != SU && 347 (Kind != SDep::Output || !MO.isDead() || 348 !DefSU->getInstr()->registerDefIsDead(*Alias))) { 349 if (Kind == SDep::Anti) 350 DefSU->addPred(SDep(SU, Kind, /*Reg=*/*Alias)); 351 else { 352 SDep Dep(SU, Kind, /*Reg=*/*Alias); 353 Dep.setLatency( 354 SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr())); 355 DefSU->addPred(Dep); 356 } 357 } 358 } 359 } 360 361 if (!MO.isDef()) { 362 SU->hasPhysRegUses = true; 363 // Either insert a new Reg2SUnits entry with an empty SUnits list, or 364 // retrieve the existing SUnits list for this register's uses. 365 // Push this SUnit on the use list. 366 Uses.insert(PhysRegSUOper(SU, OperIdx, Reg)); 367 if (RemoveKillFlags) 368 MO.setIsKill(false); 369 } else { 370 addPhysRegDataDeps(SU, OperIdx); 371 372 // clear this register's use list 373 if (Uses.contains(Reg)) 374 Uses.eraseAll(Reg); 375 376 if (!MO.isDead()) { 377 Defs.eraseAll(Reg); 378 } else if (SU->isCall) { 379 // Calls will not be reordered because of chain dependencies (see 380 // below). Since call operands are dead, calls may continue to be added 381 // to the DefList making dependence checking quadratic in the size of 382 // the block. Instead, we leave only one call at the back of the 383 // DefList. 384 Reg2SUnitsMap::RangePair P = Defs.equal_range(Reg); 385 Reg2SUnitsMap::iterator B = P.first; 386 Reg2SUnitsMap::iterator I = P.second; 387 for (bool isBegin = I == B; !isBegin; /* empty */) { 388 isBegin = (--I) == B; 389 if (!I->SU->isCall) 390 break; 391 I = Defs.erase(I); 392 } 393 } 394 395 // Defs are pushed in the order they are visited and never reordered. 396 Defs.insert(PhysRegSUOper(SU, OperIdx, Reg)); 397 } 398 } 399 400 LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const 401 { 402 unsigned Reg = MO.getReg(); 403 // No point in tracking lanemasks if we don't have interesting subregisters. 404 const TargetRegisterClass &RC = *MRI.getRegClass(Reg); 405 if (!RC.HasDisjunctSubRegs) 406 return LaneBitmask::getAll(); 407 408 unsigned SubReg = MO.getSubReg(); 409 if (SubReg == 0) 410 return RC.getLaneMask(); 411 return TRI->getSubRegIndexLaneMask(SubReg); 412 } 413 414 /// Adds register output and data dependencies from this SUnit to instructions 415 /// that occur later in the same scheduling region if they read from or write to 416 /// the virtual register defined at OperIdx. 417 /// 418 /// TODO: Hoist loop induction variable increments. This has to be 419 /// reevaluated. Generally, IV scheduling should be done before coalescing. 420 void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) { 421 MachineInstr *MI = SU->getInstr(); 422 MachineOperand &MO = MI->getOperand(OperIdx); 423 unsigned Reg = MO.getReg(); 424 425 LaneBitmask DefLaneMask; 426 LaneBitmask KillLaneMask; 427 if (TrackLaneMasks) { 428 bool IsKill = MO.getSubReg() == 0 || MO.isUndef(); 429 DefLaneMask = getLaneMaskForMO(MO); 430 // If we have a <read-undef> flag, none of the lane values comes from an 431 // earlier instruction. 432 KillLaneMask = IsKill ? LaneBitmask::getAll() : DefLaneMask; 433 434 // Clear undef flag, we'll re-add it later once we know which subregister 435 // Def is first. 436 MO.setIsUndef(false); 437 } else { 438 DefLaneMask = LaneBitmask::getAll(); 439 KillLaneMask = LaneBitmask::getAll(); 440 } 441 442 if (MO.isDead()) { 443 assert(CurrentVRegUses.find(Reg) == CurrentVRegUses.end() && 444 "Dead defs should have no uses"); 445 } else { 446 // Add data dependence to all uses we found so far. 447 const TargetSubtargetInfo &ST = MF.getSubtarget(); 448 for (VReg2SUnitOperIdxMultiMap::iterator I = CurrentVRegUses.find(Reg), 449 E = CurrentVRegUses.end(); I != E; /*empty*/) { 450 LaneBitmask LaneMask = I->LaneMask; 451 // Ignore uses of other lanes. 452 if ((LaneMask & KillLaneMask).none()) { 453 ++I; 454 continue; 455 } 456 457 if ((LaneMask & DefLaneMask).any()) { 458 SUnit *UseSU = I->SU; 459 MachineInstr *Use = UseSU->getInstr(); 460 SDep Dep(SU, SDep::Data, Reg); 461 Dep.setLatency(SchedModel.computeOperandLatency(MI, OperIdx, Use, 462 I->OperandIndex)); 463 ST.adjustSchedDependency(SU, UseSU, Dep); 464 UseSU->addPred(Dep); 465 } 466 467 LaneMask &= ~KillLaneMask; 468 // If we found a Def for all lanes of this use, remove it from the list. 469 if (LaneMask.any()) { 470 I->LaneMask = LaneMask; 471 ++I; 472 } else 473 I = CurrentVRegUses.erase(I); 474 } 475 } 476 477 // Shortcut: Singly defined vregs do not have output/anti dependencies. 478 if (MRI.hasOneDef(Reg)) 479 return; 480 481 // Add output dependence to the next nearest defs of this vreg. 482 // 483 // Unless this definition is dead, the output dependence should be 484 // transitively redundant with antidependencies from this definition's 485 // uses. We're conservative for now until we have a way to guarantee the uses 486 // are not eliminated sometime during scheduling. The output dependence edge 487 // is also useful if output latency exceeds def-use latency. 488 LaneBitmask LaneMask = DefLaneMask; 489 for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg), 490 CurrentVRegDefs.end())) { 491 // Ignore defs for other lanes. 492 if ((V2SU.LaneMask & LaneMask).none()) 493 continue; 494 // Add an output dependence. 495 SUnit *DefSU = V2SU.SU; 496 // Ignore additional defs of the same lanes in one instruction. This can 497 // happen because lanemasks are shared for targets with too many 498 // subregisters. We also use some representration tricks/hacks where we 499 // add super-register defs/uses, to imply that although we only access parts 500 // of the reg we care about the full one. 501 if (DefSU == SU) 502 continue; 503 SDep Dep(SU, SDep::Output, Reg); 504 Dep.setLatency( 505 SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr())); 506 DefSU->addPred(Dep); 507 508 // Update current definition. This can get tricky if the def was about a 509 // bigger lanemask before. We then have to shrink it and create a new 510 // VReg2SUnit for the non-overlapping part. 511 LaneBitmask OverlapMask = V2SU.LaneMask & LaneMask; 512 LaneBitmask NonOverlapMask = V2SU.LaneMask & ~LaneMask; 513 V2SU.SU = SU; 514 V2SU.LaneMask = OverlapMask; 515 if (NonOverlapMask.any()) 516 CurrentVRegDefs.insert(VReg2SUnit(Reg, NonOverlapMask, DefSU)); 517 } 518 // If there was no CurrentVRegDefs entry for some lanes yet, create one. 519 if (LaneMask.any()) 520 CurrentVRegDefs.insert(VReg2SUnit(Reg, LaneMask, SU)); 521 } 522 523 /// \brief Adds a register data dependency if the instruction that defines the 524 /// virtual register used at OperIdx is mapped to an SUnit. Add a register 525 /// antidependency from this SUnit to instructions that occur later in the same 526 /// scheduling region if they write the virtual register. 527 /// 528 /// TODO: Handle ExitSU "uses" properly. 529 void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) { 530 const MachineInstr *MI = SU->getInstr(); 531 const MachineOperand &MO = MI->getOperand(OperIdx); 532 unsigned Reg = MO.getReg(); 533 534 // Remember the use. Data dependencies will be added when we find the def. 535 LaneBitmask LaneMask = TrackLaneMasks ? getLaneMaskForMO(MO) 536 : LaneBitmask::getAll(); 537 CurrentVRegUses.insert(VReg2SUnitOperIdx(Reg, LaneMask, OperIdx, SU)); 538 539 // Add antidependences to the following defs of the vreg. 540 for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg), 541 CurrentVRegDefs.end())) { 542 // Ignore defs for unrelated lanes. 543 LaneBitmask PrevDefLaneMask = V2SU.LaneMask; 544 if ((PrevDefLaneMask & LaneMask).none()) 545 continue; 546 if (V2SU.SU == SU) 547 continue; 548 549 V2SU.SU->addPred(SDep(SU, SDep::Anti, Reg)); 550 } 551 } 552 553 /// Returns true if MI is an instruction we are unable to reason about 554 /// (like a call or something with unmodeled side effects). 555 static inline bool isGlobalMemoryObject(AliasAnalysis *AA, MachineInstr *MI) { 556 return MI->isCall() || MI->hasUnmodeledSideEffects() || 557 (MI->hasOrderedMemoryRef() && !MI->isDereferenceableInvariantLoad(AA)); 558 } 559 560 void ScheduleDAGInstrs::addChainDependency (SUnit *SUa, SUnit *SUb, 561 unsigned Latency) { 562 if (SUa->getInstr()->mayAlias(AAForDep, *SUb->getInstr(), UseTBAA)) { 563 SDep Dep(SUa, SDep::MayAliasMem); 564 Dep.setLatency(Latency); 565 SUb->addPred(Dep); 566 } 567 } 568 569 /// \brief Creates an SUnit for each real instruction, numbered in top-down 570 /// topological order. The instruction order A < B, implies that no edge exists 571 /// from B to A. 572 /// 573 /// Map each real instruction to its SUnit. 574 /// 575 /// After initSUnits, the SUnits vector cannot be resized and the scheduler may 576 /// hang onto SUnit pointers. We may relax this in the future by using SUnit IDs 577 /// instead of pointers. 578 /// 579 /// MachineScheduler relies on initSUnits numbering the nodes by their order in 580 /// the original instruction list. 581 void ScheduleDAGInstrs::initSUnits() { 582 // We'll be allocating one SUnit for each real instruction in the region, 583 // which is contained within a basic block. 584 SUnits.reserve(NumRegionInstrs); 585 586 for (MachineInstr &MI : make_range(RegionBegin, RegionEnd)) { 587 if (MI.isDebugValue()) 588 continue; 589 590 SUnit *SU = newSUnit(&MI); 591 MISUnitMap[&MI] = SU; 592 593 SU->isCall = MI.isCall(); 594 SU->isCommutable = MI.isCommutable(); 595 596 // Assign the Latency field of SU using target-provided information. 597 SU->Latency = SchedModel.computeInstrLatency(SU->getInstr()); 598 599 // If this SUnit uses a reserved or unbuffered resource, mark it as such. 600 // 601 // Reserved resources block an instruction from issuing and stall the 602 // entire pipeline. These are identified by BufferSize=0. 603 // 604 // Unbuffered resources prevent execution of subsequent instructions that 605 // require the same resources. This is used for in-order execution pipelines 606 // within an out-of-order core. These are identified by BufferSize=1. 607 if (SchedModel.hasInstrSchedModel()) { 608 const MCSchedClassDesc *SC = getSchedClass(SU); 609 for (const MCWriteProcResEntry &PRE : 610 make_range(SchedModel.getWriteProcResBegin(SC), 611 SchedModel.getWriteProcResEnd(SC))) { 612 switch (SchedModel.getProcResource(PRE.ProcResourceIdx)->BufferSize) { 613 case 0: 614 SU->hasReservedResource = true; 615 break; 616 case 1: 617 SU->isUnbuffered = true; 618 break; 619 default: 620 break; 621 } 622 } 623 } 624 } 625 } 626 627 class ScheduleDAGInstrs::Value2SUsMap : public MapVector<ValueType, SUList> { 628 /// Current total number of SUs in map. 629 unsigned NumNodes = 0; 630 631 /// 1 for loads, 0 for stores. (see comment in SUList) 632 unsigned TrueMemOrderLatency; 633 634 public: 635 Value2SUsMap(unsigned lat = 0) : TrueMemOrderLatency(lat) {} 636 637 /// To keep NumNodes up to date, insert() is used instead of 638 /// this operator w/ push_back(). 639 ValueType &operator[](const SUList &Key) { 640 llvm_unreachable("Don't use. Use insert() instead."); }; 641 642 /// Adds SU to the SUList of V. If Map grows huge, reduce its size by calling 643 /// reduce(). 644 void inline insert(SUnit *SU, ValueType V) { 645 MapVector::operator[](V).push_back(SU); 646 NumNodes++; 647 } 648 649 /// Clears the list of SUs mapped to V. 650 void inline clearList(ValueType V) { 651 iterator Itr = find(V); 652 if (Itr != end()) { 653 assert(NumNodes >= Itr->second.size()); 654 NumNodes -= Itr->second.size(); 655 656 Itr->second.clear(); 657 } 658 } 659 660 /// Clears map from all contents. 661 void clear() { 662 MapVector<ValueType, SUList>::clear(); 663 NumNodes = 0; 664 } 665 666 unsigned inline size() const { return NumNodes; } 667 668 /// Counts the number of SUs in this map after a reduction. 669 void reComputeSize() { 670 NumNodes = 0; 671 for (auto &I : *this) 672 NumNodes += I.second.size(); 673 } 674 675 unsigned inline getTrueMemOrderLatency() const { 676 return TrueMemOrderLatency; 677 } 678 679 void dump(); 680 }; 681 682 void ScheduleDAGInstrs::addChainDependencies(SUnit *SU, 683 Value2SUsMap &Val2SUsMap) { 684 for (auto &I : Val2SUsMap) 685 addChainDependencies(SU, I.second, 686 Val2SUsMap.getTrueMemOrderLatency()); 687 } 688 689 void ScheduleDAGInstrs::addChainDependencies(SUnit *SU, 690 Value2SUsMap &Val2SUsMap, 691 ValueType V) { 692 Value2SUsMap::iterator Itr = Val2SUsMap.find(V); 693 if (Itr != Val2SUsMap.end()) 694 addChainDependencies(SU, Itr->second, 695 Val2SUsMap.getTrueMemOrderLatency()); 696 } 697 698 void ScheduleDAGInstrs::addBarrierChain(Value2SUsMap &map) { 699 assert(BarrierChain != nullptr); 700 701 for (auto &I : map) { 702 SUList &sus = I.second; 703 for (auto *SU : sus) 704 SU->addPredBarrier(BarrierChain); 705 } 706 map.clear(); 707 } 708 709 void ScheduleDAGInstrs::insertBarrierChain(Value2SUsMap &map) { 710 assert(BarrierChain != nullptr); 711 712 // Go through all lists of SUs. 713 for (Value2SUsMap::iterator I = map.begin(), EE = map.end(); I != EE;) { 714 Value2SUsMap::iterator CurrItr = I++; 715 SUList &sus = CurrItr->second; 716 SUList::iterator SUItr = sus.begin(), SUEE = sus.end(); 717 for (; SUItr != SUEE; ++SUItr) { 718 // Stop on BarrierChain or any instruction above it. 719 if ((*SUItr)->NodeNum <= BarrierChain->NodeNum) 720 break; 721 722 (*SUItr)->addPredBarrier(BarrierChain); 723 } 724 725 // Remove also the BarrierChain from list if present. 726 if (SUItr != SUEE && *SUItr == BarrierChain) 727 SUItr++; 728 729 // Remove all SUs that are now successors of BarrierChain. 730 if (SUItr != sus.begin()) 731 sus.erase(sus.begin(), SUItr); 732 } 733 734 // Remove all entries with empty su lists. 735 map.remove_if([&](std::pair<ValueType, SUList> &mapEntry) { 736 return (mapEntry.second.empty()); }); 737 738 // Recompute the size of the map (NumNodes). 739 map.reComputeSize(); 740 } 741 742 void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA, 743 RegPressureTracker *RPTracker, 744 PressureDiffs *PDiffs, 745 LiveIntervals *LIS, 746 bool TrackLaneMasks) { 747 const TargetSubtargetInfo &ST = MF.getSubtarget(); 748 bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI 749 : ST.useAA(); 750 AAForDep = UseAA ? AA : nullptr; 751 752 BarrierChain = nullptr; 753 754 this->TrackLaneMasks = TrackLaneMasks; 755 MISUnitMap.clear(); 756 ScheduleDAG::clearDAG(); 757 758 // Create an SUnit for each real instruction. 759 initSUnits(); 760 761 if (PDiffs) 762 PDiffs->init(SUnits.size()); 763 764 // We build scheduling units by walking a block's instruction list 765 // from bottom to top. 766 767 // Each MIs' memory operand(s) is analyzed to a list of underlying 768 // objects. The SU is then inserted in the SUList(s) mapped from the 769 // Value(s). Each Value thus gets mapped to lists of SUs depending 770 // on it, stores and loads kept separately. Two SUs are trivially 771 // non-aliasing if they both depend on only identified Values and do 772 // not share any common Value. 773 Value2SUsMap Stores, Loads(1 /*TrueMemOrderLatency*/); 774 775 // Certain memory accesses are known to not alias any SU in Stores 776 // or Loads, and have therefore their own 'NonAlias' 777 // domain. E.g. spill / reload instructions never alias LLVM I/R 778 // Values. It would be nice to assume that this type of memory 779 // accesses always have a proper memory operand modelling, and are 780 // therefore never unanalyzable, but this is conservatively not 781 // done. 782 Value2SUsMap NonAliasStores, NonAliasLoads(1 /*TrueMemOrderLatency*/); 783 784 // Remove any stale debug info; sometimes BuildSchedGraph is called again 785 // without emitting the info from the previous call. 786 DbgValues.clear(); 787 FirstDbgValue = nullptr; 788 789 assert(Defs.empty() && Uses.empty() && 790 "Only BuildGraph should update Defs/Uses"); 791 Defs.setUniverse(TRI->getNumRegs()); 792 Uses.setUniverse(TRI->getNumRegs()); 793 794 assert(CurrentVRegDefs.empty() && "nobody else should use CurrentVRegDefs"); 795 assert(CurrentVRegUses.empty() && "nobody else should use CurrentVRegUses"); 796 unsigned NumVirtRegs = MRI.getNumVirtRegs(); 797 CurrentVRegDefs.setUniverse(NumVirtRegs); 798 CurrentVRegUses.setUniverse(NumVirtRegs); 799 800 // Model data dependencies between instructions being scheduled and the 801 // ExitSU. 802 addSchedBarrierDeps(); 803 804 // Walk the list of instructions, from bottom moving up. 805 MachineInstr *DbgMI = nullptr; 806 for (MachineBasicBlock::iterator MII = RegionEnd, MIE = RegionBegin; 807 MII != MIE; --MII) { 808 MachineInstr &MI = *std::prev(MII); 809 if (DbgMI) { 810 DbgValues.push_back(std::make_pair(DbgMI, &MI)); 811 DbgMI = nullptr; 812 } 813 814 if (MI.isDebugValue()) { 815 DbgMI = &MI; 816 continue; 817 } 818 SUnit *SU = MISUnitMap[&MI]; 819 assert(SU && "No SUnit mapped to this MI"); 820 821 if (RPTracker) { 822 RegisterOperands RegOpers; 823 RegOpers.collect(MI, *TRI, MRI, TrackLaneMasks, false); 824 if (TrackLaneMasks) { 825 SlotIndex SlotIdx = LIS->getInstructionIndex(MI); 826 RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx); 827 } 828 if (PDiffs != nullptr) 829 PDiffs->addInstruction(SU->NodeNum, RegOpers, MRI); 830 831 RPTracker->recedeSkipDebugValues(); 832 assert(&*RPTracker->getPos() == &MI && "RPTracker in sync"); 833 RPTracker->recede(RegOpers); 834 } 835 836 assert( 837 (CanHandleTerminators || (!MI.isTerminator() && !MI.isPosition())) && 838 "Cannot schedule terminators or labels!"); 839 840 // Add register-based dependencies (data, anti, and output). 841 // For some instructions (calls, returns, inline-asm, etc.) there can 842 // be explicit uses and implicit defs, in which case the use will appear 843 // on the operand list before the def. Do two passes over the operand 844 // list to make sure that defs are processed before any uses. 845 bool HasVRegDef = false; 846 for (unsigned j = 0, n = MI.getNumOperands(); j != n; ++j) { 847 const MachineOperand &MO = MI.getOperand(j); 848 if (!MO.isReg() || !MO.isDef()) 849 continue; 850 unsigned Reg = MO.getReg(); 851 if (TargetRegisterInfo::isPhysicalRegister(Reg)) { 852 addPhysRegDeps(SU, j); 853 } else if (TargetRegisterInfo::isVirtualRegister(Reg)) { 854 HasVRegDef = true; 855 addVRegDefDeps(SU, j); 856 } 857 } 858 // Now process all uses. 859 for (unsigned j = 0, n = MI.getNumOperands(); j != n; ++j) { 860 const MachineOperand &MO = MI.getOperand(j); 861 // Only look at use operands. 862 // We do not need to check for MO.readsReg() here because subsequent 863 // subregister defs will get output dependence edges and need no 864 // additional use dependencies. 865 if (!MO.isReg() || !MO.isUse()) 866 continue; 867 unsigned Reg = MO.getReg(); 868 if (TargetRegisterInfo::isPhysicalRegister(Reg)) { 869 addPhysRegDeps(SU, j); 870 } else if (TargetRegisterInfo::isVirtualRegister(Reg) && MO.readsReg()) { 871 addVRegUseDeps(SU, j); 872 } 873 } 874 875 // If we haven't seen any uses in this scheduling region, create a 876 // dependence edge to ExitSU to model the live-out latency. This is required 877 // for vreg defs with no in-region use, and prefetches with no vreg def. 878 // 879 // FIXME: NumDataSuccs would be more precise than NumSuccs here. This 880 // check currently relies on being called before adding chain deps. 881 if (SU->NumSuccs == 0 && SU->Latency > 1 && (HasVRegDef || MI.mayLoad())) { 882 SDep Dep(SU, SDep::Artificial); 883 Dep.setLatency(SU->Latency - 1); 884 ExitSU.addPred(Dep); 885 } 886 887 // Add memory dependencies (Note: isStoreToStackSlot and 888 // isLoadFromStackSLot are not usable after stack slots are lowered to 889 // actual addresses). 890 891 // This is a barrier event that acts as a pivotal node in the DAG. 892 if (isGlobalMemoryObject(AA, &MI)) { 893 894 // Become the barrier chain. 895 if (BarrierChain) 896 BarrierChain->addPredBarrier(SU); 897 BarrierChain = SU; 898 899 DEBUG(dbgs() << "Global memory object and new barrier chain: SU(" 900 << BarrierChain->NodeNum << ").\n";); 901 902 // Add dependencies against everything below it and clear maps. 903 addBarrierChain(Stores); 904 addBarrierChain(Loads); 905 addBarrierChain(NonAliasStores); 906 addBarrierChain(NonAliasLoads); 907 908 continue; 909 } 910 911 // If it's not a store or a variant load, we're done. 912 if (!MI.mayStore() && 913 !(MI.mayLoad() && !MI.isDereferenceableInvariantLoad(AA))) 914 continue; 915 916 // Always add dependecy edge to BarrierChain if present. 917 if (BarrierChain) 918 BarrierChain->addPredBarrier(SU); 919 920 // Find the underlying objects for MI. The Objs vector is either 921 // empty, or filled with the Values of memory locations which this 922 // SU depends on. An empty vector means the memory location is 923 // unknown, and may alias anything. 924 UnderlyingObjectsVector Objs; 925 getUnderlyingObjectsForInstr(&MI, MFI, Objs, MF.getDataLayout()); 926 927 if (MI.mayStore()) { 928 if (Objs.empty()) { 929 // An unknown store depends on all stores and loads. 930 addChainDependencies(SU, Stores); 931 addChainDependencies(SU, NonAliasStores); 932 addChainDependencies(SU, Loads); 933 addChainDependencies(SU, NonAliasLoads); 934 935 // Map this store to 'UnknownValue'. 936 Stores.insert(SU, UnknownValue); 937 } else { 938 // Add precise dependencies against all previously seen memory 939 // accesses mapped to the same Value(s). 940 for (const UnderlyingObject &UnderlObj : Objs) { 941 ValueType V = UnderlObj.getValue(); 942 bool ThisMayAlias = UnderlObj.mayAlias(); 943 944 // Add dependencies to previous stores and loads mapped to V. 945 addChainDependencies(SU, (ThisMayAlias ? Stores : NonAliasStores), V); 946 addChainDependencies(SU, (ThisMayAlias ? Loads : NonAliasLoads), V); 947 } 948 // Update the store map after all chains have been added to avoid adding 949 // self-loop edge if multiple underlying objects are present. 950 for (const UnderlyingObject &UnderlObj : Objs) { 951 ValueType V = UnderlObj.getValue(); 952 bool ThisMayAlias = UnderlObj.mayAlias(); 953 954 // Map this store to V. 955 (ThisMayAlias ? Stores : NonAliasStores).insert(SU, V); 956 } 957 // The store may have dependencies to unanalyzable loads and 958 // stores. 959 addChainDependencies(SU, Loads, UnknownValue); 960 addChainDependencies(SU, Stores, UnknownValue); 961 } 962 } else { // SU is a load. 963 if (Objs.empty()) { 964 // An unknown load depends on all stores. 965 addChainDependencies(SU, Stores); 966 addChainDependencies(SU, NonAliasStores); 967 968 Loads.insert(SU, UnknownValue); 969 } else { 970 for (const UnderlyingObject &UnderlObj : Objs) { 971 ValueType V = UnderlObj.getValue(); 972 bool ThisMayAlias = UnderlObj.mayAlias(); 973 974 // Add precise dependencies against all previously seen stores 975 // mapping to the same Value(s). 976 addChainDependencies(SU, (ThisMayAlias ? Stores : NonAliasStores), V); 977 978 // Map this load to V. 979 (ThisMayAlias ? Loads : NonAliasLoads).insert(SU, V); 980 } 981 // The load may have dependencies to unanalyzable stores. 982 addChainDependencies(SU, Stores, UnknownValue); 983 } 984 } 985 986 // Reduce maps if they grow huge. 987 if (Stores.size() + Loads.size() >= HugeRegion) { 988 DEBUG(dbgs() << "Reducing Stores and Loads maps.\n";); 989 reduceHugeMemNodeMaps(Stores, Loads, getReductionSize()); 990 } 991 if (NonAliasStores.size() + NonAliasLoads.size() >= HugeRegion) { 992 DEBUG(dbgs() << "Reducing NonAliasStores and NonAliasLoads maps.\n";); 993 reduceHugeMemNodeMaps(NonAliasStores, NonAliasLoads, getReductionSize()); 994 } 995 } 996 997 if (DbgMI) 998 FirstDbgValue = DbgMI; 999 1000 Defs.clear(); 1001 Uses.clear(); 1002 CurrentVRegDefs.clear(); 1003 CurrentVRegUses.clear(); 1004 } 1005 1006 raw_ostream &llvm::operator<<(raw_ostream &OS, const PseudoSourceValue* PSV) { 1007 PSV->printCustom(OS); 1008 return OS; 1009 } 1010 1011 void ScheduleDAGInstrs::Value2SUsMap::dump() { 1012 for (auto &Itr : *this) { 1013 if (Itr.first.is<const Value*>()) { 1014 const Value *V = Itr.first.get<const Value*>(); 1015 if (isa<UndefValue>(V)) 1016 dbgs() << "Unknown"; 1017 else 1018 V->printAsOperand(dbgs()); 1019 } 1020 else if (Itr.first.is<const PseudoSourceValue*>()) 1021 dbgs() << Itr.first.get<const PseudoSourceValue*>(); 1022 else 1023 llvm_unreachable("Unknown Value type."); 1024 1025 dbgs() << " : "; 1026 dumpSUList(Itr.second); 1027 } 1028 } 1029 1030 void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores, 1031 Value2SUsMap &loads, unsigned N) { 1032 DEBUG(dbgs() << "Before reduction:\nStoring SUnits:\n"; 1033 stores.dump(); 1034 dbgs() << "Loading SUnits:\n"; 1035 loads.dump()); 1036 1037 // Insert all SU's NodeNums into a vector and sort it. 1038 std::vector<unsigned> NodeNums; 1039 NodeNums.reserve(stores.size() + loads.size()); 1040 for (auto &I : stores) 1041 for (auto *SU : I.second) 1042 NodeNums.push_back(SU->NodeNum); 1043 for (auto &I : loads) 1044 for (auto *SU : I.second) 1045 NodeNums.push_back(SU->NodeNum); 1046 std::sort(NodeNums.begin(), NodeNums.end()); 1047 1048 // The N last elements in NodeNums will be removed, and the SU with 1049 // the lowest NodeNum of them will become the new BarrierChain to 1050 // let the not yet seen SUs have a dependency to the removed SUs. 1051 assert(N <= NodeNums.size()); 1052 SUnit *newBarrierChain = &SUnits[*(NodeNums.end() - N)]; 1053 if (BarrierChain) { 1054 // The aliasing and non-aliasing maps reduce independently of each 1055 // other, but share a common BarrierChain. Check if the 1056 // newBarrierChain is above the former one. If it is not, it may 1057 // introduce a loop to use newBarrierChain, so keep the old one. 1058 if (newBarrierChain->NodeNum < BarrierChain->NodeNum) { 1059 BarrierChain->addPredBarrier(newBarrierChain); 1060 BarrierChain = newBarrierChain; 1061 DEBUG(dbgs() << "Inserting new barrier chain: SU(" 1062 << BarrierChain->NodeNum << ").\n";); 1063 } 1064 else 1065 DEBUG(dbgs() << "Keeping old barrier chain: SU(" 1066 << BarrierChain->NodeNum << ").\n";); 1067 } 1068 else 1069 BarrierChain = newBarrierChain; 1070 1071 insertBarrierChain(stores); 1072 insertBarrierChain(loads); 1073 1074 DEBUG(dbgs() << "After reduction:\nStoring SUnits:\n"; 1075 stores.dump(); 1076 dbgs() << "Loading SUnits:\n"; 1077 loads.dump()); 1078 } 1079 1080 static void toggleKills(const MachineRegisterInfo &MRI, LivePhysRegs &LiveRegs, 1081 MachineInstr &MI, bool addToLiveRegs) { 1082 for (MachineOperand &MO : MI.operands()) { 1083 if (!MO.isReg() || !MO.readsReg()) 1084 continue; 1085 unsigned Reg = MO.getReg(); 1086 if (!Reg) 1087 continue; 1088 1089 // Things that are available after the instruction are killed by it. 1090 bool IsKill = LiveRegs.available(MRI, Reg); 1091 MO.setIsKill(IsKill); 1092 if (IsKill && addToLiveRegs) 1093 LiveRegs.addReg(Reg); 1094 } 1095 } 1096 1097 void ScheduleDAGInstrs::fixupKills(MachineBasicBlock &MBB) { 1098 DEBUG(dbgs() << "Fixup kills for BB#" << MBB.getNumber() << '\n'); 1099 1100 LiveRegs.init(*TRI); 1101 LiveRegs.addLiveOuts(MBB); 1102 1103 // Examine block from end to start... 1104 for (MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) { 1105 if (MI.isDebugValue()) 1106 continue; 1107 1108 // Update liveness. Registers that are defed but not used in this 1109 // instruction are now dead. Mark register and all subregs as they 1110 // are completely defined. 1111 for (ConstMIBundleOperands O(MI); O.isValid(); ++O) { 1112 const MachineOperand &MO = *O; 1113 if (MO.isReg()) { 1114 if (!MO.isDef()) 1115 continue; 1116 unsigned Reg = MO.getReg(); 1117 if (!Reg) 1118 continue; 1119 LiveRegs.removeReg(Reg); 1120 } else if (MO.isRegMask()) { 1121 LiveRegs.removeRegsInMask(MO); 1122 } 1123 } 1124 1125 // If there is a bundle header fix it up first. 1126 if (!MI.isBundled()) { 1127 toggleKills(MRI, LiveRegs, MI, true); 1128 } else { 1129 MachineBasicBlock::instr_iterator First = MI.getIterator(); 1130 if (MI.isBundle()) { 1131 toggleKills(MRI, LiveRegs, MI, false); 1132 ++First; 1133 } 1134 // Some targets make the (questionable) assumtion that the instructions 1135 // inside the bundle are ordered and consequently only the last use of 1136 // a register inside the bundle can kill it. 1137 MachineBasicBlock::instr_iterator I = std::next(First); 1138 while (I->isBundledWithSucc()) 1139 ++I; 1140 do { 1141 if (!I->isDebugValue()) 1142 toggleKills(MRI, LiveRegs, *I, true); 1143 --I; 1144 } while(I != First); 1145 } 1146 } 1147 } 1148 1149 void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const { 1150 // Cannot completely remove virtual function even in release mode. 1151 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1152 SU->getInstr()->dump(); 1153 #endif 1154 } 1155 1156 std::string ScheduleDAGInstrs::getGraphNodeLabel(const SUnit *SU) const { 1157 std::string s; 1158 raw_string_ostream oss(s); 1159 if (SU == &EntrySU) 1160 oss << "<entry>"; 1161 else if (SU == &ExitSU) 1162 oss << "<exit>"; 1163 else 1164 SU->getInstr()->print(oss, /*SkipOpers=*/true); 1165 return oss.str(); 1166 } 1167 1168 /// Return the basic block label. It is not necessarilly unique because a block 1169 /// contains multiple scheduling regions. But it is fine for visualization. 1170 std::string ScheduleDAGInstrs::getDAGName() const { 1171 return "dag." + BB->getFullName(); 1172 } 1173 1174 //===----------------------------------------------------------------------===// 1175 // SchedDFSResult Implementation 1176 //===----------------------------------------------------------------------===// 1177 1178 namespace llvm { 1179 1180 /// Internal state used to compute SchedDFSResult. 1181 class SchedDFSImpl { 1182 SchedDFSResult &R; 1183 1184 /// Join DAG nodes into equivalence classes by their subtree. 1185 IntEqClasses SubtreeClasses; 1186 /// List PredSU, SuccSU pairs that represent data edges between subtrees. 1187 std::vector<std::pair<const SUnit *, const SUnit*>> ConnectionPairs; 1188 1189 struct RootData { 1190 unsigned NodeID; 1191 unsigned ParentNodeID; ///< Parent node (member of the parent subtree). 1192 unsigned SubInstrCount = 0; ///< Instr count in this tree only, not 1193 /// children. 1194 1195 RootData(unsigned id): NodeID(id), 1196 ParentNodeID(SchedDFSResult::InvalidSubtreeID) {} 1197 1198 unsigned getSparseSetIndex() const { return NodeID; } 1199 }; 1200 1201 SparseSet<RootData> RootSet; 1202 1203 public: 1204 SchedDFSImpl(SchedDFSResult &r): R(r), SubtreeClasses(R.DFSNodeData.size()) { 1205 RootSet.setUniverse(R.DFSNodeData.size()); 1206 } 1207 1208 /// Returns true if this node been visited by the DFS traversal. 1209 /// 1210 /// During visitPostorderNode the Node's SubtreeID is assigned to the Node 1211 /// ID. Later, SubtreeID is updated but remains valid. 1212 bool isVisited(const SUnit *SU) const { 1213 return R.DFSNodeData[SU->NodeNum].SubtreeID 1214 != SchedDFSResult::InvalidSubtreeID; 1215 } 1216 1217 /// Initializes this node's instruction count. We don't need to flag the node 1218 /// visited until visitPostorder because the DAG cannot have cycles. 1219 void visitPreorder(const SUnit *SU) { 1220 R.DFSNodeData[SU->NodeNum].InstrCount = 1221 SU->getInstr()->isTransient() ? 0 : 1; 1222 } 1223 1224 /// Called once for each node after all predecessors are visited. Revisit this 1225 /// node's predecessors and potentially join them now that we know the ILP of 1226 /// the other predecessors. 1227 void visitPostorderNode(const SUnit *SU) { 1228 // Mark this node as the root of a subtree. It may be joined with its 1229 // successors later. 1230 R.DFSNodeData[SU->NodeNum].SubtreeID = SU->NodeNum; 1231 RootData RData(SU->NodeNum); 1232 RData.SubInstrCount = SU->getInstr()->isTransient() ? 0 : 1; 1233 1234 // If any predecessors are still in their own subtree, they either cannot be 1235 // joined or are large enough to remain separate. If this parent node's 1236 // total instruction count is not greater than a child subtree by at least 1237 // the subtree limit, then try to join it now since splitting subtrees is 1238 // only useful if multiple high-pressure paths are possible. 1239 unsigned InstrCount = R.DFSNodeData[SU->NodeNum].InstrCount; 1240 for (const SDep &PredDep : SU->Preds) { 1241 if (PredDep.getKind() != SDep::Data) 1242 continue; 1243 unsigned PredNum = PredDep.getSUnit()->NodeNum; 1244 if ((InstrCount - R.DFSNodeData[PredNum].InstrCount) < R.SubtreeLimit) 1245 joinPredSubtree(PredDep, SU, /*CheckLimit=*/false); 1246 1247 // Either link or merge the TreeData entry from the child to the parent. 1248 if (R.DFSNodeData[PredNum].SubtreeID == PredNum) { 1249 // If the predecessor's parent is invalid, this is a tree edge and the 1250 // current node is the parent. 1251 if (RootSet[PredNum].ParentNodeID == SchedDFSResult::InvalidSubtreeID) 1252 RootSet[PredNum].ParentNodeID = SU->NodeNum; 1253 } 1254 else if (RootSet.count(PredNum)) { 1255 // The predecessor is not a root, but is still in the root set. This 1256 // must be the new parent that it was just joined to. Note that 1257 // RootSet[PredNum].ParentNodeID may either be invalid or may still be 1258 // set to the original parent. 1259 RData.SubInstrCount += RootSet[PredNum].SubInstrCount; 1260 RootSet.erase(PredNum); 1261 } 1262 } 1263 RootSet[SU->NodeNum] = RData; 1264 } 1265 1266 /// \brief Called once for each tree edge after calling visitPostOrderNode on 1267 /// the predecessor. Increment the parent node's instruction count and 1268 /// preemptively join this subtree to its parent's if it is small enough. 1269 void visitPostorderEdge(const SDep &PredDep, const SUnit *Succ) { 1270 R.DFSNodeData[Succ->NodeNum].InstrCount 1271 += R.DFSNodeData[PredDep.getSUnit()->NodeNum].InstrCount; 1272 joinPredSubtree(PredDep, Succ); 1273 } 1274 1275 /// Adds a connection for cross edges. 1276 void visitCrossEdge(const SDep &PredDep, const SUnit *Succ) { 1277 ConnectionPairs.push_back(std::make_pair(PredDep.getSUnit(), Succ)); 1278 } 1279 1280 /// Sets each node's subtree ID to the representative ID and record 1281 /// connections between trees. 1282 void finalize() { 1283 SubtreeClasses.compress(); 1284 R.DFSTreeData.resize(SubtreeClasses.getNumClasses()); 1285 assert(SubtreeClasses.getNumClasses() == RootSet.size() 1286 && "number of roots should match trees"); 1287 for (const RootData &Root : RootSet) { 1288 unsigned TreeID = SubtreeClasses[Root.NodeID]; 1289 if (Root.ParentNodeID != SchedDFSResult::InvalidSubtreeID) 1290 R.DFSTreeData[TreeID].ParentTreeID = SubtreeClasses[Root.ParentNodeID]; 1291 R.DFSTreeData[TreeID].SubInstrCount = Root.SubInstrCount; 1292 // Note that SubInstrCount may be greater than InstrCount if we joined 1293 // subtrees across a cross edge. InstrCount will be attributed to the 1294 // original parent, while SubInstrCount will be attributed to the joined 1295 // parent. 1296 } 1297 R.SubtreeConnections.resize(SubtreeClasses.getNumClasses()); 1298 R.SubtreeConnectLevels.resize(SubtreeClasses.getNumClasses()); 1299 DEBUG(dbgs() << R.getNumSubtrees() << " subtrees:\n"); 1300 for (unsigned Idx = 0, End = R.DFSNodeData.size(); Idx != End; ++Idx) { 1301 R.DFSNodeData[Idx].SubtreeID = SubtreeClasses[Idx]; 1302 DEBUG(dbgs() << " SU(" << Idx << ") in tree " 1303 << R.DFSNodeData[Idx].SubtreeID << '\n'); 1304 } 1305 for (const std::pair<const SUnit*, const SUnit*> &P : ConnectionPairs) { 1306 unsigned PredTree = SubtreeClasses[P.first->NodeNum]; 1307 unsigned SuccTree = SubtreeClasses[P.second->NodeNum]; 1308 if (PredTree == SuccTree) 1309 continue; 1310 unsigned Depth = P.first->getDepth(); 1311 addConnection(PredTree, SuccTree, Depth); 1312 addConnection(SuccTree, PredTree, Depth); 1313 } 1314 } 1315 1316 protected: 1317 /// Joins the predecessor subtree with the successor that is its DFS parent. 1318 /// Applies some heuristics before joining. 1319 bool joinPredSubtree(const SDep &PredDep, const SUnit *Succ, 1320 bool CheckLimit = true) { 1321 assert(PredDep.getKind() == SDep::Data && "Subtrees are for data edges"); 1322 1323 // Check if the predecessor is already joined. 1324 const SUnit *PredSU = PredDep.getSUnit(); 1325 unsigned PredNum = PredSU->NodeNum; 1326 if (R.DFSNodeData[PredNum].SubtreeID != PredNum) 1327 return false; 1328 1329 // Four is the magic number of successors before a node is considered a 1330 // pinch point. 1331 unsigned NumDataSucs = 0; 1332 for (const SDep &SuccDep : PredSU->Succs) { 1333 if (SuccDep.getKind() == SDep::Data) { 1334 if (++NumDataSucs >= 4) 1335 return false; 1336 } 1337 } 1338 if (CheckLimit && R.DFSNodeData[PredNum].InstrCount > R.SubtreeLimit) 1339 return false; 1340 R.DFSNodeData[PredNum].SubtreeID = Succ->NodeNum; 1341 SubtreeClasses.join(Succ->NodeNum, PredNum); 1342 return true; 1343 } 1344 1345 /// Called by finalize() to record a connection between trees. 1346 void addConnection(unsigned FromTree, unsigned ToTree, unsigned Depth) { 1347 if (!Depth) 1348 return; 1349 1350 do { 1351 SmallVectorImpl<SchedDFSResult::Connection> &Connections = 1352 R.SubtreeConnections[FromTree]; 1353 for (SchedDFSResult::Connection &C : Connections) { 1354 if (C.TreeID == ToTree) { 1355 C.Level = std::max(C.Level, Depth); 1356 return; 1357 } 1358 } 1359 Connections.push_back(SchedDFSResult::Connection(ToTree, Depth)); 1360 FromTree = R.DFSTreeData[FromTree].ParentTreeID; 1361 } while (FromTree != SchedDFSResult::InvalidSubtreeID); 1362 } 1363 }; 1364 1365 } // end namespace llvm 1366 1367 namespace { 1368 1369 /// Manage the stack used by a reverse depth-first search over the DAG. 1370 class SchedDAGReverseDFS { 1371 std::vector<std::pair<const SUnit *, SUnit::const_pred_iterator>> DFSStack; 1372 1373 public: 1374 bool isComplete() const { return DFSStack.empty(); } 1375 1376 void follow(const SUnit *SU) { 1377 DFSStack.push_back(std::make_pair(SU, SU->Preds.begin())); 1378 } 1379 void advance() { ++DFSStack.back().second; } 1380 1381 const SDep *backtrack() { 1382 DFSStack.pop_back(); 1383 return DFSStack.empty() ? nullptr : std::prev(DFSStack.back().second); 1384 } 1385 1386 const SUnit *getCurr() const { return DFSStack.back().first; } 1387 1388 SUnit::const_pred_iterator getPred() const { return DFSStack.back().second; } 1389 1390 SUnit::const_pred_iterator getPredEnd() const { 1391 return getCurr()->Preds.end(); 1392 } 1393 }; 1394 1395 } // end anonymous namespace 1396 1397 static bool hasDataSucc(const SUnit *SU) { 1398 for (const SDep &SuccDep : SU->Succs) { 1399 if (SuccDep.getKind() == SDep::Data && 1400 !SuccDep.getSUnit()->isBoundaryNode()) 1401 return true; 1402 } 1403 return false; 1404 } 1405 1406 /// Computes an ILP metric for all nodes in the subDAG reachable via depth-first 1407 /// search from this root. 1408 void SchedDFSResult::compute(ArrayRef<SUnit> SUnits) { 1409 if (!IsBottomUp) 1410 llvm_unreachable("Top-down ILP metric is unimplemnted"); 1411 1412 SchedDFSImpl Impl(*this); 1413 for (const SUnit &SU : SUnits) { 1414 if (Impl.isVisited(&SU) || hasDataSucc(&SU)) 1415 continue; 1416 1417 SchedDAGReverseDFS DFS; 1418 Impl.visitPreorder(&SU); 1419 DFS.follow(&SU); 1420 while (true) { 1421 // Traverse the leftmost path as far as possible. 1422 while (DFS.getPred() != DFS.getPredEnd()) { 1423 const SDep &PredDep = *DFS.getPred(); 1424 DFS.advance(); 1425 // Ignore non-data edges. 1426 if (PredDep.getKind() != SDep::Data 1427 || PredDep.getSUnit()->isBoundaryNode()) { 1428 continue; 1429 } 1430 // An already visited edge is a cross edge, assuming an acyclic DAG. 1431 if (Impl.isVisited(PredDep.getSUnit())) { 1432 Impl.visitCrossEdge(PredDep, DFS.getCurr()); 1433 continue; 1434 } 1435 Impl.visitPreorder(PredDep.getSUnit()); 1436 DFS.follow(PredDep.getSUnit()); 1437 } 1438 // Visit the top of the stack in postorder and backtrack. 1439 const SUnit *Child = DFS.getCurr(); 1440 const SDep *PredDep = DFS.backtrack(); 1441 Impl.visitPostorderNode(Child); 1442 if (PredDep) 1443 Impl.visitPostorderEdge(*PredDep, DFS.getCurr()); 1444 if (DFS.isComplete()) 1445 break; 1446 } 1447 } 1448 Impl.finalize(); 1449 } 1450 1451 /// The root of the given SubtreeID was just scheduled. For all subtrees 1452 /// connected to this tree, record the depth of the connection so that the 1453 /// nearest connected subtrees can be prioritized. 1454 void SchedDFSResult::scheduleTree(unsigned SubtreeID) { 1455 for (const Connection &C : SubtreeConnections[SubtreeID]) { 1456 SubtreeConnectLevels[C.TreeID] = 1457 std::max(SubtreeConnectLevels[C.TreeID], C.Level); 1458 DEBUG(dbgs() << " Tree: " << C.TreeID 1459 << " @" << SubtreeConnectLevels[C.TreeID] << '\n'); 1460 } 1461 } 1462 1463 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1464 LLVM_DUMP_METHOD void ILPValue::print(raw_ostream &OS) const { 1465 OS << InstrCount << " / " << Length << " = "; 1466 if (!Length) 1467 OS << "BADILP"; 1468 else 1469 OS << format("%g", ((double)InstrCount / Length)); 1470 } 1471 1472 LLVM_DUMP_METHOD void ILPValue::dump() const { 1473 dbgs() << *this << '\n'; 1474 } 1475 1476 namespace llvm { 1477 1478 LLVM_DUMP_METHOD 1479 raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) { 1480 Val.print(OS); 1481 return OS; 1482 } 1483 1484 } // end namespace llvm 1485 1486 #endif 1487