1 //===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10 //
11 // This SMS implementation is a target-independent back-end pass. When enabled,
12 // the pass runs just prior to the register allocation pass, while the machine
13 // IR is in SSA form. If software pipelining is successful, then the original
14 // loop is replaced by the optimized loop. The optimized loop contains one or
15 // more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16 // the instructions cannot be scheduled in a given MII, we increase the MII by
17 // one and try again.
18 //
19 // The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20 // represent loop carried dependences in the DAG as order edges to the Phi
21 // nodes. We also perform several passes over the DAG to eliminate unnecessary
22 // edges that inhibit the ability to pipeline. The implementation uses the
23 // DFAPacketizer class to compute the minimum initiation interval and the check
24 // where an instruction may be inserted in the pipelined schedule.
25 //
26 // In order for the SMS pass to work, several target specific hooks need to be
27 // implemented to get information about the loop structure and to rewrite
28 // instructions.
29 //
30 //===----------------------------------------------------------------------===//
31 
32 #include "llvm/ADT/ArrayRef.h"
33 #include "llvm/ADT/BitVector.h"
34 #include "llvm/ADT/DenseMap.h"
35 #include "llvm/ADT/MapVector.h"
36 #include "llvm/ADT/PriorityQueue.h"
37 #include "llvm/ADT/SetVector.h"
38 #include "llvm/ADT/SmallPtrSet.h"
39 #include "llvm/ADT/SmallSet.h"
40 #include "llvm/ADT/SmallVector.h"
41 #include "llvm/ADT/Statistic.h"
42 #include "llvm/ADT/iterator_range.h"
43 #include "llvm/Analysis/AliasAnalysis.h"
44 #include "llvm/Analysis/MemoryLocation.h"
45 #include "llvm/Analysis/ValueTracking.h"
46 #include "llvm/CodeGen/DFAPacketizer.h"
47 #include "llvm/CodeGen/LiveIntervals.h"
48 #include "llvm/CodeGen/MachineBasicBlock.h"
49 #include "llvm/CodeGen/MachineDominators.h"
50 #include "llvm/CodeGen/MachineFunction.h"
51 #include "llvm/CodeGen/MachineFunctionPass.h"
52 #include "llvm/CodeGen/MachineInstr.h"
53 #include "llvm/CodeGen/MachineInstrBuilder.h"
54 #include "llvm/CodeGen/MachineLoopInfo.h"
55 #include "llvm/CodeGen/MachineMemOperand.h"
56 #include "llvm/CodeGen/MachineOperand.h"
57 #include "llvm/CodeGen/MachinePipeliner.h"
58 #include "llvm/CodeGen/MachineRegisterInfo.h"
59 #include "llvm/CodeGen/RegisterPressure.h"
60 #include "llvm/CodeGen/ScheduleDAG.h"
61 #include "llvm/CodeGen/ScheduleDAGMutation.h"
62 #include "llvm/CodeGen/TargetOpcodes.h"
63 #include "llvm/CodeGen/TargetRegisterInfo.h"
64 #include "llvm/CodeGen/TargetSubtargetInfo.h"
65 #include "llvm/Config/llvm-config.h"
66 #include "llvm/IR/Attributes.h"
67 #include "llvm/IR/DebugLoc.h"
68 #include "llvm/IR/Function.h"
69 #include "llvm/MC/LaneBitmask.h"
70 #include "llvm/MC/MCInstrDesc.h"
71 #include "llvm/MC/MCInstrItineraries.h"
72 #include "llvm/MC/MCRegisterInfo.h"
73 #include "llvm/Pass.h"
74 #include "llvm/Support/CommandLine.h"
75 #include "llvm/Support/Compiler.h"
76 #include "llvm/Support/Debug.h"
77 #include "llvm/Support/MathExtras.h"
78 #include "llvm/Support/raw_ostream.h"
79 #include <algorithm>
80 #include <cassert>
81 #include <climits>
82 #include <cstdint>
83 #include <deque>
84 #include <functional>
85 #include <iterator>
86 #include <map>
87 #include <memory>
88 #include <tuple>
89 #include <utility>
90 #include <vector>
91 
92 using namespace llvm;
93 
94 #define DEBUG_TYPE "pipeliner"
95 
96 STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
97 STATISTIC(NumPipelined, "Number of loops software pipelined");
98 STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
99 STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
100 STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
101 STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
102 STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
103 STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
104 STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
105 STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
106 STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
107 
108 /// A command line option to turn software pipelining on or off.
109 static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
110                                cl::ZeroOrMore,
111                                cl::desc("Enable Software Pipelining"));
112 
113 /// A command line option to enable SWP at -Os.
114 static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
115                                       cl::desc("Enable SWP at Os."), cl::Hidden,
116                                       cl::init(false));
117 
118 /// A command line argument to limit minimum initial interval for pipelining.
119 static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
120                               cl::desc("Size limit for the MII."),
121                               cl::Hidden, cl::init(27));
122 
123 /// A command line argument to limit the number of stages in the pipeline.
124 static cl::opt<int>
125     SwpMaxStages("pipeliner-max-stages",
126                  cl::desc("Maximum stages allowed in the generated scheduled."),
127                  cl::Hidden, cl::init(3));
128 
129 /// A command line option to disable the pruning of chain dependences due to
130 /// an unrelated Phi.
131 static cl::opt<bool>
132     SwpPruneDeps("pipeliner-prune-deps",
133                  cl::desc("Prune dependences between unrelated Phi nodes."),
134                  cl::Hidden, cl::init(true));
135 
136 /// A command line option to disable the pruning of loop carried order
137 /// dependences.
138 static cl::opt<bool>
139     SwpPruneLoopCarried("pipeliner-prune-loop-carried",
140                         cl::desc("Prune loop carried order dependences."),
141                         cl::Hidden, cl::init(true));
142 
143 #ifndef NDEBUG
144 static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
145 #endif
146 
147 static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
148                                      cl::ReallyHidden, cl::init(false),
149                                      cl::ZeroOrMore, cl::desc("Ignore RecMII"));
150 
151 namespace llvm {
152 
153 // A command line option to enable the CopyToPhi DAG mutation.
154 cl::opt<bool>
155     SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
156                        cl::init(true), cl::ZeroOrMore,
157                        cl::desc("Enable CopyToPhi DAG Mutation"));
158 
159 } // end namespace llvm
160 
161 unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
162 char MachinePipeliner::ID = 0;
163 #ifndef NDEBUG
164 int MachinePipeliner::NumTries = 0;
165 #endif
166 char &llvm::MachinePipelinerID = MachinePipeliner::ID;
167 
168 INITIALIZE_PASS_BEGIN(MachinePipeliner, DEBUG_TYPE,
169                       "Modulo Software Pipelining", false, false)
170 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
171 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
172 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
173 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
174 INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE,
175                     "Modulo Software Pipelining", false, false)
176 
177 /// The "main" function for implementing Swing Modulo Scheduling.
178 bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
179   if (skipFunction(mf.getFunction()))
180     return false;
181 
182   if (!EnableSWP)
183     return false;
184 
185   if (mf.getFunction().getAttributes().hasAttribute(
186           AttributeList::FunctionIndex, Attribute::OptimizeForSize) &&
187       !EnableSWPOptSize.getPosition())
188     return false;
189 
190   // Cannot pipeline loops without instruction itineraries if we are using
191   // DFA for the pipeliner.
192   if (mf.getSubtarget().useDFAforSMS() &&
193       (!mf.getSubtarget().getInstrItineraryData() ||
194        mf.getSubtarget().getInstrItineraryData()->isEmpty()))
195     return false;
196 
197   MF = &mf;
198   MLI = &getAnalysis<MachineLoopInfo>();
199   MDT = &getAnalysis<MachineDominatorTree>();
200   TII = MF->getSubtarget().getInstrInfo();
201   RegClassInfo.runOnMachineFunction(*MF);
202 
203   for (auto &L : *MLI)
204     scheduleLoop(*L);
205 
206   return false;
207 }
208 
209 /// Attempt to perform the SMS algorithm on the specified loop. This function is
210 /// the main entry point for the algorithm.  The function identifies candidate
211 /// loops, calculates the minimum initiation interval, and attempts to schedule
212 /// the loop.
213 bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
214   bool Changed = false;
215   for (auto &InnerLoop : L)
216     Changed |= scheduleLoop(*InnerLoop);
217 
218 #ifndef NDEBUG
219   // Stop trying after reaching the limit (if any).
220   int Limit = SwpLoopLimit;
221   if (Limit >= 0) {
222     if (NumTries >= SwpLoopLimit)
223       return Changed;
224     NumTries++;
225   }
226 #endif
227 
228   setPragmaPipelineOptions(L);
229   if (!canPipelineLoop(L)) {
230     LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
231     return Changed;
232   }
233 
234   ++NumTrytoPipeline;
235 
236   Changed = swingModuloScheduler(L);
237 
238   return Changed;
239 }
240 
241 void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
242   MachineBasicBlock *LBLK = L.getTopBlock();
243 
244   if (LBLK == nullptr)
245     return;
246 
247   const BasicBlock *BBLK = LBLK->getBasicBlock();
248   if (BBLK == nullptr)
249     return;
250 
251   const Instruction *TI = BBLK->getTerminator();
252   if (TI == nullptr)
253     return;
254 
255   MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
256   if (LoopID == nullptr)
257     return;
258 
259   assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
260   assert(LoopID->getOperand(0) == LoopID && "invalid loop");
261 
262   for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
263     MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
264 
265     if (MD == nullptr)
266       continue;
267 
268     MDString *S = dyn_cast<MDString>(MD->getOperand(0));
269 
270     if (S == nullptr)
271       continue;
272 
273     if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
274       assert(MD->getNumOperands() == 2 &&
275              "Pipeline initiation interval hint metadata should have two operands.");
276       II_setByPragma =
277           mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
278       assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
279     } else if (S->getString() == "llvm.loop.pipeline.disable") {
280       disabledByPragma = true;
281     }
282   }
283 }
284 
285 /// Return true if the loop can be software pipelined.  The algorithm is
286 /// restricted to loops with a single basic block.  Make sure that the
287 /// branch in the loop can be analyzed.
288 bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
289   if (L.getNumBlocks() != 1)
290     return false;
291 
292   if (disabledByPragma)
293     return false;
294 
295   // Check if the branch can't be understood because we can't do pipelining
296   // if that's the case.
297   LI.TBB = nullptr;
298   LI.FBB = nullptr;
299   LI.BrCond.clear();
300   if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
301     LLVM_DEBUG(
302         dbgs() << "Unable to analyzeBranch, can NOT pipeline current Loop\n");
303     NumFailBranch++;
304     return false;
305   }
306 
307   LI.LoopInductionVar = nullptr;
308   LI.LoopCompare = nullptr;
309   if (TII->analyzeLoop(L, LI.LoopInductionVar, LI.LoopCompare)) {
310     LLVM_DEBUG(
311         dbgs() << "Unable to analyzeLoop, can NOT pipeline current Loop\n");
312     NumFailLoop++;
313     return false;
314   }
315 
316   if (!L.getLoopPreheader()) {
317     LLVM_DEBUG(
318         dbgs() << "Preheader not found, can NOT pipeline current Loop\n");
319     NumFailPreheader++;
320     return false;
321   }
322 
323   // Remove any subregisters from inputs to phi nodes.
324   preprocessPhiNodes(*L.getHeader());
325   return true;
326 }
327 
328 void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
329   MachineRegisterInfo &MRI = MF->getRegInfo();
330   SlotIndexes &Slots = *getAnalysis<LiveIntervals>().getSlotIndexes();
331 
332   for (MachineInstr &PI : make_range(B.begin(), B.getFirstNonPHI())) {
333     MachineOperand &DefOp = PI.getOperand(0);
334     assert(DefOp.getSubReg() == 0);
335     auto *RC = MRI.getRegClass(DefOp.getReg());
336 
337     for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
338       MachineOperand &RegOp = PI.getOperand(i);
339       if (RegOp.getSubReg() == 0)
340         continue;
341 
342       // If the operand uses a subregister, replace it with a new register
343       // without subregisters, and generate a copy to the new register.
344       unsigned NewReg = MRI.createVirtualRegister(RC);
345       MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
346       MachineBasicBlock::iterator At = PredB.getFirstTerminator();
347       const DebugLoc &DL = PredB.findDebugLoc(At);
348       auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
349                     .addReg(RegOp.getReg(), getRegState(RegOp),
350                             RegOp.getSubReg());
351       Slots.insertMachineInstrInMaps(*Copy);
352       RegOp.setReg(NewReg);
353       RegOp.setSubReg(0);
354     }
355   }
356 }
357 
358 /// The SMS algorithm consists of the following main steps:
359 /// 1. Computation and analysis of the dependence graph.
360 /// 2. Ordering of the nodes (instructions).
361 /// 3. Attempt to Schedule the loop.
362 bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
363   assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
364 
365   SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo,
366                         II_setByPragma);
367 
368   MachineBasicBlock *MBB = L.getHeader();
369   // The kernel should not include any terminator instructions.  These
370   // will be added back later.
371   SMS.startBlock(MBB);
372 
373   // Compute the number of 'real' instructions in the basic block by
374   // ignoring terminators.
375   unsigned size = MBB->size();
376   for (MachineBasicBlock::iterator I = MBB->getFirstTerminator(),
377                                    E = MBB->instr_end();
378        I != E; ++I, --size)
379     ;
380 
381   SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
382   SMS.schedule();
383   SMS.exitRegion();
384 
385   SMS.finishBlock();
386   return SMS.hasNewSchedule();
387 }
388 
389 void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
390   if (II_setByPragma > 0)
391     MII = II_setByPragma;
392   else
393     MII = std::max(ResMII, RecMII);
394 }
395 
396 void SwingSchedulerDAG::setMAX_II() {
397   if (II_setByPragma > 0)
398     MAX_II = II_setByPragma;
399   else
400     MAX_II = MII + 10;
401 }
402 
403 /// We override the schedule function in ScheduleDAGInstrs to implement the
404 /// scheduling part of the Swing Modulo Scheduling algorithm.
405 void SwingSchedulerDAG::schedule() {
406   AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
407   buildSchedGraph(AA);
408   addLoopCarriedDependences(AA);
409   updatePhiDependences();
410   Topo.InitDAGTopologicalSorting();
411   changeDependences();
412   postprocessDAG();
413   LLVM_DEBUG(dump());
414 
415   NodeSetType NodeSets;
416   findCircuits(NodeSets);
417   NodeSetType Circuits = NodeSets;
418 
419   // Calculate the MII.
420   unsigned ResMII = calculateResMII();
421   unsigned RecMII = calculateRecMII(NodeSets);
422 
423   fuseRecs(NodeSets);
424 
425   // This flag is used for testing and can cause correctness problems.
426   if (SwpIgnoreRecMII)
427     RecMII = 0;
428 
429   setMII(ResMII, RecMII);
430   setMAX_II();
431 
432   LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
433                     << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
434 
435   // Can't schedule a loop without a valid MII.
436   if (MII == 0) {
437     LLVM_DEBUG(
438         dbgs()
439         << "0 is not a valid Minimal Initiation Interval, can NOT schedule\n");
440     NumFailZeroMII++;
441     return;
442   }
443 
444   // Don't pipeline large loops.
445   if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
446     LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
447                       << ", we don't pipleline large loops\n");
448     NumFailLargeMaxMII++;
449     return;
450   }
451 
452   computeNodeFunctions(NodeSets);
453 
454   registerPressureFilter(NodeSets);
455 
456   colocateNodeSets(NodeSets);
457 
458   checkNodeSets(NodeSets);
459 
460   LLVM_DEBUG({
461     for (auto &I : NodeSets) {
462       dbgs() << "  Rec NodeSet ";
463       I.dump();
464     }
465   });
466 
467   llvm::stable_sort(NodeSets, std::greater<NodeSet>());
468 
469   groupRemainingNodes(NodeSets);
470 
471   removeDuplicateNodes(NodeSets);
472 
473   LLVM_DEBUG({
474     for (auto &I : NodeSets) {
475       dbgs() << "  NodeSet ";
476       I.dump();
477     }
478   });
479 
480   computeNodeOrder(NodeSets);
481 
482   // check for node order issues
483   checkValidNodeOrder(Circuits);
484 
485   SMSchedule Schedule(Pass.MF);
486   Scheduled = schedulePipeline(Schedule);
487 
488   if (!Scheduled){
489     LLVM_DEBUG(dbgs() << "No schedule found, return\n");
490     NumFailNoSchedule++;
491     return;
492   }
493 
494   unsigned numStages = Schedule.getMaxStageCount();
495   // No need to generate pipeline if there are no overlapped iterations.
496   if (numStages == 0) {
497     LLVM_DEBUG(
498         dbgs() << "No overlapped iterations, no need to generate pipeline\n");
499     NumFailZeroStage++;
500     return;
501   }
502   // Check that the maximum stage count is less than user-defined limit.
503   if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
504     LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
505                       << " : too many stages, abort\n");
506     NumFailLargeMaxStage++;
507     return;
508   }
509 
510   generatePipelinedLoop(Schedule);
511   ++NumPipelined;
512 }
513 
514 /// Clean up after the software pipeliner runs.
515 void SwingSchedulerDAG::finishBlock() {
516   for (MachineInstr *I : NewMIs)
517     MF.DeleteMachineInstr(I);
518   NewMIs.clear();
519 
520   // Call the superclass.
521   ScheduleDAGInstrs::finishBlock();
522 }
523 
524 /// Return the register values for  the operands of a Phi instruction.
525 /// This function assume the instruction is a Phi.
526 static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
527                        unsigned &InitVal, unsigned &LoopVal) {
528   assert(Phi.isPHI() && "Expecting a Phi.");
529 
530   InitVal = 0;
531   LoopVal = 0;
532   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
533     if (Phi.getOperand(i + 1).getMBB() != Loop)
534       InitVal = Phi.getOperand(i).getReg();
535     else
536       LoopVal = Phi.getOperand(i).getReg();
537 
538   assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
539 }
540 
541 /// Return the Phi register value that comes from the incoming block.
542 static unsigned getInitPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
543   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
544     if (Phi.getOperand(i + 1).getMBB() != LoopBB)
545       return Phi.getOperand(i).getReg();
546   return 0;
547 }
548 
549 /// Return the Phi register value that comes the loop block.
550 static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
551   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
552     if (Phi.getOperand(i + 1).getMBB() == LoopBB)
553       return Phi.getOperand(i).getReg();
554   return 0;
555 }
556 
557 /// Return true if SUb can be reached from SUa following the chain edges.
558 static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
559   SmallPtrSet<SUnit *, 8> Visited;
560   SmallVector<SUnit *, 8> Worklist;
561   Worklist.push_back(SUa);
562   while (!Worklist.empty()) {
563     const SUnit *SU = Worklist.pop_back_val();
564     for (auto &SI : SU->Succs) {
565       SUnit *SuccSU = SI.getSUnit();
566       if (SI.getKind() == SDep::Order) {
567         if (Visited.count(SuccSU))
568           continue;
569         if (SuccSU == SUb)
570           return true;
571         Worklist.push_back(SuccSU);
572         Visited.insert(SuccSU);
573       }
574     }
575   }
576   return false;
577 }
578 
579 /// Return true if the instruction causes a chain between memory
580 /// references before and after it.
581 static bool isDependenceBarrier(MachineInstr &MI, AliasAnalysis *AA) {
582   return MI.isCall() || MI.hasUnmodeledSideEffects() ||
583          (MI.hasOrderedMemoryRef() &&
584           (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad(AA)));
585 }
586 
587 /// Return the underlying objects for the memory references of an instruction.
588 /// This function calls the code in ValueTracking, but first checks that the
589 /// instruction has a memory operand.
590 static void getUnderlyingObjects(const MachineInstr *MI,
591                                  SmallVectorImpl<const Value *> &Objs,
592                                  const DataLayout &DL) {
593   if (!MI->hasOneMemOperand())
594     return;
595   MachineMemOperand *MM = *MI->memoperands_begin();
596   if (!MM->getValue())
597     return;
598   GetUnderlyingObjects(MM->getValue(), Objs, DL);
599   for (const Value *V : Objs) {
600     if (!isIdentifiedObject(V)) {
601       Objs.clear();
602       return;
603     }
604     Objs.push_back(V);
605   }
606 }
607 
608 /// Add a chain edge between a load and store if the store can be an
609 /// alias of the load on a subsequent iteration, i.e., a loop carried
610 /// dependence. This code is very similar to the code in ScheduleDAGInstrs
611 /// but that code doesn't create loop carried dependences.
612 void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
613   MapVector<const Value *, SmallVector<SUnit *, 4>> PendingLoads;
614   Value *UnknownValue =
615     UndefValue::get(Type::getVoidTy(MF.getFunction().getContext()));
616   for (auto &SU : SUnits) {
617     MachineInstr &MI = *SU.getInstr();
618     if (isDependenceBarrier(MI, AA))
619       PendingLoads.clear();
620     else if (MI.mayLoad()) {
621       SmallVector<const Value *, 4> Objs;
622       getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
623       if (Objs.empty())
624         Objs.push_back(UnknownValue);
625       for (auto V : Objs) {
626         SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
627         SUs.push_back(&SU);
628       }
629     } else if (MI.mayStore()) {
630       SmallVector<const Value *, 4> Objs;
631       getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
632       if (Objs.empty())
633         Objs.push_back(UnknownValue);
634       for (auto V : Objs) {
635         MapVector<const Value *, SmallVector<SUnit *, 4>>::iterator I =
636             PendingLoads.find(V);
637         if (I == PendingLoads.end())
638           continue;
639         for (auto Load : I->second) {
640           if (isSuccOrder(Load, &SU))
641             continue;
642           MachineInstr &LdMI = *Load->getInstr();
643           // First, perform the cheaper check that compares the base register.
644           // If they are the same and the load offset is less than the store
645           // offset, then mark the dependence as loop carried potentially.
646           const MachineOperand *BaseOp1, *BaseOp2;
647           int64_t Offset1, Offset2;
648           if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1, TRI) &&
649               TII->getMemOperandWithOffset(MI, BaseOp2, Offset2, TRI)) {
650             if (BaseOp1->isIdenticalTo(*BaseOp2) &&
651                 (int)Offset1 < (int)Offset2) {
652               assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI, AA) &&
653                      "What happened to the chain edge?");
654               SDep Dep(Load, SDep::Barrier);
655               Dep.setLatency(1);
656               SU.addPred(Dep);
657               continue;
658             }
659           }
660           // Second, the more expensive check that uses alias analysis on the
661           // base registers. If they alias, and the load offset is less than
662           // the store offset, the mark the dependence as loop carried.
663           if (!AA) {
664             SDep Dep(Load, SDep::Barrier);
665             Dep.setLatency(1);
666             SU.addPred(Dep);
667             continue;
668           }
669           MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
670           MachineMemOperand *MMO2 = *MI.memoperands_begin();
671           if (!MMO1->getValue() || !MMO2->getValue()) {
672             SDep Dep(Load, SDep::Barrier);
673             Dep.setLatency(1);
674             SU.addPred(Dep);
675             continue;
676           }
677           if (MMO1->getValue() == MMO2->getValue() &&
678               MMO1->getOffset() <= MMO2->getOffset()) {
679             SDep Dep(Load, SDep::Barrier);
680             Dep.setLatency(1);
681             SU.addPred(Dep);
682             continue;
683           }
684           AliasResult AAResult = AA->alias(
685               MemoryLocation(MMO1->getValue(), LocationSize::unknown(),
686                              MMO1->getAAInfo()),
687               MemoryLocation(MMO2->getValue(), LocationSize::unknown(),
688                              MMO2->getAAInfo()));
689 
690           if (AAResult != NoAlias) {
691             SDep Dep(Load, SDep::Barrier);
692             Dep.setLatency(1);
693             SU.addPred(Dep);
694           }
695         }
696       }
697     }
698   }
699 }
700 
701 /// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
702 /// processes dependences for PHIs. This function adds true dependences
703 /// from a PHI to a use, and a loop carried dependence from the use to the
704 /// PHI. The loop carried dependence is represented as an anti dependence
705 /// edge. This function also removes chain dependences between unrelated
706 /// PHIs.
707 void SwingSchedulerDAG::updatePhiDependences() {
708   SmallVector<SDep, 4> RemoveDeps;
709   const TargetSubtargetInfo &ST = MF.getSubtarget<TargetSubtargetInfo>();
710 
711   // Iterate over each DAG node.
712   for (SUnit &I : SUnits) {
713     RemoveDeps.clear();
714     // Set to true if the instruction has an operand defined by a Phi.
715     unsigned HasPhiUse = 0;
716     unsigned HasPhiDef = 0;
717     MachineInstr *MI = I.getInstr();
718     // Iterate over each operand, and we process the definitions.
719     for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
720                                     MOE = MI->operands_end();
721          MOI != MOE; ++MOI) {
722       if (!MOI->isReg())
723         continue;
724       unsigned Reg = MOI->getReg();
725       if (MOI->isDef()) {
726         // If the register is used by a Phi, then create an anti dependence.
727         for (MachineRegisterInfo::use_instr_iterator
728                  UI = MRI.use_instr_begin(Reg),
729                  UE = MRI.use_instr_end();
730              UI != UE; ++UI) {
731           MachineInstr *UseMI = &*UI;
732           SUnit *SU = getSUnit(UseMI);
733           if (SU != nullptr && UseMI->isPHI()) {
734             if (!MI->isPHI()) {
735               SDep Dep(SU, SDep::Anti, Reg);
736               Dep.setLatency(1);
737               I.addPred(Dep);
738             } else {
739               HasPhiDef = Reg;
740               // Add a chain edge to a dependent Phi that isn't an existing
741               // predecessor.
742               if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
743                 I.addPred(SDep(SU, SDep::Barrier));
744             }
745           }
746         }
747       } else if (MOI->isUse()) {
748         // If the register is defined by a Phi, then create a true dependence.
749         MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
750         if (DefMI == nullptr)
751           continue;
752         SUnit *SU = getSUnit(DefMI);
753         if (SU != nullptr && DefMI->isPHI()) {
754           if (!MI->isPHI()) {
755             SDep Dep(SU, SDep::Data, Reg);
756             Dep.setLatency(0);
757             ST.adjustSchedDependency(SU, &I, Dep);
758             I.addPred(Dep);
759           } else {
760             HasPhiUse = Reg;
761             // Add a chain edge to a dependent Phi that isn't an existing
762             // predecessor.
763             if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
764               I.addPred(SDep(SU, SDep::Barrier));
765           }
766         }
767       }
768     }
769     // Remove order dependences from an unrelated Phi.
770     if (!SwpPruneDeps)
771       continue;
772     for (auto &PI : I.Preds) {
773       MachineInstr *PMI = PI.getSUnit()->getInstr();
774       if (PMI->isPHI() && PI.getKind() == SDep::Order) {
775         if (I.getInstr()->isPHI()) {
776           if (PMI->getOperand(0).getReg() == HasPhiUse)
777             continue;
778           if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
779             continue;
780         }
781         RemoveDeps.push_back(PI);
782       }
783     }
784     for (int i = 0, e = RemoveDeps.size(); i != e; ++i)
785       I.removePred(RemoveDeps[i]);
786   }
787 }
788 
789 /// Iterate over each DAG node and see if we can change any dependences
790 /// in order to reduce the recurrence MII.
791 void SwingSchedulerDAG::changeDependences() {
792   // See if an instruction can use a value from the previous iteration.
793   // If so, we update the base and offset of the instruction and change
794   // the dependences.
795   for (SUnit &I : SUnits) {
796     unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
797     int64_t NewOffset = 0;
798     if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
799                                NewOffset))
800       continue;
801 
802     // Get the MI and SUnit for the instruction that defines the original base.
803     unsigned OrigBase = I.getInstr()->getOperand(BasePos).getReg();
804     MachineInstr *DefMI = MRI.getUniqueVRegDef(OrigBase);
805     if (!DefMI)
806       continue;
807     SUnit *DefSU = getSUnit(DefMI);
808     if (!DefSU)
809       continue;
810     // Get the MI and SUnit for the instruction that defins the new base.
811     MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
812     if (!LastMI)
813       continue;
814     SUnit *LastSU = getSUnit(LastMI);
815     if (!LastSU)
816       continue;
817 
818     if (Topo.IsReachable(&I, LastSU))
819       continue;
820 
821     // Remove the dependence. The value now depends on a prior iteration.
822     SmallVector<SDep, 4> Deps;
823     for (SUnit::pred_iterator P = I.Preds.begin(), E = I.Preds.end(); P != E;
824          ++P)
825       if (P->getSUnit() == DefSU)
826         Deps.push_back(*P);
827     for (int i = 0, e = Deps.size(); i != e; i++) {
828       Topo.RemovePred(&I, Deps[i].getSUnit());
829       I.removePred(Deps[i]);
830     }
831     // Remove the chain dependence between the instructions.
832     Deps.clear();
833     for (auto &P : LastSU->Preds)
834       if (P.getSUnit() == &I && P.getKind() == SDep::Order)
835         Deps.push_back(P);
836     for (int i = 0, e = Deps.size(); i != e; i++) {
837       Topo.RemovePred(LastSU, Deps[i].getSUnit());
838       LastSU->removePred(Deps[i]);
839     }
840 
841     // Add a dependence between the new instruction and the instruction
842     // that defines the new base.
843     SDep Dep(&I, SDep::Anti, NewBase);
844     Topo.AddPred(LastSU, &I);
845     LastSU->addPred(Dep);
846 
847     // Remember the base and offset information so that we can update the
848     // instruction during code generation.
849     InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
850   }
851 }
852 
853 namespace {
854 
855 // FuncUnitSorter - Comparison operator used to sort instructions by
856 // the number of functional unit choices.
857 struct FuncUnitSorter {
858   const InstrItineraryData *InstrItins;
859   const MCSubtargetInfo *STI;
860   DenseMap<unsigned, unsigned> Resources;
861 
862   FuncUnitSorter(const TargetSubtargetInfo &TSI)
863       : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
864 
865   // Compute the number of functional unit alternatives needed
866   // at each stage, and take the minimum value. We prioritize the
867   // instructions by the least number of choices first.
868   unsigned minFuncUnits(const MachineInstr *Inst, unsigned &F) const {
869     unsigned SchedClass = Inst->getDesc().getSchedClass();
870     unsigned min = UINT_MAX;
871     if (InstrItins && !InstrItins->isEmpty()) {
872       for (const InstrStage &IS :
873            make_range(InstrItins->beginStage(SchedClass),
874                       InstrItins->endStage(SchedClass))) {
875         unsigned funcUnits = IS.getUnits();
876         unsigned numAlternatives = countPopulation(funcUnits);
877         if (numAlternatives < min) {
878           min = numAlternatives;
879           F = funcUnits;
880         }
881       }
882       return min;
883     }
884     if (STI && STI->getSchedModel().hasInstrSchedModel()) {
885       const MCSchedClassDesc *SCDesc =
886           STI->getSchedModel().getSchedClassDesc(SchedClass);
887       if (!SCDesc->isValid())
888         // No valid Schedule Class Desc for schedClass, should be
889         // Pseudo/PostRAPseudo
890         return min;
891 
892       for (const MCWriteProcResEntry &PRE :
893            make_range(STI->getWriteProcResBegin(SCDesc),
894                       STI->getWriteProcResEnd(SCDesc))) {
895         if (!PRE.Cycles)
896           continue;
897         const MCProcResourceDesc *ProcResource =
898             STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
899         unsigned NumUnits = ProcResource->NumUnits;
900         if (NumUnits < min) {
901           min = NumUnits;
902           F = PRE.ProcResourceIdx;
903         }
904       }
905       return min;
906     }
907     llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
908   }
909 
910   // Compute the critical resources needed by the instruction. This
911   // function records the functional units needed by instructions that
912   // must use only one functional unit. We use this as a tie breaker
913   // for computing the resource MII. The instrutions that require
914   // the same, highly used, functional unit have high priority.
915   void calcCriticalResources(MachineInstr &MI) {
916     unsigned SchedClass = MI.getDesc().getSchedClass();
917     if (InstrItins && !InstrItins->isEmpty()) {
918       for (const InstrStage &IS :
919            make_range(InstrItins->beginStage(SchedClass),
920                       InstrItins->endStage(SchedClass))) {
921         unsigned FuncUnits = IS.getUnits();
922         if (countPopulation(FuncUnits) == 1)
923           Resources[FuncUnits]++;
924       }
925       return;
926     }
927     if (STI && STI->getSchedModel().hasInstrSchedModel()) {
928       const MCSchedClassDesc *SCDesc =
929           STI->getSchedModel().getSchedClassDesc(SchedClass);
930       if (!SCDesc->isValid())
931         // No valid Schedule Class Desc for schedClass, should be
932         // Pseudo/PostRAPseudo
933         return;
934 
935       for (const MCWriteProcResEntry &PRE :
936            make_range(STI->getWriteProcResBegin(SCDesc),
937                       STI->getWriteProcResEnd(SCDesc))) {
938         if (!PRE.Cycles)
939           continue;
940         Resources[PRE.ProcResourceIdx]++;
941       }
942       return;
943     }
944     llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
945   }
946 
947   /// Return true if IS1 has less priority than IS2.
948   bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
949     unsigned F1 = 0, F2 = 0;
950     unsigned MFUs1 = minFuncUnits(IS1, F1);
951     unsigned MFUs2 = minFuncUnits(IS2, F2);
952     if (MFUs1 == 1 && MFUs2 == 1)
953       return Resources.lookup(F1) < Resources.lookup(F2);
954     return MFUs1 > MFUs2;
955   }
956 };
957 
958 } // end anonymous namespace
959 
960 /// Calculate the resource constrained minimum initiation interval for the
961 /// specified loop. We use the DFA to model the resources needed for
962 /// each instruction, and we ignore dependences. A different DFA is created
963 /// for each cycle that is required. When adding a new instruction, we attempt
964 /// to add it to each existing DFA, until a legal space is found. If the
965 /// instruction cannot be reserved in an existing DFA, we create a new one.
966 unsigned SwingSchedulerDAG::calculateResMII() {
967 
968   LLVM_DEBUG(dbgs() << "calculateResMII:\n");
969   SmallVector<ResourceManager*, 8> Resources;
970   MachineBasicBlock *MBB = Loop.getHeader();
971   Resources.push_back(new ResourceManager(&MF.getSubtarget()));
972 
973   // Sort the instructions by the number of available choices for scheduling,
974   // least to most. Use the number of critical resources as the tie breaker.
975   FuncUnitSorter FUS = FuncUnitSorter(MF.getSubtarget());
976   for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
977                                    E = MBB->getFirstTerminator();
978        I != E; ++I)
979     FUS.calcCriticalResources(*I);
980   PriorityQueue<MachineInstr *, std::vector<MachineInstr *>, FuncUnitSorter>
981       FuncUnitOrder(FUS);
982 
983   for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
984                                    E = MBB->getFirstTerminator();
985        I != E; ++I)
986     FuncUnitOrder.push(&*I);
987 
988   while (!FuncUnitOrder.empty()) {
989     MachineInstr *MI = FuncUnitOrder.top();
990     FuncUnitOrder.pop();
991     if (TII->isZeroCost(MI->getOpcode()))
992       continue;
993     // Attempt to reserve the instruction in an existing DFA. At least one
994     // DFA is needed for each cycle.
995     unsigned NumCycles = getSUnit(MI)->Latency;
996     unsigned ReservedCycles = 0;
997     SmallVectorImpl<ResourceManager *>::iterator RI = Resources.begin();
998     SmallVectorImpl<ResourceManager *>::iterator RE = Resources.end();
999     LLVM_DEBUG({
1000       dbgs() << "Trying to reserve resource for " << NumCycles
1001              << " cycles for \n";
1002       MI->dump();
1003     });
1004     for (unsigned C = 0; C < NumCycles; ++C)
1005       while (RI != RE) {
1006         if ((*RI++)->canReserveResources(*MI)) {
1007           ++ReservedCycles;
1008           break;
1009         }
1010       }
1011     // Start reserving resources using existing DFAs.
1012     for (unsigned C = 0; C < ReservedCycles; ++C) {
1013       --RI;
1014       (*RI)->reserveResources(*MI);
1015     }
1016 
1017     LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
1018                       << ", NumCycles:" << NumCycles << "\n");
1019     // Add new DFAs, if needed, to reserve resources.
1020     for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
1021       LLVM_DEBUG(dbgs() << "NewResource created to reserve resources"
1022                         << "\n");
1023       ResourceManager *NewResource = new ResourceManager(&MF.getSubtarget());
1024       assert(NewResource->canReserveResources(*MI) && "Reserve error.");
1025       NewResource->reserveResources(*MI);
1026       Resources.push_back(NewResource);
1027     }
1028   }
1029   int Resmii = Resources.size();
1030   LLVM_DEBUG(dbgs() << "Retrun Res MII:" << Resmii << "\n");
1031   // Delete the memory for each of the DFAs that were created earlier.
1032   for (ResourceManager *RI : Resources) {
1033     ResourceManager *D = RI;
1034     delete D;
1035   }
1036   Resources.clear();
1037   return Resmii;
1038 }
1039 
1040 /// Calculate the recurrence-constrainted minimum initiation interval.
1041 /// Iterate over each circuit.  Compute the delay(c) and distance(c)
1042 /// for each circuit. The II needs to satisfy the inequality
1043 /// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1044 /// II that satisfies the inequality, and the RecMII is the maximum
1045 /// of those values.
1046 unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1047   unsigned RecMII = 0;
1048 
1049   for (NodeSet &Nodes : NodeSets) {
1050     if (Nodes.empty())
1051       continue;
1052 
1053     unsigned Delay = Nodes.getLatency();
1054     unsigned Distance = 1;
1055 
1056     // ii = ceil(delay / distance)
1057     unsigned CurMII = (Delay + Distance - 1) / Distance;
1058     Nodes.setRecMII(CurMII);
1059     if (CurMII > RecMII)
1060       RecMII = CurMII;
1061   }
1062 
1063   return RecMII;
1064 }
1065 
1066 /// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1067 /// but we do this to find the circuits, and then change them back.
1068 static void swapAntiDependences(std::vector<SUnit> &SUnits) {
1069   SmallVector<std::pair<SUnit *, SDep>, 8> DepsAdded;
1070   for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
1071     SUnit *SU = &SUnits[i];
1072     for (SUnit::pred_iterator IP = SU->Preds.begin(), EP = SU->Preds.end();
1073          IP != EP; ++IP) {
1074       if (IP->getKind() != SDep::Anti)
1075         continue;
1076       DepsAdded.push_back(std::make_pair(SU, *IP));
1077     }
1078   }
1079   for (SmallVector<std::pair<SUnit *, SDep>, 8>::iterator I = DepsAdded.begin(),
1080                                                           E = DepsAdded.end();
1081        I != E; ++I) {
1082     // Remove this anti dependency and add one in the reverse direction.
1083     SUnit *SU = I->first;
1084     SDep &D = I->second;
1085     SUnit *TargetSU = D.getSUnit();
1086     unsigned Reg = D.getReg();
1087     unsigned Lat = D.getLatency();
1088     SU->removePred(D);
1089     SDep Dep(SU, SDep::Anti, Reg);
1090     Dep.setLatency(Lat);
1091     TargetSU->addPred(Dep);
1092   }
1093 }
1094 
1095 /// Create the adjacency structure of the nodes in the graph.
1096 void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1097     SwingSchedulerDAG *DAG) {
1098   BitVector Added(SUnits.size());
1099   DenseMap<int, int> OutputDeps;
1100   for (int i = 0, e = SUnits.size(); i != e; ++i) {
1101     Added.reset();
1102     // Add any successor to the adjacency matrix and exclude duplicates.
1103     for (auto &SI : SUnits[i].Succs) {
1104       // Only create a back-edge on the first and last nodes of a dependence
1105       // chain. This records any chains and adds them later.
1106       if (SI.getKind() == SDep::Output) {
1107         int N = SI.getSUnit()->NodeNum;
1108         int BackEdge = i;
1109         auto Dep = OutputDeps.find(BackEdge);
1110         if (Dep != OutputDeps.end()) {
1111           BackEdge = Dep->second;
1112           OutputDeps.erase(Dep);
1113         }
1114         OutputDeps[N] = BackEdge;
1115       }
1116       // Do not process a boundary node, an artificial node.
1117       // A back-edge is processed only if it goes to a Phi.
1118       if (SI.getSUnit()->isBoundaryNode() || SI.isArtificial() ||
1119           (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI()))
1120         continue;
1121       int N = SI.getSUnit()->NodeNum;
1122       if (!Added.test(N)) {
1123         AdjK[i].push_back(N);
1124         Added.set(N);
1125       }
1126     }
1127     // A chain edge between a store and a load is treated as a back-edge in the
1128     // adjacency matrix.
1129     for (auto &PI : SUnits[i].Preds) {
1130       if (!SUnits[i].getInstr()->mayStore() ||
1131           !DAG->isLoopCarriedDep(&SUnits[i], PI, false))
1132         continue;
1133       if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
1134         int N = PI.getSUnit()->NodeNum;
1135         if (!Added.test(N)) {
1136           AdjK[i].push_back(N);
1137           Added.set(N);
1138         }
1139       }
1140     }
1141   }
1142   // Add back-edges in the adjacency matrix for the output dependences.
1143   for (auto &OD : OutputDeps)
1144     if (!Added.test(OD.second)) {
1145       AdjK[OD.first].push_back(OD.second);
1146       Added.set(OD.second);
1147     }
1148 }
1149 
1150 /// Identify an elementary circuit in the dependence graph starting at the
1151 /// specified node.
1152 bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1153                                           bool HasBackedge) {
1154   SUnit *SV = &SUnits[V];
1155   bool F = false;
1156   Stack.insert(SV);
1157   Blocked.set(V);
1158 
1159   for (auto W : AdjK[V]) {
1160     if (NumPaths > MaxPaths)
1161       break;
1162     if (W < S)
1163       continue;
1164     if (W == S) {
1165       if (!HasBackedge)
1166         NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
1167       F = true;
1168       ++NumPaths;
1169       break;
1170     } else if (!Blocked.test(W)) {
1171       if (circuit(W, S, NodeSets,
1172                   Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
1173         F = true;
1174     }
1175   }
1176 
1177   if (F)
1178     unblock(V);
1179   else {
1180     for (auto W : AdjK[V]) {
1181       if (W < S)
1182         continue;
1183       if (B[W].count(SV) == 0)
1184         B[W].insert(SV);
1185     }
1186   }
1187   Stack.pop_back();
1188   return F;
1189 }
1190 
1191 /// Unblock a node in the circuit finding algorithm.
1192 void SwingSchedulerDAG::Circuits::unblock(int U) {
1193   Blocked.reset(U);
1194   SmallPtrSet<SUnit *, 4> &BU = B[U];
1195   while (!BU.empty()) {
1196     SmallPtrSet<SUnit *, 4>::iterator SI = BU.begin();
1197     assert(SI != BU.end() && "Invalid B set.");
1198     SUnit *W = *SI;
1199     BU.erase(W);
1200     if (Blocked.test(W->NodeNum))
1201       unblock(W->NodeNum);
1202   }
1203 }
1204 
1205 /// Identify all the elementary circuits in the dependence graph using
1206 /// Johnson's circuit algorithm.
1207 void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1208   // Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1209   // but we do this to find the circuits, and then change them back.
1210   swapAntiDependences(SUnits);
1211 
1212   Circuits Cir(SUnits, Topo);
1213   // Create the adjacency structure.
1214   Cir.createAdjacencyStructure(this);
1215   for (int i = 0, e = SUnits.size(); i != e; ++i) {
1216     Cir.reset();
1217     Cir.circuit(i, i, NodeSets);
1218   }
1219 
1220   // Change the dependences back so that we've created a DAG again.
1221   swapAntiDependences(SUnits);
1222 }
1223 
1224 // Create artificial dependencies between the source of COPY/REG_SEQUENCE that
1225 // is loop-carried to the USE in next iteration. This will help pipeliner avoid
1226 // additional copies that are needed across iterations. An artificial dependence
1227 // edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
1228 
1229 // PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
1230 // SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
1231 // PHI-------True-Dep------> USEOfPhi
1232 
1233 // The mutation creates
1234 // USEOfPHI -------Artificial-Dep---> SRCOfCopy
1235 
1236 // This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
1237 // (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
1238 // late  to avoid additional copies across iterations. The possible scheduling
1239 // order would be
1240 // USEOfPHI --- SRCOfCopy---  COPY/REG_SEQUENCE.
1241 
1242 void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
1243   for (SUnit &SU : DAG->SUnits) {
1244     // Find the COPY/REG_SEQUENCE instruction.
1245     if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
1246       continue;
1247 
1248     // Record the loop carried PHIs.
1249     SmallVector<SUnit *, 4> PHISUs;
1250     // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
1251     SmallVector<SUnit *, 4> SrcSUs;
1252 
1253     for (auto &Dep : SU.Preds) {
1254       SUnit *TmpSU = Dep.getSUnit();
1255       MachineInstr *TmpMI = TmpSU->getInstr();
1256       SDep::Kind DepKind = Dep.getKind();
1257       // Save the loop carried PHI.
1258       if (DepKind == SDep::Anti && TmpMI->isPHI())
1259         PHISUs.push_back(TmpSU);
1260       // Save the source of COPY/REG_SEQUENCE.
1261       // If the source has no pre-decessors, we will end up creating cycles.
1262       else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
1263         SrcSUs.push_back(TmpSU);
1264     }
1265 
1266     if (PHISUs.size() == 0 || SrcSUs.size() == 0)
1267       continue;
1268 
1269     // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
1270     // SUnit to the container.
1271     SmallVector<SUnit *, 8> UseSUs;
1272     for (auto I = PHISUs.begin(); I != PHISUs.end(); ++I) {
1273       for (auto &Dep : (*I)->Succs) {
1274         if (Dep.getKind() != SDep::Data)
1275           continue;
1276 
1277         SUnit *TmpSU = Dep.getSUnit();
1278         MachineInstr *TmpMI = TmpSU->getInstr();
1279         if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
1280           PHISUs.push_back(TmpSU);
1281           continue;
1282         }
1283         UseSUs.push_back(TmpSU);
1284       }
1285     }
1286 
1287     if (UseSUs.size() == 0)
1288       continue;
1289 
1290     SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
1291     // Add the artificial dependencies if it does not form a cycle.
1292     for (auto I : UseSUs) {
1293       for (auto Src : SrcSUs) {
1294         if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
1295           Src->addPred(SDep(I, SDep::Artificial));
1296           SDAG->Topo.AddPred(Src, I);
1297         }
1298       }
1299     }
1300   }
1301 }
1302 
1303 /// Return true for DAG nodes that we ignore when computing the cost functions.
1304 /// We ignore the back-edge recurrence in order to avoid unbounded recursion
1305 /// in the calculation of the ASAP, ALAP, etc functions.
1306 static bool ignoreDependence(const SDep &D, bool isPred) {
1307   if (D.isArtificial())
1308     return true;
1309   return D.getKind() == SDep::Anti && isPred;
1310 }
1311 
1312 /// Compute several functions need to order the nodes for scheduling.
1313 ///  ASAP - Earliest time to schedule a node.
1314 ///  ALAP - Latest time to schedule a node.
1315 ///  MOV - Mobility function, difference between ALAP and ASAP.
1316 ///  D - Depth of each node.
1317 ///  H - Height of each node.
1318 void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
1319   ScheduleInfo.resize(SUnits.size());
1320 
1321   LLVM_DEBUG({
1322     for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(),
1323                                                     E = Topo.end();
1324          I != E; ++I) {
1325       const SUnit &SU = SUnits[*I];
1326       dumpNode(SU);
1327     }
1328   });
1329 
1330   int maxASAP = 0;
1331   // Compute ASAP and ZeroLatencyDepth.
1332   for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(),
1333                                                   E = Topo.end();
1334        I != E; ++I) {
1335     int asap = 0;
1336     int zeroLatencyDepth = 0;
1337     SUnit *SU = &SUnits[*I];
1338     for (SUnit::const_pred_iterator IP = SU->Preds.begin(),
1339                                     EP = SU->Preds.end();
1340          IP != EP; ++IP) {
1341       SUnit *pred = IP->getSUnit();
1342       if (IP->getLatency() == 0)
1343         zeroLatencyDepth =
1344             std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
1345       if (ignoreDependence(*IP, true))
1346         continue;
1347       asap = std::max(asap, (int)(getASAP(pred) + IP->getLatency() -
1348                                   getDistance(pred, SU, *IP) * MII));
1349     }
1350     maxASAP = std::max(maxASAP, asap);
1351     ScheduleInfo[*I].ASAP = asap;
1352     ScheduleInfo[*I].ZeroLatencyDepth = zeroLatencyDepth;
1353   }
1354 
1355   // Compute ALAP, ZeroLatencyHeight, and MOV.
1356   for (ScheduleDAGTopologicalSort::const_reverse_iterator I = Topo.rbegin(),
1357                                                           E = Topo.rend();
1358        I != E; ++I) {
1359     int alap = maxASAP;
1360     int zeroLatencyHeight = 0;
1361     SUnit *SU = &SUnits[*I];
1362     for (SUnit::const_succ_iterator IS = SU->Succs.begin(),
1363                                     ES = SU->Succs.end();
1364          IS != ES; ++IS) {
1365       SUnit *succ = IS->getSUnit();
1366       if (IS->getLatency() == 0)
1367         zeroLatencyHeight =
1368             std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
1369       if (ignoreDependence(*IS, true))
1370         continue;
1371       alap = std::min(alap, (int)(getALAP(succ) - IS->getLatency() +
1372                                   getDistance(SU, succ, *IS) * MII));
1373     }
1374 
1375     ScheduleInfo[*I].ALAP = alap;
1376     ScheduleInfo[*I].ZeroLatencyHeight = zeroLatencyHeight;
1377   }
1378 
1379   // After computing the node functions, compute the summary for each node set.
1380   for (NodeSet &I : NodeSets)
1381     I.computeNodeSetInfo(this);
1382 
1383   LLVM_DEBUG({
1384     for (unsigned i = 0; i < SUnits.size(); i++) {
1385       dbgs() << "\tNode " << i << ":\n";
1386       dbgs() << "\t   ASAP = " << getASAP(&SUnits[i]) << "\n";
1387       dbgs() << "\t   ALAP = " << getALAP(&SUnits[i]) << "\n";
1388       dbgs() << "\t   MOV  = " << getMOV(&SUnits[i]) << "\n";
1389       dbgs() << "\t   D    = " << getDepth(&SUnits[i]) << "\n";
1390       dbgs() << "\t   H    = " << getHeight(&SUnits[i]) << "\n";
1391       dbgs() << "\t   ZLD  = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
1392       dbgs() << "\t   ZLH  = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
1393     }
1394   });
1395 }
1396 
1397 /// Compute the Pred_L(O) set, as defined in the paper. The set is defined
1398 /// as the predecessors of the elements of NodeOrder that are not also in
1399 /// NodeOrder.
1400 static bool pred_L(SetVector<SUnit *> &NodeOrder,
1401                    SmallSetVector<SUnit *, 8> &Preds,
1402                    const NodeSet *S = nullptr) {
1403   Preds.clear();
1404   for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end();
1405        I != E; ++I) {
1406     for (SUnit::pred_iterator PI = (*I)->Preds.begin(), PE = (*I)->Preds.end();
1407          PI != PE; ++PI) {
1408       if (S && S->count(PI->getSUnit()) == 0)
1409         continue;
1410       if (ignoreDependence(*PI, true))
1411         continue;
1412       if (NodeOrder.count(PI->getSUnit()) == 0)
1413         Preds.insert(PI->getSUnit());
1414     }
1415     // Back-edges are predecessors with an anti-dependence.
1416     for (SUnit::const_succ_iterator IS = (*I)->Succs.begin(),
1417                                     ES = (*I)->Succs.end();
1418          IS != ES; ++IS) {
1419       if (IS->getKind() != SDep::Anti)
1420         continue;
1421       if (S && S->count(IS->getSUnit()) == 0)
1422         continue;
1423       if (NodeOrder.count(IS->getSUnit()) == 0)
1424         Preds.insert(IS->getSUnit());
1425     }
1426   }
1427   return !Preds.empty();
1428 }
1429 
1430 /// Compute the Succ_L(O) set, as defined in the paper. The set is defined
1431 /// as the successors of the elements of NodeOrder that are not also in
1432 /// NodeOrder.
1433 static bool succ_L(SetVector<SUnit *> &NodeOrder,
1434                    SmallSetVector<SUnit *, 8> &Succs,
1435                    const NodeSet *S = nullptr) {
1436   Succs.clear();
1437   for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end();
1438        I != E; ++I) {
1439     for (SUnit::succ_iterator SI = (*I)->Succs.begin(), SE = (*I)->Succs.end();
1440          SI != SE; ++SI) {
1441       if (S && S->count(SI->getSUnit()) == 0)
1442         continue;
1443       if (ignoreDependence(*SI, false))
1444         continue;
1445       if (NodeOrder.count(SI->getSUnit()) == 0)
1446         Succs.insert(SI->getSUnit());
1447     }
1448     for (SUnit::const_pred_iterator PI = (*I)->Preds.begin(),
1449                                     PE = (*I)->Preds.end();
1450          PI != PE; ++PI) {
1451       if (PI->getKind() != SDep::Anti)
1452         continue;
1453       if (S && S->count(PI->getSUnit()) == 0)
1454         continue;
1455       if (NodeOrder.count(PI->getSUnit()) == 0)
1456         Succs.insert(PI->getSUnit());
1457     }
1458   }
1459   return !Succs.empty();
1460 }
1461 
1462 /// Return true if there is a path from the specified node to any of the nodes
1463 /// in DestNodes. Keep track and return the nodes in any path.
1464 static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
1465                         SetVector<SUnit *> &DestNodes,
1466                         SetVector<SUnit *> &Exclude,
1467                         SmallPtrSet<SUnit *, 8> &Visited) {
1468   if (Cur->isBoundaryNode())
1469     return false;
1470   if (Exclude.count(Cur) != 0)
1471     return false;
1472   if (DestNodes.count(Cur) != 0)
1473     return true;
1474   if (!Visited.insert(Cur).second)
1475     return Path.count(Cur) != 0;
1476   bool FoundPath = false;
1477   for (auto &SI : Cur->Succs)
1478     FoundPath |= computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
1479   for (auto &PI : Cur->Preds)
1480     if (PI.getKind() == SDep::Anti)
1481       FoundPath |=
1482           computePath(PI.getSUnit(), Path, DestNodes, Exclude, Visited);
1483   if (FoundPath)
1484     Path.insert(Cur);
1485   return FoundPath;
1486 }
1487 
1488 /// Return true if Set1 is a subset of Set2.
1489 template <class S1Ty, class S2Ty> static bool isSubset(S1Ty &Set1, S2Ty &Set2) {
1490   for (typename S1Ty::iterator I = Set1.begin(), E = Set1.end(); I != E; ++I)
1491     if (Set2.count(*I) == 0)
1492       return false;
1493   return true;
1494 }
1495 
1496 /// Compute the live-out registers for the instructions in a node-set.
1497 /// The live-out registers are those that are defined in the node-set,
1498 /// but not used. Except for use operands of Phis.
1499 static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker,
1500                             NodeSet &NS) {
1501   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1502   MachineRegisterInfo &MRI = MF.getRegInfo();
1503   SmallVector<RegisterMaskPair, 8> LiveOutRegs;
1504   SmallSet<unsigned, 4> Uses;
1505   for (SUnit *SU : NS) {
1506     const MachineInstr *MI = SU->getInstr();
1507     if (MI->isPHI())
1508       continue;
1509     for (const MachineOperand &MO : MI->operands())
1510       if (MO.isReg() && MO.isUse()) {
1511         unsigned Reg = MO.getReg();
1512         if (TargetRegisterInfo::isVirtualRegister(Reg))
1513           Uses.insert(Reg);
1514         else if (MRI.isAllocatable(Reg))
1515           for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
1516             Uses.insert(*Units);
1517       }
1518   }
1519   for (SUnit *SU : NS)
1520     for (const MachineOperand &MO : SU->getInstr()->operands())
1521       if (MO.isReg() && MO.isDef() && !MO.isDead()) {
1522         unsigned Reg = MO.getReg();
1523         if (TargetRegisterInfo::isVirtualRegister(Reg)) {
1524           if (!Uses.count(Reg))
1525             LiveOutRegs.push_back(RegisterMaskPair(Reg,
1526                                                    LaneBitmask::getNone()));
1527         } else if (MRI.isAllocatable(Reg)) {
1528           for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
1529             if (!Uses.count(*Units))
1530               LiveOutRegs.push_back(RegisterMaskPair(*Units,
1531                                                      LaneBitmask::getNone()));
1532         }
1533       }
1534   RPTracker.addLiveRegs(LiveOutRegs);
1535 }
1536 
1537 /// A heuristic to filter nodes in recurrent node-sets if the register
1538 /// pressure of a set is too high.
1539 void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
1540   for (auto &NS : NodeSets) {
1541     // Skip small node-sets since they won't cause register pressure problems.
1542     if (NS.size() <= 2)
1543       continue;
1544     IntervalPressure RecRegPressure;
1545     RegPressureTracker RecRPTracker(RecRegPressure);
1546     RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
1547     computeLiveOuts(MF, RecRPTracker, NS);
1548     RecRPTracker.closeBottom();
1549 
1550     std::vector<SUnit *> SUnits(NS.begin(), NS.end());
1551     llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
1552       return A->NodeNum > B->NodeNum;
1553     });
1554 
1555     for (auto &SU : SUnits) {
1556       // Since we're computing the register pressure for a subset of the
1557       // instructions in a block, we need to set the tracker for each
1558       // instruction in the node-set. The tracker is set to the instruction
1559       // just after the one we're interested in.
1560       MachineBasicBlock::const_iterator CurInstI = SU->getInstr();
1561       RecRPTracker.setPos(std::next(CurInstI));
1562 
1563       RegPressureDelta RPDelta;
1564       ArrayRef<PressureChange> CriticalPSets;
1565       RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
1566                                              CriticalPSets,
1567                                              RecRegPressure.MaxSetPressure);
1568       if (RPDelta.Excess.isValid()) {
1569         LLVM_DEBUG(
1570             dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
1571                    << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
1572                    << ":" << RPDelta.Excess.getUnitInc());
1573         NS.setExceedPressure(SU);
1574         break;
1575       }
1576       RecRPTracker.recede();
1577     }
1578   }
1579 }
1580 
1581 /// A heuristic to colocate node sets that have the same set of
1582 /// successors.
1583 void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
1584   unsigned Colocate = 0;
1585   for (int i = 0, e = NodeSets.size(); i < e; ++i) {
1586     NodeSet &N1 = NodeSets[i];
1587     SmallSetVector<SUnit *, 8> S1;
1588     if (N1.empty() || !succ_L(N1, S1))
1589       continue;
1590     for (int j = i + 1; j < e; ++j) {
1591       NodeSet &N2 = NodeSets[j];
1592       if (N1.compareRecMII(N2) != 0)
1593         continue;
1594       SmallSetVector<SUnit *, 8> S2;
1595       if (N2.empty() || !succ_L(N2, S2))
1596         continue;
1597       if (isSubset(S1, S2) && S1.size() == S2.size()) {
1598         N1.setColocate(++Colocate);
1599         N2.setColocate(Colocate);
1600         break;
1601       }
1602     }
1603   }
1604 }
1605 
1606 /// Check if the existing node-sets are profitable. If not, then ignore the
1607 /// recurrent node-sets, and attempt to schedule all nodes together. This is
1608 /// a heuristic. If the MII is large and all the recurrent node-sets are small,
1609 /// then it's best to try to schedule all instructions together instead of
1610 /// starting with the recurrent node-sets.
1611 void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
1612   // Look for loops with a large MII.
1613   if (MII < 17)
1614     return;
1615   // Check if the node-set contains only a simple add recurrence.
1616   for (auto &NS : NodeSets) {
1617     if (NS.getRecMII() > 2)
1618       return;
1619     if (NS.getMaxDepth() > MII)
1620       return;
1621   }
1622   NodeSets.clear();
1623   LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
1624   return;
1625 }
1626 
1627 /// Add the nodes that do not belong to a recurrence set into groups
1628 /// based upon connected componenets.
1629 void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
1630   SetVector<SUnit *> NodesAdded;
1631   SmallPtrSet<SUnit *, 8> Visited;
1632   // Add the nodes that are on a path between the previous node sets and
1633   // the current node set.
1634   for (NodeSet &I : NodeSets) {
1635     SmallSetVector<SUnit *, 8> N;
1636     // Add the nodes from the current node set to the previous node set.
1637     if (succ_L(I, N)) {
1638       SetVector<SUnit *> Path;
1639       for (SUnit *NI : N) {
1640         Visited.clear();
1641         computePath(NI, Path, NodesAdded, I, Visited);
1642       }
1643       if (!Path.empty())
1644         I.insert(Path.begin(), Path.end());
1645     }
1646     // Add the nodes from the previous node set to the current node set.
1647     N.clear();
1648     if (succ_L(NodesAdded, N)) {
1649       SetVector<SUnit *> Path;
1650       for (SUnit *NI : N) {
1651         Visited.clear();
1652         computePath(NI, Path, I, NodesAdded, Visited);
1653       }
1654       if (!Path.empty())
1655         I.insert(Path.begin(), Path.end());
1656     }
1657     NodesAdded.insert(I.begin(), I.end());
1658   }
1659 
1660   // Create a new node set with the connected nodes of any successor of a node
1661   // in a recurrent set.
1662   NodeSet NewSet;
1663   SmallSetVector<SUnit *, 8> N;
1664   if (succ_L(NodesAdded, N))
1665     for (SUnit *I : N)
1666       addConnectedNodes(I, NewSet, NodesAdded);
1667   if (!NewSet.empty())
1668     NodeSets.push_back(NewSet);
1669 
1670   // Create a new node set with the connected nodes of any predecessor of a node
1671   // in a recurrent set.
1672   NewSet.clear();
1673   if (pred_L(NodesAdded, N))
1674     for (SUnit *I : N)
1675       addConnectedNodes(I, NewSet, NodesAdded);
1676   if (!NewSet.empty())
1677     NodeSets.push_back(NewSet);
1678 
1679   // Create new nodes sets with the connected nodes any remaining node that
1680   // has no predecessor.
1681   for (unsigned i = 0; i < SUnits.size(); ++i) {
1682     SUnit *SU = &SUnits[i];
1683     if (NodesAdded.count(SU) == 0) {
1684       NewSet.clear();
1685       addConnectedNodes(SU, NewSet, NodesAdded);
1686       if (!NewSet.empty())
1687         NodeSets.push_back(NewSet);
1688     }
1689   }
1690 }
1691 
1692 /// Add the node to the set, and add all of its connected nodes to the set.
1693 void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
1694                                           SetVector<SUnit *> &NodesAdded) {
1695   NewSet.insert(SU);
1696   NodesAdded.insert(SU);
1697   for (auto &SI : SU->Succs) {
1698     SUnit *Successor = SI.getSUnit();
1699     if (!SI.isArtificial() && NodesAdded.count(Successor) == 0)
1700       addConnectedNodes(Successor, NewSet, NodesAdded);
1701   }
1702   for (auto &PI : SU->Preds) {
1703     SUnit *Predecessor = PI.getSUnit();
1704     if (!PI.isArtificial() && NodesAdded.count(Predecessor) == 0)
1705       addConnectedNodes(Predecessor, NewSet, NodesAdded);
1706   }
1707 }
1708 
1709 /// Return true if Set1 contains elements in Set2. The elements in common
1710 /// are returned in a different container.
1711 static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
1712                         SmallSetVector<SUnit *, 8> &Result) {
1713   Result.clear();
1714   for (unsigned i = 0, e = Set1.size(); i != e; ++i) {
1715     SUnit *SU = Set1[i];
1716     if (Set2.count(SU) != 0)
1717       Result.insert(SU);
1718   }
1719   return !Result.empty();
1720 }
1721 
1722 /// Merge the recurrence node sets that have the same initial node.
1723 void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
1724   for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
1725        ++I) {
1726     NodeSet &NI = *I;
1727     for (NodeSetType::iterator J = I + 1; J != E;) {
1728       NodeSet &NJ = *J;
1729       if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
1730         if (NJ.compareRecMII(NI) > 0)
1731           NI.setRecMII(NJ.getRecMII());
1732         for (NodeSet::iterator NII = J->begin(), ENI = J->end(); NII != ENI;
1733              ++NII)
1734           I->insert(*NII);
1735         NodeSets.erase(J);
1736         E = NodeSets.end();
1737       } else {
1738         ++J;
1739       }
1740     }
1741   }
1742 }
1743 
1744 /// Remove nodes that have been scheduled in previous NodeSets.
1745 void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
1746   for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
1747        ++I)
1748     for (NodeSetType::iterator J = I + 1; J != E;) {
1749       J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
1750 
1751       if (J->empty()) {
1752         NodeSets.erase(J);
1753         E = NodeSets.end();
1754       } else {
1755         ++J;
1756       }
1757     }
1758 }
1759 
1760 /// Compute an ordered list of the dependence graph nodes, which
1761 /// indicates the order that the nodes will be scheduled.  This is a
1762 /// two-level algorithm. First, a partial order is created, which
1763 /// consists of a list of sets ordered from highest to lowest priority.
1764 void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
1765   SmallSetVector<SUnit *, 8> R;
1766   NodeOrder.clear();
1767 
1768   for (auto &Nodes : NodeSets) {
1769     LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
1770     OrderKind Order;
1771     SmallSetVector<SUnit *, 8> N;
1772     if (pred_L(NodeOrder, N) && isSubset(N, Nodes)) {
1773       R.insert(N.begin(), N.end());
1774       Order = BottomUp;
1775       LLVM_DEBUG(dbgs() << "  Bottom up (preds) ");
1776     } else if (succ_L(NodeOrder, N) && isSubset(N, Nodes)) {
1777       R.insert(N.begin(), N.end());
1778       Order = TopDown;
1779       LLVM_DEBUG(dbgs() << "  Top down (succs) ");
1780     } else if (isIntersect(N, Nodes, R)) {
1781       // If some of the successors are in the existing node-set, then use the
1782       // top-down ordering.
1783       Order = TopDown;
1784       LLVM_DEBUG(dbgs() << "  Top down (intersect) ");
1785     } else if (NodeSets.size() == 1) {
1786       for (auto &N : Nodes)
1787         if (N->Succs.size() == 0)
1788           R.insert(N);
1789       Order = BottomUp;
1790       LLVM_DEBUG(dbgs() << "  Bottom up (all) ");
1791     } else {
1792       // Find the node with the highest ASAP.
1793       SUnit *maxASAP = nullptr;
1794       for (SUnit *SU : Nodes) {
1795         if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
1796             (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
1797           maxASAP = SU;
1798       }
1799       R.insert(maxASAP);
1800       Order = BottomUp;
1801       LLVM_DEBUG(dbgs() << "  Bottom up (default) ");
1802     }
1803 
1804     while (!R.empty()) {
1805       if (Order == TopDown) {
1806         // Choose the node with the maximum height.  If more than one, choose
1807         // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
1808         // choose the node with the lowest MOV.
1809         while (!R.empty()) {
1810           SUnit *maxHeight = nullptr;
1811           for (SUnit *I : R) {
1812             if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
1813               maxHeight = I;
1814             else if (getHeight(I) == getHeight(maxHeight) &&
1815                      getZeroLatencyHeight(I) > getZeroLatencyHeight(maxHeight))
1816               maxHeight = I;
1817             else if (getHeight(I) == getHeight(maxHeight) &&
1818                      getZeroLatencyHeight(I) ==
1819                          getZeroLatencyHeight(maxHeight) &&
1820                      getMOV(I) < getMOV(maxHeight))
1821               maxHeight = I;
1822           }
1823           NodeOrder.insert(maxHeight);
1824           LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
1825           R.remove(maxHeight);
1826           for (const auto &I : maxHeight->Succs) {
1827             if (Nodes.count(I.getSUnit()) == 0)
1828               continue;
1829             if (NodeOrder.count(I.getSUnit()) != 0)
1830               continue;
1831             if (ignoreDependence(I, false))
1832               continue;
1833             R.insert(I.getSUnit());
1834           }
1835           // Back-edges are predecessors with an anti-dependence.
1836           for (const auto &I : maxHeight->Preds) {
1837             if (I.getKind() != SDep::Anti)
1838               continue;
1839             if (Nodes.count(I.getSUnit()) == 0)
1840               continue;
1841             if (NodeOrder.count(I.getSUnit()) != 0)
1842               continue;
1843             R.insert(I.getSUnit());
1844           }
1845         }
1846         Order = BottomUp;
1847         LLVM_DEBUG(dbgs() << "\n   Switching order to bottom up ");
1848         SmallSetVector<SUnit *, 8> N;
1849         if (pred_L(NodeOrder, N, &Nodes))
1850           R.insert(N.begin(), N.end());
1851       } else {
1852         // Choose the node with the maximum depth.  If more than one, choose
1853         // the node with the maximum ZeroLatencyDepth. If still more than one,
1854         // choose the node with the lowest MOV.
1855         while (!R.empty()) {
1856           SUnit *maxDepth = nullptr;
1857           for (SUnit *I : R) {
1858             if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
1859               maxDepth = I;
1860             else if (getDepth(I) == getDepth(maxDepth) &&
1861                      getZeroLatencyDepth(I) > getZeroLatencyDepth(maxDepth))
1862               maxDepth = I;
1863             else if (getDepth(I) == getDepth(maxDepth) &&
1864                      getZeroLatencyDepth(I) == getZeroLatencyDepth(maxDepth) &&
1865                      getMOV(I) < getMOV(maxDepth))
1866               maxDepth = I;
1867           }
1868           NodeOrder.insert(maxDepth);
1869           LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
1870           R.remove(maxDepth);
1871           if (Nodes.isExceedSU(maxDepth)) {
1872             Order = TopDown;
1873             R.clear();
1874             R.insert(Nodes.getNode(0));
1875             break;
1876           }
1877           for (const auto &I : maxDepth->Preds) {
1878             if (Nodes.count(I.getSUnit()) == 0)
1879               continue;
1880             if (NodeOrder.count(I.getSUnit()) != 0)
1881               continue;
1882             R.insert(I.getSUnit());
1883           }
1884           // Back-edges are predecessors with an anti-dependence.
1885           for (const auto &I : maxDepth->Succs) {
1886             if (I.getKind() != SDep::Anti)
1887               continue;
1888             if (Nodes.count(I.getSUnit()) == 0)
1889               continue;
1890             if (NodeOrder.count(I.getSUnit()) != 0)
1891               continue;
1892             R.insert(I.getSUnit());
1893           }
1894         }
1895         Order = TopDown;
1896         LLVM_DEBUG(dbgs() << "\n   Switching order to top down ");
1897         SmallSetVector<SUnit *, 8> N;
1898         if (succ_L(NodeOrder, N, &Nodes))
1899           R.insert(N.begin(), N.end());
1900       }
1901     }
1902     LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
1903   }
1904 
1905   LLVM_DEBUG({
1906     dbgs() << "Node order: ";
1907     for (SUnit *I : NodeOrder)
1908       dbgs() << " " << I->NodeNum << " ";
1909     dbgs() << "\n";
1910   });
1911 }
1912 
1913 /// Process the nodes in the computed order and create the pipelined schedule
1914 /// of the instructions, if possible. Return true if a schedule is found.
1915 bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
1916 
1917   if (NodeOrder.empty()){
1918     LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
1919     return false;
1920   }
1921 
1922   bool scheduleFound = false;
1923   unsigned II = 0;
1924   // Keep increasing II until a valid schedule is found.
1925   for (II = MII; II <= MAX_II && !scheduleFound; ++II) {
1926     Schedule.reset();
1927     Schedule.setInitiationInterval(II);
1928     LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
1929 
1930     SetVector<SUnit *>::iterator NI = NodeOrder.begin();
1931     SetVector<SUnit *>::iterator NE = NodeOrder.end();
1932     do {
1933       SUnit *SU = *NI;
1934 
1935       // Compute the schedule time for the instruction, which is based
1936       // upon the scheduled time for any predecessors/successors.
1937       int EarlyStart = INT_MIN;
1938       int LateStart = INT_MAX;
1939       // These values are set when the size of the schedule window is limited
1940       // due to chain dependences.
1941       int SchedEnd = INT_MAX;
1942       int SchedStart = INT_MIN;
1943       Schedule.computeStart(SU, &EarlyStart, &LateStart, &SchedEnd, &SchedStart,
1944                             II, this);
1945       LLVM_DEBUG({
1946         dbgs() << "\n";
1947         dbgs() << "Inst (" << SU->NodeNum << ") ";
1948         SU->getInstr()->dump();
1949         dbgs() << "\n";
1950       });
1951       LLVM_DEBUG({
1952         dbgs() << format("\tes: %8x ls: %8x me: %8x ms: %8x\n", EarlyStart,
1953                          LateStart, SchedEnd, SchedStart);
1954       });
1955 
1956       if (EarlyStart > LateStart || SchedEnd < EarlyStart ||
1957           SchedStart > LateStart)
1958         scheduleFound = false;
1959       else if (EarlyStart != INT_MIN && LateStart == INT_MAX) {
1960         SchedEnd = std::min(SchedEnd, EarlyStart + (int)II - 1);
1961         scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
1962       } else if (EarlyStart == INT_MIN && LateStart != INT_MAX) {
1963         SchedStart = std::max(SchedStart, LateStart - (int)II + 1);
1964         scheduleFound = Schedule.insert(SU, LateStart, SchedStart, II);
1965       } else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
1966         SchedEnd =
1967             std::min(SchedEnd, std::min(LateStart, EarlyStart + (int)II - 1));
1968         // When scheduling a Phi it is better to start at the late cycle and go
1969         // backwards. The default order may insert the Phi too far away from
1970         // its first dependence.
1971         if (SU->getInstr()->isPHI())
1972           scheduleFound = Schedule.insert(SU, SchedEnd, EarlyStart, II);
1973         else
1974           scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
1975       } else {
1976         int FirstCycle = Schedule.getFirstCycle();
1977         scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
1978                                         FirstCycle + getASAP(SU) + II - 1, II);
1979       }
1980       // Even if we find a schedule, make sure the schedule doesn't exceed the
1981       // allowable number of stages. We keep trying if this happens.
1982       if (scheduleFound)
1983         if (SwpMaxStages > -1 &&
1984             Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
1985           scheduleFound = false;
1986 
1987       LLVM_DEBUG({
1988         if (!scheduleFound)
1989           dbgs() << "\tCan't schedule\n";
1990       });
1991     } while (++NI != NE && scheduleFound);
1992 
1993     // If a schedule is found, check if it is a valid schedule too.
1994     if (scheduleFound)
1995       scheduleFound = Schedule.isValidSchedule(this);
1996   }
1997 
1998   LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound << " (II=" << II
1999                     << ")\n");
2000 
2001   if (scheduleFound)
2002     Schedule.finalizeSchedule(this);
2003   else
2004     Schedule.reset();
2005 
2006   return scheduleFound && Schedule.getMaxStageCount() > 0;
2007 }
2008 
2009 /// Given a schedule for the loop, generate a new version of the loop,
2010 /// and replace the old version.  This function generates a prolog
2011 /// that contains the initial iterations in the pipeline, and kernel
2012 /// loop, and the epilogue that contains the code for the final
2013 /// iterations.
2014 void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) {
2015   // Create a new basic block for the kernel and add it to the CFG.
2016   MachineBasicBlock *KernelBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
2017 
2018   unsigned MaxStageCount = Schedule.getMaxStageCount();
2019 
2020   // Remember the registers that are used in different stages. The index is
2021   // the iteration, or stage, that the instruction is scheduled in.  This is
2022   // a map between register names in the original block and the names created
2023   // in each stage of the pipelined loop.
2024   ValueMapTy *VRMap = new ValueMapTy[(MaxStageCount + 1) * 2];
2025   InstrMapTy InstrMap;
2026 
2027   SmallVector<MachineBasicBlock *, 4> PrologBBs;
2028   // Generate the prolog instructions that set up the pipeline.
2029   generateProlog(Schedule, MaxStageCount, KernelBB, VRMap, PrologBBs);
2030   MF.insert(BB->getIterator(), KernelBB);
2031 
2032   // Rearrange the instructions to generate the new, pipelined loop,
2033   // and update register names as needed.
2034   for (int Cycle = Schedule.getFirstCycle(),
2035            LastCycle = Schedule.getFinalCycle();
2036        Cycle <= LastCycle; ++Cycle) {
2037     std::deque<SUnit *> &CycleInstrs = Schedule.getInstructions(Cycle);
2038     // This inner loop schedules each instruction in the cycle.
2039     for (SUnit *CI : CycleInstrs) {
2040       if (CI->getInstr()->isPHI())
2041         continue;
2042       unsigned StageNum = Schedule.stageScheduled(getSUnit(CI->getInstr()));
2043       MachineInstr *NewMI = cloneInstr(CI->getInstr(), MaxStageCount, StageNum);
2044       updateInstruction(NewMI, false, MaxStageCount, StageNum, Schedule, VRMap);
2045       KernelBB->push_back(NewMI);
2046       InstrMap[NewMI] = CI->getInstr();
2047     }
2048   }
2049 
2050   // Copy any terminator instructions to the new kernel, and update
2051   // names as needed.
2052   for (MachineBasicBlock::iterator I = BB->getFirstTerminator(),
2053                                    E = BB->instr_end();
2054        I != E; ++I) {
2055     MachineInstr *NewMI = MF.CloneMachineInstr(&*I);
2056     updateInstruction(NewMI, false, MaxStageCount, 0, Schedule, VRMap);
2057     KernelBB->push_back(NewMI);
2058     InstrMap[NewMI] = &*I;
2059   }
2060 
2061   KernelBB->transferSuccessors(BB);
2062   KernelBB->replaceSuccessor(BB, KernelBB);
2063 
2064   generateExistingPhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule,
2065                        VRMap, InstrMap, MaxStageCount, MaxStageCount, false);
2066   generatePhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule, VRMap,
2067                InstrMap, MaxStageCount, MaxStageCount, false);
2068 
2069   LLVM_DEBUG(dbgs() << "New block\n"; KernelBB->dump(););
2070 
2071   SmallVector<MachineBasicBlock *, 4> EpilogBBs;
2072   // Generate the epilog instructions to complete the pipeline.
2073   generateEpilog(Schedule, MaxStageCount, KernelBB, VRMap, EpilogBBs,
2074                  PrologBBs);
2075 
2076   // We need this step because the register allocation doesn't handle some
2077   // situations well, so we insert copies to help out.
2078   splitLifetimes(KernelBB, EpilogBBs, Schedule);
2079 
2080   // Remove dead instructions due to loop induction variables.
2081   removeDeadInstructions(KernelBB, EpilogBBs);
2082 
2083   // Add branches between prolog and epilog blocks.
2084   addBranches(PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap);
2085 
2086   // Remove the original loop since it's no longer referenced.
2087   for (auto &I : *BB)
2088     LIS.RemoveMachineInstrFromMaps(I);
2089   BB->clear();
2090   BB->eraseFromParent();
2091 
2092   delete[] VRMap;
2093 }
2094 
2095 /// Generate the pipeline prolog code.
2096 void SwingSchedulerDAG::generateProlog(SMSchedule &Schedule, unsigned LastStage,
2097                                        MachineBasicBlock *KernelBB,
2098                                        ValueMapTy *VRMap,
2099                                        MBBVectorTy &PrologBBs) {
2100   MachineBasicBlock *PreheaderBB = MLI->getLoopFor(BB)->getLoopPreheader();
2101   assert(PreheaderBB != nullptr &&
2102          "Need to add code to handle loops w/o preheader");
2103   MachineBasicBlock *PredBB = PreheaderBB;
2104   InstrMapTy InstrMap;
2105 
2106   // Generate a basic block for each stage, not including the last stage,
2107   // which will be generated in the kernel. Each basic block may contain
2108   // instructions from multiple stages/iterations.
2109   for (unsigned i = 0; i < LastStage; ++i) {
2110     // Create and insert the prolog basic block prior to the original loop
2111     // basic block.  The original loop is removed later.
2112     MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
2113     PrologBBs.push_back(NewBB);
2114     MF.insert(BB->getIterator(), NewBB);
2115     NewBB->transferSuccessors(PredBB);
2116     PredBB->addSuccessor(NewBB);
2117     PredBB = NewBB;
2118 
2119     // Generate instructions for each appropriate stage. Process instructions
2120     // in original program order.
2121     for (int StageNum = i; StageNum >= 0; --StageNum) {
2122       for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
2123                                        BBE = BB->getFirstTerminator();
2124            BBI != BBE; ++BBI) {
2125         if (Schedule.isScheduledAtStage(getSUnit(&*BBI), (unsigned)StageNum)) {
2126           if (BBI->isPHI())
2127             continue;
2128           MachineInstr *NewMI =
2129               cloneAndChangeInstr(&*BBI, i, (unsigned)StageNum, Schedule);
2130           updateInstruction(NewMI, false, i, (unsigned)StageNum, Schedule,
2131                             VRMap);
2132           NewBB->push_back(NewMI);
2133           InstrMap[NewMI] = &*BBI;
2134         }
2135       }
2136     }
2137     rewritePhiValues(NewBB, i, Schedule, VRMap, InstrMap);
2138     LLVM_DEBUG({
2139       dbgs() << "prolog:\n";
2140       NewBB->dump();
2141     });
2142   }
2143 
2144   PredBB->replaceSuccessor(BB, KernelBB);
2145 
2146   // Check if we need to remove the branch from the preheader to the original
2147   // loop, and replace it with a branch to the new loop.
2148   unsigned numBranches = TII->removeBranch(*PreheaderBB);
2149   if (numBranches) {
2150     SmallVector<MachineOperand, 0> Cond;
2151     TII->insertBranch(*PreheaderBB, PrologBBs[0], nullptr, Cond, DebugLoc());
2152   }
2153 }
2154 
2155 /// Generate the pipeline epilog code. The epilog code finishes the iterations
2156 /// that were started in either the prolog or the kernel.  We create a basic
2157 /// block for each stage that needs to complete.
2158 void SwingSchedulerDAG::generateEpilog(SMSchedule &Schedule, unsigned LastStage,
2159                                        MachineBasicBlock *KernelBB,
2160                                        ValueMapTy *VRMap,
2161                                        MBBVectorTy &EpilogBBs,
2162                                        MBBVectorTy &PrologBBs) {
2163   // We need to change the branch from the kernel to the first epilog block, so
2164   // this call to analyze branch uses the kernel rather than the original BB.
2165   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
2166   SmallVector<MachineOperand, 4> Cond;
2167   bool checkBranch = TII->analyzeBranch(*KernelBB, TBB, FBB, Cond);
2168   assert(!checkBranch && "generateEpilog must be able to analyze the branch");
2169   if (checkBranch)
2170     return;
2171 
2172   MachineBasicBlock::succ_iterator LoopExitI = KernelBB->succ_begin();
2173   if (*LoopExitI == KernelBB)
2174     ++LoopExitI;
2175   assert(LoopExitI != KernelBB->succ_end() && "Expecting a successor");
2176   MachineBasicBlock *LoopExitBB = *LoopExitI;
2177 
2178   MachineBasicBlock *PredBB = KernelBB;
2179   MachineBasicBlock *EpilogStart = LoopExitBB;
2180   InstrMapTy InstrMap;
2181 
2182   // Generate a basic block for each stage, not including the last stage,
2183   // which was generated for the kernel.  Each basic block may contain
2184   // instructions from multiple stages/iterations.
2185   int EpilogStage = LastStage + 1;
2186   for (unsigned i = LastStage; i >= 1; --i, ++EpilogStage) {
2187     MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock();
2188     EpilogBBs.push_back(NewBB);
2189     MF.insert(BB->getIterator(), NewBB);
2190 
2191     PredBB->replaceSuccessor(LoopExitBB, NewBB);
2192     NewBB->addSuccessor(LoopExitBB);
2193 
2194     if (EpilogStart == LoopExitBB)
2195       EpilogStart = NewBB;
2196 
2197     // Add instructions to the epilog depending on the current block.
2198     // Process instructions in original program order.
2199     for (unsigned StageNum = i; StageNum <= LastStage; ++StageNum) {
2200       for (auto &BBI : *BB) {
2201         if (BBI.isPHI())
2202           continue;
2203         MachineInstr *In = &BBI;
2204         if (Schedule.isScheduledAtStage(getSUnit(In), StageNum)) {
2205           // Instructions with memoperands in the epilog are updated with
2206           // conservative values.
2207           MachineInstr *NewMI = cloneInstr(In, UINT_MAX, 0);
2208           updateInstruction(NewMI, i == 1, EpilogStage, 0, Schedule, VRMap);
2209           NewBB->push_back(NewMI);
2210           InstrMap[NewMI] = In;
2211         }
2212       }
2213     }
2214     generateExistingPhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule,
2215                          VRMap, InstrMap, LastStage, EpilogStage, i == 1);
2216     generatePhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule, VRMap,
2217                  InstrMap, LastStage, EpilogStage, i == 1);
2218     PredBB = NewBB;
2219 
2220     LLVM_DEBUG({
2221       dbgs() << "epilog:\n";
2222       NewBB->dump();
2223     });
2224   }
2225 
2226   // Fix any Phi nodes in the loop exit block.
2227   for (MachineInstr &MI : *LoopExitBB) {
2228     if (!MI.isPHI())
2229       break;
2230     for (unsigned i = 2, e = MI.getNumOperands() + 1; i != e; i += 2) {
2231       MachineOperand &MO = MI.getOperand(i);
2232       if (MO.getMBB() == BB)
2233         MO.setMBB(PredBB);
2234     }
2235   }
2236 
2237   // Create a branch to the new epilog from the kernel.
2238   // Remove the original branch and add a new branch to the epilog.
2239   TII->removeBranch(*KernelBB);
2240   TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
2241   // Add a branch to the loop exit.
2242   if (EpilogBBs.size() > 0) {
2243     MachineBasicBlock *LastEpilogBB = EpilogBBs.back();
2244     SmallVector<MachineOperand, 4> Cond1;
2245     TII->insertBranch(*LastEpilogBB, LoopExitBB, nullptr, Cond1, DebugLoc());
2246   }
2247 }
2248 
2249 /// Replace all uses of FromReg that appear outside the specified
2250 /// basic block with ToReg.
2251 static void replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg,
2252                                     MachineBasicBlock *MBB,
2253                                     MachineRegisterInfo &MRI,
2254                                     LiveIntervals &LIS) {
2255   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(FromReg),
2256                                          E = MRI.use_end();
2257        I != E;) {
2258     MachineOperand &O = *I;
2259     ++I;
2260     if (O.getParent()->getParent() != MBB)
2261       O.setReg(ToReg);
2262   }
2263   if (!LIS.hasInterval(ToReg))
2264     LIS.createEmptyInterval(ToReg);
2265 }
2266 
2267 /// Return true if the register has a use that occurs outside the
2268 /// specified loop.
2269 static bool hasUseAfterLoop(unsigned Reg, MachineBasicBlock *BB,
2270                             MachineRegisterInfo &MRI) {
2271   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(Reg),
2272                                          E = MRI.use_end();
2273        I != E; ++I)
2274     if (I->getParent()->getParent() != BB)
2275       return true;
2276   return false;
2277 }
2278 
2279 /// Generate Phis for the specific block in the generated pipelined code.
2280 /// This function looks at the Phis from the original code to guide the
2281 /// creation of new Phis.
2282 void SwingSchedulerDAG::generateExistingPhis(
2283     MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
2284     MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
2285     InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
2286     bool IsLast) {
2287   // Compute the stage number for the initial value of the Phi, which
2288   // comes from the prolog. The prolog to use depends on to which kernel/
2289   // epilog that we're adding the Phi.
2290   unsigned PrologStage = 0;
2291   unsigned PrevStage = 0;
2292   bool InKernel = (LastStageNum == CurStageNum);
2293   if (InKernel) {
2294     PrologStage = LastStageNum - 1;
2295     PrevStage = CurStageNum;
2296   } else {
2297     PrologStage = LastStageNum - (CurStageNum - LastStageNum);
2298     PrevStage = LastStageNum + (CurStageNum - LastStageNum) - 1;
2299   }
2300 
2301   for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
2302                                    BBE = BB->getFirstNonPHI();
2303        BBI != BBE; ++BBI) {
2304     unsigned Def = BBI->getOperand(0).getReg();
2305 
2306     unsigned InitVal = 0;
2307     unsigned LoopVal = 0;
2308     getPhiRegs(*BBI, BB, InitVal, LoopVal);
2309 
2310     unsigned PhiOp1 = 0;
2311     // The Phi value from the loop body typically is defined in the loop, but
2312     // not always. So, we need to check if the value is defined in the loop.
2313     unsigned PhiOp2 = LoopVal;
2314     if (VRMap[LastStageNum].count(LoopVal))
2315       PhiOp2 = VRMap[LastStageNum][LoopVal];
2316 
2317     int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI));
2318     int LoopValStage =
2319         Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal)));
2320     unsigned NumStages = Schedule.getStagesForReg(Def, CurStageNum);
2321     if (NumStages == 0) {
2322       // We don't need to generate a Phi anymore, but we need to rename any uses
2323       // of the Phi value.
2324       unsigned NewReg = VRMap[PrevStage][LoopVal];
2325       rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, 0, &*BBI,
2326                             Def, InitVal, NewReg);
2327       if (VRMap[CurStageNum].count(LoopVal))
2328         VRMap[CurStageNum][Def] = VRMap[CurStageNum][LoopVal];
2329     }
2330     // Adjust the number of Phis needed depending on the number of prologs left,
2331     // and the distance from where the Phi is first scheduled. The number of
2332     // Phis cannot exceed the number of prolog stages. Each stage can
2333     // potentially define two values.
2334     unsigned MaxPhis = PrologStage + 2;
2335     if (!InKernel && (int)PrologStage <= LoopValStage)
2336       MaxPhis = std::max((int)MaxPhis - (int)LoopValStage, 1);
2337     unsigned NumPhis = std::min(NumStages, MaxPhis);
2338 
2339     unsigned NewReg = 0;
2340     unsigned AccessStage = (LoopValStage != -1) ? LoopValStage : StageScheduled;
2341     // In the epilog, we may need to look back one stage to get the correct
2342     // Phi name because the epilog and prolog blocks execute the same stage.
2343     // The correct name is from the previous block only when the Phi has
2344     // been completely scheduled prior to the epilog, and Phi value is not
2345     // needed in multiple stages.
2346     int StageDiff = 0;
2347     if (!InKernel && StageScheduled >= LoopValStage && AccessStage == 0 &&
2348         NumPhis == 1)
2349       StageDiff = 1;
2350     // Adjust the computations below when the phi and the loop definition
2351     // are scheduled in different stages.
2352     if (InKernel && LoopValStage != -1 && StageScheduled > LoopValStage)
2353       StageDiff = StageScheduled - LoopValStage;
2354     for (unsigned np = 0; np < NumPhis; ++np) {
2355       // If the Phi hasn't been scheduled, then use the initial Phi operand
2356       // value. Otherwise, use the scheduled version of the instruction. This
2357       // is a little complicated when a Phi references another Phi.
2358       if (np > PrologStage || StageScheduled >= (int)LastStageNum)
2359         PhiOp1 = InitVal;
2360       // Check if the Phi has already been scheduled in a prolog stage.
2361       else if (PrologStage >= AccessStage + StageDiff + np &&
2362                VRMap[PrologStage - StageDiff - np].count(LoopVal) != 0)
2363         PhiOp1 = VRMap[PrologStage - StageDiff - np][LoopVal];
2364       // Check if the Phi has already been scheduled, but the loop instruction
2365       // is either another Phi, or doesn't occur in the loop.
2366       else if (PrologStage >= AccessStage + StageDiff + np) {
2367         // If the Phi references another Phi, we need to examine the other
2368         // Phi to get the correct value.
2369         PhiOp1 = LoopVal;
2370         MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1);
2371         int Indirects = 1;
2372         while (InstOp1 && InstOp1->isPHI() && InstOp1->getParent() == BB) {
2373           int PhiStage = Schedule.stageScheduled(getSUnit(InstOp1));
2374           if ((int)(PrologStage - StageDiff - np) < PhiStage + Indirects)
2375             PhiOp1 = getInitPhiReg(*InstOp1, BB);
2376           else
2377             PhiOp1 = getLoopPhiReg(*InstOp1, BB);
2378           InstOp1 = MRI.getVRegDef(PhiOp1);
2379           int PhiOpStage = Schedule.stageScheduled(getSUnit(InstOp1));
2380           int StageAdj = (PhiOpStage != -1 ? PhiStage - PhiOpStage : 0);
2381           if (PhiOpStage != -1 && PrologStage - StageAdj >= Indirects + np &&
2382               VRMap[PrologStage - StageAdj - Indirects - np].count(PhiOp1)) {
2383             PhiOp1 = VRMap[PrologStage - StageAdj - Indirects - np][PhiOp1];
2384             break;
2385           }
2386           ++Indirects;
2387         }
2388       } else
2389         PhiOp1 = InitVal;
2390       // If this references a generated Phi in the kernel, get the Phi operand
2391       // from the incoming block.
2392       if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1))
2393         if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
2394           PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
2395 
2396       MachineInstr *PhiInst = MRI.getVRegDef(LoopVal);
2397       bool LoopDefIsPhi = PhiInst && PhiInst->isPHI();
2398       // In the epilog, a map lookup is needed to get the value from the kernel,
2399       // or previous epilog block. How is does this depends on if the
2400       // instruction is scheduled in the previous block.
2401       if (!InKernel) {
2402         int StageDiffAdj = 0;
2403         if (LoopValStage != -1 && StageScheduled > LoopValStage)
2404           StageDiffAdj = StageScheduled - LoopValStage;
2405         // Use the loop value defined in the kernel, unless the kernel
2406         // contains the last definition of the Phi.
2407         if (np == 0 && PrevStage == LastStageNum &&
2408             (StageScheduled != 0 || LoopValStage != 0) &&
2409             VRMap[PrevStage - StageDiffAdj].count(LoopVal))
2410           PhiOp2 = VRMap[PrevStage - StageDiffAdj][LoopVal];
2411         // Use the value defined by the Phi. We add one because we switch
2412         // from looking at the loop value to the Phi definition.
2413         else if (np > 0 && PrevStage == LastStageNum &&
2414                  VRMap[PrevStage - np + 1].count(Def))
2415           PhiOp2 = VRMap[PrevStage - np + 1][Def];
2416         // Use the loop value defined in the kernel.
2417         else if (static_cast<unsigned>(LoopValStage) > PrologStage + 1 &&
2418                  VRMap[PrevStage - StageDiffAdj - np].count(LoopVal))
2419           PhiOp2 = VRMap[PrevStage - StageDiffAdj - np][LoopVal];
2420         // Use the value defined by the Phi, unless we're generating the first
2421         // epilog and the Phi refers to a Phi in a different stage.
2422         else if (VRMap[PrevStage - np].count(Def) &&
2423                  (!LoopDefIsPhi || PrevStage != LastStageNum))
2424           PhiOp2 = VRMap[PrevStage - np][Def];
2425       }
2426 
2427       // Check if we can reuse an existing Phi. This occurs when a Phi
2428       // references another Phi, and the other Phi is scheduled in an
2429       // earlier stage. We can try to reuse an existing Phi up until the last
2430       // stage of the current Phi.
2431       if (LoopDefIsPhi) {
2432         if (static_cast<int>(PrologStage - np) >= StageScheduled) {
2433           int LVNumStages = Schedule.getStagesForPhi(LoopVal);
2434           int StageDiff = (StageScheduled - LoopValStage);
2435           LVNumStages -= StageDiff;
2436           // Make sure the loop value Phi has been processed already.
2437           if (LVNumStages > (int)np && VRMap[CurStageNum].count(LoopVal)) {
2438             NewReg = PhiOp2;
2439             unsigned ReuseStage = CurStageNum;
2440             if (Schedule.isLoopCarried(this, *PhiInst))
2441               ReuseStage -= LVNumStages;
2442             // Check if the Phi to reuse has been generated yet. If not, then
2443             // there is nothing to reuse.
2444             if (VRMap[ReuseStage - np].count(LoopVal)) {
2445               NewReg = VRMap[ReuseStage - np][LoopVal];
2446 
2447               rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2448                                     &*BBI, Def, NewReg);
2449               // Update the map with the new Phi name.
2450               VRMap[CurStageNum - np][Def] = NewReg;
2451               PhiOp2 = NewReg;
2452               if (VRMap[LastStageNum - np - 1].count(LoopVal))
2453                 PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal];
2454 
2455               if (IsLast && np == NumPhis - 1)
2456                 replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
2457               continue;
2458             }
2459           }
2460         }
2461         if (InKernel && StageDiff > 0 &&
2462             VRMap[CurStageNum - StageDiff - np].count(LoopVal))
2463           PhiOp2 = VRMap[CurStageNum - StageDiff - np][LoopVal];
2464       }
2465 
2466       const TargetRegisterClass *RC = MRI.getRegClass(Def);
2467       NewReg = MRI.createVirtualRegister(RC);
2468 
2469       MachineInstrBuilder NewPhi =
2470           BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
2471                   TII->get(TargetOpcode::PHI), NewReg);
2472       NewPhi.addReg(PhiOp1).addMBB(BB1);
2473       NewPhi.addReg(PhiOp2).addMBB(BB2);
2474       if (np == 0)
2475         InstrMap[NewPhi] = &*BBI;
2476 
2477       // We define the Phis after creating the new pipelined code, so
2478       // we need to rename the Phi values in scheduled instructions.
2479 
2480       unsigned PrevReg = 0;
2481       if (InKernel && VRMap[PrevStage - np].count(LoopVal))
2482         PrevReg = VRMap[PrevStage - np][LoopVal];
2483       rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI,
2484                             Def, NewReg, PrevReg);
2485       // If the Phi has been scheduled, use the new name for rewriting.
2486       if (VRMap[CurStageNum - np].count(Def)) {
2487         unsigned R = VRMap[CurStageNum - np][Def];
2488         rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI,
2489                               R, NewReg);
2490       }
2491 
2492       // Check if we need to rename any uses that occurs after the loop. The
2493       // register to replace depends on whether the Phi is scheduled in the
2494       // epilog.
2495       if (IsLast && np == NumPhis - 1)
2496         replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
2497 
2498       // In the kernel, a dependent Phi uses the value from this Phi.
2499       if (InKernel)
2500         PhiOp2 = NewReg;
2501 
2502       // Update the map with the new Phi name.
2503       VRMap[CurStageNum - np][Def] = NewReg;
2504     }
2505 
2506     while (NumPhis++ < NumStages) {
2507       rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, NumPhis,
2508                             &*BBI, Def, NewReg, 0);
2509     }
2510 
2511     // Check if we need to rename a Phi that has been eliminated due to
2512     // scheduling.
2513     if (NumStages == 0 && IsLast && VRMap[CurStageNum].count(LoopVal))
2514       replaceRegUsesAfterLoop(Def, VRMap[CurStageNum][LoopVal], BB, MRI, LIS);
2515   }
2516 }
2517 
2518 /// Generate Phis for the specified block in the generated pipelined code.
2519 /// These are new Phis needed because the definition is scheduled after the
2520 /// use in the pipelined sequence.
2521 void SwingSchedulerDAG::generatePhis(
2522     MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
2523     MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
2524     InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
2525     bool IsLast) {
2526   // Compute the stage number that contains the initial Phi value, and
2527   // the Phi from the previous stage.
2528   unsigned PrologStage = 0;
2529   unsigned PrevStage = 0;
2530   unsigned StageDiff = CurStageNum - LastStageNum;
2531   bool InKernel = (StageDiff == 0);
2532   if (InKernel) {
2533     PrologStage = LastStageNum - 1;
2534     PrevStage = CurStageNum;
2535   } else {
2536     PrologStage = LastStageNum - StageDiff;
2537     PrevStage = LastStageNum + StageDiff - 1;
2538   }
2539 
2540   for (MachineBasicBlock::iterator BBI = BB->getFirstNonPHI(),
2541                                    BBE = BB->instr_end();
2542        BBI != BBE; ++BBI) {
2543     for (unsigned i = 0, e = BBI->getNumOperands(); i != e; ++i) {
2544       MachineOperand &MO = BBI->getOperand(i);
2545       if (!MO.isReg() || !MO.isDef() ||
2546           !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
2547         continue;
2548 
2549       int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI));
2550       assert(StageScheduled != -1 && "Expecting scheduled instruction.");
2551       unsigned Def = MO.getReg();
2552       unsigned NumPhis = Schedule.getStagesForReg(Def, CurStageNum);
2553       // An instruction scheduled in stage 0 and is used after the loop
2554       // requires a phi in the epilog for the last definition from either
2555       // the kernel or prolog.
2556       if (!InKernel && NumPhis == 0 && StageScheduled == 0 &&
2557           hasUseAfterLoop(Def, BB, MRI))
2558         NumPhis = 1;
2559       if (!InKernel && (unsigned)StageScheduled > PrologStage)
2560         continue;
2561 
2562       unsigned PhiOp2 = VRMap[PrevStage][Def];
2563       if (MachineInstr *InstOp2 = MRI.getVRegDef(PhiOp2))
2564         if (InstOp2->isPHI() && InstOp2->getParent() == NewBB)
2565           PhiOp2 = getLoopPhiReg(*InstOp2, BB2);
2566       // The number of Phis can't exceed the number of prolog stages. The
2567       // prolog stage number is zero based.
2568       if (NumPhis > PrologStage + 1 - StageScheduled)
2569         NumPhis = PrologStage + 1 - StageScheduled;
2570       for (unsigned np = 0; np < NumPhis; ++np) {
2571         unsigned PhiOp1 = VRMap[PrologStage][Def];
2572         if (np <= PrologStage)
2573           PhiOp1 = VRMap[PrologStage - np][Def];
2574         if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1)) {
2575           if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
2576             PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
2577           if (InstOp1->isPHI() && InstOp1->getParent() == NewBB)
2578             PhiOp1 = getInitPhiReg(*InstOp1, NewBB);
2579         }
2580         if (!InKernel)
2581           PhiOp2 = VRMap[PrevStage - np][Def];
2582 
2583         const TargetRegisterClass *RC = MRI.getRegClass(Def);
2584         unsigned NewReg = MRI.createVirtualRegister(RC);
2585 
2586         MachineInstrBuilder NewPhi =
2587             BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
2588                     TII->get(TargetOpcode::PHI), NewReg);
2589         NewPhi.addReg(PhiOp1).addMBB(BB1);
2590         NewPhi.addReg(PhiOp2).addMBB(BB2);
2591         if (np == 0)
2592           InstrMap[NewPhi] = &*BBI;
2593 
2594         // Rewrite uses and update the map. The actions depend upon whether
2595         // we generating code for the kernel or epilog blocks.
2596         if (InKernel) {
2597           rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2598                                 &*BBI, PhiOp1, NewReg);
2599           rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2600                                 &*BBI, PhiOp2, NewReg);
2601 
2602           PhiOp2 = NewReg;
2603           VRMap[PrevStage - np - 1][Def] = NewReg;
2604         } else {
2605           VRMap[CurStageNum - np][Def] = NewReg;
2606           if (np == NumPhis - 1)
2607             rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2608                                   &*BBI, Def, NewReg);
2609         }
2610         if (IsLast && np == NumPhis - 1)
2611           replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
2612       }
2613     }
2614   }
2615 }
2616 
2617 /// Remove instructions that generate values with no uses.
2618 /// Typically, these are induction variable operations that generate values
2619 /// used in the loop itself.  A dead instruction has a definition with
2620 /// no uses, or uses that occur in the original loop only.
2621 void SwingSchedulerDAG::removeDeadInstructions(MachineBasicBlock *KernelBB,
2622                                                MBBVectorTy &EpilogBBs) {
2623   // For each epilog block, check that the value defined by each instruction
2624   // is used.  If not, delete it.
2625   for (MBBVectorTy::reverse_iterator MBB = EpilogBBs.rbegin(),
2626                                      MBE = EpilogBBs.rend();
2627        MBB != MBE; ++MBB)
2628     for (MachineBasicBlock::reverse_instr_iterator MI = (*MBB)->instr_rbegin(),
2629                                                    ME = (*MBB)->instr_rend();
2630          MI != ME;) {
2631       // From DeadMachineInstructionElem. Don't delete inline assembly.
2632       if (MI->isInlineAsm()) {
2633         ++MI;
2634         continue;
2635       }
2636       bool SawStore = false;
2637       // Check if it's safe to remove the instruction due to side effects.
2638       // We can, and want to, remove Phis here.
2639       if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI()) {
2640         ++MI;
2641         continue;
2642       }
2643       bool used = true;
2644       for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
2645                                       MOE = MI->operands_end();
2646            MOI != MOE; ++MOI) {
2647         if (!MOI->isReg() || !MOI->isDef())
2648           continue;
2649         unsigned reg = MOI->getReg();
2650         // Assume physical registers are used, unless they are marked dead.
2651         if (TargetRegisterInfo::isPhysicalRegister(reg)) {
2652           used = !MOI->isDead();
2653           if (used)
2654             break;
2655           continue;
2656         }
2657         unsigned realUses = 0;
2658         for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(reg),
2659                                                EI = MRI.use_end();
2660              UI != EI; ++UI) {
2661           // Check if there are any uses that occur only in the original
2662           // loop.  If so, that's not a real use.
2663           if (UI->getParent()->getParent() != BB) {
2664             realUses++;
2665             used = true;
2666             break;
2667           }
2668         }
2669         if (realUses > 0)
2670           break;
2671         used = false;
2672       }
2673       if (!used) {
2674         LIS.RemoveMachineInstrFromMaps(*MI);
2675         MI++->eraseFromParent();
2676         continue;
2677       }
2678       ++MI;
2679     }
2680   // In the kernel block, check if we can remove a Phi that generates a value
2681   // used in an instruction removed in the epilog block.
2682   for (MachineBasicBlock::iterator BBI = KernelBB->instr_begin(),
2683                                    BBE = KernelBB->getFirstNonPHI();
2684        BBI != BBE;) {
2685     MachineInstr *MI = &*BBI;
2686     ++BBI;
2687     unsigned reg = MI->getOperand(0).getReg();
2688     if (MRI.use_begin(reg) == MRI.use_end()) {
2689       LIS.RemoveMachineInstrFromMaps(*MI);
2690       MI->eraseFromParent();
2691     }
2692   }
2693 }
2694 
2695 /// For loop carried definitions, we split the lifetime of a virtual register
2696 /// that has uses past the definition in the next iteration. A copy with a new
2697 /// virtual register is inserted before the definition, which helps with
2698 /// generating a better register assignment.
2699 ///
2700 ///   v1 = phi(a, v2)     v1 = phi(a, v2)
2701 ///   v2 = phi(b, v3)     v2 = phi(b, v3)
2702 ///   v3 = ..             v4 = copy v1
2703 ///   .. = V1             v3 = ..
2704 ///                       .. = v4
2705 void SwingSchedulerDAG::splitLifetimes(MachineBasicBlock *KernelBB,
2706                                        MBBVectorTy &EpilogBBs,
2707                                        SMSchedule &Schedule) {
2708   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2709   for (auto &PHI : KernelBB->phis()) {
2710     unsigned Def = PHI.getOperand(0).getReg();
2711     // Check for any Phi definition that used as an operand of another Phi
2712     // in the same block.
2713     for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(Def),
2714                                                  E = MRI.use_instr_end();
2715          I != E; ++I) {
2716       if (I->isPHI() && I->getParent() == KernelBB) {
2717         // Get the loop carried definition.
2718         unsigned LCDef = getLoopPhiReg(PHI, KernelBB);
2719         if (!LCDef)
2720           continue;
2721         MachineInstr *MI = MRI.getVRegDef(LCDef);
2722         if (!MI || MI->getParent() != KernelBB || MI->isPHI())
2723           continue;
2724         // Search through the rest of the block looking for uses of the Phi
2725         // definition. If one occurs, then split the lifetime.
2726         unsigned SplitReg = 0;
2727         for (auto &BBJ : make_range(MachineBasicBlock::instr_iterator(MI),
2728                                     KernelBB->instr_end()))
2729           if (BBJ.readsRegister(Def)) {
2730             // We split the lifetime when we find the first use.
2731             if (SplitReg == 0) {
2732               SplitReg = MRI.createVirtualRegister(MRI.getRegClass(Def));
2733               BuildMI(*KernelBB, MI, MI->getDebugLoc(),
2734                       TII->get(TargetOpcode::COPY), SplitReg)
2735                   .addReg(Def);
2736             }
2737             BBJ.substituteRegister(Def, SplitReg, 0, *TRI);
2738           }
2739         if (!SplitReg)
2740           continue;
2741         // Search through each of the epilog blocks for any uses to be renamed.
2742         for (auto &Epilog : EpilogBBs)
2743           for (auto &I : *Epilog)
2744             if (I.readsRegister(Def))
2745               I.substituteRegister(Def, SplitReg, 0, *TRI);
2746         break;
2747       }
2748     }
2749   }
2750 }
2751 
2752 /// Remove the incoming block from the Phis in a basic block.
2753 static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
2754   for (MachineInstr &MI : *BB) {
2755     if (!MI.isPHI())
2756       break;
2757     for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2)
2758       if (MI.getOperand(i + 1).getMBB() == Incoming) {
2759         MI.RemoveOperand(i + 1);
2760         MI.RemoveOperand(i);
2761         break;
2762       }
2763   }
2764 }
2765 
2766 /// Create branches from each prolog basic block to the appropriate epilog
2767 /// block.  These edges are needed if the loop ends before reaching the
2768 /// kernel.
2769 void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs,
2770                                     MachineBasicBlock *KernelBB,
2771                                     MBBVectorTy &EpilogBBs,
2772                                     SMSchedule &Schedule, ValueMapTy *VRMap) {
2773   assert(PrologBBs.size() == EpilogBBs.size() && "Prolog/Epilog mismatch");
2774   MachineInstr *IndVar = Pass.LI.LoopInductionVar;
2775   MachineInstr *Cmp = Pass.LI.LoopCompare;
2776   MachineBasicBlock *LastPro = KernelBB;
2777   MachineBasicBlock *LastEpi = KernelBB;
2778 
2779   // Start from the blocks connected to the kernel and work "out"
2780   // to the first prolog and the last epilog blocks.
2781   SmallVector<MachineInstr *, 4> PrevInsts;
2782   unsigned MaxIter = PrologBBs.size() - 1;
2783   unsigned LC = UINT_MAX;
2784   unsigned LCMin = UINT_MAX;
2785   for (unsigned i = 0, j = MaxIter; i <= MaxIter; ++i, --j) {
2786     // Add branches to the prolog that go to the corresponding
2787     // epilog, and the fall-thru prolog/kernel block.
2788     MachineBasicBlock *Prolog = PrologBBs[j];
2789     MachineBasicBlock *Epilog = EpilogBBs[i];
2790     // We've executed one iteration, so decrement the loop count and check for
2791     // the loop end.
2792     SmallVector<MachineOperand, 4> Cond;
2793     // Check if the LOOP0 has already been removed. If so, then there is no need
2794     // to reduce the trip count.
2795     if (LC != 0)
2796       LC = TII->reduceLoopCount(*Prolog, IndVar, *Cmp, Cond, PrevInsts, j,
2797                                 MaxIter);
2798 
2799     // Record the value of the first trip count, which is used to determine if
2800     // branches and blocks can be removed for constant trip counts.
2801     if (LCMin == UINT_MAX)
2802       LCMin = LC;
2803 
2804     unsigned numAdded = 0;
2805     if (TargetRegisterInfo::isVirtualRegister(LC)) {
2806       Prolog->addSuccessor(Epilog);
2807       numAdded = TII->insertBranch(*Prolog, Epilog, LastPro, Cond, DebugLoc());
2808     } else if (j >= LCMin) {
2809       Prolog->addSuccessor(Epilog);
2810       Prolog->removeSuccessor(LastPro);
2811       LastEpi->removeSuccessor(Epilog);
2812       numAdded = TII->insertBranch(*Prolog, Epilog, nullptr, Cond, DebugLoc());
2813       removePhis(Epilog, LastEpi);
2814       // Remove the blocks that are no longer referenced.
2815       if (LastPro != LastEpi) {
2816         LastEpi->clear();
2817         LastEpi->eraseFromParent();
2818       }
2819       LastPro->clear();
2820       LastPro->eraseFromParent();
2821     } else {
2822       numAdded = TII->insertBranch(*Prolog, LastPro, nullptr, Cond, DebugLoc());
2823       removePhis(Epilog, Prolog);
2824     }
2825     LastPro = Prolog;
2826     LastEpi = Epilog;
2827     for (MachineBasicBlock::reverse_instr_iterator I = Prolog->instr_rbegin(),
2828                                                    E = Prolog->instr_rend();
2829          I != E && numAdded > 0; ++I, --numAdded)
2830       updateInstruction(&*I, false, j, 0, Schedule, VRMap);
2831   }
2832 }
2833 
2834 /// Return true if we can compute the amount the instruction changes
2835 /// during each iteration. Set Delta to the amount of the change.
2836 bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
2837   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2838   const MachineOperand *BaseOp;
2839   int64_t Offset;
2840   if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI))
2841     return false;
2842 
2843   if (!BaseOp->isReg())
2844     return false;
2845 
2846   unsigned BaseReg = BaseOp->getReg();
2847 
2848   MachineRegisterInfo &MRI = MF.getRegInfo();
2849   // Check if there is a Phi. If so, get the definition in the loop.
2850   MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
2851   if (BaseDef && BaseDef->isPHI()) {
2852     BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
2853     BaseDef = MRI.getVRegDef(BaseReg);
2854   }
2855   if (!BaseDef)
2856     return false;
2857 
2858   int D = 0;
2859   if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
2860     return false;
2861 
2862   Delta = D;
2863   return true;
2864 }
2865 
2866 /// Update the memory operand with a new offset when the pipeliner
2867 /// generates a new copy of the instruction that refers to a
2868 /// different memory location.
2869 void SwingSchedulerDAG::updateMemOperands(MachineInstr &NewMI,
2870                                           MachineInstr &OldMI, unsigned Num) {
2871   if (Num == 0)
2872     return;
2873   // If the instruction has memory operands, then adjust the offset
2874   // when the instruction appears in different stages.
2875   if (NewMI.memoperands_empty())
2876     return;
2877   SmallVector<MachineMemOperand *, 2> NewMMOs;
2878   for (MachineMemOperand *MMO : NewMI.memoperands()) {
2879     // TODO: Figure out whether isAtomic is really necessary (see D57601).
2880     if (MMO->isVolatile() || MMO->isAtomic() ||
2881         (MMO->isInvariant() && MMO->isDereferenceable()) ||
2882         (!MMO->getValue())) {
2883       NewMMOs.push_back(MMO);
2884       continue;
2885     }
2886     unsigned Delta;
2887     if (Num != UINT_MAX && computeDelta(OldMI, Delta)) {
2888       int64_t AdjOffset = Delta * Num;
2889       NewMMOs.push_back(
2890           MF.getMachineMemOperand(MMO, AdjOffset, MMO->getSize()));
2891     } else {
2892       NewMMOs.push_back(
2893           MF.getMachineMemOperand(MMO, 0, MemoryLocation::UnknownSize));
2894     }
2895   }
2896   NewMI.setMemRefs(MF, NewMMOs);
2897 }
2898 
2899 /// Clone the instruction for the new pipelined loop and update the
2900 /// memory operands, if needed.
2901 MachineInstr *SwingSchedulerDAG::cloneInstr(MachineInstr *OldMI,
2902                                             unsigned CurStageNum,
2903                                             unsigned InstStageNum) {
2904   MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
2905   // Check for tied operands in inline asm instructions. This should be handled
2906   // elsewhere, but I'm not sure of the best solution.
2907   if (OldMI->isInlineAsm())
2908     for (unsigned i = 0, e = OldMI->getNumOperands(); i != e; ++i) {
2909       const auto &MO = OldMI->getOperand(i);
2910       if (MO.isReg() && MO.isUse())
2911         break;
2912       unsigned UseIdx;
2913       if (OldMI->isRegTiedToUseOperand(i, &UseIdx))
2914         NewMI->tieOperands(i, UseIdx);
2915     }
2916   updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
2917   return NewMI;
2918 }
2919 
2920 /// Clone the instruction for the new pipelined loop. If needed, this
2921 /// function updates the instruction using the values saved in the
2922 /// InstrChanges structure.
2923 MachineInstr *SwingSchedulerDAG::cloneAndChangeInstr(MachineInstr *OldMI,
2924                                                      unsigned CurStageNum,
2925                                                      unsigned InstStageNum,
2926                                                      SMSchedule &Schedule) {
2927   MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
2928   DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
2929       InstrChanges.find(getSUnit(OldMI));
2930   if (It != InstrChanges.end()) {
2931     std::pair<unsigned, int64_t> RegAndOffset = It->second;
2932     unsigned BasePos, OffsetPos;
2933     if (!TII->getBaseAndOffsetPosition(*OldMI, BasePos, OffsetPos))
2934       return nullptr;
2935     int64_t NewOffset = OldMI->getOperand(OffsetPos).getImm();
2936     MachineInstr *LoopDef = findDefInLoop(RegAndOffset.first);
2937     if (Schedule.stageScheduled(getSUnit(LoopDef)) > (signed)InstStageNum)
2938       NewOffset += RegAndOffset.second * (CurStageNum - InstStageNum);
2939     NewMI->getOperand(OffsetPos).setImm(NewOffset);
2940   }
2941   updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
2942   return NewMI;
2943 }
2944 
2945 /// Update the machine instruction with new virtual registers.  This
2946 /// function may change the defintions and/or uses.
2947 void SwingSchedulerDAG::updateInstruction(MachineInstr *NewMI, bool LastDef,
2948                                           unsigned CurStageNum,
2949                                           unsigned InstrStageNum,
2950                                           SMSchedule &Schedule,
2951                                           ValueMapTy *VRMap) {
2952   for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
2953     MachineOperand &MO = NewMI->getOperand(i);
2954     if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
2955       continue;
2956     unsigned reg = MO.getReg();
2957     if (MO.isDef()) {
2958       // Create a new virtual register for the definition.
2959       const TargetRegisterClass *RC = MRI.getRegClass(reg);
2960       unsigned NewReg = MRI.createVirtualRegister(RC);
2961       MO.setReg(NewReg);
2962       VRMap[CurStageNum][reg] = NewReg;
2963       if (LastDef)
2964         replaceRegUsesAfterLoop(reg, NewReg, BB, MRI, LIS);
2965     } else if (MO.isUse()) {
2966       MachineInstr *Def = MRI.getVRegDef(reg);
2967       // Compute the stage that contains the last definition for instruction.
2968       int DefStageNum = Schedule.stageScheduled(getSUnit(Def));
2969       unsigned StageNum = CurStageNum;
2970       if (DefStageNum != -1 && (int)InstrStageNum > DefStageNum) {
2971         // Compute the difference in stages between the defintion and the use.
2972         unsigned StageDiff = (InstrStageNum - DefStageNum);
2973         // Make an adjustment to get the last definition.
2974         StageNum -= StageDiff;
2975       }
2976       if (VRMap[StageNum].count(reg))
2977         MO.setReg(VRMap[StageNum][reg]);
2978     }
2979   }
2980 }
2981 
2982 /// Return the instruction in the loop that defines the register.
2983 /// If the definition is a Phi, then follow the Phi operand to
2984 /// the instruction in the loop.
2985 MachineInstr *SwingSchedulerDAG::findDefInLoop(unsigned Reg) {
2986   SmallPtrSet<MachineInstr *, 8> Visited;
2987   MachineInstr *Def = MRI.getVRegDef(Reg);
2988   while (Def->isPHI()) {
2989     if (!Visited.insert(Def).second)
2990       break;
2991     for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
2992       if (Def->getOperand(i + 1).getMBB() == BB) {
2993         Def = MRI.getVRegDef(Def->getOperand(i).getReg());
2994         break;
2995       }
2996   }
2997   return Def;
2998 }
2999 
3000 /// Return the new name for the value from the previous stage.
3001 unsigned SwingSchedulerDAG::getPrevMapVal(unsigned StageNum, unsigned PhiStage,
3002                                           unsigned LoopVal, unsigned LoopStage,
3003                                           ValueMapTy *VRMap,
3004                                           MachineBasicBlock *BB) {
3005   unsigned PrevVal = 0;
3006   if (StageNum > PhiStage) {
3007     MachineInstr *LoopInst = MRI.getVRegDef(LoopVal);
3008     if (PhiStage == LoopStage && VRMap[StageNum - 1].count(LoopVal))
3009       // The name is defined in the previous stage.
3010       PrevVal = VRMap[StageNum - 1][LoopVal];
3011     else if (VRMap[StageNum].count(LoopVal))
3012       // The previous name is defined in the current stage when the instruction
3013       // order is swapped.
3014       PrevVal = VRMap[StageNum][LoopVal];
3015     else if (!LoopInst->isPHI() || LoopInst->getParent() != BB)
3016       // The loop value hasn't yet been scheduled.
3017       PrevVal = LoopVal;
3018     else if (StageNum == PhiStage + 1)
3019       // The loop value is another phi, which has not been scheduled.
3020       PrevVal = getInitPhiReg(*LoopInst, BB);
3021     else if (StageNum > PhiStage + 1 && LoopInst->getParent() == BB)
3022       // The loop value is another phi, which has been scheduled.
3023       PrevVal =
3024           getPrevMapVal(StageNum - 1, PhiStage, getLoopPhiReg(*LoopInst, BB),
3025                         LoopStage, VRMap, BB);
3026   }
3027   return PrevVal;
3028 }
3029 
3030 /// Rewrite the Phi values in the specified block to use the mappings
3031 /// from the initial operand. Once the Phi is scheduled, we switch
3032 /// to using the loop value instead of the Phi value, so those names
3033 /// do not need to be rewritten.
3034 void SwingSchedulerDAG::rewritePhiValues(MachineBasicBlock *NewBB,
3035                                          unsigned StageNum,
3036                                          SMSchedule &Schedule,
3037                                          ValueMapTy *VRMap,
3038                                          InstrMapTy &InstrMap) {
3039   for (auto &PHI : BB->phis()) {
3040     unsigned InitVal = 0;
3041     unsigned LoopVal = 0;
3042     getPhiRegs(PHI, BB, InitVal, LoopVal);
3043     unsigned PhiDef = PHI.getOperand(0).getReg();
3044 
3045     unsigned PhiStage =
3046         (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(PhiDef)));
3047     unsigned LoopStage =
3048         (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal)));
3049     unsigned NumPhis = Schedule.getStagesForPhi(PhiDef);
3050     if (NumPhis > StageNum)
3051       NumPhis = StageNum;
3052     for (unsigned np = 0; np <= NumPhis; ++np) {
3053       unsigned NewVal =
3054           getPrevMapVal(StageNum - np, PhiStage, LoopVal, LoopStage, VRMap, BB);
3055       if (!NewVal)
3056         NewVal = InitVal;
3057       rewriteScheduledInstr(NewBB, Schedule, InstrMap, StageNum - np, np, &PHI,
3058                             PhiDef, NewVal);
3059     }
3060   }
3061 }
3062 
3063 /// Rewrite a previously scheduled instruction to use the register value
3064 /// from the new instruction. Make sure the instruction occurs in the
3065 /// basic block, and we don't change the uses in the new instruction.
3066 void SwingSchedulerDAG::rewriteScheduledInstr(
3067     MachineBasicBlock *BB, SMSchedule &Schedule, InstrMapTy &InstrMap,
3068     unsigned CurStageNum, unsigned PhiNum, MachineInstr *Phi, unsigned OldReg,
3069     unsigned NewReg, unsigned PrevReg) {
3070   bool InProlog = (CurStageNum < Schedule.getMaxStageCount());
3071   int StagePhi = Schedule.stageScheduled(getSUnit(Phi)) + PhiNum;
3072   // Rewrite uses that have been scheduled already to use the new
3073   // Phi register.
3074   for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(OldReg),
3075                                          EI = MRI.use_end();
3076        UI != EI;) {
3077     MachineOperand &UseOp = *UI;
3078     MachineInstr *UseMI = UseOp.getParent();
3079     ++UI;
3080     if (UseMI->getParent() != BB)
3081       continue;
3082     if (UseMI->isPHI()) {
3083       if (!Phi->isPHI() && UseMI->getOperand(0).getReg() == NewReg)
3084         continue;
3085       if (getLoopPhiReg(*UseMI, BB) != OldReg)
3086         continue;
3087     }
3088     InstrMapTy::iterator OrigInstr = InstrMap.find(UseMI);
3089     assert(OrigInstr != InstrMap.end() && "Instruction not scheduled.");
3090     SUnit *OrigMISU = getSUnit(OrigInstr->second);
3091     int StageSched = Schedule.stageScheduled(OrigMISU);
3092     int CycleSched = Schedule.cycleScheduled(OrigMISU);
3093     unsigned ReplaceReg = 0;
3094     // This is the stage for the scheduled instruction.
3095     if (StagePhi == StageSched && Phi->isPHI()) {
3096       int CyclePhi = Schedule.cycleScheduled(getSUnit(Phi));
3097       if (PrevReg && InProlog)
3098         ReplaceReg = PrevReg;
3099       else if (PrevReg && !Schedule.isLoopCarried(this, *Phi) &&
3100                (CyclePhi <= CycleSched || OrigMISU->getInstr()->isPHI()))
3101         ReplaceReg = PrevReg;
3102       else
3103         ReplaceReg = NewReg;
3104     }
3105     // The scheduled instruction occurs before the scheduled Phi, and the
3106     // Phi is not loop carried.
3107     if (!InProlog && StagePhi + 1 == StageSched &&
3108         !Schedule.isLoopCarried(this, *Phi))
3109       ReplaceReg = NewReg;
3110     if (StagePhi > StageSched && Phi->isPHI())
3111       ReplaceReg = NewReg;
3112     if (!InProlog && !Phi->isPHI() && StagePhi < StageSched)
3113       ReplaceReg = NewReg;
3114     if (ReplaceReg) {
3115       MRI.constrainRegClass(ReplaceReg, MRI.getRegClass(OldReg));
3116       UseOp.setReg(ReplaceReg);
3117     }
3118   }
3119 }
3120 
3121 /// Check if we can change the instruction to use an offset value from the
3122 /// previous iteration. If so, return true and set the base and offset values
3123 /// so that we can rewrite the load, if necessary.
3124 ///   v1 = Phi(v0, v3)
3125 ///   v2 = load v1, 0
3126 ///   v3 = post_store v1, 4, x
3127 /// This function enables the load to be rewritten as v2 = load v3, 4.
3128 bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
3129                                               unsigned &BasePos,
3130                                               unsigned &OffsetPos,
3131                                               unsigned &NewBase,
3132                                               int64_t &Offset) {
3133   // Get the load instruction.
3134   if (TII->isPostIncrement(*MI))
3135     return false;
3136   unsigned BasePosLd, OffsetPosLd;
3137   if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
3138     return false;
3139   unsigned BaseReg = MI->getOperand(BasePosLd).getReg();
3140 
3141   // Look for the Phi instruction.
3142   MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
3143   MachineInstr *Phi = MRI.getVRegDef(BaseReg);
3144   if (!Phi || !Phi->isPHI())
3145     return false;
3146   // Get the register defined in the loop block.
3147   unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
3148   if (!PrevReg)
3149     return false;
3150 
3151   // Check for the post-increment load/store instruction.
3152   MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
3153   if (!PrevDef || PrevDef == MI)
3154     return false;
3155 
3156   if (!TII->isPostIncrement(*PrevDef))
3157     return false;
3158 
3159   unsigned BasePos1 = 0, OffsetPos1 = 0;
3160   if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
3161     return false;
3162 
3163   // Make sure that the instructions do not access the same memory location in
3164   // the next iteration.
3165   int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
3166   int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
3167   MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3168   NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
3169   bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
3170   MF.DeleteMachineInstr(NewMI);
3171   if (!Disjoint)
3172     return false;
3173 
3174   // Set the return value once we determine that we return true.
3175   BasePos = BasePosLd;
3176   OffsetPos = OffsetPosLd;
3177   NewBase = PrevReg;
3178   Offset = StoreOffset;
3179   return true;
3180 }
3181 
3182 /// Apply changes to the instruction if needed. The changes are need
3183 /// to improve the scheduling and depend up on the final schedule.
3184 void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
3185                                          SMSchedule &Schedule) {
3186   SUnit *SU = getSUnit(MI);
3187   DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
3188       InstrChanges.find(SU);
3189   if (It != InstrChanges.end()) {
3190     std::pair<unsigned, int64_t> RegAndOffset = It->second;
3191     unsigned BasePos, OffsetPos;
3192     if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3193       return;
3194     unsigned BaseReg = MI->getOperand(BasePos).getReg();
3195     MachineInstr *LoopDef = findDefInLoop(BaseReg);
3196     int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
3197     int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
3198     int BaseStageNum = Schedule.stageScheduled(SU);
3199     int BaseCycleNum = Schedule.cycleScheduled(SU);
3200     if (BaseStageNum < DefStageNum) {
3201       MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3202       int OffsetDiff = DefStageNum - BaseStageNum;
3203       if (DefCycleNum < BaseCycleNum) {
3204         NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
3205         if (OffsetDiff > 0)
3206           --OffsetDiff;
3207       }
3208       int64_t NewOffset =
3209           MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
3210       NewMI->getOperand(OffsetPos).setImm(NewOffset);
3211       SU->setInstr(NewMI);
3212       MISUnitMap[NewMI] = SU;
3213       NewMIs.insert(NewMI);
3214     }
3215   }
3216 }
3217 
3218 /// Return true for an order or output dependence that is loop carried
3219 /// potentially. A dependence is loop carried if the destination defines a valu
3220 /// that may be used or defined by the source in a subsequent iteration.
3221 bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
3222                                          bool isSucc) {
3223   if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
3224       Dep.isArtificial())
3225     return false;
3226 
3227   if (!SwpPruneLoopCarried)
3228     return true;
3229 
3230   if (Dep.getKind() == SDep::Output)
3231     return true;
3232 
3233   MachineInstr *SI = Source->getInstr();
3234   MachineInstr *DI = Dep.getSUnit()->getInstr();
3235   if (!isSucc)
3236     std::swap(SI, DI);
3237   assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
3238 
3239   // Assume ordered loads and stores may have a loop carried dependence.
3240   if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
3241       SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
3242     return true;
3243 
3244   // Only chain dependences between a load and store can be loop carried.
3245   if (!DI->mayStore() || !SI->mayLoad())
3246     return false;
3247 
3248   unsigned DeltaS, DeltaD;
3249   if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
3250     return true;
3251 
3252   const MachineOperand *BaseOpS, *BaseOpD;
3253   int64_t OffsetS, OffsetD;
3254   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3255   if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, TRI) ||
3256       !TII->getMemOperandWithOffset(*DI, BaseOpD, OffsetD, TRI))
3257     return true;
3258 
3259   if (!BaseOpS->isIdenticalTo(*BaseOpD))
3260     return true;
3261 
3262   // Check that the base register is incremented by a constant value for each
3263   // iteration.
3264   MachineInstr *Def = MRI.getVRegDef(BaseOpS->getReg());
3265   if (!Def || !Def->isPHI())
3266     return true;
3267   unsigned InitVal = 0;
3268   unsigned LoopVal = 0;
3269   getPhiRegs(*Def, BB, InitVal, LoopVal);
3270   MachineInstr *LoopDef = MRI.getVRegDef(LoopVal);
3271   int D = 0;
3272   if (!LoopDef || !TII->getIncrementValue(*LoopDef, D))
3273     return true;
3274 
3275   uint64_t AccessSizeS = (*SI->memoperands_begin())->getSize();
3276   uint64_t AccessSizeD = (*DI->memoperands_begin())->getSize();
3277 
3278   // This is the main test, which checks the offset values and the loop
3279   // increment value to determine if the accesses may be loop carried.
3280   if (AccessSizeS == MemoryLocation::UnknownSize ||
3281       AccessSizeD == MemoryLocation::UnknownSize)
3282     return true;
3283 
3284   if (DeltaS != DeltaD || DeltaS < AccessSizeS || DeltaD < AccessSizeD)
3285     return true;
3286 
3287   return (OffsetS + (int64_t)AccessSizeS < OffsetD + (int64_t)AccessSizeD);
3288 }
3289 
3290 void SwingSchedulerDAG::postprocessDAG() {
3291   for (auto &M : Mutations)
3292     M->apply(this);
3293 }
3294 
3295 /// Try to schedule the node at the specified StartCycle and continue
3296 /// until the node is schedule or the EndCycle is reached.  This function
3297 /// returns true if the node is scheduled.  This routine may search either
3298 /// forward or backward for a place to insert the instruction based upon
3299 /// the relative values of StartCycle and EndCycle.
3300 bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
3301   bool forward = true;
3302   LLVM_DEBUG({
3303     dbgs() << "Trying to insert node between " << StartCycle << " and "
3304            << EndCycle << " II: " << II << "\n";
3305   });
3306   if (StartCycle > EndCycle)
3307     forward = false;
3308 
3309   // The terminating condition depends on the direction.
3310   int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
3311   for (int curCycle = StartCycle; curCycle != termCycle;
3312        forward ? ++curCycle : --curCycle) {
3313 
3314     // Add the already scheduled instructions at the specified cycle to the
3315     // DFA.
3316     ProcItinResources.clearResources();
3317     for (int checkCycle = FirstCycle + ((curCycle - FirstCycle) % II);
3318          checkCycle <= LastCycle; checkCycle += II) {
3319       std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[checkCycle];
3320 
3321       for (std::deque<SUnit *>::iterator I = cycleInstrs.begin(),
3322                                          E = cycleInstrs.end();
3323            I != E; ++I) {
3324         if (ST.getInstrInfo()->isZeroCost((*I)->getInstr()->getOpcode()))
3325           continue;
3326         assert(ProcItinResources.canReserveResources(*(*I)->getInstr()) &&
3327                "These instructions have already been scheduled.");
3328         ProcItinResources.reserveResources(*(*I)->getInstr());
3329       }
3330     }
3331     if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
3332         ProcItinResources.canReserveResources(*SU->getInstr())) {
3333       LLVM_DEBUG({
3334         dbgs() << "\tinsert at cycle " << curCycle << " ";
3335         SU->getInstr()->dump();
3336       });
3337 
3338       ScheduledInstrs[curCycle].push_back(SU);
3339       InstrToCycle.insert(std::make_pair(SU, curCycle));
3340       if (curCycle > LastCycle)
3341         LastCycle = curCycle;
3342       if (curCycle < FirstCycle)
3343         FirstCycle = curCycle;
3344       return true;
3345     }
3346     LLVM_DEBUG({
3347       dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
3348       SU->getInstr()->dump();
3349     });
3350   }
3351   return false;
3352 }
3353 
3354 // Return the cycle of the earliest scheduled instruction in the chain.
3355 int SMSchedule::earliestCycleInChain(const SDep &Dep) {
3356   SmallPtrSet<SUnit *, 8> Visited;
3357   SmallVector<SDep, 8> Worklist;
3358   Worklist.push_back(Dep);
3359   int EarlyCycle = INT_MAX;
3360   while (!Worklist.empty()) {
3361     const SDep &Cur = Worklist.pop_back_val();
3362     SUnit *PrevSU = Cur.getSUnit();
3363     if (Visited.count(PrevSU))
3364       continue;
3365     std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
3366     if (it == InstrToCycle.end())
3367       continue;
3368     EarlyCycle = std::min(EarlyCycle, it->second);
3369     for (const auto &PI : PrevSU->Preds)
3370       if (PI.getKind() == SDep::Order || Dep.getKind() == SDep::Output)
3371         Worklist.push_back(PI);
3372     Visited.insert(PrevSU);
3373   }
3374   return EarlyCycle;
3375 }
3376 
3377 // Return the cycle of the latest scheduled instruction in the chain.
3378 int SMSchedule::latestCycleInChain(const SDep &Dep) {
3379   SmallPtrSet<SUnit *, 8> Visited;
3380   SmallVector<SDep, 8> Worklist;
3381   Worklist.push_back(Dep);
3382   int LateCycle = INT_MIN;
3383   while (!Worklist.empty()) {
3384     const SDep &Cur = Worklist.pop_back_val();
3385     SUnit *SuccSU = Cur.getSUnit();
3386     if (Visited.count(SuccSU))
3387       continue;
3388     std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
3389     if (it == InstrToCycle.end())
3390       continue;
3391     LateCycle = std::max(LateCycle, it->second);
3392     for (const auto &SI : SuccSU->Succs)
3393       if (SI.getKind() == SDep::Order || Dep.getKind() == SDep::Output)
3394         Worklist.push_back(SI);
3395     Visited.insert(SuccSU);
3396   }
3397   return LateCycle;
3398 }
3399 
3400 /// If an instruction has a use that spans multiple iterations, then
3401 /// return true. These instructions are characterized by having a back-ege
3402 /// to a Phi, which contains a reference to another Phi.
3403 static SUnit *multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG) {
3404   for (auto &P : SU->Preds)
3405     if (DAG->isBackedge(SU, P) && P.getSUnit()->getInstr()->isPHI())
3406       for (auto &S : P.getSUnit()->Succs)
3407         if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
3408           return P.getSUnit();
3409   return nullptr;
3410 }
3411 
3412 /// Compute the scheduling start slot for the instruction.  The start slot
3413 /// depends on any predecessor or successor nodes scheduled already.
3414 void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
3415                               int *MinEnd, int *MaxStart, int II,
3416                               SwingSchedulerDAG *DAG) {
3417   // Iterate over each instruction that has been scheduled already.  The start
3418   // slot computation depends on whether the previously scheduled instruction
3419   // is a predecessor or successor of the specified instruction.
3420   for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
3421 
3422     // Iterate over each instruction in the current cycle.
3423     for (SUnit *I : getInstructions(cycle)) {
3424       // Because we're processing a DAG for the dependences, we recognize
3425       // the back-edge in recurrences by anti dependences.
3426       for (unsigned i = 0, e = (unsigned)SU->Preds.size(); i != e; ++i) {
3427         const SDep &Dep = SU->Preds[i];
3428         if (Dep.getSUnit() == I) {
3429           if (!DAG->isBackedge(SU, Dep)) {
3430             int EarlyStart = cycle + Dep.getLatency() -
3431                              DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
3432             *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
3433             if (DAG->isLoopCarriedDep(SU, Dep, false)) {
3434               int End = earliestCycleInChain(Dep) + (II - 1);
3435               *MinEnd = std::min(*MinEnd, End);
3436             }
3437           } else {
3438             int LateStart = cycle - Dep.getLatency() +
3439                             DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
3440             *MinLateStart = std::min(*MinLateStart, LateStart);
3441           }
3442         }
3443         // For instruction that requires multiple iterations, make sure that
3444         // the dependent instruction is not scheduled past the definition.
3445         SUnit *BE = multipleIterations(I, DAG);
3446         if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
3447             !SU->isPred(I))
3448           *MinLateStart = std::min(*MinLateStart, cycle);
3449       }
3450       for (unsigned i = 0, e = (unsigned)SU->Succs.size(); i != e; ++i) {
3451         if (SU->Succs[i].getSUnit() == I) {
3452           const SDep &Dep = SU->Succs[i];
3453           if (!DAG->isBackedge(SU, Dep)) {
3454             int LateStart = cycle - Dep.getLatency() +
3455                             DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
3456             *MinLateStart = std::min(*MinLateStart, LateStart);
3457             if (DAG->isLoopCarriedDep(SU, Dep)) {
3458               int Start = latestCycleInChain(Dep) + 1 - II;
3459               *MaxStart = std::max(*MaxStart, Start);
3460             }
3461           } else {
3462             int EarlyStart = cycle + Dep.getLatency() -
3463                              DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
3464             *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
3465           }
3466         }
3467       }
3468     }
3469   }
3470 }
3471 
3472 /// Order the instructions within a cycle so that the definitions occur
3473 /// before the uses. Returns true if the instruction is added to the start
3474 /// of the list, or false if added to the end.
3475 void SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
3476                                  std::deque<SUnit *> &Insts) {
3477   MachineInstr *MI = SU->getInstr();
3478   bool OrderBeforeUse = false;
3479   bool OrderAfterDef = false;
3480   bool OrderBeforeDef = false;
3481   unsigned MoveDef = 0;
3482   unsigned MoveUse = 0;
3483   int StageInst1 = stageScheduled(SU);
3484 
3485   unsigned Pos = 0;
3486   for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
3487        ++I, ++Pos) {
3488     for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3489       MachineOperand &MO = MI->getOperand(i);
3490       if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
3491         continue;
3492 
3493       unsigned Reg = MO.getReg();
3494       unsigned BasePos, OffsetPos;
3495       if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3496         if (MI->getOperand(BasePos).getReg() == Reg)
3497           if (unsigned NewReg = SSD->getInstrBaseReg(SU))
3498             Reg = NewReg;
3499       bool Reads, Writes;
3500       std::tie(Reads, Writes) =
3501           (*I)->getInstr()->readsWritesVirtualRegister(Reg);
3502       if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
3503         OrderBeforeUse = true;
3504         if (MoveUse == 0)
3505           MoveUse = Pos;
3506       } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
3507         // Add the instruction after the scheduled instruction.
3508         OrderAfterDef = true;
3509         MoveDef = Pos;
3510       } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
3511         if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
3512           OrderBeforeUse = true;
3513           if (MoveUse == 0)
3514             MoveUse = Pos;
3515         } else {
3516           OrderAfterDef = true;
3517           MoveDef = Pos;
3518         }
3519       } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
3520         OrderBeforeUse = true;
3521         if (MoveUse == 0)
3522           MoveUse = Pos;
3523         if (MoveUse != 0) {
3524           OrderAfterDef = true;
3525           MoveDef = Pos - 1;
3526         }
3527       } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
3528         // Add the instruction before the scheduled instruction.
3529         OrderBeforeUse = true;
3530         if (MoveUse == 0)
3531           MoveUse = Pos;
3532       } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
3533                  isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
3534         if (MoveUse == 0) {
3535           OrderBeforeDef = true;
3536           MoveUse = Pos;
3537         }
3538       }
3539     }
3540     // Check for order dependences between instructions. Make sure the source
3541     // is ordered before the destination.
3542     for (auto &S : SU->Succs) {
3543       if (S.getSUnit() != *I)
3544         continue;
3545       if (S.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
3546         OrderBeforeUse = true;
3547         if (Pos < MoveUse)
3548           MoveUse = Pos;
3549       }
3550     }
3551     for (auto &P : SU->Preds) {
3552       if (P.getSUnit() != *I)
3553         continue;
3554       if (P.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
3555         OrderAfterDef = true;
3556         MoveDef = Pos;
3557       }
3558     }
3559   }
3560 
3561   // A circular dependence.
3562   if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3563     OrderBeforeUse = false;
3564 
3565   // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3566   // to a loop-carried dependence.
3567   if (OrderBeforeDef)
3568     OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
3569 
3570   // The uncommon case when the instruction order needs to be updated because
3571   // there is both a use and def.
3572   if (OrderBeforeUse && OrderAfterDef) {
3573     SUnit *UseSU = Insts.at(MoveUse);
3574     SUnit *DefSU = Insts.at(MoveDef);
3575     if (MoveUse > MoveDef) {
3576       Insts.erase(Insts.begin() + MoveUse);
3577       Insts.erase(Insts.begin() + MoveDef);
3578     } else {
3579       Insts.erase(Insts.begin() + MoveDef);
3580       Insts.erase(Insts.begin() + MoveUse);
3581     }
3582     orderDependence(SSD, UseSU, Insts);
3583     orderDependence(SSD, SU, Insts);
3584     orderDependence(SSD, DefSU, Insts);
3585     return;
3586   }
3587   // Put the new instruction first if there is a use in the list. Otherwise,
3588   // put it at the end of the list.
3589   if (OrderBeforeUse)
3590     Insts.push_front(SU);
3591   else
3592     Insts.push_back(SU);
3593 }
3594 
3595 /// Return true if the scheduled Phi has a loop carried operand.
3596 bool SMSchedule::isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi) {
3597   if (!Phi.isPHI())
3598     return false;
3599   assert(Phi.isPHI() && "Expecting a Phi.");
3600   SUnit *DefSU = SSD->getSUnit(&Phi);
3601   unsigned DefCycle = cycleScheduled(DefSU);
3602   int DefStage = stageScheduled(DefSU);
3603 
3604   unsigned InitVal = 0;
3605   unsigned LoopVal = 0;
3606   getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
3607   SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
3608   if (!UseSU)
3609     return true;
3610   if (UseSU->getInstr()->isPHI())
3611     return true;
3612   unsigned LoopCycle = cycleScheduled(UseSU);
3613   int LoopStage = stageScheduled(UseSU);
3614   return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
3615 }
3616 
3617 /// Return true if the instruction is a definition that is loop carried
3618 /// and defines the use on the next iteration.
3619 ///        v1 = phi(v2, v3)
3620 ///  (Def) v3 = op v1
3621 ///  (MO)   = v1
3622 /// If MO appears before Def, then then v1 and v3 may get assigned to the same
3623 /// register.
3624 bool SMSchedule::isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD,
3625                                        MachineInstr *Def, MachineOperand &MO) {
3626   if (!MO.isReg())
3627     return false;
3628   if (Def->isPHI())
3629     return false;
3630   MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
3631   if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
3632     return false;
3633   if (!isLoopCarried(SSD, *Phi))
3634     return false;
3635   unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
3636   for (unsigned i = 0, e = Def->getNumOperands(); i != e; ++i) {
3637     MachineOperand &DMO = Def->getOperand(i);
3638     if (!DMO.isReg() || !DMO.isDef())
3639       continue;
3640     if (DMO.getReg() == LoopReg)
3641       return true;
3642   }
3643   return false;
3644 }
3645 
3646 // Check if the generated schedule is valid. This function checks if
3647 // an instruction that uses a physical register is scheduled in a
3648 // different stage than the definition. The pipeliner does not handle
3649 // physical register values that may cross a basic block boundary.
3650 bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
3651   for (int i = 0, e = SSD->SUnits.size(); i < e; ++i) {
3652     SUnit &SU = SSD->SUnits[i];
3653     if (!SU.hasPhysRegDefs)
3654       continue;
3655     int StageDef = stageScheduled(&SU);
3656     assert(StageDef != -1 && "Instruction should have been scheduled.");
3657     for (auto &SI : SU.Succs)
3658       if (SI.isAssignedRegDep())
3659         if (ST.getRegisterInfo()->isPhysicalRegister(SI.getReg()))
3660           if (stageScheduled(SI.getSUnit()) != StageDef)
3661             return false;
3662   }
3663   return true;
3664 }
3665 
3666 /// A property of the node order in swing-modulo-scheduling is
3667 /// that for nodes outside circuits the following holds:
3668 /// none of them is scheduled after both a successor and a
3669 /// predecessor.
3670 /// The method below checks whether the property is met.
3671 /// If not, debug information is printed and statistics information updated.
3672 /// Note that we do not use an assert statement.
3673 /// The reason is that although an invalid node oder may prevent
3674 /// the pipeliner from finding a pipelined schedule for arbitrary II,
3675 /// it does not lead to the generation of incorrect code.
3676 void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
3677 
3678   // a sorted vector that maps each SUnit to its index in the NodeOrder
3679   typedef std::pair<SUnit *, unsigned> UnitIndex;
3680   std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
3681 
3682   for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
3683     Indices.push_back(std::make_pair(NodeOrder[i], i));
3684 
3685   auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
3686     return std::get<0>(i1) < std::get<0>(i2);
3687   };
3688 
3689   // sort, so that we can perform a binary search
3690   llvm::sort(Indices, CompareKey);
3691 
3692   bool Valid = true;
3693   (void)Valid;
3694   // for each SUnit in the NodeOrder, check whether
3695   // it appears after both a successor and a predecessor
3696   // of the SUnit. If this is the case, and the SUnit
3697   // is not part of circuit, then the NodeOrder is not
3698   // valid.
3699   for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
3700     SUnit *SU = NodeOrder[i];
3701     unsigned Index = i;
3702 
3703     bool PredBefore = false;
3704     bool SuccBefore = false;
3705 
3706     SUnit *Succ;
3707     SUnit *Pred;
3708     (void)Succ;
3709     (void)Pred;
3710 
3711     for (SDep &PredEdge : SU->Preds) {
3712       SUnit *PredSU = PredEdge.getSUnit();
3713       unsigned PredIndex =
3714           std::get<1>(*std::lower_bound(Indices.begin(), Indices.end(),
3715                                         std::make_pair(PredSU, 0), CompareKey));
3716       if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
3717         PredBefore = true;
3718         Pred = PredSU;
3719         break;
3720       }
3721     }
3722 
3723     for (SDep &SuccEdge : SU->Succs) {
3724       SUnit *SuccSU = SuccEdge.getSUnit();
3725       unsigned SuccIndex =
3726           std::get<1>(*std::lower_bound(Indices.begin(), Indices.end(),
3727                                         std::make_pair(SuccSU, 0), CompareKey));
3728       if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
3729         SuccBefore = true;
3730         Succ = SuccSU;
3731         break;
3732       }
3733     }
3734 
3735     if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
3736       // instructions in circuits are allowed to be scheduled
3737       // after both a successor and predecessor.
3738       bool InCircuit = std::any_of(
3739           Circuits.begin(), Circuits.end(),
3740           [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
3741       if (InCircuit)
3742         LLVM_DEBUG(dbgs() << "In a circuit, predecessor ";);
3743       else {
3744         Valid = false;
3745         NumNodeOrderIssues++;
3746         LLVM_DEBUG(dbgs() << "Predecessor ";);
3747       }
3748       LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
3749                         << " are scheduled before node " << SU->NodeNum
3750                         << "\n";);
3751     }
3752   }
3753 
3754   LLVM_DEBUG({
3755     if (!Valid)
3756       dbgs() << "Invalid node order found!\n";
3757   });
3758 }
3759 
3760 /// Attempt to fix the degenerate cases when the instruction serialization
3761 /// causes the register lifetimes to overlap. For example,
3762 ///   p' = store_pi(p, b)
3763 ///      = load p, offset
3764 /// In this case p and p' overlap, which means that two registers are needed.
3765 /// Instead, this function changes the load to use p' and updates the offset.
3766 void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
3767   unsigned OverlapReg = 0;
3768   unsigned NewBaseReg = 0;
3769   for (SUnit *SU : Instrs) {
3770     MachineInstr *MI = SU->getInstr();
3771     for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3772       const MachineOperand &MO = MI->getOperand(i);
3773       // Look for an instruction that uses p. The instruction occurs in the
3774       // same cycle but occurs later in the serialized order.
3775       if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
3776         // Check that the instruction appears in the InstrChanges structure,
3777         // which contains instructions that can have the offset updated.
3778         DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
3779           InstrChanges.find(SU);
3780         if (It != InstrChanges.end()) {
3781           unsigned BasePos, OffsetPos;
3782           // Update the base register and adjust the offset.
3783           if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
3784             MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3785             NewMI->getOperand(BasePos).setReg(NewBaseReg);
3786             int64_t NewOffset =
3787                 MI->getOperand(OffsetPos).getImm() - It->second.second;
3788             NewMI->getOperand(OffsetPos).setImm(NewOffset);
3789             SU->setInstr(NewMI);
3790             MISUnitMap[NewMI] = SU;
3791             NewMIs.insert(NewMI);
3792           }
3793         }
3794         OverlapReg = 0;
3795         NewBaseReg = 0;
3796         break;
3797       }
3798       // Look for an instruction of the form p' = op(p), which uses and defines
3799       // two virtual registers that get allocated to the same physical register.
3800       unsigned TiedUseIdx = 0;
3801       if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
3802         // OverlapReg is p in the example above.
3803         OverlapReg = MI->getOperand(TiedUseIdx).getReg();
3804         // NewBaseReg is p' in the example above.
3805         NewBaseReg = MI->getOperand(i).getReg();
3806         break;
3807       }
3808     }
3809   }
3810 }
3811 
3812 /// After the schedule has been formed, call this function to combine
3813 /// the instructions from the different stages/cycles.  That is, this
3814 /// function creates a schedule that represents a single iteration.
3815 void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
3816   // Move all instructions to the first stage from later stages.
3817   for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3818     for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
3819          ++stage) {
3820       std::deque<SUnit *> &cycleInstrs =
3821           ScheduledInstrs[cycle + (stage * InitiationInterval)];
3822       for (std::deque<SUnit *>::reverse_iterator I = cycleInstrs.rbegin(),
3823                                                  E = cycleInstrs.rend();
3824            I != E; ++I)
3825         ScheduledInstrs[cycle].push_front(*I);
3826     }
3827   }
3828   // Iterate over the definitions in each instruction, and compute the
3829   // stage difference for each use.  Keep the maximum value.
3830   for (auto &I : InstrToCycle) {
3831     int DefStage = stageScheduled(I.first);
3832     MachineInstr *MI = I.first->getInstr();
3833     for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3834       MachineOperand &Op = MI->getOperand(i);
3835       if (!Op.isReg() || !Op.isDef())
3836         continue;
3837 
3838       unsigned Reg = Op.getReg();
3839       unsigned MaxDiff = 0;
3840       bool PhiIsSwapped = false;
3841       for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(Reg),
3842                                              EI = MRI.use_end();
3843            UI != EI; ++UI) {
3844         MachineOperand &UseOp = *UI;
3845         MachineInstr *UseMI = UseOp.getParent();
3846         SUnit *SUnitUse = SSD->getSUnit(UseMI);
3847         int UseStage = stageScheduled(SUnitUse);
3848         unsigned Diff = 0;
3849         if (UseStage != -1 && UseStage >= DefStage)
3850           Diff = UseStage - DefStage;
3851         if (MI->isPHI()) {
3852           if (isLoopCarried(SSD, *MI))
3853             ++Diff;
3854           else
3855             PhiIsSwapped = true;
3856         }
3857         MaxDiff = std::max(Diff, MaxDiff);
3858       }
3859       RegToStageDiff[Reg] = std::make_pair(MaxDiff, PhiIsSwapped);
3860     }
3861   }
3862 
3863   // Erase all the elements in the later stages. Only one iteration should
3864   // remain in the scheduled list, and it contains all the instructions.
3865   for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
3866     ScheduledInstrs.erase(cycle);
3867 
3868   // Change the registers in instruction as specified in the InstrChanges
3869   // map. We need to use the new registers to create the correct order.
3870   for (int i = 0, e = SSD->SUnits.size(); i != e; ++i) {
3871     SUnit *SU = &SSD->SUnits[i];
3872     SSD->applyInstrChange(SU->getInstr(), *this);
3873   }
3874 
3875   // Reorder the instructions in each cycle to fix and improve the
3876   // generated code.
3877   for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
3878     std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
3879     std::deque<SUnit *> newOrderPhi;
3880     for (unsigned i = 0, e = cycleInstrs.size(); i < e; ++i) {
3881       SUnit *SU = cycleInstrs[i];
3882       if (SU->getInstr()->isPHI())
3883         newOrderPhi.push_back(SU);
3884     }
3885     std::deque<SUnit *> newOrderI;
3886     for (unsigned i = 0, e = cycleInstrs.size(); i < e; ++i) {
3887       SUnit *SU = cycleInstrs[i];
3888       if (!SU->getInstr()->isPHI())
3889         orderDependence(SSD, SU, newOrderI);
3890     }
3891     // Replace the old order with the new order.
3892     cycleInstrs.swap(newOrderPhi);
3893     cycleInstrs.insert(cycleInstrs.end(), newOrderI.begin(), newOrderI.end());
3894     SSD->fixupRegisterOverlaps(cycleInstrs);
3895   }
3896 
3897   LLVM_DEBUG(dump(););
3898 }
3899 
3900 void NodeSet::print(raw_ostream &os) const {
3901   os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
3902      << " depth " << MaxDepth << " col " << Colocate << "\n";
3903   for (const auto &I : Nodes)
3904     os << "   SU(" << I->NodeNum << ") " << *(I->getInstr());
3905   os << "\n";
3906 }
3907 
3908 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3909 /// Print the schedule information to the given output.
3910 void SMSchedule::print(raw_ostream &os) const {
3911   // Iterate over each cycle.
3912   for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3913     // Iterate over each instruction in the cycle.
3914     const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
3915     for (SUnit *CI : cycleInstrs->second) {
3916       os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
3917       os << "(" << CI->NodeNum << ") ";
3918       CI->getInstr()->print(os);
3919       os << "\n";
3920     }
3921   }
3922 }
3923 
3924 /// Utility function used for debugging to print the schedule.
3925 LLVM_DUMP_METHOD void SMSchedule::dump() const { print(dbgs()); }
3926 LLVM_DUMP_METHOD void NodeSet::dump() const { print(dbgs()); }
3927 
3928 #endif
3929 
3930 void ResourceManager::initProcResourceVectors(
3931     const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
3932   unsigned ProcResourceID = 0;
3933 
3934   // We currently limit the resource kinds to 64 and below so that we can use
3935   // uint64_t for Masks
3936   assert(SM.getNumProcResourceKinds() < 64 &&
3937          "Too many kinds of resources, unsupported");
3938   // Create a unique bitmask for every processor resource unit.
3939   // Skip resource at index 0, since it always references 'InvalidUnit'.
3940   Masks.resize(SM.getNumProcResourceKinds());
3941   for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3942     const MCProcResourceDesc &Desc = *SM.getProcResource(I);
3943     if (Desc.SubUnitsIdxBegin)
3944       continue;
3945     Masks[I] = 1ULL << ProcResourceID;
3946     ProcResourceID++;
3947   }
3948   // Create a unique bitmask for every processor resource group.
3949   for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3950     const MCProcResourceDesc &Desc = *SM.getProcResource(I);
3951     if (!Desc.SubUnitsIdxBegin)
3952       continue;
3953     Masks[I] = 1ULL << ProcResourceID;
3954     for (unsigned U = 0; U < Desc.NumUnits; ++U)
3955       Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
3956     ProcResourceID++;
3957   }
3958   LLVM_DEBUG({
3959     dbgs() << "ProcResourceDesc:\n";
3960     for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3961       const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
3962       dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3963                        ProcResource->Name, I, Masks[I], ProcResource->NumUnits);
3964     }
3965     dbgs() << " -----------------\n";
3966   });
3967 }
3968 
3969 bool ResourceManager::canReserveResources(const MCInstrDesc *MID) const {
3970 
3971   LLVM_DEBUG({ dbgs() << "canReserveResources:\n"; });
3972   if (UseDFA)
3973     return DFAResources->canReserveResources(MID);
3974 
3975   unsigned InsnClass = MID->getSchedClass();
3976   const MCSchedClassDesc *SCDesc = SM.getSchedClassDesc(InsnClass);
3977   if (!SCDesc->isValid()) {
3978     LLVM_DEBUG({
3979       dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3980       dbgs() << "isPseduo:" << MID->isPseudo() << "\n";
3981     });
3982     return true;
3983   }
3984 
3985   const MCWriteProcResEntry *I = STI->getWriteProcResBegin(SCDesc);
3986   const MCWriteProcResEntry *E = STI->getWriteProcResEnd(SCDesc);
3987   for (; I != E; ++I) {
3988     if (!I->Cycles)
3989       continue;
3990     const MCProcResourceDesc *ProcResource =
3991         SM.getProcResource(I->ProcResourceIdx);
3992     unsigned NumUnits = ProcResource->NumUnits;
3993     LLVM_DEBUG({
3994       dbgs() << format(" %16s(%2d): Count: %2d, NumUnits:%2d, Cycles:%2d\n",
3995                        ProcResource->Name, I->ProcResourceIdx,
3996                        ProcResourceCount[I->ProcResourceIdx], NumUnits,
3997                        I->Cycles);
3998     });
3999     if (ProcResourceCount[I->ProcResourceIdx] >= NumUnits)
4000       return false;
4001   }
4002   LLVM_DEBUG(dbgs() << "return true\n\n";);
4003   return true;
4004 }
4005 
4006 void ResourceManager::reserveResources(const MCInstrDesc *MID) {
4007   LLVM_DEBUG({ dbgs() << "reserveResources:\n"; });
4008   if (UseDFA)
4009     return DFAResources->reserveResources(MID);
4010 
4011   unsigned InsnClass = MID->getSchedClass();
4012   const MCSchedClassDesc *SCDesc = SM.getSchedClassDesc(InsnClass);
4013   if (!SCDesc->isValid()) {
4014     LLVM_DEBUG({
4015       dbgs() << "No valid Schedule Class Desc for schedClass!\n";
4016       dbgs() << "isPseduo:" << MID->isPseudo() << "\n";
4017     });
4018     return;
4019   }
4020   for (const MCWriteProcResEntry &PRE :
4021        make_range(STI->getWriteProcResBegin(SCDesc),
4022                   STI->getWriteProcResEnd(SCDesc))) {
4023     if (!PRE.Cycles)
4024       continue;
4025     ++ProcResourceCount[PRE.ProcResourceIdx];
4026     LLVM_DEBUG({
4027       const MCProcResourceDesc *ProcResource =
4028           SM.getProcResource(PRE.ProcResourceIdx);
4029       dbgs() << format(" %16s(%2d): Count: %2d, NumUnits:%2d, Cycles:%2d\n",
4030                        ProcResource->Name, PRE.ProcResourceIdx,
4031                        ProcResourceCount[PRE.ProcResourceIdx],
4032                        ProcResource->NumUnits, PRE.Cycles);
4033     });
4034   }
4035   LLVM_DEBUG({ dbgs() << "reserveResources: done!\n\n"; });
4036 }
4037 
4038 bool ResourceManager::canReserveResources(const MachineInstr &MI) const {
4039   return canReserveResources(&MI.getDesc());
4040 }
4041 
4042 void ResourceManager::reserveResources(const MachineInstr &MI) {
4043   return reserveResources(&MI.getDesc());
4044 }
4045 
4046 void ResourceManager::clearResources() {
4047   if (UseDFA)
4048     return DFAResources->clearResources();
4049   std::fill(ProcResourceCount.begin(), ProcResourceCount.end(), 0);
4050 }
4051 
4052