1 //===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10 //
11 // This SMS implementation is a target-independent back-end pass. When enabled,
12 // the pass runs just prior to the register allocation pass, while the machine
13 // IR is in SSA form. If software pipelining is successful, then the original
14 // loop is replaced by the optimized loop. The optimized loop contains one or
15 // more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16 // the instructions cannot be scheduled in a given MII, we increase the MII by
17 // one and try again.
18 //
19 // The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20 // represent loop carried dependences in the DAG as order edges to the Phi
21 // nodes. We also perform several passes over the DAG to eliminate unnecessary
22 // edges that inhibit the ability to pipeline. The implementation uses the
23 // DFAPacketizer class to compute the minimum initiation interval and the check
24 // where an instruction may be inserted in the pipelined schedule.
25 //
26 // In order for the SMS pass to work, several target specific hooks need to be
27 // implemented to get information about the loop structure and to rewrite
28 // instructions.
29 //
30 //===----------------------------------------------------------------------===//
31 
32 #include "llvm/ADT/ArrayRef.h"
33 #include "llvm/ADT/BitVector.h"
34 #include "llvm/ADT/DenseMap.h"
35 #include "llvm/ADT/MapVector.h"
36 #include "llvm/ADT/PriorityQueue.h"
37 #include "llvm/ADT/SetVector.h"
38 #include "llvm/ADT/SmallPtrSet.h"
39 #include "llvm/ADT/SmallSet.h"
40 #include "llvm/ADT/SmallVector.h"
41 #include "llvm/ADT/Statistic.h"
42 #include "llvm/ADT/iterator_range.h"
43 #include "llvm/Analysis/AliasAnalysis.h"
44 #include "llvm/Analysis/MemoryLocation.h"
45 #include "llvm/Analysis/ValueTracking.h"
46 #include "llvm/CodeGen/DFAPacketizer.h"
47 #include "llvm/CodeGen/LiveIntervals.h"
48 #include "llvm/CodeGen/MachineBasicBlock.h"
49 #include "llvm/CodeGen/MachineDominators.h"
50 #include "llvm/CodeGen/MachineFunction.h"
51 #include "llvm/CodeGen/MachineFunctionPass.h"
52 #include "llvm/CodeGen/MachineInstr.h"
53 #include "llvm/CodeGen/MachineInstrBuilder.h"
54 #include "llvm/CodeGen/MachineLoopInfo.h"
55 #include "llvm/CodeGen/MachineMemOperand.h"
56 #include "llvm/CodeGen/MachineOperand.h"
57 #include "llvm/CodeGen/MachinePipeliner.h"
58 #include "llvm/CodeGen/MachineRegisterInfo.h"
59 #include "llvm/CodeGen/RegisterPressure.h"
60 #include "llvm/CodeGen/ScheduleDAG.h"
61 #include "llvm/CodeGen/ScheduleDAGMutation.h"
62 #include "llvm/CodeGen/TargetOpcodes.h"
63 #include "llvm/CodeGen/TargetRegisterInfo.h"
64 #include "llvm/CodeGen/TargetSubtargetInfo.h"
65 #include "llvm/Config/llvm-config.h"
66 #include "llvm/IR/Attributes.h"
67 #include "llvm/IR/DebugLoc.h"
68 #include "llvm/IR/Function.h"
69 #include "llvm/MC/LaneBitmask.h"
70 #include "llvm/MC/MCInstrDesc.h"
71 #include "llvm/MC/MCInstrItineraries.h"
72 #include "llvm/MC/MCRegisterInfo.h"
73 #include "llvm/Pass.h"
74 #include "llvm/Support/CommandLine.h"
75 #include "llvm/Support/Compiler.h"
76 #include "llvm/Support/Debug.h"
77 #include "llvm/Support/MathExtras.h"
78 #include "llvm/Support/raw_ostream.h"
79 #include <algorithm>
80 #include <cassert>
81 #include <climits>
82 #include <cstdint>
83 #include <deque>
84 #include <functional>
85 #include <iterator>
86 #include <map>
87 #include <memory>
88 #include <tuple>
89 #include <utility>
90 #include <vector>
91 
92 using namespace llvm;
93 
94 #define DEBUG_TYPE "pipeliner"
95 
96 STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
97 STATISTIC(NumPipelined, "Number of loops software pipelined");
98 STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
99 
100 /// A command line option to turn software pipelining on or off.
101 static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
102                                cl::ZeroOrMore,
103                                cl::desc("Enable Software Pipelining"));
104 
105 /// A command line option to enable SWP at -Os.
106 static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
107                                       cl::desc("Enable SWP at Os."), cl::Hidden,
108                                       cl::init(false));
109 
110 /// A command line argument to limit minimum initial interval for pipelining.
111 static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
112                               cl::desc("Size limit for the MII."),
113                               cl::Hidden, cl::init(27));
114 
115 /// A command line argument to limit the number of stages in the pipeline.
116 static cl::opt<int>
117     SwpMaxStages("pipeliner-max-stages",
118                  cl::desc("Maximum stages allowed in the generated scheduled."),
119                  cl::Hidden, cl::init(3));
120 
121 /// A command line option to disable the pruning of chain dependences due to
122 /// an unrelated Phi.
123 static cl::opt<bool>
124     SwpPruneDeps("pipeliner-prune-deps",
125                  cl::desc("Prune dependences between unrelated Phi nodes."),
126                  cl::Hidden, cl::init(true));
127 
128 /// A command line option to disable the pruning of loop carried order
129 /// dependences.
130 static cl::opt<bool>
131     SwpPruneLoopCarried("pipeliner-prune-loop-carried",
132                         cl::desc("Prune loop carried order dependences."),
133                         cl::Hidden, cl::init(true));
134 
135 #ifndef NDEBUG
136 static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
137 #endif
138 
139 static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
140                                      cl::ReallyHidden, cl::init(false),
141                                      cl::ZeroOrMore, cl::desc("Ignore RecMII"));
142 
143 namespace llvm {
144 
145 // A command line option to enable the CopyToPhi DAG mutation.
146 cl::opt<bool>
147     SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
148                        cl::init(true), cl::ZeroOrMore,
149                        cl::desc("Enable CopyToPhi DAG Mutation"));
150 
151 } // end namespace llvm
152 
153 unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
154 char MachinePipeliner::ID = 0;
155 #ifndef NDEBUG
156 int MachinePipeliner::NumTries = 0;
157 #endif
158 char &llvm::MachinePipelinerID = MachinePipeliner::ID;
159 
160 INITIALIZE_PASS_BEGIN(MachinePipeliner, DEBUG_TYPE,
161                       "Modulo Software Pipelining", false, false)
162 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
163 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
164 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
165 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
166 INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE,
167                     "Modulo Software Pipelining", false, false)
168 
169 /// The "main" function for implementing Swing Modulo Scheduling.
170 bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
171   if (skipFunction(mf.getFunction()))
172     return false;
173 
174   if (!EnableSWP)
175     return false;
176 
177   if (mf.getFunction().getAttributes().hasAttribute(
178           AttributeList::FunctionIndex, Attribute::OptimizeForSize) &&
179       !EnableSWPOptSize.getPosition())
180     return false;
181 
182   // Cannot pipeline loops without instruction itineraries if we are using
183   // DFA for the pipeliner.
184   if (mf.getSubtarget().useDFAforSMS() &&
185       (!mf.getSubtarget().getInstrItineraryData() ||
186        mf.getSubtarget().getInstrItineraryData()->isEmpty()))
187     return false;
188 
189   MF = &mf;
190   MLI = &getAnalysis<MachineLoopInfo>();
191   MDT = &getAnalysis<MachineDominatorTree>();
192   TII = MF->getSubtarget().getInstrInfo();
193   RegClassInfo.runOnMachineFunction(*MF);
194 
195   for (auto &L : *MLI)
196     scheduleLoop(*L);
197 
198   return false;
199 }
200 
201 /// Attempt to perform the SMS algorithm on the specified loop. This function is
202 /// the main entry point for the algorithm.  The function identifies candidate
203 /// loops, calculates the minimum initiation interval, and attempts to schedule
204 /// the loop.
205 bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
206   bool Changed = false;
207   for (auto &InnerLoop : L)
208     Changed |= scheduleLoop(*InnerLoop);
209 
210 #ifndef NDEBUG
211   // Stop trying after reaching the limit (if any).
212   int Limit = SwpLoopLimit;
213   if (Limit >= 0) {
214     if (NumTries >= SwpLoopLimit)
215       return Changed;
216     NumTries++;
217   }
218 #endif
219 
220   setPragmaPipelineOptions(L);
221   if (!canPipelineLoop(L)) {
222     LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
223     return Changed;
224   }
225 
226   ++NumTrytoPipeline;
227 
228   Changed = swingModuloScheduler(L);
229 
230   return Changed;
231 }
232 
233 void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
234   MachineBasicBlock *LBLK = L.getTopBlock();
235 
236   if (LBLK == nullptr)
237     return;
238 
239   const BasicBlock *BBLK = LBLK->getBasicBlock();
240   if (BBLK == nullptr)
241     return;
242 
243   const Instruction *TI = BBLK->getTerminator();
244   if (TI == nullptr)
245     return;
246 
247   MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
248   if (LoopID == nullptr)
249     return;
250 
251   assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
252   assert(LoopID->getOperand(0) == LoopID && "invalid loop");
253 
254   for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
255     MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
256 
257     if (MD == nullptr)
258       continue;
259 
260     MDString *S = dyn_cast<MDString>(MD->getOperand(0));
261 
262     if (S == nullptr)
263       continue;
264 
265     if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
266       assert(MD->getNumOperands() == 2 &&
267              "Pipeline initiation interval hint metadata should have two operands.");
268       II_setByPragma =
269           mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
270       assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
271     } else if (S->getString() == "llvm.loop.pipeline.disable") {
272       disabledByPragma = true;
273     }
274   }
275 }
276 
277 /// Return true if the loop can be software pipelined.  The algorithm is
278 /// restricted to loops with a single basic block.  Make sure that the
279 /// branch in the loop can be analyzed.
280 bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
281   if (L.getNumBlocks() != 1)
282     return false;
283 
284   if (disabledByPragma)
285     return false;
286 
287   // Check if the branch can't be understood because we can't do pipelining
288   // if that's the case.
289   LI.TBB = nullptr;
290   LI.FBB = nullptr;
291   LI.BrCond.clear();
292   if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond))
293     return false;
294 
295   LI.LoopInductionVar = nullptr;
296   LI.LoopCompare = nullptr;
297   if (TII->analyzeLoop(L, LI.LoopInductionVar, LI.LoopCompare))
298     return false;
299 
300   if (!L.getLoopPreheader())
301     return false;
302 
303   // Remove any subregisters from inputs to phi nodes.
304   preprocessPhiNodes(*L.getHeader());
305   return true;
306 }
307 
308 void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
309   MachineRegisterInfo &MRI = MF->getRegInfo();
310   SlotIndexes &Slots = *getAnalysis<LiveIntervals>().getSlotIndexes();
311 
312   for (MachineInstr &PI : make_range(B.begin(), B.getFirstNonPHI())) {
313     MachineOperand &DefOp = PI.getOperand(0);
314     assert(DefOp.getSubReg() == 0);
315     auto *RC = MRI.getRegClass(DefOp.getReg());
316 
317     for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
318       MachineOperand &RegOp = PI.getOperand(i);
319       if (RegOp.getSubReg() == 0)
320         continue;
321 
322       // If the operand uses a subregister, replace it with a new register
323       // without subregisters, and generate a copy to the new register.
324       unsigned NewReg = MRI.createVirtualRegister(RC);
325       MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
326       MachineBasicBlock::iterator At = PredB.getFirstTerminator();
327       const DebugLoc &DL = PredB.findDebugLoc(At);
328       auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
329                     .addReg(RegOp.getReg(), getRegState(RegOp),
330                             RegOp.getSubReg());
331       Slots.insertMachineInstrInMaps(*Copy);
332       RegOp.setReg(NewReg);
333       RegOp.setSubReg(0);
334     }
335   }
336 }
337 
338 /// The SMS algorithm consists of the following main steps:
339 /// 1. Computation and analysis of the dependence graph.
340 /// 2. Ordering of the nodes (instructions).
341 /// 3. Attempt to Schedule the loop.
342 bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
343   assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
344 
345   SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo,
346                         II_setByPragma);
347 
348   MachineBasicBlock *MBB = L.getHeader();
349   // The kernel should not include any terminator instructions.  These
350   // will be added back later.
351   SMS.startBlock(MBB);
352 
353   // Compute the number of 'real' instructions in the basic block by
354   // ignoring terminators.
355   unsigned size = MBB->size();
356   for (MachineBasicBlock::iterator I = MBB->getFirstTerminator(),
357                                    E = MBB->instr_end();
358        I != E; ++I, --size)
359     ;
360 
361   SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
362   SMS.schedule();
363   SMS.exitRegion();
364 
365   SMS.finishBlock();
366   return SMS.hasNewSchedule();
367 }
368 
369 void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
370   if (II_setByPragma > 0)
371     MII = II_setByPragma;
372   else
373     MII = std::max(ResMII, RecMII);
374 }
375 
376 void SwingSchedulerDAG::setMAX_II() {
377   if (II_setByPragma > 0)
378     MAX_II = II_setByPragma;
379   else
380     MAX_II = MII + 10;
381 }
382 
383 /// We override the schedule function in ScheduleDAGInstrs to implement the
384 /// scheduling part of the Swing Modulo Scheduling algorithm.
385 void SwingSchedulerDAG::schedule() {
386   AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
387   buildSchedGraph(AA);
388   addLoopCarriedDependences(AA);
389   updatePhiDependences();
390   Topo.InitDAGTopologicalSorting();
391   changeDependences();
392   postprocessDAG();
393   LLVM_DEBUG(dump());
394 
395   NodeSetType NodeSets;
396   findCircuits(NodeSets);
397   NodeSetType Circuits = NodeSets;
398 
399   // Calculate the MII.
400   unsigned ResMII = calculateResMII();
401   unsigned RecMII = calculateRecMII(NodeSets);
402 
403   fuseRecs(NodeSets);
404 
405   // This flag is used for testing and can cause correctness problems.
406   if (SwpIgnoreRecMII)
407     RecMII = 0;
408 
409   setMII(ResMII, RecMII);
410   setMAX_II();
411 
412   LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
413                     << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
414 
415   // Can't schedule a loop without a valid MII.
416   if (MII == 0)
417     return;
418 
419   // Don't pipeline large loops.
420   if (SwpMaxMii != -1 && (int)MII > SwpMaxMii)
421     return;
422 
423   computeNodeFunctions(NodeSets);
424 
425   registerPressureFilter(NodeSets);
426 
427   colocateNodeSets(NodeSets);
428 
429   checkNodeSets(NodeSets);
430 
431   LLVM_DEBUG({
432     for (auto &I : NodeSets) {
433       dbgs() << "  Rec NodeSet ";
434       I.dump();
435     }
436   });
437 
438   llvm::stable_sort(NodeSets, std::greater<NodeSet>());
439 
440   groupRemainingNodes(NodeSets);
441 
442   removeDuplicateNodes(NodeSets);
443 
444   LLVM_DEBUG({
445     for (auto &I : NodeSets) {
446       dbgs() << "  NodeSet ";
447       I.dump();
448     }
449   });
450 
451   computeNodeOrder(NodeSets);
452 
453   // check for node order issues
454   checkValidNodeOrder(Circuits);
455 
456   SMSchedule Schedule(Pass.MF);
457   Scheduled = schedulePipeline(Schedule);
458 
459   if (!Scheduled)
460     return;
461 
462   unsigned numStages = Schedule.getMaxStageCount();
463   // No need to generate pipeline if there are no overlapped iterations.
464   if (numStages == 0)
465     return;
466 
467   // Check that the maximum stage count is less than user-defined limit.
468   if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages)
469     return;
470 
471   generatePipelinedLoop(Schedule);
472   ++NumPipelined;
473 }
474 
475 /// Clean up after the software pipeliner runs.
476 void SwingSchedulerDAG::finishBlock() {
477   for (MachineInstr *I : NewMIs)
478     MF.DeleteMachineInstr(I);
479   NewMIs.clear();
480 
481   // Call the superclass.
482   ScheduleDAGInstrs::finishBlock();
483 }
484 
485 /// Return the register values for  the operands of a Phi instruction.
486 /// This function assume the instruction is a Phi.
487 static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
488                        unsigned &InitVal, unsigned &LoopVal) {
489   assert(Phi.isPHI() && "Expecting a Phi.");
490 
491   InitVal = 0;
492   LoopVal = 0;
493   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
494     if (Phi.getOperand(i + 1).getMBB() != Loop)
495       InitVal = Phi.getOperand(i).getReg();
496     else
497       LoopVal = Phi.getOperand(i).getReg();
498 
499   assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
500 }
501 
502 /// Return the Phi register value that comes from the incoming block.
503 static unsigned getInitPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
504   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
505     if (Phi.getOperand(i + 1).getMBB() != LoopBB)
506       return Phi.getOperand(i).getReg();
507   return 0;
508 }
509 
510 /// Return the Phi register value that comes the loop block.
511 static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
512   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
513     if (Phi.getOperand(i + 1).getMBB() == LoopBB)
514       return Phi.getOperand(i).getReg();
515   return 0;
516 }
517 
518 /// Return true if SUb can be reached from SUa following the chain edges.
519 static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
520   SmallPtrSet<SUnit *, 8> Visited;
521   SmallVector<SUnit *, 8> Worklist;
522   Worklist.push_back(SUa);
523   while (!Worklist.empty()) {
524     const SUnit *SU = Worklist.pop_back_val();
525     for (auto &SI : SU->Succs) {
526       SUnit *SuccSU = SI.getSUnit();
527       if (SI.getKind() == SDep::Order) {
528         if (Visited.count(SuccSU))
529           continue;
530         if (SuccSU == SUb)
531           return true;
532         Worklist.push_back(SuccSU);
533         Visited.insert(SuccSU);
534       }
535     }
536   }
537   return false;
538 }
539 
540 /// Return true if the instruction causes a chain between memory
541 /// references before and after it.
542 static bool isDependenceBarrier(MachineInstr &MI, AliasAnalysis *AA) {
543   return MI.isCall() || MI.hasUnmodeledSideEffects() ||
544          (MI.hasOrderedMemoryRef() &&
545           (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad(AA)));
546 }
547 
548 /// Return the underlying objects for the memory references of an instruction.
549 /// This function calls the code in ValueTracking, but first checks that the
550 /// instruction has a memory operand.
551 static void getUnderlyingObjects(const MachineInstr *MI,
552                                  SmallVectorImpl<const Value *> &Objs,
553                                  const DataLayout &DL) {
554   if (!MI->hasOneMemOperand())
555     return;
556   MachineMemOperand *MM = *MI->memoperands_begin();
557   if (!MM->getValue())
558     return;
559   GetUnderlyingObjects(MM->getValue(), Objs, DL);
560   for (const Value *V : Objs) {
561     if (!isIdentifiedObject(V)) {
562       Objs.clear();
563       return;
564     }
565     Objs.push_back(V);
566   }
567 }
568 
569 /// Add a chain edge between a load and store if the store can be an
570 /// alias of the load on a subsequent iteration, i.e., a loop carried
571 /// dependence. This code is very similar to the code in ScheduleDAGInstrs
572 /// but that code doesn't create loop carried dependences.
573 void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
574   MapVector<const Value *, SmallVector<SUnit *, 4>> PendingLoads;
575   Value *UnknownValue =
576     UndefValue::get(Type::getVoidTy(MF.getFunction().getContext()));
577   for (auto &SU : SUnits) {
578     MachineInstr &MI = *SU.getInstr();
579     if (isDependenceBarrier(MI, AA))
580       PendingLoads.clear();
581     else if (MI.mayLoad()) {
582       SmallVector<const Value *, 4> Objs;
583       getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
584       if (Objs.empty())
585         Objs.push_back(UnknownValue);
586       for (auto V : Objs) {
587         SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
588         SUs.push_back(&SU);
589       }
590     } else if (MI.mayStore()) {
591       SmallVector<const Value *, 4> Objs;
592       getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
593       if (Objs.empty())
594         Objs.push_back(UnknownValue);
595       for (auto V : Objs) {
596         MapVector<const Value *, SmallVector<SUnit *, 4>>::iterator I =
597             PendingLoads.find(V);
598         if (I == PendingLoads.end())
599           continue;
600         for (auto Load : I->second) {
601           if (isSuccOrder(Load, &SU))
602             continue;
603           MachineInstr &LdMI = *Load->getInstr();
604           // First, perform the cheaper check that compares the base register.
605           // If they are the same and the load offset is less than the store
606           // offset, then mark the dependence as loop carried potentially.
607           const MachineOperand *BaseOp1, *BaseOp2;
608           int64_t Offset1, Offset2;
609           if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1, TRI) &&
610               TII->getMemOperandWithOffset(MI, BaseOp2, Offset2, TRI)) {
611             if (BaseOp1->isIdenticalTo(*BaseOp2) &&
612                 (int)Offset1 < (int)Offset2) {
613               assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI, AA) &&
614                      "What happened to the chain edge?");
615               SDep Dep(Load, SDep::Barrier);
616               Dep.setLatency(1);
617               SU.addPred(Dep);
618               continue;
619             }
620           }
621           // Second, the more expensive check that uses alias analysis on the
622           // base registers. If they alias, and the load offset is less than
623           // the store offset, the mark the dependence as loop carried.
624           if (!AA) {
625             SDep Dep(Load, SDep::Barrier);
626             Dep.setLatency(1);
627             SU.addPred(Dep);
628             continue;
629           }
630           MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
631           MachineMemOperand *MMO2 = *MI.memoperands_begin();
632           if (!MMO1->getValue() || !MMO2->getValue()) {
633             SDep Dep(Load, SDep::Barrier);
634             Dep.setLatency(1);
635             SU.addPred(Dep);
636             continue;
637           }
638           if (MMO1->getValue() == MMO2->getValue() &&
639               MMO1->getOffset() <= MMO2->getOffset()) {
640             SDep Dep(Load, SDep::Barrier);
641             Dep.setLatency(1);
642             SU.addPred(Dep);
643             continue;
644           }
645           AliasResult AAResult = AA->alias(
646               MemoryLocation(MMO1->getValue(), LocationSize::unknown(),
647                              MMO1->getAAInfo()),
648               MemoryLocation(MMO2->getValue(), LocationSize::unknown(),
649                              MMO2->getAAInfo()));
650 
651           if (AAResult != NoAlias) {
652             SDep Dep(Load, SDep::Barrier);
653             Dep.setLatency(1);
654             SU.addPred(Dep);
655           }
656         }
657       }
658     }
659   }
660 }
661 
662 /// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
663 /// processes dependences for PHIs. This function adds true dependences
664 /// from a PHI to a use, and a loop carried dependence from the use to the
665 /// PHI. The loop carried dependence is represented as an anti dependence
666 /// edge. This function also removes chain dependences between unrelated
667 /// PHIs.
668 void SwingSchedulerDAG::updatePhiDependences() {
669   SmallVector<SDep, 4> RemoveDeps;
670   const TargetSubtargetInfo &ST = MF.getSubtarget<TargetSubtargetInfo>();
671 
672   // Iterate over each DAG node.
673   for (SUnit &I : SUnits) {
674     RemoveDeps.clear();
675     // Set to true if the instruction has an operand defined by a Phi.
676     unsigned HasPhiUse = 0;
677     unsigned HasPhiDef = 0;
678     MachineInstr *MI = I.getInstr();
679     // Iterate over each operand, and we process the definitions.
680     for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
681                                     MOE = MI->operands_end();
682          MOI != MOE; ++MOI) {
683       if (!MOI->isReg())
684         continue;
685       unsigned Reg = MOI->getReg();
686       if (MOI->isDef()) {
687         // If the register is used by a Phi, then create an anti dependence.
688         for (MachineRegisterInfo::use_instr_iterator
689                  UI = MRI.use_instr_begin(Reg),
690                  UE = MRI.use_instr_end();
691              UI != UE; ++UI) {
692           MachineInstr *UseMI = &*UI;
693           SUnit *SU = getSUnit(UseMI);
694           if (SU != nullptr && UseMI->isPHI()) {
695             if (!MI->isPHI()) {
696               SDep Dep(SU, SDep::Anti, Reg);
697               Dep.setLatency(1);
698               I.addPred(Dep);
699             } else {
700               HasPhiDef = Reg;
701               // Add a chain edge to a dependent Phi that isn't an existing
702               // predecessor.
703               if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
704                 I.addPred(SDep(SU, SDep::Barrier));
705             }
706           }
707         }
708       } else if (MOI->isUse()) {
709         // If the register is defined by a Phi, then create a true dependence.
710         MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
711         if (DefMI == nullptr)
712           continue;
713         SUnit *SU = getSUnit(DefMI);
714         if (SU != nullptr && DefMI->isPHI()) {
715           if (!MI->isPHI()) {
716             SDep Dep(SU, SDep::Data, Reg);
717             Dep.setLatency(0);
718             ST.adjustSchedDependency(SU, &I, Dep);
719             I.addPred(Dep);
720           } else {
721             HasPhiUse = Reg;
722             // Add a chain edge to a dependent Phi that isn't an existing
723             // predecessor.
724             if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
725               I.addPred(SDep(SU, SDep::Barrier));
726           }
727         }
728       }
729     }
730     // Remove order dependences from an unrelated Phi.
731     if (!SwpPruneDeps)
732       continue;
733     for (auto &PI : I.Preds) {
734       MachineInstr *PMI = PI.getSUnit()->getInstr();
735       if (PMI->isPHI() && PI.getKind() == SDep::Order) {
736         if (I.getInstr()->isPHI()) {
737           if (PMI->getOperand(0).getReg() == HasPhiUse)
738             continue;
739           if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
740             continue;
741         }
742         RemoveDeps.push_back(PI);
743       }
744     }
745     for (int i = 0, e = RemoveDeps.size(); i != e; ++i)
746       I.removePred(RemoveDeps[i]);
747   }
748 }
749 
750 /// Iterate over each DAG node and see if we can change any dependences
751 /// in order to reduce the recurrence MII.
752 void SwingSchedulerDAG::changeDependences() {
753   // See if an instruction can use a value from the previous iteration.
754   // If so, we update the base and offset of the instruction and change
755   // the dependences.
756   for (SUnit &I : SUnits) {
757     unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
758     int64_t NewOffset = 0;
759     if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
760                                NewOffset))
761       continue;
762 
763     // Get the MI and SUnit for the instruction that defines the original base.
764     unsigned OrigBase = I.getInstr()->getOperand(BasePos).getReg();
765     MachineInstr *DefMI = MRI.getUniqueVRegDef(OrigBase);
766     if (!DefMI)
767       continue;
768     SUnit *DefSU = getSUnit(DefMI);
769     if (!DefSU)
770       continue;
771     // Get the MI and SUnit for the instruction that defins the new base.
772     MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
773     if (!LastMI)
774       continue;
775     SUnit *LastSU = getSUnit(LastMI);
776     if (!LastSU)
777       continue;
778 
779     if (Topo.IsReachable(&I, LastSU))
780       continue;
781 
782     // Remove the dependence. The value now depends on a prior iteration.
783     SmallVector<SDep, 4> Deps;
784     for (SUnit::pred_iterator P = I.Preds.begin(), E = I.Preds.end(); P != E;
785          ++P)
786       if (P->getSUnit() == DefSU)
787         Deps.push_back(*P);
788     for (int i = 0, e = Deps.size(); i != e; i++) {
789       Topo.RemovePred(&I, Deps[i].getSUnit());
790       I.removePred(Deps[i]);
791     }
792     // Remove the chain dependence between the instructions.
793     Deps.clear();
794     for (auto &P : LastSU->Preds)
795       if (P.getSUnit() == &I && P.getKind() == SDep::Order)
796         Deps.push_back(P);
797     for (int i = 0, e = Deps.size(); i != e; i++) {
798       Topo.RemovePred(LastSU, Deps[i].getSUnit());
799       LastSU->removePred(Deps[i]);
800     }
801 
802     // Add a dependence between the new instruction and the instruction
803     // that defines the new base.
804     SDep Dep(&I, SDep::Anti, NewBase);
805     Topo.AddPred(LastSU, &I);
806     LastSU->addPred(Dep);
807 
808     // Remember the base and offset information so that we can update the
809     // instruction during code generation.
810     InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
811   }
812 }
813 
814 namespace {
815 
816 // FuncUnitSorter - Comparison operator used to sort instructions by
817 // the number of functional unit choices.
818 struct FuncUnitSorter {
819   const InstrItineraryData *InstrItins;
820   const MCSubtargetInfo *STI;
821   DenseMap<unsigned, unsigned> Resources;
822 
823   FuncUnitSorter(const TargetSubtargetInfo &TSI)
824       : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
825 
826   // Compute the number of functional unit alternatives needed
827   // at each stage, and take the minimum value. We prioritize the
828   // instructions by the least number of choices first.
829   unsigned minFuncUnits(const MachineInstr *Inst, unsigned &F) const {
830     unsigned SchedClass = Inst->getDesc().getSchedClass();
831     unsigned min = UINT_MAX;
832     if (InstrItins && !InstrItins->isEmpty()) {
833       for (const InstrStage &IS :
834            make_range(InstrItins->beginStage(SchedClass),
835                       InstrItins->endStage(SchedClass))) {
836         unsigned funcUnits = IS.getUnits();
837         unsigned numAlternatives = countPopulation(funcUnits);
838         if (numAlternatives < min) {
839           min = numAlternatives;
840           F = funcUnits;
841         }
842       }
843       return min;
844     }
845     if (STI && STI->getSchedModel().hasInstrSchedModel()) {
846       const MCSchedClassDesc *SCDesc =
847           STI->getSchedModel().getSchedClassDesc(SchedClass);
848       if (!SCDesc->isValid())
849         // No valid Schedule Class Desc for schedClass, should be
850         // Pseudo/PostRAPseudo
851         return min;
852 
853       for (const MCWriteProcResEntry &PRE :
854            make_range(STI->getWriteProcResBegin(SCDesc),
855                       STI->getWriteProcResEnd(SCDesc))) {
856         if (!PRE.Cycles)
857           continue;
858         const MCProcResourceDesc *ProcResource =
859             STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
860         unsigned NumUnits = ProcResource->NumUnits;
861         if (NumUnits < min) {
862           min = NumUnits;
863           F = PRE.ProcResourceIdx;
864         }
865       }
866       return min;
867     }
868     llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
869   }
870 
871   // Compute the critical resources needed by the instruction. This
872   // function records the functional units needed by instructions that
873   // must use only one functional unit. We use this as a tie breaker
874   // for computing the resource MII. The instrutions that require
875   // the same, highly used, functional unit have high priority.
876   void calcCriticalResources(MachineInstr &MI) {
877     unsigned SchedClass = MI.getDesc().getSchedClass();
878     if (InstrItins && !InstrItins->isEmpty()) {
879       for (const InstrStage &IS :
880            make_range(InstrItins->beginStage(SchedClass),
881                       InstrItins->endStage(SchedClass))) {
882         unsigned FuncUnits = IS.getUnits();
883         if (countPopulation(FuncUnits) == 1)
884           Resources[FuncUnits]++;
885       }
886       return;
887     }
888     if (STI && STI->getSchedModel().hasInstrSchedModel()) {
889       const MCSchedClassDesc *SCDesc =
890           STI->getSchedModel().getSchedClassDesc(SchedClass);
891       if (!SCDesc->isValid())
892         // No valid Schedule Class Desc for schedClass, should be
893         // Pseudo/PostRAPseudo
894         return;
895 
896       for (const MCWriteProcResEntry &PRE :
897            make_range(STI->getWriteProcResBegin(SCDesc),
898                       STI->getWriteProcResEnd(SCDesc))) {
899         if (!PRE.Cycles)
900           continue;
901         Resources[PRE.ProcResourceIdx]++;
902       }
903       return;
904     }
905     llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
906   }
907 
908   /// Return true if IS1 has less priority than IS2.
909   bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
910     unsigned F1 = 0, F2 = 0;
911     unsigned MFUs1 = minFuncUnits(IS1, F1);
912     unsigned MFUs2 = minFuncUnits(IS2, F2);
913     if (MFUs1 == 1 && MFUs2 == 1)
914       return Resources.lookup(F1) < Resources.lookup(F2);
915     return MFUs1 > MFUs2;
916   }
917 };
918 
919 } // end anonymous namespace
920 
921 /// Calculate the resource constrained minimum initiation interval for the
922 /// specified loop. We use the DFA to model the resources needed for
923 /// each instruction, and we ignore dependences. A different DFA is created
924 /// for each cycle that is required. When adding a new instruction, we attempt
925 /// to add it to each existing DFA, until a legal space is found. If the
926 /// instruction cannot be reserved in an existing DFA, we create a new one.
927 unsigned SwingSchedulerDAG::calculateResMII() {
928 
929   SmallVector<ResourceManager*, 8> Resources;
930   MachineBasicBlock *MBB = Loop.getHeader();
931   Resources.push_back(new ResourceManager(&MF.getSubtarget()));
932 
933   // Sort the instructions by the number of available choices for scheduling,
934   // least to most. Use the number of critical resources as the tie breaker.
935   FuncUnitSorter FUS = FuncUnitSorter(MF.getSubtarget());
936   for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
937                                    E = MBB->getFirstTerminator();
938        I != E; ++I)
939     FUS.calcCriticalResources(*I);
940   PriorityQueue<MachineInstr *, std::vector<MachineInstr *>, FuncUnitSorter>
941       FuncUnitOrder(FUS);
942 
943   for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
944                                    E = MBB->getFirstTerminator();
945        I != E; ++I)
946     FuncUnitOrder.push(&*I);
947 
948   while (!FuncUnitOrder.empty()) {
949     MachineInstr *MI = FuncUnitOrder.top();
950     FuncUnitOrder.pop();
951     if (TII->isZeroCost(MI->getOpcode()))
952       continue;
953     // Attempt to reserve the instruction in an existing DFA. At least one
954     // DFA is needed for each cycle.
955     unsigned NumCycles = getSUnit(MI)->Latency;
956     unsigned ReservedCycles = 0;
957     SmallVectorImpl<ResourceManager *>::iterator RI = Resources.begin();
958     SmallVectorImpl<ResourceManager *>::iterator RE = Resources.end();
959     for (unsigned C = 0; C < NumCycles; ++C)
960       while (RI != RE) {
961         if ((*RI++)->canReserveResources(*MI)) {
962           ++ReservedCycles;
963           break;
964         }
965       }
966     // Start reserving resources using existing DFAs.
967     for (unsigned C = 0; C < ReservedCycles; ++C) {
968       --RI;
969       (*RI)->reserveResources(*MI);
970     }
971     // Add new DFAs, if needed, to reserve resources.
972     for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
973       ResourceManager *NewResource = new ResourceManager(&MF.getSubtarget());
974       assert(NewResource->canReserveResources(*MI) && "Reserve error.");
975       NewResource->reserveResources(*MI);
976       Resources.push_back(NewResource);
977     }
978   }
979   int Resmii = Resources.size();
980   // Delete the memory for each of the DFAs that were created earlier.
981   for (ResourceManager *RI : Resources) {
982     ResourceManager *D = RI;
983     delete D;
984   }
985   Resources.clear();
986   return Resmii;
987 }
988 
989 /// Calculate the recurrence-constrainted minimum initiation interval.
990 /// Iterate over each circuit.  Compute the delay(c) and distance(c)
991 /// for each circuit. The II needs to satisfy the inequality
992 /// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
993 /// II that satisfies the inequality, and the RecMII is the maximum
994 /// of those values.
995 unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
996   unsigned RecMII = 0;
997 
998   for (NodeSet &Nodes : NodeSets) {
999     if (Nodes.empty())
1000       continue;
1001 
1002     unsigned Delay = Nodes.getLatency();
1003     unsigned Distance = 1;
1004 
1005     // ii = ceil(delay / distance)
1006     unsigned CurMII = (Delay + Distance - 1) / Distance;
1007     Nodes.setRecMII(CurMII);
1008     if (CurMII > RecMII)
1009       RecMII = CurMII;
1010   }
1011 
1012   return RecMII;
1013 }
1014 
1015 /// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1016 /// but we do this to find the circuits, and then change them back.
1017 static void swapAntiDependences(std::vector<SUnit> &SUnits) {
1018   SmallVector<std::pair<SUnit *, SDep>, 8> DepsAdded;
1019   for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
1020     SUnit *SU = &SUnits[i];
1021     for (SUnit::pred_iterator IP = SU->Preds.begin(), EP = SU->Preds.end();
1022          IP != EP; ++IP) {
1023       if (IP->getKind() != SDep::Anti)
1024         continue;
1025       DepsAdded.push_back(std::make_pair(SU, *IP));
1026     }
1027   }
1028   for (SmallVector<std::pair<SUnit *, SDep>, 8>::iterator I = DepsAdded.begin(),
1029                                                           E = DepsAdded.end();
1030        I != E; ++I) {
1031     // Remove this anti dependency and add one in the reverse direction.
1032     SUnit *SU = I->first;
1033     SDep &D = I->second;
1034     SUnit *TargetSU = D.getSUnit();
1035     unsigned Reg = D.getReg();
1036     unsigned Lat = D.getLatency();
1037     SU->removePred(D);
1038     SDep Dep(SU, SDep::Anti, Reg);
1039     Dep.setLatency(Lat);
1040     TargetSU->addPred(Dep);
1041   }
1042 }
1043 
1044 /// Create the adjacency structure of the nodes in the graph.
1045 void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1046     SwingSchedulerDAG *DAG) {
1047   BitVector Added(SUnits.size());
1048   DenseMap<int, int> OutputDeps;
1049   for (int i = 0, e = SUnits.size(); i != e; ++i) {
1050     Added.reset();
1051     // Add any successor to the adjacency matrix and exclude duplicates.
1052     for (auto &SI : SUnits[i].Succs) {
1053       // Only create a back-edge on the first and last nodes of a dependence
1054       // chain. This records any chains and adds them later.
1055       if (SI.getKind() == SDep::Output) {
1056         int N = SI.getSUnit()->NodeNum;
1057         int BackEdge = i;
1058         auto Dep = OutputDeps.find(BackEdge);
1059         if (Dep != OutputDeps.end()) {
1060           BackEdge = Dep->second;
1061           OutputDeps.erase(Dep);
1062         }
1063         OutputDeps[N] = BackEdge;
1064       }
1065       // Do not process a boundary node, an artificial node.
1066       // A back-edge is processed only if it goes to a Phi.
1067       if (SI.getSUnit()->isBoundaryNode() || SI.isArtificial() ||
1068           (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI()))
1069         continue;
1070       int N = SI.getSUnit()->NodeNum;
1071       if (!Added.test(N)) {
1072         AdjK[i].push_back(N);
1073         Added.set(N);
1074       }
1075     }
1076     // A chain edge between a store and a load is treated as a back-edge in the
1077     // adjacency matrix.
1078     for (auto &PI : SUnits[i].Preds) {
1079       if (!SUnits[i].getInstr()->mayStore() ||
1080           !DAG->isLoopCarriedDep(&SUnits[i], PI, false))
1081         continue;
1082       if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
1083         int N = PI.getSUnit()->NodeNum;
1084         if (!Added.test(N)) {
1085           AdjK[i].push_back(N);
1086           Added.set(N);
1087         }
1088       }
1089     }
1090   }
1091   // Add back-edges in the adjacency matrix for the output dependences.
1092   for (auto &OD : OutputDeps)
1093     if (!Added.test(OD.second)) {
1094       AdjK[OD.first].push_back(OD.second);
1095       Added.set(OD.second);
1096     }
1097 }
1098 
1099 /// Identify an elementary circuit in the dependence graph starting at the
1100 /// specified node.
1101 bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1102                                           bool HasBackedge) {
1103   SUnit *SV = &SUnits[V];
1104   bool F = false;
1105   Stack.insert(SV);
1106   Blocked.set(V);
1107 
1108   for (auto W : AdjK[V]) {
1109     if (NumPaths > MaxPaths)
1110       break;
1111     if (W < S)
1112       continue;
1113     if (W == S) {
1114       if (!HasBackedge)
1115         NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
1116       F = true;
1117       ++NumPaths;
1118       break;
1119     } else if (!Blocked.test(W)) {
1120       if (circuit(W, S, NodeSets,
1121                   Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
1122         F = true;
1123     }
1124   }
1125 
1126   if (F)
1127     unblock(V);
1128   else {
1129     for (auto W : AdjK[V]) {
1130       if (W < S)
1131         continue;
1132       if (B[W].count(SV) == 0)
1133         B[W].insert(SV);
1134     }
1135   }
1136   Stack.pop_back();
1137   return F;
1138 }
1139 
1140 /// Unblock a node in the circuit finding algorithm.
1141 void SwingSchedulerDAG::Circuits::unblock(int U) {
1142   Blocked.reset(U);
1143   SmallPtrSet<SUnit *, 4> &BU = B[U];
1144   while (!BU.empty()) {
1145     SmallPtrSet<SUnit *, 4>::iterator SI = BU.begin();
1146     assert(SI != BU.end() && "Invalid B set.");
1147     SUnit *W = *SI;
1148     BU.erase(W);
1149     if (Blocked.test(W->NodeNum))
1150       unblock(W->NodeNum);
1151   }
1152 }
1153 
1154 /// Identify all the elementary circuits in the dependence graph using
1155 /// Johnson's circuit algorithm.
1156 void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1157   // Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1158   // but we do this to find the circuits, and then change them back.
1159   swapAntiDependences(SUnits);
1160 
1161   Circuits Cir(SUnits, Topo);
1162   // Create the adjacency structure.
1163   Cir.createAdjacencyStructure(this);
1164   for (int i = 0, e = SUnits.size(); i != e; ++i) {
1165     Cir.reset();
1166     Cir.circuit(i, i, NodeSets);
1167   }
1168 
1169   // Change the dependences back so that we've created a DAG again.
1170   swapAntiDependences(SUnits);
1171 }
1172 
1173 // Create artificial dependencies between the source of COPY/REG_SEQUENCE that
1174 // is loop-carried to the USE in next iteration. This will help pipeliner avoid
1175 // additional copies that are needed across iterations. An artificial dependence
1176 // edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
1177 
1178 // PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
1179 // SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
1180 // PHI-------True-Dep------> USEOfPhi
1181 
1182 // The mutation creates
1183 // USEOfPHI -------Artificial-Dep---> SRCOfCopy
1184 
1185 // This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
1186 // (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
1187 // late  to avoid additional copies across iterations. The possible scheduling
1188 // order would be
1189 // USEOfPHI --- SRCOfCopy---  COPY/REG_SEQUENCE.
1190 
1191 void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
1192   for (SUnit &SU : DAG->SUnits) {
1193     // Find the COPY/REG_SEQUENCE instruction.
1194     if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
1195       continue;
1196 
1197     // Record the loop carried PHIs.
1198     SmallVector<SUnit *, 4> PHISUs;
1199     // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
1200     SmallVector<SUnit *, 4> SrcSUs;
1201 
1202     for (auto &Dep : SU.Preds) {
1203       SUnit *TmpSU = Dep.getSUnit();
1204       MachineInstr *TmpMI = TmpSU->getInstr();
1205       SDep::Kind DepKind = Dep.getKind();
1206       // Save the loop carried PHI.
1207       if (DepKind == SDep::Anti && TmpMI->isPHI())
1208         PHISUs.push_back(TmpSU);
1209       // Save the source of COPY/REG_SEQUENCE.
1210       // If the source has no pre-decessors, we will end up creating cycles.
1211       else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
1212         SrcSUs.push_back(TmpSU);
1213     }
1214 
1215     if (PHISUs.size() == 0 || SrcSUs.size() == 0)
1216       continue;
1217 
1218     // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
1219     // SUnit to the container.
1220     SmallVector<SUnit *, 8> UseSUs;
1221     for (auto I = PHISUs.begin(); I != PHISUs.end(); ++I) {
1222       for (auto &Dep : (*I)->Succs) {
1223         if (Dep.getKind() != SDep::Data)
1224           continue;
1225 
1226         SUnit *TmpSU = Dep.getSUnit();
1227         MachineInstr *TmpMI = TmpSU->getInstr();
1228         if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
1229           PHISUs.push_back(TmpSU);
1230           continue;
1231         }
1232         UseSUs.push_back(TmpSU);
1233       }
1234     }
1235 
1236     if (UseSUs.size() == 0)
1237       continue;
1238 
1239     SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
1240     // Add the artificial dependencies if it does not form a cycle.
1241     for (auto I : UseSUs) {
1242       for (auto Src : SrcSUs) {
1243         if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
1244           Src->addPred(SDep(I, SDep::Artificial));
1245           SDAG->Topo.AddPred(Src, I);
1246         }
1247       }
1248     }
1249   }
1250 }
1251 
1252 /// Return true for DAG nodes that we ignore when computing the cost functions.
1253 /// We ignore the back-edge recurrence in order to avoid unbounded recursion
1254 /// in the calculation of the ASAP, ALAP, etc functions.
1255 static bool ignoreDependence(const SDep &D, bool isPred) {
1256   if (D.isArtificial())
1257     return true;
1258   return D.getKind() == SDep::Anti && isPred;
1259 }
1260 
1261 /// Compute several functions need to order the nodes for scheduling.
1262 ///  ASAP - Earliest time to schedule a node.
1263 ///  ALAP - Latest time to schedule a node.
1264 ///  MOV - Mobility function, difference between ALAP and ASAP.
1265 ///  D - Depth of each node.
1266 ///  H - Height of each node.
1267 void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
1268   ScheduleInfo.resize(SUnits.size());
1269 
1270   LLVM_DEBUG({
1271     for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(),
1272                                                     E = Topo.end();
1273          I != E; ++I) {
1274       const SUnit &SU = SUnits[*I];
1275       dumpNode(SU);
1276     }
1277   });
1278 
1279   int maxASAP = 0;
1280   // Compute ASAP and ZeroLatencyDepth.
1281   for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(),
1282                                                   E = Topo.end();
1283        I != E; ++I) {
1284     int asap = 0;
1285     int zeroLatencyDepth = 0;
1286     SUnit *SU = &SUnits[*I];
1287     for (SUnit::const_pred_iterator IP = SU->Preds.begin(),
1288                                     EP = SU->Preds.end();
1289          IP != EP; ++IP) {
1290       SUnit *pred = IP->getSUnit();
1291       if (IP->getLatency() == 0)
1292         zeroLatencyDepth =
1293             std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
1294       if (ignoreDependence(*IP, true))
1295         continue;
1296       asap = std::max(asap, (int)(getASAP(pred) + IP->getLatency() -
1297                                   getDistance(pred, SU, *IP) * MII));
1298     }
1299     maxASAP = std::max(maxASAP, asap);
1300     ScheduleInfo[*I].ASAP = asap;
1301     ScheduleInfo[*I].ZeroLatencyDepth = zeroLatencyDepth;
1302   }
1303 
1304   // Compute ALAP, ZeroLatencyHeight, and MOV.
1305   for (ScheduleDAGTopologicalSort::const_reverse_iterator I = Topo.rbegin(),
1306                                                           E = Topo.rend();
1307        I != E; ++I) {
1308     int alap = maxASAP;
1309     int zeroLatencyHeight = 0;
1310     SUnit *SU = &SUnits[*I];
1311     for (SUnit::const_succ_iterator IS = SU->Succs.begin(),
1312                                     ES = SU->Succs.end();
1313          IS != ES; ++IS) {
1314       SUnit *succ = IS->getSUnit();
1315       if (IS->getLatency() == 0)
1316         zeroLatencyHeight =
1317             std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
1318       if (ignoreDependence(*IS, true))
1319         continue;
1320       alap = std::min(alap, (int)(getALAP(succ) - IS->getLatency() +
1321                                   getDistance(SU, succ, *IS) * MII));
1322     }
1323 
1324     ScheduleInfo[*I].ALAP = alap;
1325     ScheduleInfo[*I].ZeroLatencyHeight = zeroLatencyHeight;
1326   }
1327 
1328   // After computing the node functions, compute the summary for each node set.
1329   for (NodeSet &I : NodeSets)
1330     I.computeNodeSetInfo(this);
1331 
1332   LLVM_DEBUG({
1333     for (unsigned i = 0; i < SUnits.size(); i++) {
1334       dbgs() << "\tNode " << i << ":\n";
1335       dbgs() << "\t   ASAP = " << getASAP(&SUnits[i]) << "\n";
1336       dbgs() << "\t   ALAP = " << getALAP(&SUnits[i]) << "\n";
1337       dbgs() << "\t   MOV  = " << getMOV(&SUnits[i]) << "\n";
1338       dbgs() << "\t   D    = " << getDepth(&SUnits[i]) << "\n";
1339       dbgs() << "\t   H    = " << getHeight(&SUnits[i]) << "\n";
1340       dbgs() << "\t   ZLD  = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
1341       dbgs() << "\t   ZLH  = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
1342     }
1343   });
1344 }
1345 
1346 /// Compute the Pred_L(O) set, as defined in the paper. The set is defined
1347 /// as the predecessors of the elements of NodeOrder that are not also in
1348 /// NodeOrder.
1349 static bool pred_L(SetVector<SUnit *> &NodeOrder,
1350                    SmallSetVector<SUnit *, 8> &Preds,
1351                    const NodeSet *S = nullptr) {
1352   Preds.clear();
1353   for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end();
1354        I != E; ++I) {
1355     for (SUnit::pred_iterator PI = (*I)->Preds.begin(), PE = (*I)->Preds.end();
1356          PI != PE; ++PI) {
1357       if (S && S->count(PI->getSUnit()) == 0)
1358         continue;
1359       if (ignoreDependence(*PI, true))
1360         continue;
1361       if (NodeOrder.count(PI->getSUnit()) == 0)
1362         Preds.insert(PI->getSUnit());
1363     }
1364     // Back-edges are predecessors with an anti-dependence.
1365     for (SUnit::const_succ_iterator IS = (*I)->Succs.begin(),
1366                                     ES = (*I)->Succs.end();
1367          IS != ES; ++IS) {
1368       if (IS->getKind() != SDep::Anti)
1369         continue;
1370       if (S && S->count(IS->getSUnit()) == 0)
1371         continue;
1372       if (NodeOrder.count(IS->getSUnit()) == 0)
1373         Preds.insert(IS->getSUnit());
1374     }
1375   }
1376   return !Preds.empty();
1377 }
1378 
1379 /// Compute the Succ_L(O) set, as defined in the paper. The set is defined
1380 /// as the successors of the elements of NodeOrder that are not also in
1381 /// NodeOrder.
1382 static bool succ_L(SetVector<SUnit *> &NodeOrder,
1383                    SmallSetVector<SUnit *, 8> &Succs,
1384                    const NodeSet *S = nullptr) {
1385   Succs.clear();
1386   for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end();
1387        I != E; ++I) {
1388     for (SUnit::succ_iterator SI = (*I)->Succs.begin(), SE = (*I)->Succs.end();
1389          SI != SE; ++SI) {
1390       if (S && S->count(SI->getSUnit()) == 0)
1391         continue;
1392       if (ignoreDependence(*SI, false))
1393         continue;
1394       if (NodeOrder.count(SI->getSUnit()) == 0)
1395         Succs.insert(SI->getSUnit());
1396     }
1397     for (SUnit::const_pred_iterator PI = (*I)->Preds.begin(),
1398                                     PE = (*I)->Preds.end();
1399          PI != PE; ++PI) {
1400       if (PI->getKind() != SDep::Anti)
1401         continue;
1402       if (S && S->count(PI->getSUnit()) == 0)
1403         continue;
1404       if (NodeOrder.count(PI->getSUnit()) == 0)
1405         Succs.insert(PI->getSUnit());
1406     }
1407   }
1408   return !Succs.empty();
1409 }
1410 
1411 /// Return true if there is a path from the specified node to any of the nodes
1412 /// in DestNodes. Keep track and return the nodes in any path.
1413 static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
1414                         SetVector<SUnit *> &DestNodes,
1415                         SetVector<SUnit *> &Exclude,
1416                         SmallPtrSet<SUnit *, 8> &Visited) {
1417   if (Cur->isBoundaryNode())
1418     return false;
1419   if (Exclude.count(Cur) != 0)
1420     return false;
1421   if (DestNodes.count(Cur) != 0)
1422     return true;
1423   if (!Visited.insert(Cur).second)
1424     return Path.count(Cur) != 0;
1425   bool FoundPath = false;
1426   for (auto &SI : Cur->Succs)
1427     FoundPath |= computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
1428   for (auto &PI : Cur->Preds)
1429     if (PI.getKind() == SDep::Anti)
1430       FoundPath |=
1431           computePath(PI.getSUnit(), Path, DestNodes, Exclude, Visited);
1432   if (FoundPath)
1433     Path.insert(Cur);
1434   return FoundPath;
1435 }
1436 
1437 /// Return true if Set1 is a subset of Set2.
1438 template <class S1Ty, class S2Ty> static bool isSubset(S1Ty &Set1, S2Ty &Set2) {
1439   for (typename S1Ty::iterator I = Set1.begin(), E = Set1.end(); I != E; ++I)
1440     if (Set2.count(*I) == 0)
1441       return false;
1442   return true;
1443 }
1444 
1445 /// Compute the live-out registers for the instructions in a node-set.
1446 /// The live-out registers are those that are defined in the node-set,
1447 /// but not used. Except for use operands of Phis.
1448 static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker,
1449                             NodeSet &NS) {
1450   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1451   MachineRegisterInfo &MRI = MF.getRegInfo();
1452   SmallVector<RegisterMaskPair, 8> LiveOutRegs;
1453   SmallSet<unsigned, 4> Uses;
1454   for (SUnit *SU : NS) {
1455     const MachineInstr *MI = SU->getInstr();
1456     if (MI->isPHI())
1457       continue;
1458     for (const MachineOperand &MO : MI->operands())
1459       if (MO.isReg() && MO.isUse()) {
1460         unsigned Reg = MO.getReg();
1461         if (TargetRegisterInfo::isVirtualRegister(Reg))
1462           Uses.insert(Reg);
1463         else if (MRI.isAllocatable(Reg))
1464           for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
1465             Uses.insert(*Units);
1466       }
1467   }
1468   for (SUnit *SU : NS)
1469     for (const MachineOperand &MO : SU->getInstr()->operands())
1470       if (MO.isReg() && MO.isDef() && !MO.isDead()) {
1471         unsigned Reg = MO.getReg();
1472         if (TargetRegisterInfo::isVirtualRegister(Reg)) {
1473           if (!Uses.count(Reg))
1474             LiveOutRegs.push_back(RegisterMaskPair(Reg,
1475                                                    LaneBitmask::getNone()));
1476         } else if (MRI.isAllocatable(Reg)) {
1477           for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
1478             if (!Uses.count(*Units))
1479               LiveOutRegs.push_back(RegisterMaskPair(*Units,
1480                                                      LaneBitmask::getNone()));
1481         }
1482       }
1483   RPTracker.addLiveRegs(LiveOutRegs);
1484 }
1485 
1486 /// A heuristic to filter nodes in recurrent node-sets if the register
1487 /// pressure of a set is too high.
1488 void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
1489   for (auto &NS : NodeSets) {
1490     // Skip small node-sets since they won't cause register pressure problems.
1491     if (NS.size() <= 2)
1492       continue;
1493     IntervalPressure RecRegPressure;
1494     RegPressureTracker RecRPTracker(RecRegPressure);
1495     RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
1496     computeLiveOuts(MF, RecRPTracker, NS);
1497     RecRPTracker.closeBottom();
1498 
1499     std::vector<SUnit *> SUnits(NS.begin(), NS.end());
1500     llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
1501       return A->NodeNum > B->NodeNum;
1502     });
1503 
1504     for (auto &SU : SUnits) {
1505       // Since we're computing the register pressure for a subset of the
1506       // instructions in a block, we need to set the tracker for each
1507       // instruction in the node-set. The tracker is set to the instruction
1508       // just after the one we're interested in.
1509       MachineBasicBlock::const_iterator CurInstI = SU->getInstr();
1510       RecRPTracker.setPos(std::next(CurInstI));
1511 
1512       RegPressureDelta RPDelta;
1513       ArrayRef<PressureChange> CriticalPSets;
1514       RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
1515                                              CriticalPSets,
1516                                              RecRegPressure.MaxSetPressure);
1517       if (RPDelta.Excess.isValid()) {
1518         LLVM_DEBUG(
1519             dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
1520                    << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
1521                    << ":" << RPDelta.Excess.getUnitInc());
1522         NS.setExceedPressure(SU);
1523         break;
1524       }
1525       RecRPTracker.recede();
1526     }
1527   }
1528 }
1529 
1530 /// A heuristic to colocate node sets that have the same set of
1531 /// successors.
1532 void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
1533   unsigned Colocate = 0;
1534   for (int i = 0, e = NodeSets.size(); i < e; ++i) {
1535     NodeSet &N1 = NodeSets[i];
1536     SmallSetVector<SUnit *, 8> S1;
1537     if (N1.empty() || !succ_L(N1, S1))
1538       continue;
1539     for (int j = i + 1; j < e; ++j) {
1540       NodeSet &N2 = NodeSets[j];
1541       if (N1.compareRecMII(N2) != 0)
1542         continue;
1543       SmallSetVector<SUnit *, 8> S2;
1544       if (N2.empty() || !succ_L(N2, S2))
1545         continue;
1546       if (isSubset(S1, S2) && S1.size() == S2.size()) {
1547         N1.setColocate(++Colocate);
1548         N2.setColocate(Colocate);
1549         break;
1550       }
1551     }
1552   }
1553 }
1554 
1555 /// Check if the existing node-sets are profitable. If not, then ignore the
1556 /// recurrent node-sets, and attempt to schedule all nodes together. This is
1557 /// a heuristic. If the MII is large and all the recurrent node-sets are small,
1558 /// then it's best to try to schedule all instructions together instead of
1559 /// starting with the recurrent node-sets.
1560 void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
1561   // Look for loops with a large MII.
1562   if (MII < 17)
1563     return;
1564   // Check if the node-set contains only a simple add recurrence.
1565   for (auto &NS : NodeSets) {
1566     if (NS.getRecMII() > 2)
1567       return;
1568     if (NS.getMaxDepth() > MII)
1569       return;
1570   }
1571   NodeSets.clear();
1572   LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
1573   return;
1574 }
1575 
1576 /// Add the nodes that do not belong to a recurrence set into groups
1577 /// based upon connected componenets.
1578 void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
1579   SetVector<SUnit *> NodesAdded;
1580   SmallPtrSet<SUnit *, 8> Visited;
1581   // Add the nodes that are on a path between the previous node sets and
1582   // the current node set.
1583   for (NodeSet &I : NodeSets) {
1584     SmallSetVector<SUnit *, 8> N;
1585     // Add the nodes from the current node set to the previous node set.
1586     if (succ_L(I, N)) {
1587       SetVector<SUnit *> Path;
1588       for (SUnit *NI : N) {
1589         Visited.clear();
1590         computePath(NI, Path, NodesAdded, I, Visited);
1591       }
1592       if (!Path.empty())
1593         I.insert(Path.begin(), Path.end());
1594     }
1595     // Add the nodes from the previous node set to the current node set.
1596     N.clear();
1597     if (succ_L(NodesAdded, N)) {
1598       SetVector<SUnit *> Path;
1599       for (SUnit *NI : N) {
1600         Visited.clear();
1601         computePath(NI, Path, I, NodesAdded, Visited);
1602       }
1603       if (!Path.empty())
1604         I.insert(Path.begin(), Path.end());
1605     }
1606     NodesAdded.insert(I.begin(), I.end());
1607   }
1608 
1609   // Create a new node set with the connected nodes of any successor of a node
1610   // in a recurrent set.
1611   NodeSet NewSet;
1612   SmallSetVector<SUnit *, 8> N;
1613   if (succ_L(NodesAdded, N))
1614     for (SUnit *I : N)
1615       addConnectedNodes(I, NewSet, NodesAdded);
1616   if (!NewSet.empty())
1617     NodeSets.push_back(NewSet);
1618 
1619   // Create a new node set with the connected nodes of any predecessor of a node
1620   // in a recurrent set.
1621   NewSet.clear();
1622   if (pred_L(NodesAdded, N))
1623     for (SUnit *I : N)
1624       addConnectedNodes(I, NewSet, NodesAdded);
1625   if (!NewSet.empty())
1626     NodeSets.push_back(NewSet);
1627 
1628   // Create new nodes sets with the connected nodes any remaining node that
1629   // has no predecessor.
1630   for (unsigned i = 0; i < SUnits.size(); ++i) {
1631     SUnit *SU = &SUnits[i];
1632     if (NodesAdded.count(SU) == 0) {
1633       NewSet.clear();
1634       addConnectedNodes(SU, NewSet, NodesAdded);
1635       if (!NewSet.empty())
1636         NodeSets.push_back(NewSet);
1637     }
1638   }
1639 }
1640 
1641 /// Add the node to the set, and add all of its connected nodes to the set.
1642 void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
1643                                           SetVector<SUnit *> &NodesAdded) {
1644   NewSet.insert(SU);
1645   NodesAdded.insert(SU);
1646   for (auto &SI : SU->Succs) {
1647     SUnit *Successor = SI.getSUnit();
1648     if (!SI.isArtificial() && NodesAdded.count(Successor) == 0)
1649       addConnectedNodes(Successor, NewSet, NodesAdded);
1650   }
1651   for (auto &PI : SU->Preds) {
1652     SUnit *Predecessor = PI.getSUnit();
1653     if (!PI.isArtificial() && NodesAdded.count(Predecessor) == 0)
1654       addConnectedNodes(Predecessor, NewSet, NodesAdded);
1655   }
1656 }
1657 
1658 /// Return true if Set1 contains elements in Set2. The elements in common
1659 /// are returned in a different container.
1660 static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
1661                         SmallSetVector<SUnit *, 8> &Result) {
1662   Result.clear();
1663   for (unsigned i = 0, e = Set1.size(); i != e; ++i) {
1664     SUnit *SU = Set1[i];
1665     if (Set2.count(SU) != 0)
1666       Result.insert(SU);
1667   }
1668   return !Result.empty();
1669 }
1670 
1671 /// Merge the recurrence node sets that have the same initial node.
1672 void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
1673   for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
1674        ++I) {
1675     NodeSet &NI = *I;
1676     for (NodeSetType::iterator J = I + 1; J != E;) {
1677       NodeSet &NJ = *J;
1678       if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
1679         if (NJ.compareRecMII(NI) > 0)
1680           NI.setRecMII(NJ.getRecMII());
1681         for (NodeSet::iterator NII = J->begin(), ENI = J->end(); NII != ENI;
1682              ++NII)
1683           I->insert(*NII);
1684         NodeSets.erase(J);
1685         E = NodeSets.end();
1686       } else {
1687         ++J;
1688       }
1689     }
1690   }
1691 }
1692 
1693 /// Remove nodes that have been scheduled in previous NodeSets.
1694 void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
1695   for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
1696        ++I)
1697     for (NodeSetType::iterator J = I + 1; J != E;) {
1698       J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
1699 
1700       if (J->empty()) {
1701         NodeSets.erase(J);
1702         E = NodeSets.end();
1703       } else {
1704         ++J;
1705       }
1706     }
1707 }
1708 
1709 /// Compute an ordered list of the dependence graph nodes, which
1710 /// indicates the order that the nodes will be scheduled.  This is a
1711 /// two-level algorithm. First, a partial order is created, which
1712 /// consists of a list of sets ordered from highest to lowest priority.
1713 void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
1714   SmallSetVector<SUnit *, 8> R;
1715   NodeOrder.clear();
1716 
1717   for (auto &Nodes : NodeSets) {
1718     LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
1719     OrderKind Order;
1720     SmallSetVector<SUnit *, 8> N;
1721     if (pred_L(NodeOrder, N) && isSubset(N, Nodes)) {
1722       R.insert(N.begin(), N.end());
1723       Order = BottomUp;
1724       LLVM_DEBUG(dbgs() << "  Bottom up (preds) ");
1725     } else if (succ_L(NodeOrder, N) && isSubset(N, Nodes)) {
1726       R.insert(N.begin(), N.end());
1727       Order = TopDown;
1728       LLVM_DEBUG(dbgs() << "  Top down (succs) ");
1729     } else if (isIntersect(N, Nodes, R)) {
1730       // If some of the successors are in the existing node-set, then use the
1731       // top-down ordering.
1732       Order = TopDown;
1733       LLVM_DEBUG(dbgs() << "  Top down (intersect) ");
1734     } else if (NodeSets.size() == 1) {
1735       for (auto &N : Nodes)
1736         if (N->Succs.size() == 0)
1737           R.insert(N);
1738       Order = BottomUp;
1739       LLVM_DEBUG(dbgs() << "  Bottom up (all) ");
1740     } else {
1741       // Find the node with the highest ASAP.
1742       SUnit *maxASAP = nullptr;
1743       for (SUnit *SU : Nodes) {
1744         if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
1745             (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
1746           maxASAP = SU;
1747       }
1748       R.insert(maxASAP);
1749       Order = BottomUp;
1750       LLVM_DEBUG(dbgs() << "  Bottom up (default) ");
1751     }
1752 
1753     while (!R.empty()) {
1754       if (Order == TopDown) {
1755         // Choose the node with the maximum height.  If more than one, choose
1756         // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
1757         // choose the node with the lowest MOV.
1758         while (!R.empty()) {
1759           SUnit *maxHeight = nullptr;
1760           for (SUnit *I : R) {
1761             if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
1762               maxHeight = I;
1763             else if (getHeight(I) == getHeight(maxHeight) &&
1764                      getZeroLatencyHeight(I) > getZeroLatencyHeight(maxHeight))
1765               maxHeight = I;
1766             else if (getHeight(I) == getHeight(maxHeight) &&
1767                      getZeroLatencyHeight(I) ==
1768                          getZeroLatencyHeight(maxHeight) &&
1769                      getMOV(I) < getMOV(maxHeight))
1770               maxHeight = I;
1771           }
1772           NodeOrder.insert(maxHeight);
1773           LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
1774           R.remove(maxHeight);
1775           for (const auto &I : maxHeight->Succs) {
1776             if (Nodes.count(I.getSUnit()) == 0)
1777               continue;
1778             if (NodeOrder.count(I.getSUnit()) != 0)
1779               continue;
1780             if (ignoreDependence(I, false))
1781               continue;
1782             R.insert(I.getSUnit());
1783           }
1784           // Back-edges are predecessors with an anti-dependence.
1785           for (const auto &I : maxHeight->Preds) {
1786             if (I.getKind() != SDep::Anti)
1787               continue;
1788             if (Nodes.count(I.getSUnit()) == 0)
1789               continue;
1790             if (NodeOrder.count(I.getSUnit()) != 0)
1791               continue;
1792             R.insert(I.getSUnit());
1793           }
1794         }
1795         Order = BottomUp;
1796         LLVM_DEBUG(dbgs() << "\n   Switching order to bottom up ");
1797         SmallSetVector<SUnit *, 8> N;
1798         if (pred_L(NodeOrder, N, &Nodes))
1799           R.insert(N.begin(), N.end());
1800       } else {
1801         // Choose the node with the maximum depth.  If more than one, choose
1802         // the node with the maximum ZeroLatencyDepth. If still more than one,
1803         // choose the node with the lowest MOV.
1804         while (!R.empty()) {
1805           SUnit *maxDepth = nullptr;
1806           for (SUnit *I : R) {
1807             if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
1808               maxDepth = I;
1809             else if (getDepth(I) == getDepth(maxDepth) &&
1810                      getZeroLatencyDepth(I) > getZeroLatencyDepth(maxDepth))
1811               maxDepth = I;
1812             else if (getDepth(I) == getDepth(maxDepth) &&
1813                      getZeroLatencyDepth(I) == getZeroLatencyDepth(maxDepth) &&
1814                      getMOV(I) < getMOV(maxDepth))
1815               maxDepth = I;
1816           }
1817           NodeOrder.insert(maxDepth);
1818           LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
1819           R.remove(maxDepth);
1820           if (Nodes.isExceedSU(maxDepth)) {
1821             Order = TopDown;
1822             R.clear();
1823             R.insert(Nodes.getNode(0));
1824             break;
1825           }
1826           for (const auto &I : maxDepth->Preds) {
1827             if (Nodes.count(I.getSUnit()) == 0)
1828               continue;
1829             if (NodeOrder.count(I.getSUnit()) != 0)
1830               continue;
1831             R.insert(I.getSUnit());
1832           }
1833           // Back-edges are predecessors with an anti-dependence.
1834           for (const auto &I : maxDepth->Succs) {
1835             if (I.getKind() != SDep::Anti)
1836               continue;
1837             if (Nodes.count(I.getSUnit()) == 0)
1838               continue;
1839             if (NodeOrder.count(I.getSUnit()) != 0)
1840               continue;
1841             R.insert(I.getSUnit());
1842           }
1843         }
1844         Order = TopDown;
1845         LLVM_DEBUG(dbgs() << "\n   Switching order to top down ");
1846         SmallSetVector<SUnit *, 8> N;
1847         if (succ_L(NodeOrder, N, &Nodes))
1848           R.insert(N.begin(), N.end());
1849       }
1850     }
1851     LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
1852   }
1853 
1854   LLVM_DEBUG({
1855     dbgs() << "Node order: ";
1856     for (SUnit *I : NodeOrder)
1857       dbgs() << " " << I->NodeNum << " ";
1858     dbgs() << "\n";
1859   });
1860 }
1861 
1862 /// Process the nodes in the computed order and create the pipelined schedule
1863 /// of the instructions, if possible. Return true if a schedule is found.
1864 bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
1865   if (NodeOrder.empty())
1866     return false;
1867 
1868   bool scheduleFound = false;
1869   unsigned II = 0;
1870   // Keep increasing II until a valid schedule is found.
1871   for (II = MII; II <= MAX_II && !scheduleFound; ++II) {
1872     Schedule.reset();
1873     Schedule.setInitiationInterval(II);
1874     LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
1875 
1876     SetVector<SUnit *>::iterator NI = NodeOrder.begin();
1877     SetVector<SUnit *>::iterator NE = NodeOrder.end();
1878     do {
1879       SUnit *SU = *NI;
1880 
1881       // Compute the schedule time for the instruction, which is based
1882       // upon the scheduled time for any predecessors/successors.
1883       int EarlyStart = INT_MIN;
1884       int LateStart = INT_MAX;
1885       // These values are set when the size of the schedule window is limited
1886       // due to chain dependences.
1887       int SchedEnd = INT_MAX;
1888       int SchedStart = INT_MIN;
1889       Schedule.computeStart(SU, &EarlyStart, &LateStart, &SchedEnd, &SchedStart,
1890                             II, this);
1891       LLVM_DEBUG({
1892         dbgs() << "Inst (" << SU->NodeNum << ") ";
1893         SU->getInstr()->dump();
1894         dbgs() << "\n";
1895       });
1896       LLVM_DEBUG({
1897         dbgs() << "\tes: " << EarlyStart << " ls: " << LateStart
1898                << " me: " << SchedEnd << " ms: " << SchedStart << "\n";
1899       });
1900 
1901       if (EarlyStart > LateStart || SchedEnd < EarlyStart ||
1902           SchedStart > LateStart)
1903         scheduleFound = false;
1904       else if (EarlyStart != INT_MIN && LateStart == INT_MAX) {
1905         SchedEnd = std::min(SchedEnd, EarlyStart + (int)II - 1);
1906         scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
1907       } else if (EarlyStart == INT_MIN && LateStart != INT_MAX) {
1908         SchedStart = std::max(SchedStart, LateStart - (int)II + 1);
1909         scheduleFound = Schedule.insert(SU, LateStart, SchedStart, II);
1910       } else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
1911         SchedEnd =
1912             std::min(SchedEnd, std::min(LateStart, EarlyStart + (int)II - 1));
1913         // When scheduling a Phi it is better to start at the late cycle and go
1914         // backwards. The default order may insert the Phi too far away from
1915         // its first dependence.
1916         if (SU->getInstr()->isPHI())
1917           scheduleFound = Schedule.insert(SU, SchedEnd, EarlyStart, II);
1918         else
1919           scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
1920       } else {
1921         int FirstCycle = Schedule.getFirstCycle();
1922         scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
1923                                         FirstCycle + getASAP(SU) + II - 1, II);
1924       }
1925       // Even if we find a schedule, make sure the schedule doesn't exceed the
1926       // allowable number of stages. We keep trying if this happens.
1927       if (scheduleFound)
1928         if (SwpMaxStages > -1 &&
1929             Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
1930           scheduleFound = false;
1931 
1932       LLVM_DEBUG({
1933         if (!scheduleFound)
1934           dbgs() << "\tCan't schedule\n";
1935       });
1936     } while (++NI != NE && scheduleFound);
1937 
1938     // If a schedule is found, check if it is a valid schedule too.
1939     if (scheduleFound)
1940       scheduleFound = Schedule.isValidSchedule(this);
1941   }
1942 
1943   LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound << " (II=" << II
1944                     << ")\n");
1945 
1946   if (scheduleFound)
1947     Schedule.finalizeSchedule(this);
1948   else
1949     Schedule.reset();
1950 
1951   return scheduleFound && Schedule.getMaxStageCount() > 0;
1952 }
1953 
1954 /// Given a schedule for the loop, generate a new version of the loop,
1955 /// and replace the old version.  This function generates a prolog
1956 /// that contains the initial iterations in the pipeline, and kernel
1957 /// loop, and the epilogue that contains the code for the final
1958 /// iterations.
1959 void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) {
1960   // Create a new basic block for the kernel and add it to the CFG.
1961   MachineBasicBlock *KernelBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
1962 
1963   unsigned MaxStageCount = Schedule.getMaxStageCount();
1964 
1965   // Remember the registers that are used in different stages. The index is
1966   // the iteration, or stage, that the instruction is scheduled in.  This is
1967   // a map between register names in the original block and the names created
1968   // in each stage of the pipelined loop.
1969   ValueMapTy *VRMap = new ValueMapTy[(MaxStageCount + 1) * 2];
1970   InstrMapTy InstrMap;
1971 
1972   SmallVector<MachineBasicBlock *, 4> PrologBBs;
1973   // Generate the prolog instructions that set up the pipeline.
1974   generateProlog(Schedule, MaxStageCount, KernelBB, VRMap, PrologBBs);
1975   MF.insert(BB->getIterator(), KernelBB);
1976 
1977   // Rearrange the instructions to generate the new, pipelined loop,
1978   // and update register names as needed.
1979   for (int Cycle = Schedule.getFirstCycle(),
1980            LastCycle = Schedule.getFinalCycle();
1981        Cycle <= LastCycle; ++Cycle) {
1982     std::deque<SUnit *> &CycleInstrs = Schedule.getInstructions(Cycle);
1983     // This inner loop schedules each instruction in the cycle.
1984     for (SUnit *CI : CycleInstrs) {
1985       if (CI->getInstr()->isPHI())
1986         continue;
1987       unsigned StageNum = Schedule.stageScheduled(getSUnit(CI->getInstr()));
1988       MachineInstr *NewMI = cloneInstr(CI->getInstr(), MaxStageCount, StageNum);
1989       updateInstruction(NewMI, false, MaxStageCount, StageNum, Schedule, VRMap);
1990       KernelBB->push_back(NewMI);
1991       InstrMap[NewMI] = CI->getInstr();
1992     }
1993   }
1994 
1995   // Copy any terminator instructions to the new kernel, and update
1996   // names as needed.
1997   for (MachineBasicBlock::iterator I = BB->getFirstTerminator(),
1998                                    E = BB->instr_end();
1999        I != E; ++I) {
2000     MachineInstr *NewMI = MF.CloneMachineInstr(&*I);
2001     updateInstruction(NewMI, false, MaxStageCount, 0, Schedule, VRMap);
2002     KernelBB->push_back(NewMI);
2003     InstrMap[NewMI] = &*I;
2004   }
2005 
2006   KernelBB->transferSuccessors(BB);
2007   KernelBB->replaceSuccessor(BB, KernelBB);
2008 
2009   generateExistingPhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule,
2010                        VRMap, InstrMap, MaxStageCount, MaxStageCount, false);
2011   generatePhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule, VRMap,
2012                InstrMap, MaxStageCount, MaxStageCount, false);
2013 
2014   LLVM_DEBUG(dbgs() << "New block\n"; KernelBB->dump(););
2015 
2016   SmallVector<MachineBasicBlock *, 4> EpilogBBs;
2017   // Generate the epilog instructions to complete the pipeline.
2018   generateEpilog(Schedule, MaxStageCount, KernelBB, VRMap, EpilogBBs,
2019                  PrologBBs);
2020 
2021   // We need this step because the register allocation doesn't handle some
2022   // situations well, so we insert copies to help out.
2023   splitLifetimes(KernelBB, EpilogBBs, Schedule);
2024 
2025   // Remove dead instructions due to loop induction variables.
2026   removeDeadInstructions(KernelBB, EpilogBBs);
2027 
2028   // Add branches between prolog and epilog blocks.
2029   addBranches(PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap);
2030 
2031   // Remove the original loop since it's no longer referenced.
2032   for (auto &I : *BB)
2033     LIS.RemoveMachineInstrFromMaps(I);
2034   BB->clear();
2035   BB->eraseFromParent();
2036 
2037   delete[] VRMap;
2038 }
2039 
2040 /// Generate the pipeline prolog code.
2041 void SwingSchedulerDAG::generateProlog(SMSchedule &Schedule, unsigned LastStage,
2042                                        MachineBasicBlock *KernelBB,
2043                                        ValueMapTy *VRMap,
2044                                        MBBVectorTy &PrologBBs) {
2045   MachineBasicBlock *PreheaderBB = MLI->getLoopFor(BB)->getLoopPreheader();
2046   assert(PreheaderBB != nullptr &&
2047          "Need to add code to handle loops w/o preheader");
2048   MachineBasicBlock *PredBB = PreheaderBB;
2049   InstrMapTy InstrMap;
2050 
2051   // Generate a basic block for each stage, not including the last stage,
2052   // which will be generated in the kernel. Each basic block may contain
2053   // instructions from multiple stages/iterations.
2054   for (unsigned i = 0; i < LastStage; ++i) {
2055     // Create and insert the prolog basic block prior to the original loop
2056     // basic block.  The original loop is removed later.
2057     MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
2058     PrologBBs.push_back(NewBB);
2059     MF.insert(BB->getIterator(), NewBB);
2060     NewBB->transferSuccessors(PredBB);
2061     PredBB->addSuccessor(NewBB);
2062     PredBB = NewBB;
2063 
2064     // Generate instructions for each appropriate stage. Process instructions
2065     // in original program order.
2066     for (int StageNum = i; StageNum >= 0; --StageNum) {
2067       for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
2068                                        BBE = BB->getFirstTerminator();
2069            BBI != BBE; ++BBI) {
2070         if (Schedule.isScheduledAtStage(getSUnit(&*BBI), (unsigned)StageNum)) {
2071           if (BBI->isPHI())
2072             continue;
2073           MachineInstr *NewMI =
2074               cloneAndChangeInstr(&*BBI, i, (unsigned)StageNum, Schedule);
2075           updateInstruction(NewMI, false, i, (unsigned)StageNum, Schedule,
2076                             VRMap);
2077           NewBB->push_back(NewMI);
2078           InstrMap[NewMI] = &*BBI;
2079         }
2080       }
2081     }
2082     rewritePhiValues(NewBB, i, Schedule, VRMap, InstrMap);
2083     LLVM_DEBUG({
2084       dbgs() << "prolog:\n";
2085       NewBB->dump();
2086     });
2087   }
2088 
2089   PredBB->replaceSuccessor(BB, KernelBB);
2090 
2091   // Check if we need to remove the branch from the preheader to the original
2092   // loop, and replace it with a branch to the new loop.
2093   unsigned numBranches = TII->removeBranch(*PreheaderBB);
2094   if (numBranches) {
2095     SmallVector<MachineOperand, 0> Cond;
2096     TII->insertBranch(*PreheaderBB, PrologBBs[0], nullptr, Cond, DebugLoc());
2097   }
2098 }
2099 
2100 /// Generate the pipeline epilog code. The epilog code finishes the iterations
2101 /// that were started in either the prolog or the kernel.  We create a basic
2102 /// block for each stage that needs to complete.
2103 void SwingSchedulerDAG::generateEpilog(SMSchedule &Schedule, unsigned LastStage,
2104                                        MachineBasicBlock *KernelBB,
2105                                        ValueMapTy *VRMap,
2106                                        MBBVectorTy &EpilogBBs,
2107                                        MBBVectorTy &PrologBBs) {
2108   // We need to change the branch from the kernel to the first epilog block, so
2109   // this call to analyze branch uses the kernel rather than the original BB.
2110   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
2111   SmallVector<MachineOperand, 4> Cond;
2112   bool checkBranch = TII->analyzeBranch(*KernelBB, TBB, FBB, Cond);
2113   assert(!checkBranch && "generateEpilog must be able to analyze the branch");
2114   if (checkBranch)
2115     return;
2116 
2117   MachineBasicBlock::succ_iterator LoopExitI = KernelBB->succ_begin();
2118   if (*LoopExitI == KernelBB)
2119     ++LoopExitI;
2120   assert(LoopExitI != KernelBB->succ_end() && "Expecting a successor");
2121   MachineBasicBlock *LoopExitBB = *LoopExitI;
2122 
2123   MachineBasicBlock *PredBB = KernelBB;
2124   MachineBasicBlock *EpilogStart = LoopExitBB;
2125   InstrMapTy InstrMap;
2126 
2127   // Generate a basic block for each stage, not including the last stage,
2128   // which was generated for the kernel.  Each basic block may contain
2129   // instructions from multiple stages/iterations.
2130   int EpilogStage = LastStage + 1;
2131   for (unsigned i = LastStage; i >= 1; --i, ++EpilogStage) {
2132     MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock();
2133     EpilogBBs.push_back(NewBB);
2134     MF.insert(BB->getIterator(), NewBB);
2135 
2136     PredBB->replaceSuccessor(LoopExitBB, NewBB);
2137     NewBB->addSuccessor(LoopExitBB);
2138 
2139     if (EpilogStart == LoopExitBB)
2140       EpilogStart = NewBB;
2141 
2142     // Add instructions to the epilog depending on the current block.
2143     // Process instructions in original program order.
2144     for (unsigned StageNum = i; StageNum <= LastStage; ++StageNum) {
2145       for (auto &BBI : *BB) {
2146         if (BBI.isPHI())
2147           continue;
2148         MachineInstr *In = &BBI;
2149         if (Schedule.isScheduledAtStage(getSUnit(In), StageNum)) {
2150           // Instructions with memoperands in the epilog are updated with
2151           // conservative values.
2152           MachineInstr *NewMI = cloneInstr(In, UINT_MAX, 0);
2153           updateInstruction(NewMI, i == 1, EpilogStage, 0, Schedule, VRMap);
2154           NewBB->push_back(NewMI);
2155           InstrMap[NewMI] = In;
2156         }
2157       }
2158     }
2159     generateExistingPhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule,
2160                          VRMap, InstrMap, LastStage, EpilogStage, i == 1);
2161     generatePhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule, VRMap,
2162                  InstrMap, LastStage, EpilogStage, i == 1);
2163     PredBB = NewBB;
2164 
2165     LLVM_DEBUG({
2166       dbgs() << "epilog:\n";
2167       NewBB->dump();
2168     });
2169   }
2170 
2171   // Fix any Phi nodes in the loop exit block.
2172   for (MachineInstr &MI : *LoopExitBB) {
2173     if (!MI.isPHI())
2174       break;
2175     for (unsigned i = 2, e = MI.getNumOperands() + 1; i != e; i += 2) {
2176       MachineOperand &MO = MI.getOperand(i);
2177       if (MO.getMBB() == BB)
2178         MO.setMBB(PredBB);
2179     }
2180   }
2181 
2182   // Create a branch to the new epilog from the kernel.
2183   // Remove the original branch and add a new branch to the epilog.
2184   TII->removeBranch(*KernelBB);
2185   TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
2186   // Add a branch to the loop exit.
2187   if (EpilogBBs.size() > 0) {
2188     MachineBasicBlock *LastEpilogBB = EpilogBBs.back();
2189     SmallVector<MachineOperand, 4> Cond1;
2190     TII->insertBranch(*LastEpilogBB, LoopExitBB, nullptr, Cond1, DebugLoc());
2191   }
2192 }
2193 
2194 /// Replace all uses of FromReg that appear outside the specified
2195 /// basic block with ToReg.
2196 static void replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg,
2197                                     MachineBasicBlock *MBB,
2198                                     MachineRegisterInfo &MRI,
2199                                     LiveIntervals &LIS) {
2200   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(FromReg),
2201                                          E = MRI.use_end();
2202        I != E;) {
2203     MachineOperand &O = *I;
2204     ++I;
2205     if (O.getParent()->getParent() != MBB)
2206       O.setReg(ToReg);
2207   }
2208   if (!LIS.hasInterval(ToReg))
2209     LIS.createEmptyInterval(ToReg);
2210 }
2211 
2212 /// Return true if the register has a use that occurs outside the
2213 /// specified loop.
2214 static bool hasUseAfterLoop(unsigned Reg, MachineBasicBlock *BB,
2215                             MachineRegisterInfo &MRI) {
2216   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(Reg),
2217                                          E = MRI.use_end();
2218        I != E; ++I)
2219     if (I->getParent()->getParent() != BB)
2220       return true;
2221   return false;
2222 }
2223 
2224 /// Generate Phis for the specific block in the generated pipelined code.
2225 /// This function looks at the Phis from the original code to guide the
2226 /// creation of new Phis.
2227 void SwingSchedulerDAG::generateExistingPhis(
2228     MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
2229     MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
2230     InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
2231     bool IsLast) {
2232   // Compute the stage number for the initial value of the Phi, which
2233   // comes from the prolog. The prolog to use depends on to which kernel/
2234   // epilog that we're adding the Phi.
2235   unsigned PrologStage = 0;
2236   unsigned PrevStage = 0;
2237   bool InKernel = (LastStageNum == CurStageNum);
2238   if (InKernel) {
2239     PrologStage = LastStageNum - 1;
2240     PrevStage = CurStageNum;
2241   } else {
2242     PrologStage = LastStageNum - (CurStageNum - LastStageNum);
2243     PrevStage = LastStageNum + (CurStageNum - LastStageNum) - 1;
2244   }
2245 
2246   for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
2247                                    BBE = BB->getFirstNonPHI();
2248        BBI != BBE; ++BBI) {
2249     unsigned Def = BBI->getOperand(0).getReg();
2250 
2251     unsigned InitVal = 0;
2252     unsigned LoopVal = 0;
2253     getPhiRegs(*BBI, BB, InitVal, LoopVal);
2254 
2255     unsigned PhiOp1 = 0;
2256     // The Phi value from the loop body typically is defined in the loop, but
2257     // not always. So, we need to check if the value is defined in the loop.
2258     unsigned PhiOp2 = LoopVal;
2259     if (VRMap[LastStageNum].count(LoopVal))
2260       PhiOp2 = VRMap[LastStageNum][LoopVal];
2261 
2262     int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI));
2263     int LoopValStage =
2264         Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal)));
2265     unsigned NumStages = Schedule.getStagesForReg(Def, CurStageNum);
2266     if (NumStages == 0) {
2267       // We don't need to generate a Phi anymore, but we need to rename any uses
2268       // of the Phi value.
2269       unsigned NewReg = VRMap[PrevStage][LoopVal];
2270       rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, 0, &*BBI,
2271                             Def, InitVal, NewReg);
2272       if (VRMap[CurStageNum].count(LoopVal))
2273         VRMap[CurStageNum][Def] = VRMap[CurStageNum][LoopVal];
2274     }
2275     // Adjust the number of Phis needed depending on the number of prologs left,
2276     // and the distance from where the Phi is first scheduled. The number of
2277     // Phis cannot exceed the number of prolog stages. Each stage can
2278     // potentially define two values.
2279     unsigned MaxPhis = PrologStage + 2;
2280     if (!InKernel && (int)PrologStage <= LoopValStage)
2281       MaxPhis = std::max((int)MaxPhis - (int)LoopValStage, 1);
2282     unsigned NumPhis = std::min(NumStages, MaxPhis);
2283 
2284     unsigned NewReg = 0;
2285     unsigned AccessStage = (LoopValStage != -1) ? LoopValStage : StageScheduled;
2286     // In the epilog, we may need to look back one stage to get the correct
2287     // Phi name because the epilog and prolog blocks execute the same stage.
2288     // The correct name is from the previous block only when the Phi has
2289     // been completely scheduled prior to the epilog, and Phi value is not
2290     // needed in multiple stages.
2291     int StageDiff = 0;
2292     if (!InKernel && StageScheduled >= LoopValStage && AccessStage == 0 &&
2293         NumPhis == 1)
2294       StageDiff = 1;
2295     // Adjust the computations below when the phi and the loop definition
2296     // are scheduled in different stages.
2297     if (InKernel && LoopValStage != -1 && StageScheduled > LoopValStage)
2298       StageDiff = StageScheduled - LoopValStage;
2299     for (unsigned np = 0; np < NumPhis; ++np) {
2300       // If the Phi hasn't been scheduled, then use the initial Phi operand
2301       // value. Otherwise, use the scheduled version of the instruction. This
2302       // is a little complicated when a Phi references another Phi.
2303       if (np > PrologStage || StageScheduled >= (int)LastStageNum)
2304         PhiOp1 = InitVal;
2305       // Check if the Phi has already been scheduled in a prolog stage.
2306       else if (PrologStage >= AccessStage + StageDiff + np &&
2307                VRMap[PrologStage - StageDiff - np].count(LoopVal) != 0)
2308         PhiOp1 = VRMap[PrologStage - StageDiff - np][LoopVal];
2309       // Check if the Phi has already been scheduled, but the loop instruction
2310       // is either another Phi, or doesn't occur in the loop.
2311       else if (PrologStage >= AccessStage + StageDiff + np) {
2312         // If the Phi references another Phi, we need to examine the other
2313         // Phi to get the correct value.
2314         PhiOp1 = LoopVal;
2315         MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1);
2316         int Indirects = 1;
2317         while (InstOp1 && InstOp1->isPHI() && InstOp1->getParent() == BB) {
2318           int PhiStage = Schedule.stageScheduled(getSUnit(InstOp1));
2319           if ((int)(PrologStage - StageDiff - np) < PhiStage + Indirects)
2320             PhiOp1 = getInitPhiReg(*InstOp1, BB);
2321           else
2322             PhiOp1 = getLoopPhiReg(*InstOp1, BB);
2323           InstOp1 = MRI.getVRegDef(PhiOp1);
2324           int PhiOpStage = Schedule.stageScheduled(getSUnit(InstOp1));
2325           int StageAdj = (PhiOpStage != -1 ? PhiStage - PhiOpStage : 0);
2326           if (PhiOpStage != -1 && PrologStage - StageAdj >= Indirects + np &&
2327               VRMap[PrologStage - StageAdj - Indirects - np].count(PhiOp1)) {
2328             PhiOp1 = VRMap[PrologStage - StageAdj - Indirects - np][PhiOp1];
2329             break;
2330           }
2331           ++Indirects;
2332         }
2333       } else
2334         PhiOp1 = InitVal;
2335       // If this references a generated Phi in the kernel, get the Phi operand
2336       // from the incoming block.
2337       if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1))
2338         if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
2339           PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
2340 
2341       MachineInstr *PhiInst = MRI.getVRegDef(LoopVal);
2342       bool LoopDefIsPhi = PhiInst && PhiInst->isPHI();
2343       // In the epilog, a map lookup is needed to get the value from the kernel,
2344       // or previous epilog block. How is does this depends on if the
2345       // instruction is scheduled in the previous block.
2346       if (!InKernel) {
2347         int StageDiffAdj = 0;
2348         if (LoopValStage != -1 && StageScheduled > LoopValStage)
2349           StageDiffAdj = StageScheduled - LoopValStage;
2350         // Use the loop value defined in the kernel, unless the kernel
2351         // contains the last definition of the Phi.
2352         if (np == 0 && PrevStage == LastStageNum &&
2353             (StageScheduled != 0 || LoopValStage != 0) &&
2354             VRMap[PrevStage - StageDiffAdj].count(LoopVal))
2355           PhiOp2 = VRMap[PrevStage - StageDiffAdj][LoopVal];
2356         // Use the value defined by the Phi. We add one because we switch
2357         // from looking at the loop value to the Phi definition.
2358         else if (np > 0 && PrevStage == LastStageNum &&
2359                  VRMap[PrevStage - np + 1].count(Def))
2360           PhiOp2 = VRMap[PrevStage - np + 1][Def];
2361         // Use the loop value defined in the kernel.
2362         else if (static_cast<unsigned>(LoopValStage) > PrologStage + 1 &&
2363                  VRMap[PrevStage - StageDiffAdj - np].count(LoopVal))
2364           PhiOp2 = VRMap[PrevStage - StageDiffAdj - np][LoopVal];
2365         // Use the value defined by the Phi, unless we're generating the first
2366         // epilog and the Phi refers to a Phi in a different stage.
2367         else if (VRMap[PrevStage - np].count(Def) &&
2368                  (!LoopDefIsPhi || PrevStage != LastStageNum))
2369           PhiOp2 = VRMap[PrevStage - np][Def];
2370       }
2371 
2372       // Check if we can reuse an existing Phi. This occurs when a Phi
2373       // references another Phi, and the other Phi is scheduled in an
2374       // earlier stage. We can try to reuse an existing Phi up until the last
2375       // stage of the current Phi.
2376       if (LoopDefIsPhi) {
2377         if (static_cast<int>(PrologStage - np) >= StageScheduled) {
2378           int LVNumStages = Schedule.getStagesForPhi(LoopVal);
2379           int StageDiff = (StageScheduled - LoopValStage);
2380           LVNumStages -= StageDiff;
2381           // Make sure the loop value Phi has been processed already.
2382           if (LVNumStages > (int)np && VRMap[CurStageNum].count(LoopVal)) {
2383             NewReg = PhiOp2;
2384             unsigned ReuseStage = CurStageNum;
2385             if (Schedule.isLoopCarried(this, *PhiInst))
2386               ReuseStage -= LVNumStages;
2387             // Check if the Phi to reuse has been generated yet. If not, then
2388             // there is nothing to reuse.
2389             if (VRMap[ReuseStage - np].count(LoopVal)) {
2390               NewReg = VRMap[ReuseStage - np][LoopVal];
2391 
2392               rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2393                                     &*BBI, Def, NewReg);
2394               // Update the map with the new Phi name.
2395               VRMap[CurStageNum - np][Def] = NewReg;
2396               PhiOp2 = NewReg;
2397               if (VRMap[LastStageNum - np - 1].count(LoopVal))
2398                 PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal];
2399 
2400               if (IsLast && np == NumPhis - 1)
2401                 replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
2402               continue;
2403             }
2404           }
2405         }
2406         if (InKernel && StageDiff > 0 &&
2407             VRMap[CurStageNum - StageDiff - np].count(LoopVal))
2408           PhiOp2 = VRMap[CurStageNum - StageDiff - np][LoopVal];
2409       }
2410 
2411       const TargetRegisterClass *RC = MRI.getRegClass(Def);
2412       NewReg = MRI.createVirtualRegister(RC);
2413 
2414       MachineInstrBuilder NewPhi =
2415           BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
2416                   TII->get(TargetOpcode::PHI), NewReg);
2417       NewPhi.addReg(PhiOp1).addMBB(BB1);
2418       NewPhi.addReg(PhiOp2).addMBB(BB2);
2419       if (np == 0)
2420         InstrMap[NewPhi] = &*BBI;
2421 
2422       // We define the Phis after creating the new pipelined code, so
2423       // we need to rename the Phi values in scheduled instructions.
2424 
2425       unsigned PrevReg = 0;
2426       if (InKernel && VRMap[PrevStage - np].count(LoopVal))
2427         PrevReg = VRMap[PrevStage - np][LoopVal];
2428       rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI,
2429                             Def, NewReg, PrevReg);
2430       // If the Phi has been scheduled, use the new name for rewriting.
2431       if (VRMap[CurStageNum - np].count(Def)) {
2432         unsigned R = VRMap[CurStageNum - np][Def];
2433         rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI,
2434                               R, NewReg);
2435       }
2436 
2437       // Check if we need to rename any uses that occurs after the loop. The
2438       // register to replace depends on whether the Phi is scheduled in the
2439       // epilog.
2440       if (IsLast && np == NumPhis - 1)
2441         replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
2442 
2443       // In the kernel, a dependent Phi uses the value from this Phi.
2444       if (InKernel)
2445         PhiOp2 = NewReg;
2446 
2447       // Update the map with the new Phi name.
2448       VRMap[CurStageNum - np][Def] = NewReg;
2449     }
2450 
2451     while (NumPhis++ < NumStages) {
2452       rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, NumPhis,
2453                             &*BBI, Def, NewReg, 0);
2454     }
2455 
2456     // Check if we need to rename a Phi that has been eliminated due to
2457     // scheduling.
2458     if (NumStages == 0 && IsLast && VRMap[CurStageNum].count(LoopVal))
2459       replaceRegUsesAfterLoop(Def, VRMap[CurStageNum][LoopVal], BB, MRI, LIS);
2460   }
2461 }
2462 
2463 /// Generate Phis for the specified block in the generated pipelined code.
2464 /// These are new Phis needed because the definition is scheduled after the
2465 /// use in the pipelined sequence.
2466 void SwingSchedulerDAG::generatePhis(
2467     MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
2468     MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
2469     InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
2470     bool IsLast) {
2471   // Compute the stage number that contains the initial Phi value, and
2472   // the Phi from the previous stage.
2473   unsigned PrologStage = 0;
2474   unsigned PrevStage = 0;
2475   unsigned StageDiff = CurStageNum - LastStageNum;
2476   bool InKernel = (StageDiff == 0);
2477   if (InKernel) {
2478     PrologStage = LastStageNum - 1;
2479     PrevStage = CurStageNum;
2480   } else {
2481     PrologStage = LastStageNum - StageDiff;
2482     PrevStage = LastStageNum + StageDiff - 1;
2483   }
2484 
2485   for (MachineBasicBlock::iterator BBI = BB->getFirstNonPHI(),
2486                                    BBE = BB->instr_end();
2487        BBI != BBE; ++BBI) {
2488     for (unsigned i = 0, e = BBI->getNumOperands(); i != e; ++i) {
2489       MachineOperand &MO = BBI->getOperand(i);
2490       if (!MO.isReg() || !MO.isDef() ||
2491           !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
2492         continue;
2493 
2494       int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI));
2495       assert(StageScheduled != -1 && "Expecting scheduled instruction.");
2496       unsigned Def = MO.getReg();
2497       unsigned NumPhis = Schedule.getStagesForReg(Def, CurStageNum);
2498       // An instruction scheduled in stage 0 and is used after the loop
2499       // requires a phi in the epilog for the last definition from either
2500       // the kernel or prolog.
2501       if (!InKernel && NumPhis == 0 && StageScheduled == 0 &&
2502           hasUseAfterLoop(Def, BB, MRI))
2503         NumPhis = 1;
2504       if (!InKernel && (unsigned)StageScheduled > PrologStage)
2505         continue;
2506 
2507       unsigned PhiOp2 = VRMap[PrevStage][Def];
2508       if (MachineInstr *InstOp2 = MRI.getVRegDef(PhiOp2))
2509         if (InstOp2->isPHI() && InstOp2->getParent() == NewBB)
2510           PhiOp2 = getLoopPhiReg(*InstOp2, BB2);
2511       // The number of Phis can't exceed the number of prolog stages. The
2512       // prolog stage number is zero based.
2513       if (NumPhis > PrologStage + 1 - StageScheduled)
2514         NumPhis = PrologStage + 1 - StageScheduled;
2515       for (unsigned np = 0; np < NumPhis; ++np) {
2516         unsigned PhiOp1 = VRMap[PrologStage][Def];
2517         if (np <= PrologStage)
2518           PhiOp1 = VRMap[PrologStage - np][Def];
2519         if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1)) {
2520           if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
2521             PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
2522           if (InstOp1->isPHI() && InstOp1->getParent() == NewBB)
2523             PhiOp1 = getInitPhiReg(*InstOp1, NewBB);
2524         }
2525         if (!InKernel)
2526           PhiOp2 = VRMap[PrevStage - np][Def];
2527 
2528         const TargetRegisterClass *RC = MRI.getRegClass(Def);
2529         unsigned NewReg = MRI.createVirtualRegister(RC);
2530 
2531         MachineInstrBuilder NewPhi =
2532             BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
2533                     TII->get(TargetOpcode::PHI), NewReg);
2534         NewPhi.addReg(PhiOp1).addMBB(BB1);
2535         NewPhi.addReg(PhiOp2).addMBB(BB2);
2536         if (np == 0)
2537           InstrMap[NewPhi] = &*BBI;
2538 
2539         // Rewrite uses and update the map. The actions depend upon whether
2540         // we generating code for the kernel or epilog blocks.
2541         if (InKernel) {
2542           rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2543                                 &*BBI, PhiOp1, NewReg);
2544           rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2545                                 &*BBI, PhiOp2, NewReg);
2546 
2547           PhiOp2 = NewReg;
2548           VRMap[PrevStage - np - 1][Def] = NewReg;
2549         } else {
2550           VRMap[CurStageNum - np][Def] = NewReg;
2551           if (np == NumPhis - 1)
2552             rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2553                                   &*BBI, Def, NewReg);
2554         }
2555         if (IsLast && np == NumPhis - 1)
2556           replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
2557       }
2558     }
2559   }
2560 }
2561 
2562 /// Remove instructions that generate values with no uses.
2563 /// Typically, these are induction variable operations that generate values
2564 /// used in the loop itself.  A dead instruction has a definition with
2565 /// no uses, or uses that occur in the original loop only.
2566 void SwingSchedulerDAG::removeDeadInstructions(MachineBasicBlock *KernelBB,
2567                                                MBBVectorTy &EpilogBBs) {
2568   // For each epilog block, check that the value defined by each instruction
2569   // is used.  If not, delete it.
2570   for (MBBVectorTy::reverse_iterator MBB = EpilogBBs.rbegin(),
2571                                      MBE = EpilogBBs.rend();
2572        MBB != MBE; ++MBB)
2573     for (MachineBasicBlock::reverse_instr_iterator MI = (*MBB)->instr_rbegin(),
2574                                                    ME = (*MBB)->instr_rend();
2575          MI != ME;) {
2576       // From DeadMachineInstructionElem. Don't delete inline assembly.
2577       if (MI->isInlineAsm()) {
2578         ++MI;
2579         continue;
2580       }
2581       bool SawStore = false;
2582       // Check if it's safe to remove the instruction due to side effects.
2583       // We can, and want to, remove Phis here.
2584       if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI()) {
2585         ++MI;
2586         continue;
2587       }
2588       bool used = true;
2589       for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
2590                                       MOE = MI->operands_end();
2591            MOI != MOE; ++MOI) {
2592         if (!MOI->isReg() || !MOI->isDef())
2593           continue;
2594         unsigned reg = MOI->getReg();
2595         // Assume physical registers are used, unless they are marked dead.
2596         if (TargetRegisterInfo::isPhysicalRegister(reg)) {
2597           used = !MOI->isDead();
2598           if (used)
2599             break;
2600           continue;
2601         }
2602         unsigned realUses = 0;
2603         for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(reg),
2604                                                EI = MRI.use_end();
2605              UI != EI; ++UI) {
2606           // Check if there are any uses that occur only in the original
2607           // loop.  If so, that's not a real use.
2608           if (UI->getParent()->getParent() != BB) {
2609             realUses++;
2610             used = true;
2611             break;
2612           }
2613         }
2614         if (realUses > 0)
2615           break;
2616         used = false;
2617       }
2618       if (!used) {
2619         LIS.RemoveMachineInstrFromMaps(*MI);
2620         MI++->eraseFromParent();
2621         continue;
2622       }
2623       ++MI;
2624     }
2625   // In the kernel block, check if we can remove a Phi that generates a value
2626   // used in an instruction removed in the epilog block.
2627   for (MachineBasicBlock::iterator BBI = KernelBB->instr_begin(),
2628                                    BBE = KernelBB->getFirstNonPHI();
2629        BBI != BBE;) {
2630     MachineInstr *MI = &*BBI;
2631     ++BBI;
2632     unsigned reg = MI->getOperand(0).getReg();
2633     if (MRI.use_begin(reg) == MRI.use_end()) {
2634       LIS.RemoveMachineInstrFromMaps(*MI);
2635       MI->eraseFromParent();
2636     }
2637   }
2638 }
2639 
2640 /// For loop carried definitions, we split the lifetime of a virtual register
2641 /// that has uses past the definition in the next iteration. A copy with a new
2642 /// virtual register is inserted before the definition, which helps with
2643 /// generating a better register assignment.
2644 ///
2645 ///   v1 = phi(a, v2)     v1 = phi(a, v2)
2646 ///   v2 = phi(b, v3)     v2 = phi(b, v3)
2647 ///   v3 = ..             v4 = copy v1
2648 ///   .. = V1             v3 = ..
2649 ///                       .. = v4
2650 void SwingSchedulerDAG::splitLifetimes(MachineBasicBlock *KernelBB,
2651                                        MBBVectorTy &EpilogBBs,
2652                                        SMSchedule &Schedule) {
2653   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2654   for (auto &PHI : KernelBB->phis()) {
2655     unsigned Def = PHI.getOperand(0).getReg();
2656     // Check for any Phi definition that used as an operand of another Phi
2657     // in the same block.
2658     for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(Def),
2659                                                  E = MRI.use_instr_end();
2660          I != E; ++I) {
2661       if (I->isPHI() && I->getParent() == KernelBB) {
2662         // Get the loop carried definition.
2663         unsigned LCDef = getLoopPhiReg(PHI, KernelBB);
2664         if (!LCDef)
2665           continue;
2666         MachineInstr *MI = MRI.getVRegDef(LCDef);
2667         if (!MI || MI->getParent() != KernelBB || MI->isPHI())
2668           continue;
2669         // Search through the rest of the block looking for uses of the Phi
2670         // definition. If one occurs, then split the lifetime.
2671         unsigned SplitReg = 0;
2672         for (auto &BBJ : make_range(MachineBasicBlock::instr_iterator(MI),
2673                                     KernelBB->instr_end()))
2674           if (BBJ.readsRegister(Def)) {
2675             // We split the lifetime when we find the first use.
2676             if (SplitReg == 0) {
2677               SplitReg = MRI.createVirtualRegister(MRI.getRegClass(Def));
2678               BuildMI(*KernelBB, MI, MI->getDebugLoc(),
2679                       TII->get(TargetOpcode::COPY), SplitReg)
2680                   .addReg(Def);
2681             }
2682             BBJ.substituteRegister(Def, SplitReg, 0, *TRI);
2683           }
2684         if (!SplitReg)
2685           continue;
2686         // Search through each of the epilog blocks for any uses to be renamed.
2687         for (auto &Epilog : EpilogBBs)
2688           for (auto &I : *Epilog)
2689             if (I.readsRegister(Def))
2690               I.substituteRegister(Def, SplitReg, 0, *TRI);
2691         break;
2692       }
2693     }
2694   }
2695 }
2696 
2697 /// Remove the incoming block from the Phis in a basic block.
2698 static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
2699   for (MachineInstr &MI : *BB) {
2700     if (!MI.isPHI())
2701       break;
2702     for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2)
2703       if (MI.getOperand(i + 1).getMBB() == Incoming) {
2704         MI.RemoveOperand(i + 1);
2705         MI.RemoveOperand(i);
2706         break;
2707       }
2708   }
2709 }
2710 
2711 /// Create branches from each prolog basic block to the appropriate epilog
2712 /// block.  These edges are needed if the loop ends before reaching the
2713 /// kernel.
2714 void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs,
2715                                     MachineBasicBlock *KernelBB,
2716                                     MBBVectorTy &EpilogBBs,
2717                                     SMSchedule &Schedule, ValueMapTy *VRMap) {
2718   assert(PrologBBs.size() == EpilogBBs.size() && "Prolog/Epilog mismatch");
2719   MachineInstr *IndVar = Pass.LI.LoopInductionVar;
2720   MachineInstr *Cmp = Pass.LI.LoopCompare;
2721   MachineBasicBlock *LastPro = KernelBB;
2722   MachineBasicBlock *LastEpi = KernelBB;
2723 
2724   // Start from the blocks connected to the kernel and work "out"
2725   // to the first prolog and the last epilog blocks.
2726   SmallVector<MachineInstr *, 4> PrevInsts;
2727   unsigned MaxIter = PrologBBs.size() - 1;
2728   unsigned LC = UINT_MAX;
2729   unsigned LCMin = UINT_MAX;
2730   for (unsigned i = 0, j = MaxIter; i <= MaxIter; ++i, --j) {
2731     // Add branches to the prolog that go to the corresponding
2732     // epilog, and the fall-thru prolog/kernel block.
2733     MachineBasicBlock *Prolog = PrologBBs[j];
2734     MachineBasicBlock *Epilog = EpilogBBs[i];
2735     // We've executed one iteration, so decrement the loop count and check for
2736     // the loop end.
2737     SmallVector<MachineOperand, 4> Cond;
2738     // Check if the LOOP0 has already been removed. If so, then there is no need
2739     // to reduce the trip count.
2740     if (LC != 0)
2741       LC = TII->reduceLoopCount(*Prolog, IndVar, *Cmp, Cond, PrevInsts, j,
2742                                 MaxIter);
2743 
2744     // Record the value of the first trip count, which is used to determine if
2745     // branches and blocks can be removed for constant trip counts.
2746     if (LCMin == UINT_MAX)
2747       LCMin = LC;
2748 
2749     unsigned numAdded = 0;
2750     if (TargetRegisterInfo::isVirtualRegister(LC)) {
2751       Prolog->addSuccessor(Epilog);
2752       numAdded = TII->insertBranch(*Prolog, Epilog, LastPro, Cond, DebugLoc());
2753     } else if (j >= LCMin) {
2754       Prolog->addSuccessor(Epilog);
2755       Prolog->removeSuccessor(LastPro);
2756       LastEpi->removeSuccessor(Epilog);
2757       numAdded = TII->insertBranch(*Prolog, Epilog, nullptr, Cond, DebugLoc());
2758       removePhis(Epilog, LastEpi);
2759       // Remove the blocks that are no longer referenced.
2760       if (LastPro != LastEpi) {
2761         LastEpi->clear();
2762         LastEpi->eraseFromParent();
2763       }
2764       LastPro->clear();
2765       LastPro->eraseFromParent();
2766     } else {
2767       numAdded = TII->insertBranch(*Prolog, LastPro, nullptr, Cond, DebugLoc());
2768       removePhis(Epilog, Prolog);
2769     }
2770     LastPro = Prolog;
2771     LastEpi = Epilog;
2772     for (MachineBasicBlock::reverse_instr_iterator I = Prolog->instr_rbegin(),
2773                                                    E = Prolog->instr_rend();
2774          I != E && numAdded > 0; ++I, --numAdded)
2775       updateInstruction(&*I, false, j, 0, Schedule, VRMap);
2776   }
2777 }
2778 
2779 /// Return true if we can compute the amount the instruction changes
2780 /// during each iteration. Set Delta to the amount of the change.
2781 bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
2782   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2783   const MachineOperand *BaseOp;
2784   int64_t Offset;
2785   if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI))
2786     return false;
2787 
2788   if (!BaseOp->isReg())
2789     return false;
2790 
2791   unsigned BaseReg = BaseOp->getReg();
2792 
2793   MachineRegisterInfo &MRI = MF.getRegInfo();
2794   // Check if there is a Phi. If so, get the definition in the loop.
2795   MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
2796   if (BaseDef && BaseDef->isPHI()) {
2797     BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
2798     BaseDef = MRI.getVRegDef(BaseReg);
2799   }
2800   if (!BaseDef)
2801     return false;
2802 
2803   int D = 0;
2804   if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
2805     return false;
2806 
2807   Delta = D;
2808   return true;
2809 }
2810 
2811 /// Update the memory operand with a new offset when the pipeliner
2812 /// generates a new copy of the instruction that refers to a
2813 /// different memory location.
2814 void SwingSchedulerDAG::updateMemOperands(MachineInstr &NewMI,
2815                                           MachineInstr &OldMI, unsigned Num) {
2816   if (Num == 0)
2817     return;
2818   // If the instruction has memory operands, then adjust the offset
2819   // when the instruction appears in different stages.
2820   if (NewMI.memoperands_empty())
2821     return;
2822   SmallVector<MachineMemOperand *, 2> NewMMOs;
2823   for (MachineMemOperand *MMO : NewMI.memoperands()) {
2824     // TODO: Figure out whether isAtomic is really necessary (see D57601).
2825     if (MMO->isVolatile() || MMO->isAtomic() ||
2826         (MMO->isInvariant() && MMO->isDereferenceable()) ||
2827         (!MMO->getValue())) {
2828       NewMMOs.push_back(MMO);
2829       continue;
2830     }
2831     unsigned Delta;
2832     if (Num != UINT_MAX && computeDelta(OldMI, Delta)) {
2833       int64_t AdjOffset = Delta * Num;
2834       NewMMOs.push_back(
2835           MF.getMachineMemOperand(MMO, AdjOffset, MMO->getSize()));
2836     } else {
2837       NewMMOs.push_back(
2838           MF.getMachineMemOperand(MMO, 0, MemoryLocation::UnknownSize));
2839     }
2840   }
2841   NewMI.setMemRefs(MF, NewMMOs);
2842 }
2843 
2844 /// Clone the instruction for the new pipelined loop and update the
2845 /// memory operands, if needed.
2846 MachineInstr *SwingSchedulerDAG::cloneInstr(MachineInstr *OldMI,
2847                                             unsigned CurStageNum,
2848                                             unsigned InstStageNum) {
2849   MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
2850   // Check for tied operands in inline asm instructions. This should be handled
2851   // elsewhere, but I'm not sure of the best solution.
2852   if (OldMI->isInlineAsm())
2853     for (unsigned i = 0, e = OldMI->getNumOperands(); i != e; ++i) {
2854       const auto &MO = OldMI->getOperand(i);
2855       if (MO.isReg() && MO.isUse())
2856         break;
2857       unsigned UseIdx;
2858       if (OldMI->isRegTiedToUseOperand(i, &UseIdx))
2859         NewMI->tieOperands(i, UseIdx);
2860     }
2861   updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
2862   return NewMI;
2863 }
2864 
2865 /// Clone the instruction for the new pipelined loop. If needed, this
2866 /// function updates the instruction using the values saved in the
2867 /// InstrChanges structure.
2868 MachineInstr *SwingSchedulerDAG::cloneAndChangeInstr(MachineInstr *OldMI,
2869                                                      unsigned CurStageNum,
2870                                                      unsigned InstStageNum,
2871                                                      SMSchedule &Schedule) {
2872   MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
2873   DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
2874       InstrChanges.find(getSUnit(OldMI));
2875   if (It != InstrChanges.end()) {
2876     std::pair<unsigned, int64_t> RegAndOffset = It->second;
2877     unsigned BasePos, OffsetPos;
2878     if (!TII->getBaseAndOffsetPosition(*OldMI, BasePos, OffsetPos))
2879       return nullptr;
2880     int64_t NewOffset = OldMI->getOperand(OffsetPos).getImm();
2881     MachineInstr *LoopDef = findDefInLoop(RegAndOffset.first);
2882     if (Schedule.stageScheduled(getSUnit(LoopDef)) > (signed)InstStageNum)
2883       NewOffset += RegAndOffset.second * (CurStageNum - InstStageNum);
2884     NewMI->getOperand(OffsetPos).setImm(NewOffset);
2885   }
2886   updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
2887   return NewMI;
2888 }
2889 
2890 /// Update the machine instruction with new virtual registers.  This
2891 /// function may change the defintions and/or uses.
2892 void SwingSchedulerDAG::updateInstruction(MachineInstr *NewMI, bool LastDef,
2893                                           unsigned CurStageNum,
2894                                           unsigned InstrStageNum,
2895                                           SMSchedule &Schedule,
2896                                           ValueMapTy *VRMap) {
2897   for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
2898     MachineOperand &MO = NewMI->getOperand(i);
2899     if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
2900       continue;
2901     unsigned reg = MO.getReg();
2902     if (MO.isDef()) {
2903       // Create a new virtual register for the definition.
2904       const TargetRegisterClass *RC = MRI.getRegClass(reg);
2905       unsigned NewReg = MRI.createVirtualRegister(RC);
2906       MO.setReg(NewReg);
2907       VRMap[CurStageNum][reg] = NewReg;
2908       if (LastDef)
2909         replaceRegUsesAfterLoop(reg, NewReg, BB, MRI, LIS);
2910     } else if (MO.isUse()) {
2911       MachineInstr *Def = MRI.getVRegDef(reg);
2912       // Compute the stage that contains the last definition for instruction.
2913       int DefStageNum = Schedule.stageScheduled(getSUnit(Def));
2914       unsigned StageNum = CurStageNum;
2915       if (DefStageNum != -1 && (int)InstrStageNum > DefStageNum) {
2916         // Compute the difference in stages between the defintion and the use.
2917         unsigned StageDiff = (InstrStageNum - DefStageNum);
2918         // Make an adjustment to get the last definition.
2919         StageNum -= StageDiff;
2920       }
2921       if (VRMap[StageNum].count(reg))
2922         MO.setReg(VRMap[StageNum][reg]);
2923     }
2924   }
2925 }
2926 
2927 /// Return the instruction in the loop that defines the register.
2928 /// If the definition is a Phi, then follow the Phi operand to
2929 /// the instruction in the loop.
2930 MachineInstr *SwingSchedulerDAG::findDefInLoop(unsigned Reg) {
2931   SmallPtrSet<MachineInstr *, 8> Visited;
2932   MachineInstr *Def = MRI.getVRegDef(Reg);
2933   while (Def->isPHI()) {
2934     if (!Visited.insert(Def).second)
2935       break;
2936     for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
2937       if (Def->getOperand(i + 1).getMBB() == BB) {
2938         Def = MRI.getVRegDef(Def->getOperand(i).getReg());
2939         break;
2940       }
2941   }
2942   return Def;
2943 }
2944 
2945 /// Return the new name for the value from the previous stage.
2946 unsigned SwingSchedulerDAG::getPrevMapVal(unsigned StageNum, unsigned PhiStage,
2947                                           unsigned LoopVal, unsigned LoopStage,
2948                                           ValueMapTy *VRMap,
2949                                           MachineBasicBlock *BB) {
2950   unsigned PrevVal = 0;
2951   if (StageNum > PhiStage) {
2952     MachineInstr *LoopInst = MRI.getVRegDef(LoopVal);
2953     if (PhiStage == LoopStage && VRMap[StageNum - 1].count(LoopVal))
2954       // The name is defined in the previous stage.
2955       PrevVal = VRMap[StageNum - 1][LoopVal];
2956     else if (VRMap[StageNum].count(LoopVal))
2957       // The previous name is defined in the current stage when the instruction
2958       // order is swapped.
2959       PrevVal = VRMap[StageNum][LoopVal];
2960     else if (!LoopInst->isPHI() || LoopInst->getParent() != BB)
2961       // The loop value hasn't yet been scheduled.
2962       PrevVal = LoopVal;
2963     else if (StageNum == PhiStage + 1)
2964       // The loop value is another phi, which has not been scheduled.
2965       PrevVal = getInitPhiReg(*LoopInst, BB);
2966     else if (StageNum > PhiStage + 1 && LoopInst->getParent() == BB)
2967       // The loop value is another phi, which has been scheduled.
2968       PrevVal =
2969           getPrevMapVal(StageNum - 1, PhiStage, getLoopPhiReg(*LoopInst, BB),
2970                         LoopStage, VRMap, BB);
2971   }
2972   return PrevVal;
2973 }
2974 
2975 /// Rewrite the Phi values in the specified block to use the mappings
2976 /// from the initial operand. Once the Phi is scheduled, we switch
2977 /// to using the loop value instead of the Phi value, so those names
2978 /// do not need to be rewritten.
2979 void SwingSchedulerDAG::rewritePhiValues(MachineBasicBlock *NewBB,
2980                                          unsigned StageNum,
2981                                          SMSchedule &Schedule,
2982                                          ValueMapTy *VRMap,
2983                                          InstrMapTy &InstrMap) {
2984   for (auto &PHI : BB->phis()) {
2985     unsigned InitVal = 0;
2986     unsigned LoopVal = 0;
2987     getPhiRegs(PHI, BB, InitVal, LoopVal);
2988     unsigned PhiDef = PHI.getOperand(0).getReg();
2989 
2990     unsigned PhiStage =
2991         (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(PhiDef)));
2992     unsigned LoopStage =
2993         (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal)));
2994     unsigned NumPhis = Schedule.getStagesForPhi(PhiDef);
2995     if (NumPhis > StageNum)
2996       NumPhis = StageNum;
2997     for (unsigned np = 0; np <= NumPhis; ++np) {
2998       unsigned NewVal =
2999           getPrevMapVal(StageNum - np, PhiStage, LoopVal, LoopStage, VRMap, BB);
3000       if (!NewVal)
3001         NewVal = InitVal;
3002       rewriteScheduledInstr(NewBB, Schedule, InstrMap, StageNum - np, np, &PHI,
3003                             PhiDef, NewVal);
3004     }
3005   }
3006 }
3007 
3008 /// Rewrite a previously scheduled instruction to use the register value
3009 /// from the new instruction. Make sure the instruction occurs in the
3010 /// basic block, and we don't change the uses in the new instruction.
3011 void SwingSchedulerDAG::rewriteScheduledInstr(
3012     MachineBasicBlock *BB, SMSchedule &Schedule, InstrMapTy &InstrMap,
3013     unsigned CurStageNum, unsigned PhiNum, MachineInstr *Phi, unsigned OldReg,
3014     unsigned NewReg, unsigned PrevReg) {
3015   bool InProlog = (CurStageNum < Schedule.getMaxStageCount());
3016   int StagePhi = Schedule.stageScheduled(getSUnit(Phi)) + PhiNum;
3017   // Rewrite uses that have been scheduled already to use the new
3018   // Phi register.
3019   for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(OldReg),
3020                                          EI = MRI.use_end();
3021        UI != EI;) {
3022     MachineOperand &UseOp = *UI;
3023     MachineInstr *UseMI = UseOp.getParent();
3024     ++UI;
3025     if (UseMI->getParent() != BB)
3026       continue;
3027     if (UseMI->isPHI()) {
3028       if (!Phi->isPHI() && UseMI->getOperand(0).getReg() == NewReg)
3029         continue;
3030       if (getLoopPhiReg(*UseMI, BB) != OldReg)
3031         continue;
3032     }
3033     InstrMapTy::iterator OrigInstr = InstrMap.find(UseMI);
3034     assert(OrigInstr != InstrMap.end() && "Instruction not scheduled.");
3035     SUnit *OrigMISU = getSUnit(OrigInstr->second);
3036     int StageSched = Schedule.stageScheduled(OrigMISU);
3037     int CycleSched = Schedule.cycleScheduled(OrigMISU);
3038     unsigned ReplaceReg = 0;
3039     // This is the stage for the scheduled instruction.
3040     if (StagePhi == StageSched && Phi->isPHI()) {
3041       int CyclePhi = Schedule.cycleScheduled(getSUnit(Phi));
3042       if (PrevReg && InProlog)
3043         ReplaceReg = PrevReg;
3044       else if (PrevReg && !Schedule.isLoopCarried(this, *Phi) &&
3045                (CyclePhi <= CycleSched || OrigMISU->getInstr()->isPHI()))
3046         ReplaceReg = PrevReg;
3047       else
3048         ReplaceReg = NewReg;
3049     }
3050     // The scheduled instruction occurs before the scheduled Phi, and the
3051     // Phi is not loop carried.
3052     if (!InProlog && StagePhi + 1 == StageSched &&
3053         !Schedule.isLoopCarried(this, *Phi))
3054       ReplaceReg = NewReg;
3055     if (StagePhi > StageSched && Phi->isPHI())
3056       ReplaceReg = NewReg;
3057     if (!InProlog && !Phi->isPHI() && StagePhi < StageSched)
3058       ReplaceReg = NewReg;
3059     if (ReplaceReg) {
3060       MRI.constrainRegClass(ReplaceReg, MRI.getRegClass(OldReg));
3061       UseOp.setReg(ReplaceReg);
3062     }
3063   }
3064 }
3065 
3066 /// Check if we can change the instruction to use an offset value from the
3067 /// previous iteration. If so, return true and set the base and offset values
3068 /// so that we can rewrite the load, if necessary.
3069 ///   v1 = Phi(v0, v3)
3070 ///   v2 = load v1, 0
3071 ///   v3 = post_store v1, 4, x
3072 /// This function enables the load to be rewritten as v2 = load v3, 4.
3073 bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
3074                                               unsigned &BasePos,
3075                                               unsigned &OffsetPos,
3076                                               unsigned &NewBase,
3077                                               int64_t &Offset) {
3078   // Get the load instruction.
3079   if (TII->isPostIncrement(*MI))
3080     return false;
3081   unsigned BasePosLd, OffsetPosLd;
3082   if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
3083     return false;
3084   unsigned BaseReg = MI->getOperand(BasePosLd).getReg();
3085 
3086   // Look for the Phi instruction.
3087   MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
3088   MachineInstr *Phi = MRI.getVRegDef(BaseReg);
3089   if (!Phi || !Phi->isPHI())
3090     return false;
3091   // Get the register defined in the loop block.
3092   unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
3093   if (!PrevReg)
3094     return false;
3095 
3096   // Check for the post-increment load/store instruction.
3097   MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
3098   if (!PrevDef || PrevDef == MI)
3099     return false;
3100 
3101   if (!TII->isPostIncrement(*PrevDef))
3102     return false;
3103 
3104   unsigned BasePos1 = 0, OffsetPos1 = 0;
3105   if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
3106     return false;
3107 
3108   // Make sure that the instructions do not access the same memory location in
3109   // the next iteration.
3110   int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
3111   int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
3112   MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3113   NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
3114   bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
3115   MF.DeleteMachineInstr(NewMI);
3116   if (!Disjoint)
3117     return false;
3118 
3119   // Set the return value once we determine that we return true.
3120   BasePos = BasePosLd;
3121   OffsetPos = OffsetPosLd;
3122   NewBase = PrevReg;
3123   Offset = StoreOffset;
3124   return true;
3125 }
3126 
3127 /// Apply changes to the instruction if needed. The changes are need
3128 /// to improve the scheduling and depend up on the final schedule.
3129 void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
3130                                          SMSchedule &Schedule) {
3131   SUnit *SU = getSUnit(MI);
3132   DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
3133       InstrChanges.find(SU);
3134   if (It != InstrChanges.end()) {
3135     std::pair<unsigned, int64_t> RegAndOffset = It->second;
3136     unsigned BasePos, OffsetPos;
3137     if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3138       return;
3139     unsigned BaseReg = MI->getOperand(BasePos).getReg();
3140     MachineInstr *LoopDef = findDefInLoop(BaseReg);
3141     int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
3142     int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
3143     int BaseStageNum = Schedule.stageScheduled(SU);
3144     int BaseCycleNum = Schedule.cycleScheduled(SU);
3145     if (BaseStageNum < DefStageNum) {
3146       MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3147       int OffsetDiff = DefStageNum - BaseStageNum;
3148       if (DefCycleNum < BaseCycleNum) {
3149         NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
3150         if (OffsetDiff > 0)
3151           --OffsetDiff;
3152       }
3153       int64_t NewOffset =
3154           MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
3155       NewMI->getOperand(OffsetPos).setImm(NewOffset);
3156       SU->setInstr(NewMI);
3157       MISUnitMap[NewMI] = SU;
3158       NewMIs.insert(NewMI);
3159     }
3160   }
3161 }
3162 
3163 /// Return true for an order or output dependence that is loop carried
3164 /// potentially. A dependence is loop carried if the destination defines a valu
3165 /// that may be used or defined by the source in a subsequent iteration.
3166 bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
3167                                          bool isSucc) {
3168   if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
3169       Dep.isArtificial())
3170     return false;
3171 
3172   if (!SwpPruneLoopCarried)
3173     return true;
3174 
3175   if (Dep.getKind() == SDep::Output)
3176     return true;
3177 
3178   MachineInstr *SI = Source->getInstr();
3179   MachineInstr *DI = Dep.getSUnit()->getInstr();
3180   if (!isSucc)
3181     std::swap(SI, DI);
3182   assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
3183 
3184   // Assume ordered loads and stores may have a loop carried dependence.
3185   if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
3186       SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
3187     return true;
3188 
3189   // Only chain dependences between a load and store can be loop carried.
3190   if (!DI->mayStore() || !SI->mayLoad())
3191     return false;
3192 
3193   unsigned DeltaS, DeltaD;
3194   if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
3195     return true;
3196 
3197   const MachineOperand *BaseOpS, *BaseOpD;
3198   int64_t OffsetS, OffsetD;
3199   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3200   if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, TRI) ||
3201       !TII->getMemOperandWithOffset(*DI, BaseOpD, OffsetD, TRI))
3202     return true;
3203 
3204   if (!BaseOpS->isIdenticalTo(*BaseOpD))
3205     return true;
3206 
3207   // Check that the base register is incremented by a constant value for each
3208   // iteration.
3209   MachineInstr *Def = MRI.getVRegDef(BaseOpS->getReg());
3210   if (!Def || !Def->isPHI())
3211     return true;
3212   unsigned InitVal = 0;
3213   unsigned LoopVal = 0;
3214   getPhiRegs(*Def, BB, InitVal, LoopVal);
3215   MachineInstr *LoopDef = MRI.getVRegDef(LoopVal);
3216   int D = 0;
3217   if (!LoopDef || !TII->getIncrementValue(*LoopDef, D))
3218     return true;
3219 
3220   uint64_t AccessSizeS = (*SI->memoperands_begin())->getSize();
3221   uint64_t AccessSizeD = (*DI->memoperands_begin())->getSize();
3222 
3223   // This is the main test, which checks the offset values and the loop
3224   // increment value to determine if the accesses may be loop carried.
3225   if (AccessSizeS == MemoryLocation::UnknownSize ||
3226       AccessSizeD == MemoryLocation::UnknownSize)
3227     return true;
3228 
3229   if (DeltaS != DeltaD || DeltaS < AccessSizeS || DeltaD < AccessSizeD)
3230     return true;
3231 
3232   return (OffsetS + (int64_t)AccessSizeS < OffsetD + (int64_t)AccessSizeD);
3233 }
3234 
3235 void SwingSchedulerDAG::postprocessDAG() {
3236   for (auto &M : Mutations)
3237     M->apply(this);
3238 }
3239 
3240 /// Try to schedule the node at the specified StartCycle and continue
3241 /// until the node is schedule or the EndCycle is reached.  This function
3242 /// returns true if the node is scheduled.  This routine may search either
3243 /// forward or backward for a place to insert the instruction based upon
3244 /// the relative values of StartCycle and EndCycle.
3245 bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
3246   bool forward = true;
3247   if (StartCycle > EndCycle)
3248     forward = false;
3249 
3250   // The terminating condition depends on the direction.
3251   int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
3252   for (int curCycle = StartCycle; curCycle != termCycle;
3253        forward ? ++curCycle : --curCycle) {
3254 
3255     // Add the already scheduled instructions at the specified cycle to the
3256     // DFA.
3257     ProcItinResources.clearResources();
3258     for (int checkCycle = FirstCycle + ((curCycle - FirstCycle) % II);
3259          checkCycle <= LastCycle; checkCycle += II) {
3260       std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[checkCycle];
3261 
3262       for (std::deque<SUnit *>::iterator I = cycleInstrs.begin(),
3263                                          E = cycleInstrs.end();
3264            I != E; ++I) {
3265         if (ST.getInstrInfo()->isZeroCost((*I)->getInstr()->getOpcode()))
3266           continue;
3267         assert(ProcItinResources.canReserveResources(*(*I)->getInstr()) &&
3268                "These instructions have already been scheduled.");
3269         ProcItinResources.reserveResources(*(*I)->getInstr());
3270       }
3271     }
3272     if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
3273         ProcItinResources.canReserveResources(*SU->getInstr())) {
3274       LLVM_DEBUG({
3275         dbgs() << "\tinsert at cycle " << curCycle << " ";
3276         SU->getInstr()->dump();
3277       });
3278 
3279       ScheduledInstrs[curCycle].push_back(SU);
3280       InstrToCycle.insert(std::make_pair(SU, curCycle));
3281       if (curCycle > LastCycle)
3282         LastCycle = curCycle;
3283       if (curCycle < FirstCycle)
3284         FirstCycle = curCycle;
3285       return true;
3286     }
3287     LLVM_DEBUG({
3288       dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
3289       SU->getInstr()->dump();
3290     });
3291   }
3292   return false;
3293 }
3294 
3295 // Return the cycle of the earliest scheduled instruction in the chain.
3296 int SMSchedule::earliestCycleInChain(const SDep &Dep) {
3297   SmallPtrSet<SUnit *, 8> Visited;
3298   SmallVector<SDep, 8> Worklist;
3299   Worklist.push_back(Dep);
3300   int EarlyCycle = INT_MAX;
3301   while (!Worklist.empty()) {
3302     const SDep &Cur = Worklist.pop_back_val();
3303     SUnit *PrevSU = Cur.getSUnit();
3304     if (Visited.count(PrevSU))
3305       continue;
3306     std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
3307     if (it == InstrToCycle.end())
3308       continue;
3309     EarlyCycle = std::min(EarlyCycle, it->second);
3310     for (const auto &PI : PrevSU->Preds)
3311       if (PI.getKind() == SDep::Order || Dep.getKind() == SDep::Output)
3312         Worklist.push_back(PI);
3313     Visited.insert(PrevSU);
3314   }
3315   return EarlyCycle;
3316 }
3317 
3318 // Return the cycle of the latest scheduled instruction in the chain.
3319 int SMSchedule::latestCycleInChain(const SDep &Dep) {
3320   SmallPtrSet<SUnit *, 8> Visited;
3321   SmallVector<SDep, 8> Worklist;
3322   Worklist.push_back(Dep);
3323   int LateCycle = INT_MIN;
3324   while (!Worklist.empty()) {
3325     const SDep &Cur = Worklist.pop_back_val();
3326     SUnit *SuccSU = Cur.getSUnit();
3327     if (Visited.count(SuccSU))
3328       continue;
3329     std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
3330     if (it == InstrToCycle.end())
3331       continue;
3332     LateCycle = std::max(LateCycle, it->second);
3333     for (const auto &SI : SuccSU->Succs)
3334       if (SI.getKind() == SDep::Order || Dep.getKind() == SDep::Output)
3335         Worklist.push_back(SI);
3336     Visited.insert(SuccSU);
3337   }
3338   return LateCycle;
3339 }
3340 
3341 /// If an instruction has a use that spans multiple iterations, then
3342 /// return true. These instructions are characterized by having a back-ege
3343 /// to a Phi, which contains a reference to another Phi.
3344 static SUnit *multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG) {
3345   for (auto &P : SU->Preds)
3346     if (DAG->isBackedge(SU, P) && P.getSUnit()->getInstr()->isPHI())
3347       for (auto &S : P.getSUnit()->Succs)
3348         if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
3349           return P.getSUnit();
3350   return nullptr;
3351 }
3352 
3353 /// Compute the scheduling start slot for the instruction.  The start slot
3354 /// depends on any predecessor or successor nodes scheduled already.
3355 void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
3356                               int *MinEnd, int *MaxStart, int II,
3357                               SwingSchedulerDAG *DAG) {
3358   // Iterate over each instruction that has been scheduled already.  The start
3359   // slot computation depends on whether the previously scheduled instruction
3360   // is a predecessor or successor of the specified instruction.
3361   for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
3362 
3363     // Iterate over each instruction in the current cycle.
3364     for (SUnit *I : getInstructions(cycle)) {
3365       // Because we're processing a DAG for the dependences, we recognize
3366       // the back-edge in recurrences by anti dependences.
3367       for (unsigned i = 0, e = (unsigned)SU->Preds.size(); i != e; ++i) {
3368         const SDep &Dep = SU->Preds[i];
3369         if (Dep.getSUnit() == I) {
3370           if (!DAG->isBackedge(SU, Dep)) {
3371             int EarlyStart = cycle + Dep.getLatency() -
3372                              DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
3373             *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
3374             if (DAG->isLoopCarriedDep(SU, Dep, false)) {
3375               int End = earliestCycleInChain(Dep) + (II - 1);
3376               *MinEnd = std::min(*MinEnd, End);
3377             }
3378           } else {
3379             int LateStart = cycle - Dep.getLatency() +
3380                             DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
3381             *MinLateStart = std::min(*MinLateStart, LateStart);
3382           }
3383         }
3384         // For instruction that requires multiple iterations, make sure that
3385         // the dependent instruction is not scheduled past the definition.
3386         SUnit *BE = multipleIterations(I, DAG);
3387         if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
3388             !SU->isPred(I))
3389           *MinLateStart = std::min(*MinLateStart, cycle);
3390       }
3391       for (unsigned i = 0, e = (unsigned)SU->Succs.size(); i != e; ++i) {
3392         if (SU->Succs[i].getSUnit() == I) {
3393           const SDep &Dep = SU->Succs[i];
3394           if (!DAG->isBackedge(SU, Dep)) {
3395             int LateStart = cycle - Dep.getLatency() +
3396                             DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
3397             *MinLateStart = std::min(*MinLateStart, LateStart);
3398             if (DAG->isLoopCarriedDep(SU, Dep)) {
3399               int Start = latestCycleInChain(Dep) + 1 - II;
3400               *MaxStart = std::max(*MaxStart, Start);
3401             }
3402           } else {
3403             int EarlyStart = cycle + Dep.getLatency() -
3404                              DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
3405             *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
3406           }
3407         }
3408       }
3409     }
3410   }
3411 }
3412 
3413 /// Order the instructions within a cycle so that the definitions occur
3414 /// before the uses. Returns true if the instruction is added to the start
3415 /// of the list, or false if added to the end.
3416 void SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
3417                                  std::deque<SUnit *> &Insts) {
3418   MachineInstr *MI = SU->getInstr();
3419   bool OrderBeforeUse = false;
3420   bool OrderAfterDef = false;
3421   bool OrderBeforeDef = false;
3422   unsigned MoveDef = 0;
3423   unsigned MoveUse = 0;
3424   int StageInst1 = stageScheduled(SU);
3425 
3426   unsigned Pos = 0;
3427   for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
3428        ++I, ++Pos) {
3429     for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3430       MachineOperand &MO = MI->getOperand(i);
3431       if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
3432         continue;
3433 
3434       unsigned Reg = MO.getReg();
3435       unsigned BasePos, OffsetPos;
3436       if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3437         if (MI->getOperand(BasePos).getReg() == Reg)
3438           if (unsigned NewReg = SSD->getInstrBaseReg(SU))
3439             Reg = NewReg;
3440       bool Reads, Writes;
3441       std::tie(Reads, Writes) =
3442           (*I)->getInstr()->readsWritesVirtualRegister(Reg);
3443       if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
3444         OrderBeforeUse = true;
3445         if (MoveUse == 0)
3446           MoveUse = Pos;
3447       } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
3448         // Add the instruction after the scheduled instruction.
3449         OrderAfterDef = true;
3450         MoveDef = Pos;
3451       } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
3452         if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
3453           OrderBeforeUse = true;
3454           if (MoveUse == 0)
3455             MoveUse = Pos;
3456         } else {
3457           OrderAfterDef = true;
3458           MoveDef = Pos;
3459         }
3460       } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
3461         OrderBeforeUse = true;
3462         if (MoveUse == 0)
3463           MoveUse = Pos;
3464         if (MoveUse != 0) {
3465           OrderAfterDef = true;
3466           MoveDef = Pos - 1;
3467         }
3468       } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
3469         // Add the instruction before the scheduled instruction.
3470         OrderBeforeUse = true;
3471         if (MoveUse == 0)
3472           MoveUse = Pos;
3473       } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
3474                  isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
3475         if (MoveUse == 0) {
3476           OrderBeforeDef = true;
3477           MoveUse = Pos;
3478         }
3479       }
3480     }
3481     // Check for order dependences between instructions. Make sure the source
3482     // is ordered before the destination.
3483     for (auto &S : SU->Succs) {
3484       if (S.getSUnit() != *I)
3485         continue;
3486       if (S.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
3487         OrderBeforeUse = true;
3488         if (Pos < MoveUse)
3489           MoveUse = Pos;
3490       }
3491     }
3492     for (auto &P : SU->Preds) {
3493       if (P.getSUnit() != *I)
3494         continue;
3495       if (P.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
3496         OrderAfterDef = true;
3497         MoveDef = Pos;
3498       }
3499     }
3500   }
3501 
3502   // A circular dependence.
3503   if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3504     OrderBeforeUse = false;
3505 
3506   // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3507   // to a loop-carried dependence.
3508   if (OrderBeforeDef)
3509     OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
3510 
3511   // The uncommon case when the instruction order needs to be updated because
3512   // there is both a use and def.
3513   if (OrderBeforeUse && OrderAfterDef) {
3514     SUnit *UseSU = Insts.at(MoveUse);
3515     SUnit *DefSU = Insts.at(MoveDef);
3516     if (MoveUse > MoveDef) {
3517       Insts.erase(Insts.begin() + MoveUse);
3518       Insts.erase(Insts.begin() + MoveDef);
3519     } else {
3520       Insts.erase(Insts.begin() + MoveDef);
3521       Insts.erase(Insts.begin() + MoveUse);
3522     }
3523     orderDependence(SSD, UseSU, Insts);
3524     orderDependence(SSD, SU, Insts);
3525     orderDependence(SSD, DefSU, Insts);
3526     return;
3527   }
3528   // Put the new instruction first if there is a use in the list. Otherwise,
3529   // put it at the end of the list.
3530   if (OrderBeforeUse)
3531     Insts.push_front(SU);
3532   else
3533     Insts.push_back(SU);
3534 }
3535 
3536 /// Return true if the scheduled Phi has a loop carried operand.
3537 bool SMSchedule::isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi) {
3538   if (!Phi.isPHI())
3539     return false;
3540   assert(Phi.isPHI() && "Expecting a Phi.");
3541   SUnit *DefSU = SSD->getSUnit(&Phi);
3542   unsigned DefCycle = cycleScheduled(DefSU);
3543   int DefStage = stageScheduled(DefSU);
3544 
3545   unsigned InitVal = 0;
3546   unsigned LoopVal = 0;
3547   getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
3548   SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
3549   if (!UseSU)
3550     return true;
3551   if (UseSU->getInstr()->isPHI())
3552     return true;
3553   unsigned LoopCycle = cycleScheduled(UseSU);
3554   int LoopStage = stageScheduled(UseSU);
3555   return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
3556 }
3557 
3558 /// Return true if the instruction is a definition that is loop carried
3559 /// and defines the use on the next iteration.
3560 ///        v1 = phi(v2, v3)
3561 ///  (Def) v3 = op v1
3562 ///  (MO)   = v1
3563 /// If MO appears before Def, then then v1 and v3 may get assigned to the same
3564 /// register.
3565 bool SMSchedule::isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD,
3566                                        MachineInstr *Def, MachineOperand &MO) {
3567   if (!MO.isReg())
3568     return false;
3569   if (Def->isPHI())
3570     return false;
3571   MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
3572   if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
3573     return false;
3574   if (!isLoopCarried(SSD, *Phi))
3575     return false;
3576   unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
3577   for (unsigned i = 0, e = Def->getNumOperands(); i != e; ++i) {
3578     MachineOperand &DMO = Def->getOperand(i);
3579     if (!DMO.isReg() || !DMO.isDef())
3580       continue;
3581     if (DMO.getReg() == LoopReg)
3582       return true;
3583   }
3584   return false;
3585 }
3586 
3587 // Check if the generated schedule is valid. This function checks if
3588 // an instruction that uses a physical register is scheduled in a
3589 // different stage than the definition. The pipeliner does not handle
3590 // physical register values that may cross a basic block boundary.
3591 bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
3592   for (int i = 0, e = SSD->SUnits.size(); i < e; ++i) {
3593     SUnit &SU = SSD->SUnits[i];
3594     if (!SU.hasPhysRegDefs)
3595       continue;
3596     int StageDef = stageScheduled(&SU);
3597     assert(StageDef != -1 && "Instruction should have been scheduled.");
3598     for (auto &SI : SU.Succs)
3599       if (SI.isAssignedRegDep())
3600         if (ST.getRegisterInfo()->isPhysicalRegister(SI.getReg()))
3601           if (stageScheduled(SI.getSUnit()) != StageDef)
3602             return false;
3603   }
3604   return true;
3605 }
3606 
3607 /// A property of the node order in swing-modulo-scheduling is
3608 /// that for nodes outside circuits the following holds:
3609 /// none of them is scheduled after both a successor and a
3610 /// predecessor.
3611 /// The method below checks whether the property is met.
3612 /// If not, debug information is printed and statistics information updated.
3613 /// Note that we do not use an assert statement.
3614 /// The reason is that although an invalid node oder may prevent
3615 /// the pipeliner from finding a pipelined schedule for arbitrary II,
3616 /// it does not lead to the generation of incorrect code.
3617 void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
3618 
3619   // a sorted vector that maps each SUnit to its index in the NodeOrder
3620   typedef std::pair<SUnit *, unsigned> UnitIndex;
3621   std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
3622 
3623   for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
3624     Indices.push_back(std::make_pair(NodeOrder[i], i));
3625 
3626   auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
3627     return std::get<0>(i1) < std::get<0>(i2);
3628   };
3629 
3630   // sort, so that we can perform a binary search
3631   llvm::sort(Indices, CompareKey);
3632 
3633   bool Valid = true;
3634   (void)Valid;
3635   // for each SUnit in the NodeOrder, check whether
3636   // it appears after both a successor and a predecessor
3637   // of the SUnit. If this is the case, and the SUnit
3638   // is not part of circuit, then the NodeOrder is not
3639   // valid.
3640   for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
3641     SUnit *SU = NodeOrder[i];
3642     unsigned Index = i;
3643 
3644     bool PredBefore = false;
3645     bool SuccBefore = false;
3646 
3647     SUnit *Succ;
3648     SUnit *Pred;
3649     (void)Succ;
3650     (void)Pred;
3651 
3652     for (SDep &PredEdge : SU->Preds) {
3653       SUnit *PredSU = PredEdge.getSUnit();
3654       unsigned PredIndex =
3655           std::get<1>(*std::lower_bound(Indices.begin(), Indices.end(),
3656                                         std::make_pair(PredSU, 0), CompareKey));
3657       if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
3658         PredBefore = true;
3659         Pred = PredSU;
3660         break;
3661       }
3662     }
3663 
3664     for (SDep &SuccEdge : SU->Succs) {
3665       SUnit *SuccSU = SuccEdge.getSUnit();
3666       unsigned SuccIndex =
3667           std::get<1>(*std::lower_bound(Indices.begin(), Indices.end(),
3668                                         std::make_pair(SuccSU, 0), CompareKey));
3669       if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
3670         SuccBefore = true;
3671         Succ = SuccSU;
3672         break;
3673       }
3674     }
3675 
3676     if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
3677       // instructions in circuits are allowed to be scheduled
3678       // after both a successor and predecessor.
3679       bool InCircuit = std::any_of(
3680           Circuits.begin(), Circuits.end(),
3681           [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
3682       if (InCircuit)
3683         LLVM_DEBUG(dbgs() << "In a circuit, predecessor ";);
3684       else {
3685         Valid = false;
3686         NumNodeOrderIssues++;
3687         LLVM_DEBUG(dbgs() << "Predecessor ";);
3688       }
3689       LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
3690                         << " are scheduled before node " << SU->NodeNum
3691                         << "\n";);
3692     }
3693   }
3694 
3695   LLVM_DEBUG({
3696     if (!Valid)
3697       dbgs() << "Invalid node order found!\n";
3698   });
3699 }
3700 
3701 /// Attempt to fix the degenerate cases when the instruction serialization
3702 /// causes the register lifetimes to overlap. For example,
3703 ///   p' = store_pi(p, b)
3704 ///      = load p, offset
3705 /// In this case p and p' overlap, which means that two registers are needed.
3706 /// Instead, this function changes the load to use p' and updates the offset.
3707 void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
3708   unsigned OverlapReg = 0;
3709   unsigned NewBaseReg = 0;
3710   for (SUnit *SU : Instrs) {
3711     MachineInstr *MI = SU->getInstr();
3712     for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3713       const MachineOperand &MO = MI->getOperand(i);
3714       // Look for an instruction that uses p. The instruction occurs in the
3715       // same cycle but occurs later in the serialized order.
3716       if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
3717         // Check that the instruction appears in the InstrChanges structure,
3718         // which contains instructions that can have the offset updated.
3719         DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
3720           InstrChanges.find(SU);
3721         if (It != InstrChanges.end()) {
3722           unsigned BasePos, OffsetPos;
3723           // Update the base register and adjust the offset.
3724           if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
3725             MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3726             NewMI->getOperand(BasePos).setReg(NewBaseReg);
3727             int64_t NewOffset =
3728                 MI->getOperand(OffsetPos).getImm() - It->second.second;
3729             NewMI->getOperand(OffsetPos).setImm(NewOffset);
3730             SU->setInstr(NewMI);
3731             MISUnitMap[NewMI] = SU;
3732             NewMIs.insert(NewMI);
3733           }
3734         }
3735         OverlapReg = 0;
3736         NewBaseReg = 0;
3737         break;
3738       }
3739       // Look for an instruction of the form p' = op(p), which uses and defines
3740       // two virtual registers that get allocated to the same physical register.
3741       unsigned TiedUseIdx = 0;
3742       if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
3743         // OverlapReg is p in the example above.
3744         OverlapReg = MI->getOperand(TiedUseIdx).getReg();
3745         // NewBaseReg is p' in the example above.
3746         NewBaseReg = MI->getOperand(i).getReg();
3747         break;
3748       }
3749     }
3750   }
3751 }
3752 
3753 /// After the schedule has been formed, call this function to combine
3754 /// the instructions from the different stages/cycles.  That is, this
3755 /// function creates a schedule that represents a single iteration.
3756 void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
3757   // Move all instructions to the first stage from later stages.
3758   for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3759     for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
3760          ++stage) {
3761       std::deque<SUnit *> &cycleInstrs =
3762           ScheduledInstrs[cycle + (stage * InitiationInterval)];
3763       for (std::deque<SUnit *>::reverse_iterator I = cycleInstrs.rbegin(),
3764                                                  E = cycleInstrs.rend();
3765            I != E; ++I)
3766         ScheduledInstrs[cycle].push_front(*I);
3767     }
3768   }
3769   // Iterate over the definitions in each instruction, and compute the
3770   // stage difference for each use.  Keep the maximum value.
3771   for (auto &I : InstrToCycle) {
3772     int DefStage = stageScheduled(I.first);
3773     MachineInstr *MI = I.first->getInstr();
3774     for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3775       MachineOperand &Op = MI->getOperand(i);
3776       if (!Op.isReg() || !Op.isDef())
3777         continue;
3778 
3779       unsigned Reg = Op.getReg();
3780       unsigned MaxDiff = 0;
3781       bool PhiIsSwapped = false;
3782       for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(Reg),
3783                                              EI = MRI.use_end();
3784            UI != EI; ++UI) {
3785         MachineOperand &UseOp = *UI;
3786         MachineInstr *UseMI = UseOp.getParent();
3787         SUnit *SUnitUse = SSD->getSUnit(UseMI);
3788         int UseStage = stageScheduled(SUnitUse);
3789         unsigned Diff = 0;
3790         if (UseStage != -1 && UseStage >= DefStage)
3791           Diff = UseStage - DefStage;
3792         if (MI->isPHI()) {
3793           if (isLoopCarried(SSD, *MI))
3794             ++Diff;
3795           else
3796             PhiIsSwapped = true;
3797         }
3798         MaxDiff = std::max(Diff, MaxDiff);
3799       }
3800       RegToStageDiff[Reg] = std::make_pair(MaxDiff, PhiIsSwapped);
3801     }
3802   }
3803 
3804   // Erase all the elements in the later stages. Only one iteration should
3805   // remain in the scheduled list, and it contains all the instructions.
3806   for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
3807     ScheduledInstrs.erase(cycle);
3808 
3809   // Change the registers in instruction as specified in the InstrChanges
3810   // map. We need to use the new registers to create the correct order.
3811   for (int i = 0, e = SSD->SUnits.size(); i != e; ++i) {
3812     SUnit *SU = &SSD->SUnits[i];
3813     SSD->applyInstrChange(SU->getInstr(), *this);
3814   }
3815 
3816   // Reorder the instructions in each cycle to fix and improve the
3817   // generated code.
3818   for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
3819     std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
3820     std::deque<SUnit *> newOrderPhi;
3821     for (unsigned i = 0, e = cycleInstrs.size(); i < e; ++i) {
3822       SUnit *SU = cycleInstrs[i];
3823       if (SU->getInstr()->isPHI())
3824         newOrderPhi.push_back(SU);
3825     }
3826     std::deque<SUnit *> newOrderI;
3827     for (unsigned i = 0, e = cycleInstrs.size(); i < e; ++i) {
3828       SUnit *SU = cycleInstrs[i];
3829       if (!SU->getInstr()->isPHI())
3830         orderDependence(SSD, SU, newOrderI);
3831     }
3832     // Replace the old order with the new order.
3833     cycleInstrs.swap(newOrderPhi);
3834     cycleInstrs.insert(cycleInstrs.end(), newOrderI.begin(), newOrderI.end());
3835     SSD->fixupRegisterOverlaps(cycleInstrs);
3836   }
3837 
3838   LLVM_DEBUG(dump(););
3839 }
3840 
3841 void NodeSet::print(raw_ostream &os) const {
3842   os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
3843      << " depth " << MaxDepth << " col " << Colocate << "\n";
3844   for (const auto &I : Nodes)
3845     os << "   SU(" << I->NodeNum << ") " << *(I->getInstr());
3846   os << "\n";
3847 }
3848 
3849 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3850 /// Print the schedule information to the given output.
3851 void SMSchedule::print(raw_ostream &os) const {
3852   // Iterate over each cycle.
3853   for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3854     // Iterate over each instruction in the cycle.
3855     const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
3856     for (SUnit *CI : cycleInstrs->second) {
3857       os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
3858       os << "(" << CI->NodeNum << ") ";
3859       CI->getInstr()->print(os);
3860       os << "\n";
3861     }
3862   }
3863 }
3864 
3865 /// Utility function used for debugging to print the schedule.
3866 LLVM_DUMP_METHOD void SMSchedule::dump() const { print(dbgs()); }
3867 LLVM_DUMP_METHOD void NodeSet::dump() const { print(dbgs()); }
3868 
3869 #endif
3870 
3871 void ResourceManager::initProcResourceVectors(
3872     const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
3873   unsigned ProcResourceID = 0;
3874 
3875   // We currently limit the resource kinds to 64 and below so that we can use
3876   // uint64_t for Masks
3877   assert(SM.getNumProcResourceKinds() < 64 &&
3878          "Too many kinds of resources, unsupported");
3879   // Create a unique bitmask for every processor resource unit.
3880   // Skip resource at index 0, since it always references 'InvalidUnit'.
3881   Masks.resize(SM.getNumProcResourceKinds());
3882   for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3883     const MCProcResourceDesc &Desc = *SM.getProcResource(I);
3884     if (Desc.SubUnitsIdxBegin)
3885       continue;
3886     Masks[I] = 1ULL << ProcResourceID;
3887     ProcResourceID++;
3888   }
3889   // Create a unique bitmask for every processor resource group.
3890   for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3891     const MCProcResourceDesc &Desc = *SM.getProcResource(I);
3892     if (!Desc.SubUnitsIdxBegin)
3893       continue;
3894     Masks[I] = 1ULL << ProcResourceID;
3895     for (unsigned U = 0; U < Desc.NumUnits; ++U)
3896       Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
3897     ProcResourceID++;
3898   }
3899   LLVM_DEBUG({
3900     dbgs() << "ProcResourceDesc:\n";
3901     for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3902       const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
3903       dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3904                        ProcResource->Name, I, Masks[I], ProcResource->NumUnits);
3905     }
3906     dbgs() << " -----------------\n";
3907   });
3908 }
3909 
3910 bool ResourceManager::canReserveResources(const MCInstrDesc *MID) const {
3911 
3912   LLVM_DEBUG({ dbgs() << "canReserveResources:\n"; });
3913   if (UseDFA)
3914     return DFAResources->canReserveResources(MID);
3915 
3916   unsigned InsnClass = MID->getSchedClass();
3917   const MCSchedClassDesc *SCDesc = SM.getSchedClassDesc(InsnClass);
3918   if (!SCDesc->isValid()) {
3919     LLVM_DEBUG({
3920       dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3921       dbgs() << "isPseduo:" << MID->isPseudo() << "\n";
3922     });
3923     return true;
3924   }
3925 
3926   const MCWriteProcResEntry *I = STI->getWriteProcResBegin(SCDesc);
3927   const MCWriteProcResEntry *E = STI->getWriteProcResEnd(SCDesc);
3928   for (; I != E; ++I) {
3929     if (!I->Cycles)
3930       continue;
3931     const MCProcResourceDesc *ProcResource =
3932         SM.getProcResource(I->ProcResourceIdx);
3933     unsigned NumUnits = ProcResource->NumUnits;
3934     LLVM_DEBUG({
3935       dbgs() << format(" %16s(%2d): Count: %2d, NumUnits:%2d, Cycles:%2d\n",
3936                        ProcResource->Name, I->ProcResourceIdx,
3937                        ProcResourceCount[I->ProcResourceIdx], NumUnits,
3938                        I->Cycles);
3939     });
3940     if (ProcResourceCount[I->ProcResourceIdx] >= NumUnits)
3941       return false;
3942   }
3943   LLVM_DEBUG(dbgs() << "return true\n\n";);
3944   return true;
3945 }
3946 
3947 void ResourceManager::reserveResources(const MCInstrDesc *MID) {
3948   LLVM_DEBUG({ dbgs() << "reserveResources:\n"; });
3949   if (UseDFA)
3950     return DFAResources->reserveResources(MID);
3951 
3952   unsigned InsnClass = MID->getSchedClass();
3953   const MCSchedClassDesc *SCDesc = SM.getSchedClassDesc(InsnClass);
3954   if (!SCDesc->isValid()) {
3955     LLVM_DEBUG({
3956       dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3957       dbgs() << "isPseduo:" << MID->isPseudo() << "\n";
3958     });
3959     return;
3960   }
3961   for (const MCWriteProcResEntry &PRE :
3962        make_range(STI->getWriteProcResBegin(SCDesc),
3963                   STI->getWriteProcResEnd(SCDesc))) {
3964     if (!PRE.Cycles)
3965       continue;
3966     ++ProcResourceCount[PRE.ProcResourceIdx];
3967     LLVM_DEBUG({
3968       const MCProcResourceDesc *ProcResource =
3969           SM.getProcResource(PRE.ProcResourceIdx);
3970       dbgs() << format(" %16s(%2d): Count: %2d, NumUnits:%2d, Cycles:%2d\n",
3971                        ProcResource->Name, PRE.ProcResourceIdx,
3972                        ProcResourceCount[PRE.ProcResourceIdx],
3973                        ProcResource->NumUnits, PRE.Cycles);
3974     });
3975   }
3976   LLVM_DEBUG({ dbgs() << "reserveResources: done!\n\n"; });
3977 }
3978 
3979 bool ResourceManager::canReserveResources(const MachineInstr &MI) const {
3980   return canReserveResources(&MI.getDesc());
3981 }
3982 
3983 void ResourceManager::reserveResources(const MachineInstr &MI) {
3984   return reserveResources(&MI.getDesc());
3985 }
3986 
3987 void ResourceManager::clearResources() {
3988   if (UseDFA)
3989     return DFAResources->clearResources();
3990   std::fill(ProcResourceCount.begin(), ProcResourceCount.end(), 0);
3991 }
3992 
3993