1 //===-- MachinePipeliner.cpp - Machine Software Pipeliner Pass ------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
11 //
12 // Software pipelining (SWP) is an instruction scheduling technique for loops
13 // that overlap loop iterations and explioits ILP via a compiler transformation.
14 //
15 // Swing Modulo Scheduling is an implementation of software pipelining
16 // that generates schedules that are near optimal in terms of initiation
17 // interval, register requirements, and stage count. See the papers:
18 //
19 // "Swing Modulo Scheduling: A Lifetime-Sensitive Approach", by J. Llosa,
20 // A. Gonzalez, E. Ayguade, and M. Valero. In PACT '96 Processings of the 1996
21 // Conference on Parallel Architectures and Compilation Techiniques.
22 //
23 // "Lifetime-Sensitive Modulo Scheduling in a Production Environment", by J.
24 // Llosa, E. Ayguade, A. Gonzalez, M. Valero, and J. Eckhardt. In IEEE
25 // Transactions on Computers, Vol. 50, No. 3, 2001.
26 //
27 // "An Implementation of Swing Modulo Scheduling With Extensions for
28 // Superblocks", by T. Lattner, Master's Thesis, University of Illinois at
29 // Urbana-Chambpain, 2005.
30 //
31 //
32 // The SMS algorithm consists of three main steps after computing the minimal
33 // initiation interval (MII).
34 // 1) Analyze the dependence graph and compute information about each
35 //    instruction in the graph.
36 // 2) Order the nodes (instructions) by priority based upon the heuristics
37 //    described in the algorithm.
38 // 3) Attempt to schedule the nodes in the specified order using the MII.
39 //
40 // This SMS implementation is a target-independent back-end pass. When enabled,
41 // the pass runs just prior to the register allocation pass, while the machine
42 // IR is in SSA form. If software pipelining is successful, then the original
43 // loop is replaced by the optimized loop. The optimized loop contains one or
44 // more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
45 // the instructions cannot be scheduled in a given MII, we increase the MII by
46 // one and try again.
47 //
48 // The SMS implementation is an extension of the ScheduleDAGInstrs class. We
49 // represent loop carried dependences in the DAG as order edges to the Phi
50 // nodes. We also perform several passes over the DAG to eliminate unnecessary
51 // edges that inhibit the ability to pipeline. The implementation uses the
52 // DFAPacketizer class to compute the minimum initiation interval and the check
53 // where an instruction may be inserted in the pipelined schedule.
54 //
55 // In order for the SMS pass to work, several target specific hooks need to be
56 // implemented to get information about the loop structure and to rewrite
57 // instructions.
58 //
59 //===----------------------------------------------------------------------===//
60 
61 #include "llvm/ADT/ArrayRef.h"
62 #include "llvm/ADT/BitVector.h"
63 #include "llvm/ADT/DenseMap.h"
64 #include "llvm/ADT/iterator_range.h"
65 #include "llvm/ADT/MapVector.h"
66 #include "llvm/ADT/PriorityQueue.h"
67 #include "llvm/ADT/SetVector.h"
68 #include "llvm/ADT/SmallPtrSet.h"
69 #include "llvm/ADT/SmallSet.h"
70 #include "llvm/ADT/SmallVector.h"
71 #include "llvm/ADT/Statistic.h"
72 #include "llvm/Analysis/AliasAnalysis.h"
73 #include "llvm/Analysis/MemoryLocation.h"
74 #include "llvm/Analysis/ValueTracking.h"
75 #include "llvm/CodeGen/DFAPacketizer.h"
76 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
77 #include "llvm/CodeGen/MachineBasicBlock.h"
78 #include "llvm/CodeGen/MachineDominators.h"
79 #include "llvm/CodeGen/MachineFunction.h"
80 #include "llvm/CodeGen/MachineFunctionPass.h"
81 #include "llvm/CodeGen/MachineInstr.h"
82 #include "llvm/CodeGen/MachineInstrBuilder.h"
83 #include "llvm/CodeGen/MachineInstrBundle.h"
84 #include "llvm/CodeGen/MachineLoopInfo.h"
85 #include "llvm/CodeGen/MachineMemOperand.h"
86 #include "llvm/CodeGen/MachineOperand.h"
87 #include "llvm/CodeGen/MachineRegisterInfo.h"
88 #include "llvm/CodeGen/RegisterClassInfo.h"
89 #include "llvm/CodeGen/RegisterPressure.h"
90 #include "llvm/CodeGen/ScheduleDAG.h"
91 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
92 #include "llvm/IR/Attributes.h"
93 #include "llvm/IR/DebugLoc.h"
94 #include "llvm/MC/MCInstrItineraries.h"
95 #include "llvm/PassAnalysisSupport.h"
96 #include "llvm/PassRegistry.h"
97 #include "llvm/PassSupport.h"
98 #include "llvm/Support/CommandLine.h"
99 #include "llvm/Support/Debug.h"
100 #include "llvm/Support/MathExtras.h"
101 #include "llvm/Support/raw_ostream.h"
102 #include "llvm/Target/TargetInstrInfo.h"
103 #include "llvm/Target/TargetRegisterInfo.h"
104 #include "llvm/Target/TargetSubtargetInfo.h"
105 #include <algorithm>
106 #include <cassert>
107 #include <climits>
108 #include <cstdint>
109 #include <deque>
110 #include <functional>
111 #include <iterator>
112 #include <map>
113 #include <tuple>
114 #include <utility>
115 #include <vector>
116 
117 using namespace llvm;
118 
119 #define DEBUG_TYPE "pipeliner"
120 
121 STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
122 STATISTIC(NumPipelined, "Number of loops software pipelined");
123 
124 /// A command line option to turn software pipelining on or off.
125 static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
126                                cl::ZeroOrMore,
127                                cl::desc("Enable Software Pipelining"));
128 
129 /// A command line option to enable SWP at -Os.
130 static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
131                                       cl::desc("Enable SWP at Os."), cl::Hidden,
132                                       cl::init(false));
133 
134 /// A command line argument to limit minimum initial interval for pipelining.
135 static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
136                               cl::desc("Size limit for the the MII."),
137                               cl::Hidden, cl::init(27));
138 
139 /// A command line argument to limit the number of stages in the pipeline.
140 static cl::opt<int>
141     SwpMaxStages("pipeliner-max-stages",
142                  cl::desc("Maximum stages allowed in the generated scheduled."),
143                  cl::Hidden, cl::init(3));
144 
145 /// A command line option to disable the pruning of chain dependences due to
146 /// an unrelated Phi.
147 static cl::opt<bool>
148     SwpPruneDeps("pipeliner-prune-deps",
149                  cl::desc("Prune dependences between unrelated Phi nodes."),
150                  cl::Hidden, cl::init(true));
151 
152 /// A command line option to disable the pruning of loop carried order
153 /// dependences.
154 static cl::opt<bool>
155     SwpPruneLoopCarried("pipeliner-prune-loop-carried",
156                         cl::desc("Prune loop carried order dependences."),
157                         cl::Hidden, cl::init(true));
158 
159 #ifndef NDEBUG
160 static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
161 #endif
162 
163 static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
164                                      cl::ReallyHidden, cl::init(false),
165                                      cl::ZeroOrMore, cl::desc("Ignore RecMII"));
166 
167 namespace {
168 
169 class NodeSet;
170 class SMSchedule;
171 class SwingSchedulerDAG;
172 
173 /// The main class in the implementation of the target independent
174 /// software pipeliner pass.
175 class MachinePipeliner : public MachineFunctionPass {
176 public:
177   MachineFunction *MF = nullptr;
178   const MachineLoopInfo *MLI = nullptr;
179   const MachineDominatorTree *MDT = nullptr;
180   const InstrItineraryData *InstrItins;
181   const TargetInstrInfo *TII = nullptr;
182   RegisterClassInfo RegClassInfo;
183 
184 #ifndef NDEBUG
185   static int NumTries;
186 #endif
187   /// Cache the target analysis information about the loop.
188   struct LoopInfo {
189     MachineBasicBlock *TBB = nullptr;
190     MachineBasicBlock *FBB = nullptr;
191     SmallVector<MachineOperand, 4> BrCond;
192     MachineInstr *LoopInductionVar = nullptr;
193     MachineInstr *LoopCompare = nullptr;
194   };
195   LoopInfo LI;
196 
197   static char ID;
198   MachinePipeliner() : MachineFunctionPass(ID) {
199     initializeMachinePipelinerPass(*PassRegistry::getPassRegistry());
200   }
201 
202   bool runOnMachineFunction(MachineFunction &MF) override;
203 
204   void getAnalysisUsage(AnalysisUsage &AU) const override {
205     AU.addRequired<AAResultsWrapperPass>();
206     AU.addPreserved<AAResultsWrapperPass>();
207     AU.addRequired<MachineLoopInfo>();
208     AU.addRequired<MachineDominatorTree>();
209     AU.addRequired<LiveIntervals>();
210     MachineFunctionPass::getAnalysisUsage(AU);
211   }
212 
213 private:
214   bool canPipelineLoop(MachineLoop &L);
215   bool scheduleLoop(MachineLoop &L);
216   bool swingModuloScheduler(MachineLoop &L);
217 };
218 
219 /// This class builds the dependence graph for the instructions in a loop,
220 /// and attempts to schedule the instructions using the SMS algorithm.
221 class SwingSchedulerDAG : public ScheduleDAGInstrs {
222   MachinePipeliner &Pass;
223   /// The minimum initiation interval between iterations for this schedule.
224   unsigned MII;
225   /// Set to true if a valid pipelined schedule is found for the loop.
226   bool Scheduled;
227   MachineLoop &Loop;
228   LiveIntervals &LIS;
229   const RegisterClassInfo &RegClassInfo;
230 
231   /// A toplogical ordering of the SUnits, which is needed for changing
232   /// dependences and iterating over the SUnits.
233   ScheduleDAGTopologicalSort Topo;
234 
235   struct NodeInfo {
236     int ASAP;
237     int ALAP;
238     NodeInfo() : ASAP(0), ALAP(0) {}
239   };
240   /// Computed properties for each node in the graph.
241   std::vector<NodeInfo> ScheduleInfo;
242 
243   enum OrderKind { BottomUp = 0, TopDown = 1 };
244   /// Computed node ordering for scheduling.
245   SetVector<SUnit *> NodeOrder;
246 
247   typedef SmallVector<NodeSet, 8> NodeSetType;
248   typedef DenseMap<unsigned, unsigned> ValueMapTy;
249   typedef SmallVectorImpl<MachineBasicBlock *> MBBVectorTy;
250   typedef DenseMap<MachineInstr *, MachineInstr *> InstrMapTy;
251 
252   /// Instructions to change when emitting the final schedule.
253   DenseMap<SUnit *, std::pair<unsigned, int64_t>> InstrChanges;
254 
255   /// We may create a new instruction, so remember it because it
256   /// must be deleted when the pass is finished.
257   SmallPtrSet<MachineInstr *, 4> NewMIs;
258 
259   /// Helper class to implement Johnson's circuit finding algorithm.
260   class Circuits {
261     std::vector<SUnit> &SUnits;
262     SetVector<SUnit *> Stack;
263     BitVector Blocked;
264     SmallVector<SmallPtrSet<SUnit *, 4>, 10> B;
265     SmallVector<SmallVector<int, 4>, 16> AdjK;
266     unsigned NumPaths;
267     static unsigned MaxPaths;
268 
269   public:
270     Circuits(std::vector<SUnit> &SUs)
271         : SUnits(SUs), Stack(), Blocked(SUs.size()), B(SUs.size()),
272           AdjK(SUs.size()) {}
273     /// Reset the data structures used in the circuit algorithm.
274     void reset() {
275       Stack.clear();
276       Blocked.reset();
277       B.assign(SUnits.size(), SmallPtrSet<SUnit *, 4>());
278       NumPaths = 0;
279     }
280     void createAdjacencyStructure(SwingSchedulerDAG *DAG);
281     bool circuit(int V, int S, NodeSetType &NodeSets, bool HasBackedge = false);
282     void unblock(int U);
283   };
284 
285 public:
286   SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis,
287                     const RegisterClassInfo &rci)
288       : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), MII(0),
289         Scheduled(false), Loop(L), LIS(lis), RegClassInfo(rci),
290         Topo(SUnits, &ExitSU) {}
291 
292   void schedule() override;
293   void finishBlock() override;
294 
295   /// Return true if the loop kernel has been scheduled.
296   bool hasNewSchedule() { return Scheduled; }
297 
298   /// Return the earliest time an instruction may be scheduled.
299   int getASAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ASAP; }
300 
301   /// Return the latest time an instruction my be scheduled.
302   int getALAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ALAP; }
303 
304   /// The mobility function, which the the number of slots in which
305   /// an instruction may be scheduled.
306   int getMOV(SUnit *Node) { return getALAP(Node) - getASAP(Node); }
307 
308   /// The depth, in the dependence graph, for a node.
309   int getDepth(SUnit *Node) { return Node->getDepth(); }
310 
311   /// The height, in the dependence graph, for a node.
312   int getHeight(SUnit *Node) { return Node->getHeight(); }
313 
314   /// Return true if the dependence is a back-edge in the data dependence graph.
315   /// Since the DAG doesn't contain cycles, we represent a cycle in the graph
316   /// using an anti dependence from a Phi to an instruction.
317   bool isBackedge(SUnit *Source, const SDep &Dep) {
318     if (Dep.getKind() != SDep::Anti)
319       return false;
320     return Source->getInstr()->isPHI() || Dep.getSUnit()->getInstr()->isPHI();
321   }
322 
323   /// Return true if the dependence is an order dependence between non-Phis.
324   static bool isOrder(SUnit *Source, const SDep &Dep) {
325     if (Dep.getKind() != SDep::Order)
326       return false;
327     return (!Source->getInstr()->isPHI() &&
328             !Dep.getSUnit()->getInstr()->isPHI());
329   }
330 
331   bool isLoopCarriedOrder(SUnit *Source, const SDep &Dep, bool isSucc = true);
332 
333   /// The latency of the dependence.
334   unsigned getLatency(SUnit *Source, const SDep &Dep) {
335     // Anti dependences represent recurrences, so use the latency of the
336     // instruction on the back-edge.
337     if (Dep.getKind() == SDep::Anti) {
338       if (Source->getInstr()->isPHI())
339         return Dep.getSUnit()->Latency;
340       if (Dep.getSUnit()->getInstr()->isPHI())
341         return Source->Latency;
342       return Dep.getLatency();
343     }
344     return Dep.getLatency();
345   }
346 
347   /// The distance function, which indicates that operation V of iteration I
348   /// depends on operations U of iteration I-distance.
349   unsigned getDistance(SUnit *U, SUnit *V, const SDep &Dep) {
350     // Instructions that feed a Phi have a distance of 1. Computing larger
351     // values for arrays requires data dependence information.
352     if (V->getInstr()->isPHI() && Dep.getKind() == SDep::Anti)
353       return 1;
354     return 0;
355   }
356 
357   /// Set the Minimum Initiation Interval for this schedule attempt.
358   void setMII(unsigned mii) { MII = mii; }
359 
360   MachineInstr *applyInstrChange(MachineInstr *MI, SMSchedule &Schedule,
361                                  bool UpdateDAG = false);
362 
363   /// Return the new base register that was stored away for the changed
364   /// instruction.
365   unsigned getInstrBaseReg(SUnit *SU) {
366     DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
367         InstrChanges.find(SU);
368     if (It != InstrChanges.end())
369       return It->second.first;
370     return 0;
371   }
372 
373 private:
374   void addLoopCarriedDependences(AliasAnalysis *AA);
375   void updatePhiDependences();
376   void changeDependences();
377   unsigned calculateResMII();
378   unsigned calculateRecMII(NodeSetType &RecNodeSets);
379   void findCircuits(NodeSetType &NodeSets);
380   void fuseRecs(NodeSetType &NodeSets);
381   void removeDuplicateNodes(NodeSetType &NodeSets);
382   void computeNodeFunctions(NodeSetType &NodeSets);
383   void registerPressureFilter(NodeSetType &NodeSets);
384   void colocateNodeSets(NodeSetType &NodeSets);
385   void checkNodeSets(NodeSetType &NodeSets);
386   void groupRemainingNodes(NodeSetType &NodeSets);
387   void addConnectedNodes(SUnit *SU, NodeSet &NewSet,
388                          SetVector<SUnit *> &NodesAdded);
389   void computeNodeOrder(NodeSetType &NodeSets);
390   bool schedulePipeline(SMSchedule &Schedule);
391   void generatePipelinedLoop(SMSchedule &Schedule);
392   void generateProlog(SMSchedule &Schedule, unsigned LastStage,
393                       MachineBasicBlock *KernelBB, ValueMapTy *VRMap,
394                       MBBVectorTy &PrologBBs);
395   void generateEpilog(SMSchedule &Schedule, unsigned LastStage,
396                       MachineBasicBlock *KernelBB, ValueMapTy *VRMap,
397                       MBBVectorTy &EpilogBBs, MBBVectorTy &PrologBBs);
398   void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
399                             MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
400                             SMSchedule &Schedule, ValueMapTy *VRMap,
401                             InstrMapTy &InstrMap, unsigned LastStageNum,
402                             unsigned CurStageNum, bool IsLast);
403   void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
404                     MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
405                     SMSchedule &Schedule, ValueMapTy *VRMap,
406                     InstrMapTy &InstrMap, unsigned LastStageNum,
407                     unsigned CurStageNum, bool IsLast);
408   void removeDeadInstructions(MachineBasicBlock *KernelBB,
409                               MBBVectorTy &EpilogBBs);
410   void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
411                       SMSchedule &Schedule);
412   void addBranches(MBBVectorTy &PrologBBs, MachineBasicBlock *KernelBB,
413                    MBBVectorTy &EpilogBBs, SMSchedule &Schedule,
414                    ValueMapTy *VRMap);
415   bool computeDelta(MachineInstr &MI, unsigned &Delta);
416   void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
417                          unsigned Num);
418   MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum,
419                            unsigned InstStageNum);
420   MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum,
421                                     unsigned InstStageNum,
422                                     SMSchedule &Schedule);
423   void updateInstruction(MachineInstr *NewMI, bool LastDef,
424                          unsigned CurStageNum, unsigned InstStageNum,
425                          SMSchedule &Schedule, ValueMapTy *VRMap);
426   MachineInstr *findDefInLoop(unsigned Reg);
427   unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal,
428                          unsigned LoopStage, ValueMapTy *VRMap,
429                          MachineBasicBlock *BB);
430   void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum,
431                         SMSchedule &Schedule, ValueMapTy *VRMap,
432                         InstrMapTy &InstrMap);
433   void rewriteScheduledInstr(MachineBasicBlock *BB, SMSchedule &Schedule,
434                              InstrMapTy &InstrMap, unsigned CurStageNum,
435                              unsigned PhiNum, MachineInstr *Phi,
436                              unsigned OldReg, unsigned NewReg,
437                              unsigned PrevReg = 0);
438   bool canUseLastOffsetValue(MachineInstr *MI, unsigned &BasePos,
439                              unsigned &OffsetPos, unsigned &NewBase,
440                              int64_t &NewOffset);
441 };
442 
443 /// A NodeSet contains a set of SUnit DAG nodes with additional information
444 /// that assigns a priority to the set.
445 class NodeSet {
446   SetVector<SUnit *> Nodes;
447   bool HasRecurrence;
448   unsigned RecMII = 0;
449   int MaxMOV = 0;
450   int MaxDepth = 0;
451   unsigned Colocate = 0;
452   SUnit *ExceedPressure = nullptr;
453 
454 public:
455   typedef SetVector<SUnit *>::const_iterator iterator;
456 
457   NodeSet() : Nodes(), HasRecurrence(false) {}
458 
459   NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) {}
460 
461   bool insert(SUnit *SU) { return Nodes.insert(SU); }
462 
463   void insert(iterator S, iterator E) { Nodes.insert(S, E); }
464 
465   template <typename UnaryPredicate> bool remove_if(UnaryPredicate P) {
466     return Nodes.remove_if(P);
467   }
468 
469   unsigned count(SUnit *SU) const { return Nodes.count(SU); }
470 
471   bool hasRecurrence() { return HasRecurrence; };
472 
473   unsigned size() const { return Nodes.size(); }
474 
475   bool empty() const { return Nodes.empty(); }
476 
477   SUnit *getNode(unsigned i) const { return Nodes[i]; };
478 
479   void setRecMII(unsigned mii) { RecMII = mii; };
480 
481   void setColocate(unsigned c) { Colocate = c; };
482 
483   void setExceedPressure(SUnit *SU) { ExceedPressure = SU; }
484 
485   bool isExceedSU(SUnit *SU) { return ExceedPressure == SU; }
486 
487   int compareRecMII(NodeSet &RHS) { return RecMII - RHS.RecMII; }
488 
489   int getRecMII() { return RecMII; }
490 
491   /// Summarize node functions for the entire node set.
492   void computeNodeSetInfo(SwingSchedulerDAG *SSD) {
493     for (SUnit *SU : *this) {
494       MaxMOV = std::max(MaxMOV, SSD->getMOV(SU));
495       MaxDepth = std::max(MaxDepth, SSD->getDepth(SU));
496     }
497   }
498 
499   void clear() {
500     Nodes.clear();
501     RecMII = 0;
502     HasRecurrence = false;
503     MaxMOV = 0;
504     MaxDepth = 0;
505     Colocate = 0;
506     ExceedPressure = nullptr;
507   }
508 
509   operator SetVector<SUnit *> &() { return Nodes; }
510 
511   /// Sort the node sets by importance. First, rank them by recurrence MII,
512   /// then by mobility (least mobile done first), and finally by depth.
513   /// Each node set may contain a colocate value which is used as the first
514   /// tie breaker, if it's set.
515   bool operator>(const NodeSet &RHS) const {
516     if (RecMII == RHS.RecMII) {
517       if (Colocate != 0 && RHS.Colocate != 0 && Colocate != RHS.Colocate)
518         return Colocate < RHS.Colocate;
519       if (MaxMOV == RHS.MaxMOV)
520         return MaxDepth > RHS.MaxDepth;
521       return MaxMOV < RHS.MaxMOV;
522     }
523     return RecMII > RHS.RecMII;
524   }
525 
526   bool operator==(const NodeSet &RHS) const {
527     return RecMII == RHS.RecMII && MaxMOV == RHS.MaxMOV &&
528            MaxDepth == RHS.MaxDepth;
529   }
530 
531   bool operator!=(const NodeSet &RHS) const { return !operator==(RHS); }
532 
533   iterator begin() { return Nodes.begin(); }
534   iterator end() { return Nodes.end(); }
535 
536   void print(raw_ostream &os) const {
537     os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
538        << " depth " << MaxDepth << " col " << Colocate << "\n";
539     for (const auto &I : Nodes)
540       os << "   SU(" << I->NodeNum << ") " << *(I->getInstr());
541     os << "\n";
542   }
543 
544   void dump() const { print(dbgs()); }
545 };
546 
547 /// This class repesents the scheduled code.  The main data structure is a
548 /// map from scheduled cycle to instructions.  During scheduling, the
549 /// data structure explicitly represents all stages/iterations.   When
550 /// the algorithm finshes, the schedule is collapsed into a single stage,
551 /// which represents instructions from different loop iterations.
552 ///
553 /// The SMS algorithm allows negative values for cycles, so the first cycle
554 /// in the schedule is the smallest cycle value.
555 class SMSchedule {
556 private:
557   /// Map from execution cycle to instructions.
558   DenseMap<int, std::deque<SUnit *>> ScheduledInstrs;
559 
560   /// Map from instruction to execution cycle.
561   std::map<SUnit *, int> InstrToCycle;
562 
563   /// Map for each register and the max difference between its uses and def.
564   /// The first element in the pair is the max difference in stages. The
565   /// second is true if the register defines a Phi value and loop value is
566   /// scheduled before the Phi.
567   std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff;
568 
569   /// Keep track of the first cycle value in the schedule.  It starts
570   /// as zero, but the algorithm allows negative values.
571   int FirstCycle;
572 
573   /// Keep track of the last cycle value in the schedule.
574   int LastCycle;
575 
576   /// The initiation interval (II) for the schedule.
577   int InitiationInterval;
578 
579   /// Target machine information.
580   const TargetSubtargetInfo &ST;
581 
582   /// Virtual register information.
583   MachineRegisterInfo &MRI;
584 
585   DFAPacketizer *Resources;
586 
587 public:
588   SMSchedule(MachineFunction *mf)
589       : ST(mf->getSubtarget()), MRI(mf->getRegInfo()),
590         Resources(ST.getInstrInfo()->CreateTargetScheduleState(ST)) {
591     FirstCycle = 0;
592     LastCycle = 0;
593     InitiationInterval = 0;
594   }
595 
596   ~SMSchedule() {
597     ScheduledInstrs.clear();
598     InstrToCycle.clear();
599     RegToStageDiff.clear();
600     delete Resources;
601   }
602 
603   void reset() {
604     ScheduledInstrs.clear();
605     InstrToCycle.clear();
606     RegToStageDiff.clear();
607     FirstCycle = 0;
608     LastCycle = 0;
609     InitiationInterval = 0;
610   }
611 
612   /// Set the initiation interval for this schedule.
613   void setInitiationInterval(int ii) { InitiationInterval = ii; }
614 
615   /// Return the first cycle in the completed schedule.  This
616   /// can be a negative value.
617   int getFirstCycle() const { return FirstCycle; }
618 
619   /// Return the last cycle in the finalized schedule.
620   int getFinalCycle() const { return FirstCycle + InitiationInterval - 1; }
621 
622   /// Return the cycle of the earliest scheduled instruction in the dependence
623   /// chain.
624   int earliestCycleInChain(const SDep &Dep);
625 
626   /// Return the cycle of the latest scheduled instruction in the dependence
627   /// chain.
628   int latestCycleInChain(const SDep &Dep);
629 
630   void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
631                     int *MinEnd, int *MaxStart, int II, SwingSchedulerDAG *DAG);
632   bool insert(SUnit *SU, int StartCycle, int EndCycle, int II);
633 
634   /// Iterators for the cycle to instruction map.
635   typedef DenseMap<int, std::deque<SUnit *>>::iterator sched_iterator;
636   typedef DenseMap<int, std::deque<SUnit *>>::const_iterator
637       const_sched_iterator;
638 
639   /// Return true if the instruction is scheduled at the specified stage.
640   bool isScheduledAtStage(SUnit *SU, unsigned StageNum) {
641     return (stageScheduled(SU) == (int)StageNum);
642   }
643 
644   /// Return the stage for a scheduled instruction.  Return -1 if
645   /// the instruction has not been scheduled.
646   int stageScheduled(SUnit *SU) const {
647     std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SU);
648     if (it == InstrToCycle.end())
649       return -1;
650     return (it->second - FirstCycle) / InitiationInterval;
651   }
652 
653   /// Return the cycle for a scheduled instruction. This function normalizes
654   /// the first cycle to be 0.
655   unsigned cycleScheduled(SUnit *SU) const {
656     std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SU);
657     assert(it != InstrToCycle.end() && "Instruction hasn't been scheduled.");
658     return (it->second - FirstCycle) % InitiationInterval;
659   }
660 
661   /// Return the maximum stage count needed for this schedule.
662   unsigned getMaxStageCount() {
663     return (LastCycle - FirstCycle) / InitiationInterval;
664   }
665 
666   /// Return the max. number of stages/iterations that can occur between a
667   /// register definition and its uses.
668   unsigned getStagesForReg(int Reg, unsigned CurStage) {
669     std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
670     if (CurStage > getMaxStageCount() && Stages.first == 0 && Stages.second)
671       return 1;
672     return Stages.first;
673   }
674 
675   /// The number of stages for a Phi is a little different than other
676   /// instructions. The minimum value computed in RegToStageDiff is 1
677   /// because we assume the Phi is needed for at least 1 iteration.
678   /// This is not the case if the loop value is scheduled prior to the
679   /// Phi in the same stage.  This function returns the number of stages
680   /// or iterations needed between the Phi definition and any uses.
681   unsigned getStagesForPhi(int Reg) {
682     std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
683     if (Stages.second)
684       return Stages.first;
685     return Stages.first - 1;
686   }
687 
688   /// Return the instructions that are scheduled at the specified cycle.
689   std::deque<SUnit *> &getInstructions(int cycle) {
690     return ScheduledInstrs[cycle];
691   }
692 
693   bool isValidSchedule(SwingSchedulerDAG *SSD);
694   void finalizeSchedule(SwingSchedulerDAG *SSD);
695   bool orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
696                        std::deque<SUnit *> &Insts);
697   bool isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi);
698   bool isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD, MachineInstr *Inst,
699                              MachineOperand &MO);
700   void print(raw_ostream &os) const;
701   void dump() const;
702 };
703 
704 } // end anonymous namespace
705 
706 unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
707 char MachinePipeliner::ID = 0;
708 #ifndef NDEBUG
709 int MachinePipeliner::NumTries = 0;
710 #endif
711 char &llvm::MachinePipelinerID = MachinePipeliner::ID;
712 INITIALIZE_PASS_BEGIN(MachinePipeliner, "pipeliner",
713                       "Modulo Software Pipelining", false, false)
714 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
715 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
716 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
717 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
718 INITIALIZE_PASS_END(MachinePipeliner, "pipeliner",
719                     "Modulo Software Pipelining", false, false)
720 
721 /// The "main" function for implementing Swing Modulo Scheduling.
722 bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
723   if (skipFunction(*mf.getFunction()))
724     return false;
725 
726   if (!EnableSWP)
727     return false;
728 
729   if (mf.getFunction()->getAttributes().hasAttribute(
730           AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
731       !EnableSWPOptSize.getPosition())
732     return false;
733 
734   MF = &mf;
735   MLI = &getAnalysis<MachineLoopInfo>();
736   MDT = &getAnalysis<MachineDominatorTree>();
737   TII = MF->getSubtarget().getInstrInfo();
738   RegClassInfo.runOnMachineFunction(*MF);
739 
740   for (auto &L : *MLI)
741     scheduleLoop(*L);
742 
743   return false;
744 }
745 
746 /// Attempt to perform the SMS algorithm on the specified loop. This function is
747 /// the main entry point for the algorithm.  The function identifies candidate
748 /// loops, calculates the minimum initiation interval, and attempts to schedule
749 /// the loop.
750 bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
751   bool Changed = false;
752   for (auto &InnerLoop : L)
753     Changed |= scheduleLoop(*InnerLoop);
754 
755 #ifndef NDEBUG
756   // Stop trying after reaching the limit (if any).
757   int Limit = SwpLoopLimit;
758   if (Limit >= 0) {
759     if (NumTries >= SwpLoopLimit)
760       return Changed;
761     NumTries++;
762   }
763 #endif
764 
765   if (!canPipelineLoop(L))
766     return Changed;
767 
768   ++NumTrytoPipeline;
769 
770   Changed = swingModuloScheduler(L);
771 
772   return Changed;
773 }
774 
775 /// Return true if the loop can be software pipelined.  The algorithm is
776 /// restricted to loops with a single basic block.  Make sure that the
777 /// branch in the loop can be analyzed.
778 bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
779   if (L.getNumBlocks() != 1)
780     return false;
781 
782   // Check if the branch can't be understood because we can't do pipelining
783   // if that's the case.
784   LI.TBB = nullptr;
785   LI.FBB = nullptr;
786   LI.BrCond.clear();
787   if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond))
788     return false;
789 
790   LI.LoopInductionVar = nullptr;
791   LI.LoopCompare = nullptr;
792   if (TII->analyzeLoop(L, LI.LoopInductionVar, LI.LoopCompare))
793     return false;
794 
795   if (!L.getLoopPreheader())
796     return false;
797 
798   // If any of the Phis contain subregs, then we can't pipeline
799   // because we don't know how to maintain subreg information in the
800   // VMap structure.
801   MachineBasicBlock *MBB = L.getHeader();
802   for (MachineBasicBlock::iterator BBI = MBB->instr_begin(),
803                                    BBE = MBB->getFirstNonPHI();
804        BBI != BBE; ++BBI)
805     for (unsigned i = 1; i != BBI->getNumOperands(); i += 2)
806       if (BBI->getOperand(i).getSubReg() != 0)
807         return false;
808 
809   return true;
810 }
811 
812 /// The SMS algorithm consists of the following main steps:
813 /// 1. Computation and analysis of the dependence graph.
814 /// 2. Ordering of the nodes (instructions).
815 /// 3. Attempt to Schedule the loop.
816 bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
817   assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
818 
819   SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo);
820 
821   MachineBasicBlock *MBB = L.getHeader();
822   // The kernel should not include any terminator instructions.  These
823   // will be added back later.
824   SMS.startBlock(MBB);
825 
826   // Compute the number of 'real' instructions in the basic block by
827   // ignoring terminators.
828   unsigned size = MBB->size();
829   for (MachineBasicBlock::iterator I = MBB->getFirstTerminator(),
830                                    E = MBB->instr_end();
831        I != E; ++I, --size)
832     ;
833 
834   SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
835   SMS.schedule();
836   SMS.exitRegion();
837 
838   SMS.finishBlock();
839   return SMS.hasNewSchedule();
840 }
841 
842 /// We override the schedule function in ScheduleDAGInstrs to implement the
843 /// scheduling part of the Swing Modulo Scheduling algorithm.
844 void SwingSchedulerDAG::schedule() {
845   AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
846   buildSchedGraph(AA);
847   addLoopCarriedDependences(AA);
848   updatePhiDependences();
849   Topo.InitDAGTopologicalSorting();
850   changeDependences();
851   DEBUG({
852     for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
853       SUnits[su].dumpAll(this);
854   });
855 
856   NodeSetType NodeSets;
857   findCircuits(NodeSets);
858 
859   // Calculate the MII.
860   unsigned ResMII = calculateResMII();
861   unsigned RecMII = calculateRecMII(NodeSets);
862 
863   fuseRecs(NodeSets);
864 
865   // This flag is used for testing and can cause correctness problems.
866   if (SwpIgnoreRecMII)
867     RecMII = 0;
868 
869   MII = std::max(ResMII, RecMII);
870   DEBUG(dbgs() << "MII = " << MII << " (rec=" << RecMII << ", res=" << ResMII
871                << ")\n");
872 
873   // Can't schedule a loop without a valid MII.
874   if (MII == 0)
875     return;
876 
877   // Don't pipeline large loops.
878   if (SwpMaxMii != -1 && (int)MII > SwpMaxMii)
879     return;
880 
881   computeNodeFunctions(NodeSets);
882 
883   registerPressureFilter(NodeSets);
884 
885   colocateNodeSets(NodeSets);
886 
887   checkNodeSets(NodeSets);
888 
889   DEBUG({
890     for (auto &I : NodeSets) {
891       dbgs() << "  Rec NodeSet ";
892       I.dump();
893     }
894   });
895 
896   std::sort(NodeSets.begin(), NodeSets.end(), std::greater<NodeSet>());
897 
898   groupRemainingNodes(NodeSets);
899 
900   removeDuplicateNodes(NodeSets);
901 
902   DEBUG({
903     for (auto &I : NodeSets) {
904       dbgs() << "  NodeSet ";
905       I.dump();
906     }
907   });
908 
909   computeNodeOrder(NodeSets);
910 
911   SMSchedule Schedule(Pass.MF);
912   Scheduled = schedulePipeline(Schedule);
913 
914   if (!Scheduled)
915     return;
916 
917   unsigned numStages = Schedule.getMaxStageCount();
918   // No need to generate pipeline if there are no overlapped iterations.
919   if (numStages == 0)
920     return;
921 
922   // Check that the maximum stage count is less than user-defined limit.
923   if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages)
924     return;
925 
926   generatePipelinedLoop(Schedule);
927   ++NumPipelined;
928 }
929 
930 /// Clean up after the software pipeliner runs.
931 void SwingSchedulerDAG::finishBlock() {
932   for (MachineInstr *I : NewMIs)
933     MF.DeleteMachineInstr(I);
934   NewMIs.clear();
935 
936   // Call the superclass.
937   ScheduleDAGInstrs::finishBlock();
938 }
939 
940 /// Return the register values for  the operands of a Phi instruction.
941 /// This function assume the instruction is a Phi.
942 static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
943                        unsigned &InitVal, unsigned &LoopVal) {
944   assert(Phi.isPHI() && "Expecting a Phi.");
945 
946   InitVal = 0;
947   LoopVal = 0;
948   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
949     if (Phi.getOperand(i + 1).getMBB() != Loop)
950       InitVal = Phi.getOperand(i).getReg();
951     else if (Phi.getOperand(i + 1).getMBB() == Loop)
952       LoopVal = Phi.getOperand(i).getReg();
953 
954   assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
955 }
956 
957 /// Return the Phi register value that comes from the incoming block.
958 static unsigned getInitPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
959   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
960     if (Phi.getOperand(i + 1).getMBB() != LoopBB)
961       return Phi.getOperand(i).getReg();
962   return 0;
963 }
964 
965 /// Return the Phi register value that comes the the loop block.
966 static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
967   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
968     if (Phi.getOperand(i + 1).getMBB() == LoopBB)
969       return Phi.getOperand(i).getReg();
970   return 0;
971 }
972 
973 /// Return true if SUb can be reached from SUa following the chain edges.
974 static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
975   SmallPtrSet<SUnit *, 8> Visited;
976   SmallVector<SUnit *, 8> Worklist;
977   Worklist.push_back(SUa);
978   while (!Worklist.empty()) {
979     const SUnit *SU = Worklist.pop_back_val();
980     for (auto &SI : SU->Succs) {
981       SUnit *SuccSU = SI.getSUnit();
982       if (SI.getKind() == SDep::Order) {
983         if (Visited.count(SuccSU))
984           continue;
985         if (SuccSU == SUb)
986           return true;
987         Worklist.push_back(SuccSU);
988         Visited.insert(SuccSU);
989       }
990     }
991   }
992   return false;
993 }
994 
995 /// Return true if the instruction causes a chain between memory
996 /// references before and after it.
997 static bool isDependenceBarrier(MachineInstr &MI, AliasAnalysis *AA) {
998   return MI.isCall() || MI.hasUnmodeledSideEffects() ||
999          (MI.hasOrderedMemoryRef() &&
1000           (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad(AA)));
1001 }
1002 
1003 /// Return the underlying objects for the memory references of an instruction.
1004 /// This function calls the code in ValueTracking, but first checks that the
1005 /// instruction has a memory operand.
1006 static void getUnderlyingObjects(MachineInstr *MI,
1007                                  SmallVectorImpl<Value *> &Objs,
1008                                  const DataLayout &DL) {
1009   if (!MI->hasOneMemOperand())
1010     return;
1011   MachineMemOperand *MM = *MI->memoperands_begin();
1012   if (!MM->getValue())
1013     return;
1014   GetUnderlyingObjects(const_cast<Value *>(MM->getValue()), Objs, DL);
1015 }
1016 
1017 /// Add a chain edge between a load and store if the store can be an
1018 /// alias of the load on a subsequent iteration, i.e., a loop carried
1019 /// dependence. This code is very similar to the code in ScheduleDAGInstrs
1020 /// but that code doesn't create loop carried dependences.
1021 void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
1022   MapVector<Value *, SmallVector<SUnit *, 4>> PendingLoads;
1023   for (auto &SU : SUnits) {
1024     MachineInstr &MI = *SU.getInstr();
1025     if (isDependenceBarrier(MI, AA))
1026       PendingLoads.clear();
1027     else if (MI.mayLoad()) {
1028       SmallVector<Value *, 4> Objs;
1029       getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
1030       for (auto V : Objs) {
1031         SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
1032         SUs.push_back(&SU);
1033       }
1034     } else if (MI.mayStore()) {
1035       SmallVector<Value *, 4> Objs;
1036       getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
1037       for (auto V : Objs) {
1038         MapVector<Value *, SmallVector<SUnit *, 4>>::iterator I =
1039             PendingLoads.find(V);
1040         if (I == PendingLoads.end())
1041           continue;
1042         for (auto Load : I->second) {
1043           if (isSuccOrder(Load, &SU))
1044             continue;
1045           MachineInstr &LdMI = *Load->getInstr();
1046           // First, perform the cheaper check that compares the base register.
1047           // If they are the same and the load offset is less than the store
1048           // offset, then mark the dependence as loop carried potentially.
1049           unsigned BaseReg1, BaseReg2;
1050           int64_t Offset1, Offset2;
1051           if (!TII->getMemOpBaseRegImmOfs(LdMI, BaseReg1, Offset1, TRI) ||
1052               !TII->getMemOpBaseRegImmOfs(MI, BaseReg2, Offset2, TRI)) {
1053             SU.addPred(SDep(Load, SDep::Barrier));
1054             continue;
1055           }
1056           if (BaseReg1 == BaseReg2 && (int)Offset1 < (int)Offset2) {
1057             assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI, AA) &&
1058                    "What happened to the chain edge?");
1059             SU.addPred(SDep(Load, SDep::Barrier));
1060             continue;
1061           }
1062           // Second, the more expensive check that uses alias analysis on the
1063           // base registers. If they alias, and the load offset is less than
1064           // the store offset, the mark the dependence as loop carried.
1065           if (!AA) {
1066             SU.addPred(SDep(Load, SDep::Barrier));
1067             continue;
1068           }
1069           MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
1070           MachineMemOperand *MMO2 = *MI.memoperands_begin();
1071           if (!MMO1->getValue() || !MMO2->getValue()) {
1072             SU.addPred(SDep(Load, SDep::Barrier));
1073             continue;
1074           }
1075           if (MMO1->getValue() == MMO2->getValue() &&
1076               MMO1->getOffset() <= MMO2->getOffset()) {
1077             SU.addPred(SDep(Load, SDep::Barrier));
1078             continue;
1079           }
1080           AliasResult AAResult = AA->alias(
1081               MemoryLocation(MMO1->getValue(), MemoryLocation::UnknownSize,
1082                              MMO1->getAAInfo()),
1083               MemoryLocation(MMO2->getValue(), MemoryLocation::UnknownSize,
1084                              MMO2->getAAInfo()));
1085 
1086           if (AAResult != NoAlias)
1087             SU.addPred(SDep(Load, SDep::Barrier));
1088         }
1089       }
1090     }
1091   }
1092 }
1093 
1094 /// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
1095 /// processes dependences for PHIs. This function adds true dependences
1096 /// from a PHI to a use, and a loop carried dependence from the use to the
1097 /// PHI. The loop carried dependence is represented as an anti dependence
1098 /// edge. This function also removes chain dependences between unrelated
1099 /// PHIs.
1100 void SwingSchedulerDAG::updatePhiDependences() {
1101   SmallVector<SDep, 4> RemoveDeps;
1102   const TargetSubtargetInfo &ST = MF.getSubtarget<TargetSubtargetInfo>();
1103 
1104   // Iterate over each DAG node.
1105   for (SUnit &I : SUnits) {
1106     RemoveDeps.clear();
1107     // Set to true if the instruction has an operand defined by a Phi.
1108     unsigned HasPhiUse = 0;
1109     unsigned HasPhiDef = 0;
1110     MachineInstr *MI = I.getInstr();
1111     // Iterate over each operand, and we process the definitions.
1112     for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
1113                                     MOE = MI->operands_end();
1114          MOI != MOE; ++MOI) {
1115       if (!MOI->isReg())
1116         continue;
1117       unsigned Reg = MOI->getReg();
1118       if (MOI->isDef()) {
1119         // If the register is used by a Phi, then create an anti dependence.
1120         for (MachineRegisterInfo::use_instr_iterator
1121                  UI = MRI.use_instr_begin(Reg),
1122                  UE = MRI.use_instr_end();
1123              UI != UE; ++UI) {
1124           MachineInstr *UseMI = &*UI;
1125           SUnit *SU = getSUnit(UseMI);
1126           if (SU != nullptr && UseMI->isPHI()) {
1127             if (!MI->isPHI()) {
1128               SDep Dep(SU, SDep::Anti, Reg);
1129               I.addPred(Dep);
1130             } else {
1131               HasPhiDef = Reg;
1132               // Add a chain edge to a dependent Phi that isn't an existing
1133               // predecessor.
1134               if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
1135                 I.addPred(SDep(SU, SDep::Barrier));
1136             }
1137           }
1138         }
1139       } else if (MOI->isUse()) {
1140         // If the register is defined by a Phi, then create a true dependence.
1141         MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
1142         if (DefMI == nullptr)
1143           continue;
1144         SUnit *SU = getSUnit(DefMI);
1145         if (SU != nullptr && DefMI->isPHI()) {
1146           if (!MI->isPHI()) {
1147             SDep Dep(SU, SDep::Data, Reg);
1148             Dep.setLatency(0);
1149             ST.adjustSchedDependency(SU, &I, Dep);
1150             I.addPred(Dep);
1151           } else {
1152             HasPhiUse = Reg;
1153             // Add a chain edge to a dependent Phi that isn't an existing
1154             // predecessor.
1155             if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
1156               I.addPred(SDep(SU, SDep::Barrier));
1157           }
1158         }
1159       }
1160     }
1161     // Remove order dependences from an unrelated Phi.
1162     if (!SwpPruneDeps)
1163       continue;
1164     for (auto &PI : I.Preds) {
1165       MachineInstr *PMI = PI.getSUnit()->getInstr();
1166       if (PMI->isPHI() && PI.getKind() == SDep::Order) {
1167         if (I.getInstr()->isPHI()) {
1168           if (PMI->getOperand(0).getReg() == HasPhiUse)
1169             continue;
1170           if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
1171             continue;
1172         }
1173         RemoveDeps.push_back(PI);
1174       }
1175     }
1176     for (int i = 0, e = RemoveDeps.size(); i != e; ++i)
1177       I.removePred(RemoveDeps[i]);
1178   }
1179 }
1180 
1181 /// Iterate over each DAG node and see if we can change any dependences
1182 /// in order to reduce the recurrence MII.
1183 void SwingSchedulerDAG::changeDependences() {
1184   // See if an instruction can use a value from the previous iteration.
1185   // If so, we update the base and offset of the instruction and change
1186   // the dependences.
1187   for (SUnit &I : SUnits) {
1188     unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
1189     int64_t NewOffset = 0;
1190     if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
1191                                NewOffset))
1192       continue;
1193 
1194     // Get the MI and SUnit for the instruction that defines the original base.
1195     unsigned OrigBase = I.getInstr()->getOperand(BasePos).getReg();
1196     MachineInstr *DefMI = MRI.getUniqueVRegDef(OrigBase);
1197     if (!DefMI)
1198       continue;
1199     SUnit *DefSU = getSUnit(DefMI);
1200     if (!DefSU)
1201       continue;
1202     // Get the MI and SUnit for the instruction that defins the new base.
1203     MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
1204     if (!LastMI)
1205       continue;
1206     SUnit *LastSU = getSUnit(LastMI);
1207     if (!LastSU)
1208       continue;
1209 
1210     if (Topo.IsReachable(&I, LastSU))
1211       continue;
1212 
1213     // Remove the dependence. The value now depends on a prior iteration.
1214     SmallVector<SDep, 4> Deps;
1215     for (SUnit::pred_iterator P = I.Preds.begin(), E = I.Preds.end(); P != E;
1216          ++P)
1217       if (P->getSUnit() == DefSU)
1218         Deps.push_back(*P);
1219     for (int i = 0, e = Deps.size(); i != e; i++) {
1220       Topo.RemovePred(&I, Deps[i].getSUnit());
1221       I.removePred(Deps[i]);
1222     }
1223     // Remove the chain dependence between the instructions.
1224     Deps.clear();
1225     for (auto &P : LastSU->Preds)
1226       if (P.getSUnit() == &I && P.getKind() == SDep::Order)
1227         Deps.push_back(P);
1228     for (int i = 0, e = Deps.size(); i != e; i++) {
1229       Topo.RemovePred(LastSU, Deps[i].getSUnit());
1230       LastSU->removePred(Deps[i]);
1231     }
1232 
1233     // Add a dependence between the new instruction and the instruction
1234     // that defines the new base.
1235     SDep Dep(&I, SDep::Anti, NewBase);
1236     LastSU->addPred(Dep);
1237 
1238     // Remember the base and offset information so that we can update the
1239     // instruction during code generation.
1240     InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
1241   }
1242 }
1243 
1244 namespace {
1245 
1246 // FuncUnitSorter - Comparison operator used to sort instructions by
1247 // the number of functional unit choices.
1248 struct FuncUnitSorter {
1249   const InstrItineraryData *InstrItins;
1250   DenseMap<unsigned, unsigned> Resources;
1251 
1252   // Compute the number of functional unit alternatives needed
1253   // at each stage, and take the minimum value. We prioritize the
1254   // instructions by the least number of choices first.
1255   unsigned minFuncUnits(const MachineInstr *Inst, unsigned &F) const {
1256     unsigned schedClass = Inst->getDesc().getSchedClass();
1257     unsigned min = UINT_MAX;
1258     for (const InstrStage *IS = InstrItins->beginStage(schedClass),
1259                           *IE = InstrItins->endStage(schedClass);
1260          IS != IE; ++IS) {
1261       unsigned funcUnits = IS->getUnits();
1262       unsigned numAlternatives = countPopulation(funcUnits);
1263       if (numAlternatives < min) {
1264         min = numAlternatives;
1265         F = funcUnits;
1266       }
1267     }
1268     return min;
1269   }
1270 
1271   // Compute the critical resources needed by the instruction. This
1272   // function records the functional units needed by instructions that
1273   // must use only one functional unit. We use this as a tie breaker
1274   // for computing the resource MII. The instrutions that require
1275   // the same, highly used, functional unit have high priority.
1276   void calcCriticalResources(MachineInstr &MI) {
1277     unsigned SchedClass = MI.getDesc().getSchedClass();
1278     for (const InstrStage *IS = InstrItins->beginStage(SchedClass),
1279                           *IE = InstrItins->endStage(SchedClass);
1280          IS != IE; ++IS) {
1281       unsigned FuncUnits = IS->getUnits();
1282       if (countPopulation(FuncUnits) == 1)
1283         Resources[FuncUnits]++;
1284     }
1285   }
1286 
1287   FuncUnitSorter(const InstrItineraryData *IID) : InstrItins(IID) {}
1288   /// Return true if IS1 has less priority than IS2.
1289   bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1290     unsigned F1 = 0, F2 = 0;
1291     unsigned MFUs1 = minFuncUnits(IS1, F1);
1292     unsigned MFUs2 = minFuncUnits(IS2, F2);
1293     if (MFUs1 == 1 && MFUs2 == 1)
1294       return Resources.lookup(F1) < Resources.lookup(F2);
1295     return MFUs1 > MFUs2;
1296   }
1297 };
1298 
1299 } // end anonymous namespace
1300 
1301 /// Calculate the resource constrained minimum initiation interval for the
1302 /// specified loop. We use the DFA to model the resources needed for
1303 /// each instruction, and we ignore dependences. A different DFA is created
1304 /// for each cycle that is required. When adding a new instruction, we attempt
1305 /// to add it to each existing DFA, until a legal space is found. If the
1306 /// instruction cannot be reserved in an existing DFA, we create a new one.
1307 unsigned SwingSchedulerDAG::calculateResMII() {
1308   SmallVector<DFAPacketizer *, 8> Resources;
1309   MachineBasicBlock *MBB = Loop.getHeader();
1310   Resources.push_back(TII->CreateTargetScheduleState(MF.getSubtarget()));
1311 
1312   // Sort the instructions by the number of available choices for scheduling,
1313   // least to most. Use the number of critical resources as the tie breaker.
1314   FuncUnitSorter FUS =
1315       FuncUnitSorter(MF.getSubtarget().getInstrItineraryData());
1316   for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
1317                                    E = MBB->getFirstTerminator();
1318        I != E; ++I)
1319     FUS.calcCriticalResources(*I);
1320   PriorityQueue<MachineInstr *, std::vector<MachineInstr *>, FuncUnitSorter>
1321       FuncUnitOrder(FUS);
1322 
1323   for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
1324                                    E = MBB->getFirstTerminator();
1325        I != E; ++I)
1326     FuncUnitOrder.push(&*I);
1327 
1328   while (!FuncUnitOrder.empty()) {
1329     MachineInstr *MI = FuncUnitOrder.top();
1330     FuncUnitOrder.pop();
1331     if (TII->isZeroCost(MI->getOpcode()))
1332       continue;
1333     // Attempt to reserve the instruction in an existing DFA. At least one
1334     // DFA is needed for each cycle.
1335     unsigned NumCycles = getSUnit(MI)->Latency;
1336     unsigned ReservedCycles = 0;
1337     SmallVectorImpl<DFAPacketizer *>::iterator RI = Resources.begin();
1338     SmallVectorImpl<DFAPacketizer *>::iterator RE = Resources.end();
1339     for (unsigned C = 0; C < NumCycles; ++C)
1340       while (RI != RE) {
1341         if ((*RI++)->canReserveResources(*MI)) {
1342           ++ReservedCycles;
1343           break;
1344         }
1345       }
1346     // Start reserving resources using existing DFAs.
1347     for (unsigned C = 0; C < ReservedCycles; ++C) {
1348       --RI;
1349       (*RI)->reserveResources(*MI);
1350     }
1351     // Add new DFAs, if needed, to reserve resources.
1352     for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
1353       DFAPacketizer *NewResource =
1354           TII->CreateTargetScheduleState(MF.getSubtarget());
1355       assert(NewResource->canReserveResources(*MI) && "Reserve error.");
1356       NewResource->reserveResources(*MI);
1357       Resources.push_back(NewResource);
1358     }
1359   }
1360   int Resmii = Resources.size();
1361   // Delete the memory for each of the DFAs that were created earlier.
1362   for (DFAPacketizer *RI : Resources) {
1363     DFAPacketizer *D = RI;
1364     delete D;
1365   }
1366   Resources.clear();
1367   return Resmii;
1368 }
1369 
1370 /// Calculate the recurrence-constrainted minimum initiation interval.
1371 /// Iterate over each circuit.  Compute the delay(c) and distance(c)
1372 /// for each circuit. The II needs to satisfy the inequality
1373 /// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1374 /// II that satistifies the inequality, and the RecMII is the maximum
1375 /// of those values.
1376 unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1377   unsigned RecMII = 0;
1378 
1379   for (NodeSet &Nodes : NodeSets) {
1380     if (Nodes.size() == 0)
1381       continue;
1382 
1383     unsigned Delay = Nodes.size() - 1;
1384     unsigned Distance = 1;
1385 
1386     // ii = ceil(delay / distance)
1387     unsigned CurMII = (Delay + Distance - 1) / Distance;
1388     Nodes.setRecMII(CurMII);
1389     if (CurMII > RecMII)
1390       RecMII = CurMII;
1391   }
1392 
1393   return RecMII;
1394 }
1395 
1396 /// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1397 /// but we do this to find the circuits, and then change them back.
1398 static void swapAntiDependences(std::vector<SUnit> &SUnits) {
1399   SmallVector<std::pair<SUnit *, SDep>, 8> DepsAdded;
1400   for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
1401     SUnit *SU = &SUnits[i];
1402     for (SUnit::pred_iterator IP = SU->Preds.begin(), EP = SU->Preds.end();
1403          IP != EP; ++IP) {
1404       if (IP->getKind() != SDep::Anti)
1405         continue;
1406       DepsAdded.push_back(std::make_pair(SU, *IP));
1407     }
1408   }
1409   for (SmallVector<std::pair<SUnit *, SDep>, 8>::iterator I = DepsAdded.begin(),
1410                                                           E = DepsAdded.end();
1411        I != E; ++I) {
1412     // Remove this anti dependency and add one in the reverse direction.
1413     SUnit *SU = I->first;
1414     SDep &D = I->second;
1415     SUnit *TargetSU = D.getSUnit();
1416     unsigned Reg = D.getReg();
1417     unsigned Lat = D.getLatency();
1418     SU->removePred(D);
1419     SDep Dep(SU, SDep::Anti, Reg);
1420     Dep.setLatency(Lat);
1421     TargetSU->addPred(Dep);
1422   }
1423 }
1424 
1425 /// Create the adjacency structure of the nodes in the graph.
1426 void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1427     SwingSchedulerDAG *DAG) {
1428   BitVector Added(SUnits.size());
1429   for (int i = 0, e = SUnits.size(); i != e; ++i) {
1430     Added.reset();
1431     // Add any successor to the adjacency matrix and exclude duplicates.
1432     for (auto &SI : SUnits[i].Succs) {
1433       // Do not process a boundary node and a back-edge is processed only
1434       // if it goes to a Phi.
1435       if (SI.getSUnit()->isBoundaryNode() ||
1436           (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI()))
1437         continue;
1438       int N = SI.getSUnit()->NodeNum;
1439       if (!Added.test(N)) {
1440         AdjK[i].push_back(N);
1441         Added.set(N);
1442       }
1443     }
1444     // A chain edge between a store and a load is treated as a back-edge in the
1445     // adjacency matrix.
1446     for (auto &PI : SUnits[i].Preds) {
1447       if (!SUnits[i].getInstr()->mayStore() ||
1448           !DAG->isLoopCarriedOrder(&SUnits[i], PI, false))
1449         continue;
1450       if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
1451         int N = PI.getSUnit()->NodeNum;
1452         if (!Added.test(N)) {
1453           AdjK[i].push_back(N);
1454           Added.set(N);
1455         }
1456       }
1457     }
1458   }
1459 }
1460 
1461 /// Identify an elementary circuit in the dependence graph starting at the
1462 /// specified node.
1463 bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1464                                           bool HasBackedge) {
1465   SUnit *SV = &SUnits[V];
1466   bool F = false;
1467   Stack.insert(SV);
1468   Blocked.set(V);
1469 
1470   for (auto W : AdjK[V]) {
1471     if (NumPaths > MaxPaths)
1472       break;
1473     if (W < S)
1474       continue;
1475     if (W == S) {
1476       if (!HasBackedge)
1477         NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
1478       F = true;
1479       ++NumPaths;
1480       break;
1481     } else if (!Blocked.test(W)) {
1482       if (circuit(W, S, NodeSets, W < V ? true : HasBackedge))
1483         F = true;
1484     }
1485   }
1486 
1487   if (F)
1488     unblock(V);
1489   else {
1490     for (auto W : AdjK[V]) {
1491       if (W < S)
1492         continue;
1493       if (B[W].count(SV) == 0)
1494         B[W].insert(SV);
1495     }
1496   }
1497   Stack.pop_back();
1498   return F;
1499 }
1500 
1501 /// Unblock a node in the circuit finding algorithm.
1502 void SwingSchedulerDAG::Circuits::unblock(int U) {
1503   Blocked.reset(U);
1504   SmallPtrSet<SUnit *, 4> &BU = B[U];
1505   while (!BU.empty()) {
1506     SmallPtrSet<SUnit *, 4>::iterator SI = BU.begin();
1507     assert(SI != BU.end() && "Invalid B set.");
1508     SUnit *W = *SI;
1509     BU.erase(W);
1510     if (Blocked.test(W->NodeNum))
1511       unblock(W->NodeNum);
1512   }
1513 }
1514 
1515 /// Identify all the elementary circuits in the dependence graph using
1516 /// Johnson's circuit algorithm.
1517 void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1518   // Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1519   // but we do this to find the circuits, and then change them back.
1520   swapAntiDependences(SUnits);
1521 
1522   Circuits Cir(SUnits);
1523   // Create the adjacency structure.
1524   Cir.createAdjacencyStructure(this);
1525   for (int i = 0, e = SUnits.size(); i != e; ++i) {
1526     Cir.reset();
1527     Cir.circuit(i, i, NodeSets);
1528   }
1529 
1530   // Change the dependences back so that we've created a DAG again.
1531   swapAntiDependences(SUnits);
1532 }
1533 
1534 /// Return true for DAG nodes that we ignore when computing the cost functions.
1535 /// We ignore the back-edge recurrence in order to avoid unbounded recurison
1536 /// in the calculation of the ASAP, ALAP, etc functions.
1537 static bool ignoreDependence(const SDep &D, bool isPred) {
1538   if (D.isArtificial())
1539     return true;
1540   return D.getKind() == SDep::Anti && isPred;
1541 }
1542 
1543 /// Compute several functions need to order the nodes for scheduling.
1544 ///  ASAP - Earliest time to schedule a node.
1545 ///  ALAP - Latest time to schedule a node.
1546 ///  MOV - Mobility function, difference between ALAP and ASAP.
1547 ///  D - Depth of each node.
1548 ///  H - Height of each node.
1549 void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
1550 
1551   ScheduleInfo.resize(SUnits.size());
1552 
1553   DEBUG({
1554     for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(),
1555                                                     E = Topo.end();
1556          I != E; ++I) {
1557       SUnit *SU = &SUnits[*I];
1558       SU->dump(this);
1559     }
1560   });
1561 
1562   int maxASAP = 0;
1563   // Compute ASAP.
1564   for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(),
1565                                                   E = Topo.end();
1566        I != E; ++I) {
1567     int asap = 0;
1568     SUnit *SU = &SUnits[*I];
1569     for (SUnit::const_pred_iterator IP = SU->Preds.begin(),
1570                                     EP = SU->Preds.end();
1571          IP != EP; ++IP) {
1572       if (ignoreDependence(*IP, true))
1573         continue;
1574       SUnit *pred = IP->getSUnit();
1575       asap = std::max(asap, (int)(getASAP(pred) + getLatency(SU, *IP) -
1576                                   getDistance(pred, SU, *IP) * MII));
1577     }
1578     maxASAP = std::max(maxASAP, asap);
1579     ScheduleInfo[*I].ASAP = asap;
1580   }
1581 
1582   // Compute ALAP and MOV.
1583   for (ScheduleDAGTopologicalSort::const_reverse_iterator I = Topo.rbegin(),
1584                                                           E = Topo.rend();
1585        I != E; ++I) {
1586     int alap = maxASAP;
1587     SUnit *SU = &SUnits[*I];
1588     for (SUnit::const_succ_iterator IS = SU->Succs.begin(),
1589                                     ES = SU->Succs.end();
1590          IS != ES; ++IS) {
1591       if (ignoreDependence(*IS, true))
1592         continue;
1593       SUnit *succ = IS->getSUnit();
1594       alap = std::min(alap, (int)(getALAP(succ) - getLatency(SU, *IS) +
1595                                   getDistance(SU, succ, *IS) * MII));
1596     }
1597 
1598     ScheduleInfo[*I].ALAP = alap;
1599   }
1600 
1601   // After computing the node functions, compute the summary for each node set.
1602   for (NodeSet &I : NodeSets)
1603     I.computeNodeSetInfo(this);
1604 
1605   DEBUG({
1606     for (unsigned i = 0; i < SUnits.size(); i++) {
1607       dbgs() << "\tNode " << i << ":\n";
1608       dbgs() << "\t   ASAP = " << getASAP(&SUnits[i]) << "\n";
1609       dbgs() << "\t   ALAP = " << getALAP(&SUnits[i]) << "\n";
1610       dbgs() << "\t   MOV  = " << getMOV(&SUnits[i]) << "\n";
1611       dbgs() << "\t   D    = " << getDepth(&SUnits[i]) << "\n";
1612       dbgs() << "\t   H    = " << getHeight(&SUnits[i]) << "\n";
1613     }
1614   });
1615 }
1616 
1617 /// Compute the Pred_L(O) set, as defined in the paper. The set is defined
1618 /// as the predecessors of the elements of NodeOrder that are not also in
1619 /// NodeOrder.
1620 static bool pred_L(SetVector<SUnit *> &NodeOrder,
1621                    SmallSetVector<SUnit *, 8> &Preds,
1622                    const NodeSet *S = nullptr) {
1623   Preds.clear();
1624   for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end();
1625        I != E; ++I) {
1626     for (SUnit::pred_iterator PI = (*I)->Preds.begin(), PE = (*I)->Preds.end();
1627          PI != PE; ++PI) {
1628       if (S && S->count(PI->getSUnit()) == 0)
1629         continue;
1630       if (ignoreDependence(*PI, true))
1631         continue;
1632       if (NodeOrder.count(PI->getSUnit()) == 0)
1633         Preds.insert(PI->getSUnit());
1634     }
1635     // Back-edges are predecessors with an anti-dependence.
1636     for (SUnit::const_succ_iterator IS = (*I)->Succs.begin(),
1637                                     ES = (*I)->Succs.end();
1638          IS != ES; ++IS) {
1639       if (IS->getKind() != SDep::Anti)
1640         continue;
1641       if (S && S->count(IS->getSUnit()) == 0)
1642         continue;
1643       if (NodeOrder.count(IS->getSUnit()) == 0)
1644         Preds.insert(IS->getSUnit());
1645     }
1646   }
1647   return Preds.size() > 0;
1648 }
1649 
1650 /// Compute the Succ_L(O) set, as defined in the paper. The set is defined
1651 /// as the successors of the elements of NodeOrder that are not also in
1652 /// NodeOrder.
1653 static bool succ_L(SetVector<SUnit *> &NodeOrder,
1654                    SmallSetVector<SUnit *, 8> &Succs,
1655                    const NodeSet *S = nullptr) {
1656   Succs.clear();
1657   for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end();
1658        I != E; ++I) {
1659     for (SUnit::succ_iterator SI = (*I)->Succs.begin(), SE = (*I)->Succs.end();
1660          SI != SE; ++SI) {
1661       if (S && S->count(SI->getSUnit()) == 0)
1662         continue;
1663       if (ignoreDependence(*SI, false))
1664         continue;
1665       if (NodeOrder.count(SI->getSUnit()) == 0)
1666         Succs.insert(SI->getSUnit());
1667     }
1668     for (SUnit::const_pred_iterator PI = (*I)->Preds.begin(),
1669                                     PE = (*I)->Preds.end();
1670          PI != PE; ++PI) {
1671       if (PI->getKind() != SDep::Anti)
1672         continue;
1673       if (S && S->count(PI->getSUnit()) == 0)
1674         continue;
1675       if (NodeOrder.count(PI->getSUnit()) == 0)
1676         Succs.insert(PI->getSUnit());
1677     }
1678   }
1679   return Succs.size() > 0;
1680 }
1681 
1682 /// Return true if there is a path from the specified node to any of the nodes
1683 /// in DestNodes. Keep track and return the nodes in any path.
1684 static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
1685                         SetVector<SUnit *> &DestNodes,
1686                         SetVector<SUnit *> &Exclude,
1687                         SmallPtrSet<SUnit *, 8> &Visited) {
1688   if (Cur->isBoundaryNode())
1689     return false;
1690   if (Exclude.count(Cur) != 0)
1691     return false;
1692   if (DestNodes.count(Cur) != 0)
1693     return true;
1694   if (!Visited.insert(Cur).second)
1695     return Path.count(Cur) != 0;
1696   bool FoundPath = false;
1697   for (auto &SI : Cur->Succs)
1698     FoundPath |= computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
1699   for (auto &PI : Cur->Preds)
1700     if (PI.getKind() == SDep::Anti)
1701       FoundPath |=
1702           computePath(PI.getSUnit(), Path, DestNodes, Exclude, Visited);
1703   if (FoundPath)
1704     Path.insert(Cur);
1705   return FoundPath;
1706 }
1707 
1708 /// Return true if Set1 is a subset of Set2.
1709 template <class S1Ty, class S2Ty> static bool isSubset(S1Ty &Set1, S2Ty &Set2) {
1710   for (typename S1Ty::iterator I = Set1.begin(), E = Set1.end(); I != E; ++I)
1711     if (Set2.count(*I) == 0)
1712       return false;
1713   return true;
1714 }
1715 
1716 /// Compute the live-out registers for the instructions in a node-set.
1717 /// The live-out registers are those that are defined in the node-set,
1718 /// but not used. Except for use operands of Phis.
1719 static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker,
1720                             NodeSet &NS) {
1721   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1722   MachineRegisterInfo &MRI = MF.getRegInfo();
1723   SmallVector<RegisterMaskPair, 8> LiveOutRegs;
1724   SmallSet<unsigned, 4> Uses;
1725   for (SUnit *SU : NS) {
1726     const MachineInstr *MI = SU->getInstr();
1727     if (MI->isPHI())
1728       continue;
1729     for (ConstMIOperands MO(*MI); MO.isValid(); ++MO)
1730       if (MO->isReg() && MO->isUse()) {
1731         unsigned Reg = MO->getReg();
1732         if (TargetRegisterInfo::isVirtualRegister(Reg))
1733           Uses.insert(Reg);
1734         else if (MRI.isAllocatable(Reg))
1735           for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
1736             Uses.insert(*Units);
1737       }
1738   }
1739   for (SUnit *SU : NS)
1740     for (ConstMIOperands MO(*SU->getInstr()); MO.isValid(); ++MO)
1741       if (MO->isReg() && MO->isDef() && !MO->isDead()) {
1742         unsigned Reg = MO->getReg();
1743         if (TargetRegisterInfo::isVirtualRegister(Reg)) {
1744           if (!Uses.count(Reg))
1745             LiveOutRegs.push_back(RegisterMaskPair(Reg, 0));
1746         } else if (MRI.isAllocatable(Reg)) {
1747           for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
1748             if (!Uses.count(*Units))
1749               LiveOutRegs.push_back(RegisterMaskPair(*Units, 0));
1750         }
1751       }
1752   RPTracker.addLiveRegs(LiveOutRegs);
1753 }
1754 
1755 /// A heuristic to filter nodes in recurrent node-sets if the register
1756 /// pressure of a set is too high.
1757 void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
1758   for (auto &NS : NodeSets) {
1759     // Skip small node-sets since they won't cause register pressure problems.
1760     if (NS.size() <= 2)
1761       continue;
1762     IntervalPressure RecRegPressure;
1763     RegPressureTracker RecRPTracker(RecRegPressure);
1764     RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
1765     computeLiveOuts(MF, RecRPTracker, NS);
1766     RecRPTracker.closeBottom();
1767 
1768     std::vector<SUnit *> SUnits(NS.begin(), NS.end());
1769     std::sort(SUnits.begin(), SUnits.end(), [](const SUnit *A, const SUnit *B) {
1770       return A->NodeNum > B->NodeNum;
1771     });
1772 
1773     for (auto &SU : SUnits) {
1774       // Since we're computing the register pressure for a subset of the
1775       // instructions in a block, we need to set the tracker for each
1776       // instruction in the node-set. The tracker is set to the instruction
1777       // just after the one we're interested in.
1778       MachineBasicBlock::const_iterator CurInstI = SU->getInstr();
1779       RecRPTracker.setPos(std::next(CurInstI));
1780 
1781       RegPressureDelta RPDelta;
1782       ArrayRef<PressureChange> CriticalPSets;
1783       RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
1784                                              CriticalPSets,
1785                                              RecRegPressure.MaxSetPressure);
1786       if (RPDelta.Excess.isValid()) {
1787         DEBUG(dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
1788                      << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
1789                      << ":" << RPDelta.Excess.getUnitInc());
1790         NS.setExceedPressure(SU);
1791         break;
1792       }
1793       RecRPTracker.recede();
1794     }
1795   }
1796 }
1797 
1798 /// A heuristic to colocate node sets that have the same set of
1799 /// successors.
1800 void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
1801   unsigned Colocate = 0;
1802   for (int i = 0, e = NodeSets.size(); i < e; ++i) {
1803     NodeSet &N1 = NodeSets[i];
1804     SmallSetVector<SUnit *, 8> S1;
1805     if (N1.empty() || !succ_L(N1, S1))
1806       continue;
1807     for (int j = i + 1; j < e; ++j) {
1808       NodeSet &N2 = NodeSets[j];
1809       if (N1.compareRecMII(N2) != 0)
1810         continue;
1811       SmallSetVector<SUnit *, 8> S2;
1812       if (N2.empty() || !succ_L(N2, S2))
1813         continue;
1814       if (isSubset(S1, S2) && S1.size() == S2.size()) {
1815         N1.setColocate(++Colocate);
1816         N2.setColocate(Colocate);
1817         break;
1818       }
1819     }
1820   }
1821 }
1822 
1823 /// Check if the existing node-sets are profitable. If not, then ignore the
1824 /// recurrent node-sets, and attempt to schedule all nodes together. This is
1825 /// a heuristic. If the MII is large and there is a non-recurrent node with
1826 /// a large depth compared to the MII, then it's best to try and schedule
1827 /// all instruction together instead of starting with the recurrent node-sets.
1828 void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
1829   // Look for loops with a large MII.
1830   if (MII <= 20)
1831     return;
1832   // Check if the node-set contains only a simple add recurrence.
1833   for (auto &NS : NodeSets)
1834     if (NS.size() > 2)
1835       return;
1836   // If the depth of any instruction is significantly larger than the MII, then
1837   // ignore the recurrent node-sets and treat all instructions equally.
1838   for (auto &SU : SUnits)
1839     if (SU.getDepth() > MII * 1.5) {
1840       NodeSets.clear();
1841       DEBUG(dbgs() << "Clear recurrence node-sets\n");
1842       return;
1843     }
1844 }
1845 
1846 /// Add the nodes that do not belong to a recurrence set into groups
1847 /// based upon connected componenets.
1848 void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
1849   SetVector<SUnit *> NodesAdded;
1850   SmallPtrSet<SUnit *, 8> Visited;
1851   // Add the nodes that are on a path between the previous node sets and
1852   // the current node set.
1853   for (NodeSet &I : NodeSets) {
1854     SmallSetVector<SUnit *, 8> N;
1855     // Add the nodes from the current node set to the previous node set.
1856     if (succ_L(I, N)) {
1857       SetVector<SUnit *> Path;
1858       for (SUnit *NI : N) {
1859         Visited.clear();
1860         computePath(NI, Path, NodesAdded, I, Visited);
1861       }
1862       if (Path.size() > 0)
1863         I.insert(Path.begin(), Path.end());
1864     }
1865     // Add the nodes from the previous node set to the current node set.
1866     N.clear();
1867     if (succ_L(NodesAdded, N)) {
1868       SetVector<SUnit *> Path;
1869       for (SUnit *NI : N) {
1870         Visited.clear();
1871         computePath(NI, Path, I, NodesAdded, Visited);
1872       }
1873       if (Path.size() > 0)
1874         I.insert(Path.begin(), Path.end());
1875     }
1876     NodesAdded.insert(I.begin(), I.end());
1877   }
1878 
1879   // Create a new node set with the connected nodes of any successor of a node
1880   // in a recurrent set.
1881   NodeSet NewSet;
1882   SmallSetVector<SUnit *, 8> N;
1883   if (succ_L(NodesAdded, N))
1884     for (SUnit *I : N)
1885       addConnectedNodes(I, NewSet, NodesAdded);
1886   if (NewSet.size() > 0)
1887     NodeSets.push_back(NewSet);
1888 
1889   // Create a new node set with the connected nodes of any predecessor of a node
1890   // in a recurrent set.
1891   NewSet.clear();
1892   if (pred_L(NodesAdded, N))
1893     for (SUnit *I : N)
1894       addConnectedNodes(I, NewSet, NodesAdded);
1895   if (NewSet.size() > 0)
1896     NodeSets.push_back(NewSet);
1897 
1898   // Create new nodes sets with the connected nodes any any remaining node that
1899   // has no predecessor.
1900   for (unsigned i = 0; i < SUnits.size(); ++i) {
1901     SUnit *SU = &SUnits[i];
1902     if (NodesAdded.count(SU) == 0) {
1903       NewSet.clear();
1904       addConnectedNodes(SU, NewSet, NodesAdded);
1905       if (NewSet.size() > 0)
1906         NodeSets.push_back(NewSet);
1907     }
1908   }
1909 }
1910 
1911 /// Add the node to the set, and add all is its connected nodes to the set.
1912 void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
1913                                           SetVector<SUnit *> &NodesAdded) {
1914   NewSet.insert(SU);
1915   NodesAdded.insert(SU);
1916   for (auto &SI : SU->Succs) {
1917     SUnit *Successor = SI.getSUnit();
1918     if (!SI.isArtificial() && NodesAdded.count(Successor) == 0)
1919       addConnectedNodes(Successor, NewSet, NodesAdded);
1920   }
1921   for (auto &PI : SU->Preds) {
1922     SUnit *Predecessor = PI.getSUnit();
1923     if (!PI.isArtificial() && NodesAdded.count(Predecessor) == 0)
1924       addConnectedNodes(Predecessor, NewSet, NodesAdded);
1925   }
1926 }
1927 
1928 /// Return true if Set1 contains elements in Set2. The elements in common
1929 /// are returned in a different container.
1930 static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
1931                         SmallSetVector<SUnit *, 8> &Result) {
1932   Result.clear();
1933   for (unsigned i = 0, e = Set1.size(); i != e; ++i) {
1934     SUnit *SU = Set1[i];
1935     if (Set2.count(SU) != 0)
1936       Result.insert(SU);
1937   }
1938   return !Result.empty();
1939 }
1940 
1941 /// Merge the recurrence node sets that have the same initial node.
1942 void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
1943   for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
1944        ++I) {
1945     NodeSet &NI = *I;
1946     for (NodeSetType::iterator J = I + 1; J != E;) {
1947       NodeSet &NJ = *J;
1948       if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
1949         if (NJ.compareRecMII(NI) > 0)
1950           NI.setRecMII(NJ.getRecMII());
1951         for (NodeSet::iterator NII = J->begin(), ENI = J->end(); NII != ENI;
1952              ++NII)
1953           I->insert(*NII);
1954         NodeSets.erase(J);
1955         E = NodeSets.end();
1956       } else {
1957         ++J;
1958       }
1959     }
1960   }
1961 }
1962 
1963 /// Remove nodes that have been scheduled in previous NodeSets.
1964 void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
1965   for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
1966        ++I)
1967     for (NodeSetType::iterator J = I + 1; J != E;) {
1968       J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
1969 
1970       if (J->size() == 0) {
1971         NodeSets.erase(J);
1972         E = NodeSets.end();
1973       } else {
1974         ++J;
1975       }
1976     }
1977 }
1978 
1979 /// Return true if Inst1 defines a value that is used in Inst2.
1980 static bool hasDataDependence(SUnit *Inst1, SUnit *Inst2) {
1981   for (auto &SI : Inst1->Succs)
1982     if (SI.getSUnit() == Inst2 && SI.getKind() == SDep::Data)
1983       return true;
1984   return false;
1985 }
1986 
1987 /// Compute an ordered list of the dependence graph nodes, which
1988 /// indicates the order that the nodes will be scheduled.  This is a
1989 /// two-level algorithm. First, a partial order is created, which
1990 /// consists of a list of sets ordered from highest to lowest priority.
1991 void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
1992   SmallSetVector<SUnit *, 8> R;
1993   NodeOrder.clear();
1994 
1995   for (auto &Nodes : NodeSets) {
1996     DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
1997     OrderKind Order;
1998     SmallSetVector<SUnit *, 8> N;
1999     if (pred_L(NodeOrder, N) && isSubset(N, Nodes)) {
2000       R.insert(N.begin(), N.end());
2001       Order = BottomUp;
2002       DEBUG(dbgs() << "  Bottom up (preds) ");
2003     } else if (succ_L(NodeOrder, N) && isSubset(N, Nodes)) {
2004       R.insert(N.begin(), N.end());
2005       Order = TopDown;
2006       DEBUG(dbgs() << "  Top down (succs) ");
2007     } else if (isIntersect(N, Nodes, R)) {
2008       // If some of the successors are in the existing node-set, then use the
2009       // top-down ordering.
2010       Order = TopDown;
2011       DEBUG(dbgs() << "  Top down (intersect) ");
2012     } else if (NodeSets.size() == 1) {
2013       for (auto &N : Nodes)
2014         if (N->Succs.size() == 0)
2015           R.insert(N);
2016       Order = BottomUp;
2017       DEBUG(dbgs() << "  Bottom up (all) ");
2018     } else {
2019       // Find the node with the highest ASAP.
2020       SUnit *maxASAP = nullptr;
2021       for (SUnit *SU : Nodes) {
2022         if (maxASAP == nullptr || getASAP(SU) >= getASAP(maxASAP))
2023           maxASAP = SU;
2024       }
2025       R.insert(maxASAP);
2026       Order = BottomUp;
2027       DEBUG(dbgs() << "  Bottom up (default) ");
2028     }
2029 
2030     while (!R.empty()) {
2031       if (Order == TopDown) {
2032         // Choose the node with the maximum height.  If more than one, choose
2033         // the node with the lowest MOV. If still more than one, check if there
2034         // is a dependence between the instructions.
2035         while (!R.empty()) {
2036           SUnit *maxHeight = nullptr;
2037           for (SUnit *I : R) {
2038             if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
2039               maxHeight = I;
2040             else if (getHeight(I) == getHeight(maxHeight) &&
2041                      getMOV(I) < getMOV(maxHeight) &&
2042                      !hasDataDependence(maxHeight, I))
2043               maxHeight = I;
2044             else if (hasDataDependence(I, maxHeight))
2045               maxHeight = I;
2046           }
2047           NodeOrder.insert(maxHeight);
2048           DEBUG(dbgs() << maxHeight->NodeNum << " ");
2049           R.remove(maxHeight);
2050           for (const auto &I : maxHeight->Succs) {
2051             if (Nodes.count(I.getSUnit()) == 0)
2052               continue;
2053             if (NodeOrder.count(I.getSUnit()) != 0)
2054               continue;
2055             if (ignoreDependence(I, false))
2056               continue;
2057             R.insert(I.getSUnit());
2058           }
2059           // Back-edges are predecessors with an anti-dependence.
2060           for (const auto &I : maxHeight->Preds) {
2061             if (I.getKind() != SDep::Anti)
2062               continue;
2063             if (Nodes.count(I.getSUnit()) == 0)
2064               continue;
2065             if (NodeOrder.count(I.getSUnit()) != 0)
2066               continue;
2067             R.insert(I.getSUnit());
2068           }
2069         }
2070         Order = BottomUp;
2071         DEBUG(dbgs() << "\n   Switching order to bottom up ");
2072         SmallSetVector<SUnit *, 8> N;
2073         if (pred_L(NodeOrder, N, &Nodes))
2074           R.insert(N.begin(), N.end());
2075       } else {
2076         // Choose the node with the maximum depth.  If more than one, choose
2077         // the node with the lowest MOV. If there is still more than one, check
2078         // for a dependence between the instructions.
2079         while (!R.empty()) {
2080           SUnit *maxDepth = nullptr;
2081           for (SUnit *I : R) {
2082             if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
2083               maxDepth = I;
2084             else if (getDepth(I) == getDepth(maxDepth) &&
2085                      getMOV(I) < getMOV(maxDepth) &&
2086                      !hasDataDependence(I, maxDepth))
2087               maxDepth = I;
2088             else if (hasDataDependence(maxDepth, I))
2089               maxDepth = I;
2090           }
2091           NodeOrder.insert(maxDepth);
2092           DEBUG(dbgs() << maxDepth->NodeNum << " ");
2093           R.remove(maxDepth);
2094           if (Nodes.isExceedSU(maxDepth)) {
2095             Order = TopDown;
2096             R.clear();
2097             R.insert(Nodes.getNode(0));
2098             break;
2099           }
2100           for (const auto &I : maxDepth->Preds) {
2101             if (Nodes.count(I.getSUnit()) == 0)
2102               continue;
2103             if (NodeOrder.count(I.getSUnit()) != 0)
2104               continue;
2105             if (I.getKind() == SDep::Anti)
2106               continue;
2107             R.insert(I.getSUnit());
2108           }
2109           // Back-edges are predecessors with an anti-dependence.
2110           for (const auto &I : maxDepth->Succs) {
2111             if (I.getKind() != SDep::Anti)
2112               continue;
2113             if (Nodes.count(I.getSUnit()) == 0)
2114               continue;
2115             if (NodeOrder.count(I.getSUnit()) != 0)
2116               continue;
2117             R.insert(I.getSUnit());
2118           }
2119         }
2120         Order = TopDown;
2121         DEBUG(dbgs() << "\n   Switching order to top down ");
2122         SmallSetVector<SUnit *, 8> N;
2123         if (succ_L(NodeOrder, N, &Nodes))
2124           R.insert(N.begin(), N.end());
2125       }
2126     }
2127     DEBUG(dbgs() << "\nDone with Nodeset\n");
2128   }
2129 
2130   DEBUG({
2131     dbgs() << "Node order: ";
2132     for (SUnit *I : NodeOrder)
2133       dbgs() << " " << I->NodeNum << " ";
2134     dbgs() << "\n";
2135   });
2136 }
2137 
2138 /// Process the nodes in the computed order and create the pipelined schedule
2139 /// of the instructions, if possible. Return true if a schedule is found.
2140 bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2141 
2142   if (NodeOrder.size() == 0)
2143     return false;
2144 
2145   bool scheduleFound = false;
2146   // Keep increasing II until a valid schedule is found.
2147   for (unsigned II = MII; II < MII + 10 && !scheduleFound; ++II) {
2148     Schedule.reset();
2149     Schedule.setInitiationInterval(II);
2150     DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2151 
2152     SetVector<SUnit *>::iterator NI = NodeOrder.begin();
2153     SetVector<SUnit *>::iterator NE = NodeOrder.end();
2154     do {
2155       SUnit *SU = *NI;
2156 
2157       // Compute the schedule time for the instruction, which is based
2158       // upon the scheduled time for any predecessors/successors.
2159       int EarlyStart = INT_MIN;
2160       int LateStart = INT_MAX;
2161       // These values are set when the size of the schedule window is limited
2162       // due to chain dependences.
2163       int SchedEnd = INT_MAX;
2164       int SchedStart = INT_MIN;
2165       Schedule.computeStart(SU, &EarlyStart, &LateStart, &SchedEnd, &SchedStart,
2166                             II, this);
2167       DEBUG({
2168         dbgs() << "Inst (" << SU->NodeNum << ") ";
2169         SU->getInstr()->dump();
2170         dbgs() << "\n";
2171       });
2172       DEBUG({
2173         dbgs() << "\tes: " << EarlyStart << " ls: " << LateStart
2174                << " me: " << SchedEnd << " ms: " << SchedStart << "\n";
2175       });
2176 
2177       if (EarlyStart > LateStart || SchedEnd < EarlyStart ||
2178           SchedStart > LateStart)
2179         scheduleFound = false;
2180       else if (EarlyStart != INT_MIN && LateStart == INT_MAX) {
2181         SchedEnd = std::min(SchedEnd, EarlyStart + (int)II - 1);
2182         scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2183       } else if (EarlyStart == INT_MIN && LateStart != INT_MAX) {
2184         SchedStart = std::max(SchedStart, LateStart - (int)II + 1);
2185         scheduleFound = Schedule.insert(SU, LateStart, SchedStart, II);
2186       } else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2187         SchedEnd =
2188             std::min(SchedEnd, std::min(LateStart, EarlyStart + (int)II - 1));
2189         // When scheduling a Phi it is better to start at the late cycle and go
2190         // backwards. The default order may insert the Phi too far away from
2191         // its first dependence.
2192         if (SU->getInstr()->isPHI())
2193           scheduleFound = Schedule.insert(SU, SchedEnd, EarlyStart, II);
2194         else
2195           scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2196       } else {
2197         int FirstCycle = Schedule.getFirstCycle();
2198         scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2199                                         FirstCycle + getASAP(SU) + II - 1, II);
2200       }
2201       // Even if we find a schedule, make sure the schedule doesn't exceed the
2202       // allowable number of stages. We keep trying if this happens.
2203       if (scheduleFound)
2204         if (SwpMaxStages > -1 &&
2205             Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2206           scheduleFound = false;
2207 
2208       DEBUG({
2209         if (!scheduleFound)
2210           dbgs() << "\tCan't schedule\n";
2211       });
2212     } while (++NI != NE && scheduleFound);
2213 
2214     // If a schedule is found, check if it is a valid schedule too.
2215     if (scheduleFound)
2216       scheduleFound = Schedule.isValidSchedule(this);
2217   }
2218 
2219   DEBUG(dbgs() << "Schedule Found? " << scheduleFound << "\n");
2220 
2221   if (scheduleFound)
2222     Schedule.finalizeSchedule(this);
2223   else
2224     Schedule.reset();
2225 
2226   return scheduleFound && Schedule.getMaxStageCount() > 0;
2227 }
2228 
2229 /// Given a schedule for the loop, generate a new version of the loop,
2230 /// and replace the old version.  This function generates a prolog
2231 /// that contains the initial iterations in the pipeline, and kernel
2232 /// loop, and the epilogue that contains the code for the final
2233 /// iterations.
2234 void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) {
2235   // Create a new basic block for the kernel and add it to the CFG.
2236   MachineBasicBlock *KernelBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
2237 
2238   unsigned MaxStageCount = Schedule.getMaxStageCount();
2239 
2240   // Remember the registers that are used in different stages. The index is
2241   // the iteration, or stage, that the instruction is scheduled in.  This is
2242   // a map between register names in the orignal block and the names created
2243   // in each stage of the pipelined loop.
2244   ValueMapTy *VRMap = new ValueMapTy[(MaxStageCount + 1) * 2];
2245   InstrMapTy InstrMap;
2246 
2247   SmallVector<MachineBasicBlock *, 4> PrologBBs;
2248   // Generate the prolog instructions that set up the pipeline.
2249   generateProlog(Schedule, MaxStageCount, KernelBB, VRMap, PrologBBs);
2250   MF.insert(BB->getIterator(), KernelBB);
2251 
2252   // Rearrange the instructions to generate the new, pipelined loop,
2253   // and update register names as needed.
2254   for (int Cycle = Schedule.getFirstCycle(),
2255            LastCycle = Schedule.getFinalCycle();
2256        Cycle <= LastCycle; ++Cycle) {
2257     std::deque<SUnit *> &CycleInstrs = Schedule.getInstructions(Cycle);
2258     // This inner loop schedules each instruction in the cycle.
2259     for (SUnit *CI : CycleInstrs) {
2260       if (CI->getInstr()->isPHI())
2261         continue;
2262       unsigned StageNum = Schedule.stageScheduled(getSUnit(CI->getInstr()));
2263       MachineInstr *NewMI = cloneInstr(CI->getInstr(), MaxStageCount, StageNum);
2264       updateInstruction(NewMI, false, MaxStageCount, StageNum, Schedule, VRMap);
2265       KernelBB->push_back(NewMI);
2266       InstrMap[NewMI] = CI->getInstr();
2267     }
2268   }
2269 
2270   // Copy any terminator instructions to the new kernel, and update
2271   // names as needed.
2272   for (MachineBasicBlock::iterator I = BB->getFirstTerminator(),
2273                                    E = BB->instr_end();
2274        I != E; ++I) {
2275     MachineInstr *NewMI = MF.CloneMachineInstr(&*I);
2276     updateInstruction(NewMI, false, MaxStageCount, 0, Schedule, VRMap);
2277     KernelBB->push_back(NewMI);
2278     InstrMap[NewMI] = &*I;
2279   }
2280 
2281   KernelBB->transferSuccessors(BB);
2282   KernelBB->replaceSuccessor(BB, KernelBB);
2283 
2284   generateExistingPhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule,
2285                        VRMap, InstrMap, MaxStageCount, MaxStageCount, false);
2286   generatePhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule, VRMap,
2287                InstrMap, MaxStageCount, MaxStageCount, false);
2288 
2289   DEBUG(dbgs() << "New block\n"; KernelBB->dump(););
2290 
2291   SmallVector<MachineBasicBlock *, 4> EpilogBBs;
2292   // Generate the epilog instructions to complete the pipeline.
2293   generateEpilog(Schedule, MaxStageCount, KernelBB, VRMap, EpilogBBs,
2294                  PrologBBs);
2295 
2296   // We need this step because the register allocation doesn't handle some
2297   // situations well, so we insert copies to help out.
2298   splitLifetimes(KernelBB, EpilogBBs, Schedule);
2299 
2300   // Remove dead instructions due to loop induction variables.
2301   removeDeadInstructions(KernelBB, EpilogBBs);
2302 
2303   // Add branches between prolog and epilog blocks.
2304   addBranches(PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap);
2305 
2306   // Remove the original loop since it's no longer referenced.
2307   BB->clear();
2308   BB->eraseFromParent();
2309 
2310   delete[] VRMap;
2311 }
2312 
2313 /// Generate the pipeline prolog code.
2314 void SwingSchedulerDAG::generateProlog(SMSchedule &Schedule, unsigned LastStage,
2315                                        MachineBasicBlock *KernelBB,
2316                                        ValueMapTy *VRMap,
2317                                        MBBVectorTy &PrologBBs) {
2318   MachineBasicBlock *PreheaderBB = MLI->getLoopFor(BB)->getLoopPreheader();
2319   assert(PreheaderBB != NULL &&
2320          "Need to add code to handle loops w/o preheader");
2321   MachineBasicBlock *PredBB = PreheaderBB;
2322   InstrMapTy InstrMap;
2323 
2324   // Generate a basic block for each stage, not including the last stage,
2325   // which will be generated in the kernel. Each basic block may contain
2326   // instructions from multiple stages/iterations.
2327   for (unsigned i = 0; i < LastStage; ++i) {
2328     // Create and insert the prolog basic block prior to the original loop
2329     // basic block.  The original loop is removed later.
2330     MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
2331     PrologBBs.push_back(NewBB);
2332     MF.insert(BB->getIterator(), NewBB);
2333     NewBB->transferSuccessors(PredBB);
2334     PredBB->addSuccessor(NewBB);
2335     PredBB = NewBB;
2336 
2337     // Generate instructions for each appropriate stage. Process instructions
2338     // in original program order.
2339     for (int StageNum = i; StageNum >= 0; --StageNum) {
2340       for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
2341                                        BBE = BB->getFirstTerminator();
2342            BBI != BBE; ++BBI) {
2343         if (Schedule.isScheduledAtStage(getSUnit(&*BBI), (unsigned)StageNum)) {
2344           if (BBI->isPHI())
2345             continue;
2346           MachineInstr *NewMI =
2347               cloneAndChangeInstr(&*BBI, i, (unsigned)StageNum, Schedule);
2348           updateInstruction(NewMI, false, i, (unsigned)StageNum, Schedule,
2349                             VRMap);
2350           NewBB->push_back(NewMI);
2351           InstrMap[NewMI] = &*BBI;
2352         }
2353       }
2354     }
2355     rewritePhiValues(NewBB, i, Schedule, VRMap, InstrMap);
2356     DEBUG({
2357       dbgs() << "prolog:\n";
2358       NewBB->dump();
2359     });
2360   }
2361 
2362   PredBB->replaceSuccessor(BB, KernelBB);
2363 
2364   // Check if we need to remove the branch from the preheader to the original
2365   // loop, and replace it with a branch to the new loop.
2366   unsigned numBranches = TII->removeBranch(*PreheaderBB);
2367   if (numBranches) {
2368     SmallVector<MachineOperand, 0> Cond;
2369     TII->insertBranch(*PreheaderBB, PrologBBs[0], nullptr, Cond, DebugLoc());
2370   }
2371 }
2372 
2373 /// Generate the pipeline epilog code. The epilog code finishes the iterations
2374 /// that were started in either the prolog or the kernel.  We create a basic
2375 /// block for each stage that needs to complete.
2376 void SwingSchedulerDAG::generateEpilog(SMSchedule &Schedule, unsigned LastStage,
2377                                        MachineBasicBlock *KernelBB,
2378                                        ValueMapTy *VRMap,
2379                                        MBBVectorTy &EpilogBBs,
2380                                        MBBVectorTy &PrologBBs) {
2381   // We need to change the branch from the kernel to the first epilog block, so
2382   // this call to analyze branch uses the kernel rather than the original BB.
2383   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
2384   SmallVector<MachineOperand, 4> Cond;
2385   bool checkBranch = TII->analyzeBranch(*KernelBB, TBB, FBB, Cond);
2386   assert(!checkBranch && "generateEpilog must be able to analyze the branch");
2387   if (checkBranch)
2388     return;
2389 
2390   MachineBasicBlock::succ_iterator LoopExitI = KernelBB->succ_begin();
2391   if (*LoopExitI == KernelBB)
2392     ++LoopExitI;
2393   assert(LoopExitI != KernelBB->succ_end() && "Expecting a successor");
2394   MachineBasicBlock *LoopExitBB = *LoopExitI;
2395 
2396   MachineBasicBlock *PredBB = KernelBB;
2397   MachineBasicBlock *EpilogStart = LoopExitBB;
2398   InstrMapTy InstrMap;
2399 
2400   // Generate a basic block for each stage, not including the last stage,
2401   // which was generated for the kernel.  Each basic block may contain
2402   // instructions from multiple stages/iterations.
2403   int EpilogStage = LastStage + 1;
2404   for (unsigned i = LastStage; i >= 1; --i, ++EpilogStage) {
2405     MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock();
2406     EpilogBBs.push_back(NewBB);
2407     MF.insert(BB->getIterator(), NewBB);
2408 
2409     PredBB->replaceSuccessor(LoopExitBB, NewBB);
2410     NewBB->addSuccessor(LoopExitBB);
2411 
2412     if (EpilogStart == LoopExitBB)
2413       EpilogStart = NewBB;
2414 
2415     // Add instructions to the epilog depending on the current block.
2416     // Process instructions in original program order.
2417     for (unsigned StageNum = i; StageNum <= LastStage; ++StageNum) {
2418       for (auto &BBI : *BB) {
2419         if (BBI.isPHI())
2420           continue;
2421         MachineInstr *In = &BBI;
2422         if (Schedule.isScheduledAtStage(getSUnit(In), StageNum)) {
2423           MachineInstr *NewMI = cloneInstr(In, EpilogStage - LastStage, 0);
2424           updateInstruction(NewMI, i == 1, EpilogStage, 0, Schedule, VRMap);
2425           NewBB->push_back(NewMI);
2426           InstrMap[NewMI] = In;
2427         }
2428       }
2429     }
2430     generateExistingPhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule,
2431                          VRMap, InstrMap, LastStage, EpilogStage, i == 1);
2432     generatePhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule, VRMap,
2433                  InstrMap, LastStage, EpilogStage, i == 1);
2434     PredBB = NewBB;
2435 
2436     DEBUG({
2437       dbgs() << "epilog:\n";
2438       NewBB->dump();
2439     });
2440   }
2441 
2442   // Fix any Phi nodes in the loop exit block.
2443   for (MachineInstr &MI : *LoopExitBB) {
2444     if (!MI.isPHI())
2445       break;
2446     for (unsigned i = 2, e = MI.getNumOperands() + 1; i != e; i += 2) {
2447       MachineOperand &MO = MI.getOperand(i);
2448       if (MO.getMBB() == BB)
2449         MO.setMBB(PredBB);
2450     }
2451   }
2452 
2453   // Create a branch to the new epilog from the kernel.
2454   // Remove the original branch and add a new branch to the epilog.
2455   TII->removeBranch(*KernelBB);
2456   TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
2457   // Add a branch to the loop exit.
2458   if (EpilogBBs.size() > 0) {
2459     MachineBasicBlock *LastEpilogBB = EpilogBBs.back();
2460     SmallVector<MachineOperand, 4> Cond1;
2461     TII->insertBranch(*LastEpilogBB, LoopExitBB, nullptr, Cond1, DebugLoc());
2462   }
2463 }
2464 
2465 /// Replace all uses of FromReg that appear outside the specified
2466 /// basic block with ToReg.
2467 static void replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg,
2468                                     MachineBasicBlock *MBB,
2469                                     MachineRegisterInfo &MRI,
2470                                     LiveIntervals &LIS) {
2471   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(FromReg),
2472                                          E = MRI.use_end();
2473        I != E;) {
2474     MachineOperand &O = *I;
2475     ++I;
2476     if (O.getParent()->getParent() != MBB)
2477       O.setReg(ToReg);
2478   }
2479   if (!LIS.hasInterval(ToReg))
2480     LIS.createEmptyInterval(ToReg);
2481 }
2482 
2483 /// Return true if the register has a use that occurs outside the
2484 /// specified loop.
2485 static bool hasUseAfterLoop(unsigned Reg, MachineBasicBlock *BB,
2486                             MachineRegisterInfo &MRI) {
2487   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(Reg),
2488                                          E = MRI.use_end();
2489        I != E; ++I)
2490     if (I->getParent()->getParent() != BB)
2491       return true;
2492   return false;
2493 }
2494 
2495 /// Generate Phis for the specific block in the generated pipelined code.
2496 /// This function looks at the Phis from the original code to guide the
2497 /// creation of new Phis.
2498 void SwingSchedulerDAG::generateExistingPhis(
2499     MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
2500     MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
2501     InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
2502     bool IsLast) {
2503   // Compute the stage number for the inital value of the Phi, which
2504   // comes from the prolog. The prolog to use depends on to which kernel/
2505   // epilog that we're adding the Phi.
2506   unsigned PrologStage = 0;
2507   unsigned PrevStage = 0;
2508   bool InKernel = (LastStageNum == CurStageNum);
2509   if (InKernel) {
2510     PrologStage = LastStageNum - 1;
2511     PrevStage = CurStageNum;
2512   } else {
2513     PrologStage = LastStageNum - (CurStageNum - LastStageNum);
2514     PrevStage = LastStageNum + (CurStageNum - LastStageNum) - 1;
2515   }
2516 
2517   for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
2518                                    BBE = BB->getFirstNonPHI();
2519        BBI != BBE; ++BBI) {
2520     unsigned Def = BBI->getOperand(0).getReg();
2521 
2522     unsigned InitVal = 0;
2523     unsigned LoopVal = 0;
2524     getPhiRegs(*BBI, BB, InitVal, LoopVal);
2525 
2526     unsigned PhiOp1 = 0;
2527     // The Phi value from the loop body typically is defined in the loop, but
2528     // not always. So, we need to check if the value is defined in the loop.
2529     unsigned PhiOp2 = LoopVal;
2530     if (VRMap[LastStageNum].count(LoopVal))
2531       PhiOp2 = VRMap[LastStageNum][LoopVal];
2532 
2533     int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI));
2534     int LoopValStage =
2535         Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal)));
2536     unsigned NumStages = Schedule.getStagesForReg(Def, CurStageNum);
2537     if (NumStages == 0) {
2538       // We don't need to generate a Phi anymore, but we need to rename any uses
2539       // of the Phi value.
2540       unsigned NewReg = VRMap[PrevStage][LoopVal];
2541       rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, 0, &*BBI,
2542                             Def, NewReg);
2543       if (VRMap[CurStageNum].count(LoopVal))
2544         VRMap[CurStageNum][Def] = VRMap[CurStageNum][LoopVal];
2545     }
2546     // Adjust the number of Phis needed depending on the number of prologs left,
2547     // and the distance from where the Phi is first scheduled.
2548     unsigned NumPhis = NumStages;
2549     if (!InKernel && (int)PrologStage < LoopValStage)
2550       // The NumPhis is the maximum number of new Phis needed during the steady
2551       // state. If the Phi has not been scheduled in current prolog, then we
2552       // need to generate less Phis.
2553       NumPhis = std::max((int)NumPhis - (int)(LoopValStage - PrologStage), 1);
2554     // The number of Phis cannot exceed the number of prolog stages. Each
2555     // stage can potentially define two values.
2556     NumPhis = std::min(NumPhis, PrologStage + 2);
2557 
2558     unsigned NewReg = 0;
2559 
2560     unsigned AccessStage = (LoopValStage != -1) ? LoopValStage : StageScheduled;
2561     // In the epilog, we may need to look back one stage to get the correct
2562     // Phi name because the epilog and prolog blocks execute the same stage.
2563     // The correct name is from the previous block only when the Phi has
2564     // been completely scheduled prior to the epilog, and Phi value is not
2565     // needed in multiple stages.
2566     int StageDiff = 0;
2567     if (!InKernel && StageScheduled >= LoopValStage && AccessStage == 0 &&
2568         NumPhis == 1)
2569       StageDiff = 1;
2570     // Adjust the computations below when the phi and the loop definition
2571     // are scheduled in different stages.
2572     if (InKernel && LoopValStage != -1 && StageScheduled > LoopValStage)
2573       StageDiff = StageScheduled - LoopValStage;
2574     for (unsigned np = 0; np < NumPhis; ++np) {
2575       // If the Phi hasn't been scheduled, then use the initial Phi operand
2576       // value. Otherwise, use the scheduled version of the instruction. This
2577       // is a little complicated when a Phi references another Phi.
2578       if (np > PrologStage || StageScheduled >= (int)LastStageNum)
2579         PhiOp1 = InitVal;
2580       // Check if the Phi has already been scheduled in a prolog stage.
2581       else if (PrologStage >= AccessStage + StageDiff + np &&
2582                VRMap[PrologStage - StageDiff - np].count(LoopVal) != 0)
2583         PhiOp1 = VRMap[PrologStage - StageDiff - np][LoopVal];
2584       // Check if the Phi has already been scheduled, but the loop intruction
2585       // is either another Phi, or doesn't occur in the loop.
2586       else if (PrologStage >= AccessStage + StageDiff + np) {
2587         // If the Phi references another Phi, we need to examine the other
2588         // Phi to get the correct value.
2589         PhiOp1 = LoopVal;
2590         MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1);
2591         int Indirects = 1;
2592         while (InstOp1 && InstOp1->isPHI() && InstOp1->getParent() == BB) {
2593           int PhiStage = Schedule.stageScheduled(getSUnit(InstOp1));
2594           if ((int)(PrologStage - StageDiff - np) < PhiStage + Indirects)
2595             PhiOp1 = getInitPhiReg(*InstOp1, BB);
2596           else
2597             PhiOp1 = getLoopPhiReg(*InstOp1, BB);
2598           InstOp1 = MRI.getVRegDef(PhiOp1);
2599           int PhiOpStage = Schedule.stageScheduled(getSUnit(InstOp1));
2600           int StageAdj = (PhiOpStage != -1 ? PhiStage - PhiOpStage : 0);
2601           if (PhiOpStage != -1 && PrologStage - StageAdj >= Indirects + np &&
2602               VRMap[PrologStage - StageAdj - Indirects - np].count(PhiOp1)) {
2603             PhiOp1 = VRMap[PrologStage - StageAdj - Indirects - np][PhiOp1];
2604             break;
2605           }
2606           ++Indirects;
2607         }
2608       } else
2609         PhiOp1 = InitVal;
2610       // If this references a generated Phi in the kernel, get the Phi operand
2611       // from the incoming block.
2612       if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1))
2613         if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
2614           PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
2615 
2616       MachineInstr *PhiInst = MRI.getVRegDef(LoopVal);
2617       bool LoopDefIsPhi = PhiInst && PhiInst->isPHI();
2618       // In the epilog, a map lookup is needed to get the value from the kernel,
2619       // or previous epilog block. How is does this depends on if the
2620       // instruction is scheduled in the previous block.
2621       if (!InKernel) {
2622         int StageDiffAdj = 0;
2623         if (LoopValStage != -1 && StageScheduled > LoopValStage)
2624           StageDiffAdj = StageScheduled - LoopValStage;
2625         // Use the loop value defined in the kernel, unless the kernel
2626         // contains the last definition of the Phi.
2627         if (np == 0 && PrevStage == LastStageNum &&
2628             (StageScheduled != 0 || LoopValStage != 0) &&
2629             VRMap[PrevStage - StageDiffAdj].count(LoopVal))
2630           PhiOp2 = VRMap[PrevStage - StageDiffAdj][LoopVal];
2631         // Use the value defined by the Phi. We add one because we switch
2632         // from looking at the loop value to the Phi definition.
2633         else if (np > 0 && PrevStage == LastStageNum &&
2634                  VRMap[PrevStage - np + 1].count(Def))
2635           PhiOp2 = VRMap[PrevStage - np + 1][Def];
2636         // Use the loop value defined in the kernel.
2637         else if ((unsigned)LoopValStage + StageDiffAdj > PrologStage + 1 &&
2638                  VRMap[PrevStage - StageDiffAdj - np].count(LoopVal))
2639           PhiOp2 = VRMap[PrevStage - StageDiffAdj - np][LoopVal];
2640         // Use the value defined by the Phi, unless we're generating the first
2641         // epilog and the Phi refers to a Phi in a different stage.
2642         else if (VRMap[PrevStage - np].count(Def) &&
2643                  (!LoopDefIsPhi || PrevStage != LastStageNum))
2644           PhiOp2 = VRMap[PrevStage - np][Def];
2645       }
2646 
2647       // Check if we can reuse an existing Phi. This occurs when a Phi
2648       // references another Phi, and the other Phi is scheduled in an
2649       // earlier stage. We can try to reuse an existing Phi up until the last
2650       // stage of the current Phi.
2651       if (LoopDefIsPhi && (int)PrologStage >= StageScheduled) {
2652         int LVNumStages = Schedule.getStagesForPhi(LoopVal);
2653         int StageDiff = (StageScheduled - LoopValStage);
2654         LVNumStages -= StageDiff;
2655         if (LVNumStages > (int)np) {
2656           NewReg = PhiOp2;
2657           unsigned ReuseStage = CurStageNum;
2658           if (Schedule.isLoopCarried(this, *PhiInst))
2659             ReuseStage -= LVNumStages;
2660           // Check if the Phi to reuse has been generated yet. If not, then
2661           // there is nothing to reuse.
2662           if (VRMap[ReuseStage].count(LoopVal)) {
2663             NewReg = VRMap[ReuseStage][LoopVal];
2664 
2665             rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2666                                   &*BBI, Def, NewReg);
2667             // Update the map with the new Phi name.
2668             VRMap[CurStageNum - np][Def] = NewReg;
2669             PhiOp2 = NewReg;
2670             if (VRMap[LastStageNum - np - 1].count(LoopVal))
2671               PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal];
2672 
2673             if (IsLast && np == NumPhis - 1)
2674               replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
2675             continue;
2676           }
2677         } else if (StageDiff > 0 &&
2678                    VRMap[CurStageNum - StageDiff - np].count(LoopVal))
2679           PhiOp2 = VRMap[CurStageNum - StageDiff - np][LoopVal];
2680       }
2681 
2682       const TargetRegisterClass *RC = MRI.getRegClass(Def);
2683       NewReg = MRI.createVirtualRegister(RC);
2684 
2685       MachineInstrBuilder NewPhi =
2686           BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
2687                   TII->get(TargetOpcode::PHI), NewReg);
2688       NewPhi.addReg(PhiOp1).addMBB(BB1);
2689       NewPhi.addReg(PhiOp2).addMBB(BB2);
2690       if (np == 0)
2691         InstrMap[NewPhi] = &*BBI;
2692 
2693       // We define the Phis after creating the new pipelined code, so
2694       // we need to rename the Phi values in scheduled instructions.
2695 
2696       unsigned PrevReg = 0;
2697       if (InKernel && VRMap[PrevStage - np].count(LoopVal))
2698         PrevReg = VRMap[PrevStage - np][LoopVal];
2699       rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI,
2700                             Def, NewReg, PrevReg);
2701       // If the Phi has been scheduled, use the new name for rewriting.
2702       if (VRMap[CurStageNum - np].count(Def)) {
2703         unsigned R = VRMap[CurStageNum - np][Def];
2704         rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI,
2705                               R, NewReg);
2706       }
2707 
2708       // Check if we need to rename any uses that occurs after the loop. The
2709       // register to replace depends on whether the Phi is scheduled in the
2710       // epilog.
2711       if (IsLast && np == NumPhis - 1)
2712         replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
2713 
2714       // In the kernel, a dependent Phi uses the value from this Phi.
2715       if (InKernel)
2716         PhiOp2 = NewReg;
2717 
2718       // Update the map with the new Phi name.
2719       VRMap[CurStageNum - np][Def] = NewReg;
2720     }
2721 
2722     while (NumPhis++ < NumStages) {
2723       rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, NumPhis,
2724                             &*BBI, Def, NewReg, 0);
2725     }
2726 
2727     // Check if we need to rename a Phi that has been eliminated due to
2728     // scheduling.
2729     if (NumStages == 0 && IsLast && VRMap[CurStageNum].count(LoopVal))
2730       replaceRegUsesAfterLoop(Def, VRMap[CurStageNum][LoopVal], BB, MRI, LIS);
2731   }
2732 }
2733 
2734 /// Generate Phis for the specified block in the generated pipelined code.
2735 /// These are new Phis needed because the definition is scheduled after the
2736 /// use in the pipelened sequence.
2737 void SwingSchedulerDAG::generatePhis(
2738     MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
2739     MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
2740     InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
2741     bool IsLast) {
2742   // Compute the stage number that contains the initial Phi value, and
2743   // the Phi from the previous stage.
2744   unsigned PrologStage = 0;
2745   unsigned PrevStage = 0;
2746   unsigned StageDiff = CurStageNum - LastStageNum;
2747   bool InKernel = (StageDiff == 0);
2748   if (InKernel) {
2749     PrologStage = LastStageNum - 1;
2750     PrevStage = CurStageNum;
2751   } else {
2752     PrologStage = LastStageNum - StageDiff;
2753     PrevStage = LastStageNum + StageDiff - 1;
2754   }
2755 
2756   for (MachineBasicBlock::iterator BBI = BB->getFirstNonPHI(),
2757                                    BBE = BB->instr_end();
2758        BBI != BBE; ++BBI) {
2759     for (unsigned i = 0, e = BBI->getNumOperands(); i != e; ++i) {
2760       MachineOperand &MO = BBI->getOperand(i);
2761       if (!MO.isReg() || !MO.isDef() ||
2762           !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
2763         continue;
2764 
2765       int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI));
2766       assert(StageScheduled != -1 && "Expecting scheduled instruction.");
2767       unsigned Def = MO.getReg();
2768       unsigned NumPhis = Schedule.getStagesForReg(Def, CurStageNum);
2769       // An instruction scheduled in stage 0 and is used after the loop
2770       // requires a phi in the epilog for the last definition from either
2771       // the kernel or prolog.
2772       if (!InKernel && NumPhis == 0 && StageScheduled == 0 &&
2773           hasUseAfterLoop(Def, BB, MRI))
2774         NumPhis = 1;
2775       if (!InKernel && (unsigned)StageScheduled > PrologStage)
2776         continue;
2777 
2778       unsigned PhiOp2 = VRMap[PrevStage][Def];
2779       if (MachineInstr *InstOp2 = MRI.getVRegDef(PhiOp2))
2780         if (InstOp2->isPHI() && InstOp2->getParent() == NewBB)
2781           PhiOp2 = getLoopPhiReg(*InstOp2, BB2);
2782       // The number of Phis can't exceed the number of prolog stages. The
2783       // prolog stage number is zero based.
2784       if (NumPhis > PrologStage + 1 - StageScheduled)
2785         NumPhis = PrologStage + 1 - StageScheduled;
2786       for (unsigned np = 0; np < NumPhis; ++np) {
2787         unsigned PhiOp1 = VRMap[PrologStage][Def];
2788         if (np <= PrologStage)
2789           PhiOp1 = VRMap[PrologStage - np][Def];
2790         if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1)) {
2791           if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
2792             PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
2793           if (InstOp1->isPHI() && InstOp1->getParent() == NewBB)
2794             PhiOp1 = getInitPhiReg(*InstOp1, NewBB);
2795         }
2796         if (!InKernel)
2797           PhiOp2 = VRMap[PrevStage - np][Def];
2798 
2799         const TargetRegisterClass *RC = MRI.getRegClass(Def);
2800         unsigned NewReg = MRI.createVirtualRegister(RC);
2801 
2802         MachineInstrBuilder NewPhi =
2803             BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
2804                     TII->get(TargetOpcode::PHI), NewReg);
2805         NewPhi.addReg(PhiOp1).addMBB(BB1);
2806         NewPhi.addReg(PhiOp2).addMBB(BB2);
2807         if (np == 0)
2808           InstrMap[NewPhi] = &*BBI;
2809 
2810         // Rewrite uses and update the map. The actions depend upon whether
2811         // we generating code for the kernel or epilog blocks.
2812         if (InKernel) {
2813           rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2814                                 &*BBI, PhiOp1, NewReg);
2815           rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2816                                 &*BBI, PhiOp2, NewReg);
2817 
2818           PhiOp2 = NewReg;
2819           VRMap[PrevStage - np - 1][Def] = NewReg;
2820         } else {
2821           VRMap[CurStageNum - np][Def] = NewReg;
2822           if (np == NumPhis - 1)
2823             rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2824                                   &*BBI, Def, NewReg);
2825         }
2826         if (IsLast && np == NumPhis - 1)
2827           replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
2828       }
2829     }
2830   }
2831 }
2832 
2833 /// Remove instructions that generate values with no uses.
2834 /// Typically, these are induction variable operations that generate values
2835 /// used in the loop itself.  A dead instruction has a definition with
2836 /// no uses, or uses that occur in the original loop only.
2837 void SwingSchedulerDAG::removeDeadInstructions(MachineBasicBlock *KernelBB,
2838                                                MBBVectorTy &EpilogBBs) {
2839   // For each epilog block, check that the value defined by each instruction
2840   // is used.  If not, delete it.
2841   for (MBBVectorTy::reverse_iterator MBB = EpilogBBs.rbegin(),
2842                                      MBE = EpilogBBs.rend();
2843        MBB != MBE; ++MBB)
2844     for (MachineBasicBlock::reverse_instr_iterator MI = (*MBB)->instr_rbegin(),
2845                                                    ME = (*MBB)->instr_rend();
2846          MI != ME;) {
2847       // From DeadMachineInstructionElem. Don't delete inline assembly.
2848       if (MI->isInlineAsm()) {
2849         ++MI;
2850         continue;
2851       }
2852       bool SawStore = false;
2853       // Check if it's safe to remove the instruction due to side effects.
2854       // We can, and want to, remove Phis here.
2855       if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI()) {
2856         ++MI;
2857         continue;
2858       }
2859       bool used = true;
2860       for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
2861                                       MOE = MI->operands_end();
2862            MOI != MOE; ++MOI) {
2863         if (!MOI->isReg() || !MOI->isDef())
2864           continue;
2865         unsigned reg = MOI->getReg();
2866         unsigned realUses = 0;
2867         for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(reg),
2868                                                EI = MRI.use_end();
2869              UI != EI; ++UI) {
2870           // Check if there are any uses that occur only in the original
2871           // loop.  If so, that's not a real use.
2872           if (UI->getParent()->getParent() != BB) {
2873             realUses++;
2874             used = true;
2875             break;
2876           }
2877         }
2878         if (realUses > 0)
2879           break;
2880         used = false;
2881       }
2882       if (!used) {
2883         MI++->eraseFromParent();
2884         continue;
2885       }
2886       ++MI;
2887     }
2888   // In the kernel block, check if we can remove a Phi that generates a value
2889   // used in an instruction removed in the epilog block.
2890   for (MachineBasicBlock::iterator BBI = KernelBB->instr_begin(),
2891                                    BBE = KernelBB->getFirstNonPHI();
2892        BBI != BBE;) {
2893     MachineInstr *MI = &*BBI;
2894     ++BBI;
2895     unsigned reg = MI->getOperand(0).getReg();
2896     if (MRI.use_begin(reg) == MRI.use_end()) {
2897       MI->eraseFromParent();
2898     }
2899   }
2900 }
2901 
2902 /// For loop carried definitions, we split the lifetime of a virtual register
2903 /// that has uses past the definition in the next iteration. A copy with a new
2904 /// virtual register is inserted before the definition, which helps with
2905 /// generating a better register assignment.
2906 ///
2907 ///   v1 = phi(a, v2)     v1 = phi(a, v2)
2908 ///   v2 = phi(b, v3)     v2 = phi(b, v3)
2909 ///   v3 = ..             v4 = copy v1
2910 ///   .. = V1             v3 = ..
2911 ///                       .. = v4
2912 void SwingSchedulerDAG::splitLifetimes(MachineBasicBlock *KernelBB,
2913                                        MBBVectorTy &EpilogBBs,
2914                                        SMSchedule &Schedule) {
2915   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2916   for (MachineBasicBlock::iterator BBI = KernelBB->instr_begin(),
2917                                    BBF = KernelBB->getFirstNonPHI();
2918        BBI != BBF; ++BBI) {
2919     unsigned Def = BBI->getOperand(0).getReg();
2920     // Check for any Phi definition that used as an operand of another Phi
2921     // in the same block.
2922     for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(Def),
2923                                                  E = MRI.use_instr_end();
2924          I != E; ++I) {
2925       if (I->isPHI() && I->getParent() == KernelBB) {
2926         // Get the loop carried definition.
2927         unsigned LCDef = getLoopPhiReg(*BBI, KernelBB);
2928         if (!LCDef)
2929           continue;
2930         MachineInstr *MI = MRI.getVRegDef(LCDef);
2931         if (!MI || MI->getParent() != KernelBB || MI->isPHI())
2932           continue;
2933         // Search through the rest of the block looking for uses of the Phi
2934         // definition. If one occurs, then split the lifetime.
2935         unsigned SplitReg = 0;
2936         for (auto &BBJ : make_range(MachineBasicBlock::instr_iterator(MI),
2937                                     KernelBB->instr_end()))
2938           if (BBJ.readsRegister(Def)) {
2939             // We split the lifetime when we find the first use.
2940             if (SplitReg == 0) {
2941               SplitReg = MRI.createVirtualRegister(MRI.getRegClass(Def));
2942               BuildMI(*KernelBB, MI, MI->getDebugLoc(),
2943                       TII->get(TargetOpcode::COPY), SplitReg)
2944                   .addReg(Def);
2945             }
2946             BBJ.substituteRegister(Def, SplitReg, 0, *TRI);
2947           }
2948         if (!SplitReg)
2949           continue;
2950         // Search through each of the epilog blocks for any uses to be renamed.
2951         for (auto &Epilog : EpilogBBs)
2952           for (auto &I : *Epilog)
2953             if (I.readsRegister(Def))
2954               I.substituteRegister(Def, SplitReg, 0, *TRI);
2955         break;
2956       }
2957     }
2958   }
2959 }
2960 
2961 /// Remove the incoming block from the Phis in a basic block.
2962 static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
2963   for (MachineInstr &MI : *BB) {
2964     if (!MI.isPHI())
2965       break;
2966     for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2)
2967       if (MI.getOperand(i + 1).getMBB() == Incoming) {
2968         MI.RemoveOperand(i + 1);
2969         MI.RemoveOperand(i);
2970         break;
2971       }
2972   }
2973 }
2974 
2975 /// Create branches from each prolog basic block to the appropriate epilog
2976 /// block.  These edges are needed if the loop ends before reaching the
2977 /// kernel.
2978 void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs,
2979                                     MachineBasicBlock *KernelBB,
2980                                     MBBVectorTy &EpilogBBs,
2981                                     SMSchedule &Schedule, ValueMapTy *VRMap) {
2982   assert(PrologBBs.size() == EpilogBBs.size() && "Prolog/Epilog mismatch");
2983   MachineInstr *IndVar = Pass.LI.LoopInductionVar;
2984   MachineInstr *Cmp = Pass.LI.LoopCompare;
2985   MachineBasicBlock *LastPro = KernelBB;
2986   MachineBasicBlock *LastEpi = KernelBB;
2987 
2988   // Start from the blocks connected to the kernel and work "out"
2989   // to the first prolog and the last epilog blocks.
2990   SmallVector<MachineInstr *, 4> PrevInsts;
2991   unsigned MaxIter = PrologBBs.size() - 1;
2992   unsigned LC = UINT_MAX;
2993   unsigned LCMin = UINT_MAX;
2994   for (unsigned i = 0, j = MaxIter; i <= MaxIter; ++i, --j) {
2995     // Add branches to the prolog that go to the corresponding
2996     // epilog, and the fall-thru prolog/kernel block.
2997     MachineBasicBlock *Prolog = PrologBBs[j];
2998     MachineBasicBlock *Epilog = EpilogBBs[i];
2999     // We've executed one iteration, so decrement the loop count and check for
3000     // the loop end.
3001     SmallVector<MachineOperand, 4> Cond;
3002     // Check if the LOOP0 has already been removed. If so, then there is no need
3003     // to reduce the trip count.
3004     if (LC != 0)
3005       LC = TII->reduceLoopCount(*Prolog, IndVar, *Cmp, Cond, PrevInsts, j,
3006                                 MaxIter);
3007 
3008     // Record the value of the first trip count, which is used to determine if
3009     // branches and blocks can be removed for constant trip counts.
3010     if (LCMin == UINT_MAX)
3011       LCMin = LC;
3012 
3013     unsigned numAdded = 0;
3014     if (TargetRegisterInfo::isVirtualRegister(LC)) {
3015       Prolog->addSuccessor(Epilog);
3016       numAdded = TII->insertBranch(*Prolog, Epilog, LastPro, Cond, DebugLoc());
3017     } else if (j >= LCMin) {
3018       Prolog->addSuccessor(Epilog);
3019       Prolog->removeSuccessor(LastPro);
3020       LastEpi->removeSuccessor(Epilog);
3021       numAdded = TII->insertBranch(*Prolog, Epilog, nullptr, Cond, DebugLoc());
3022       removePhis(Epilog, LastEpi);
3023       // Remove the blocks that are no longer referenced.
3024       if (LastPro != LastEpi) {
3025         LastEpi->clear();
3026         LastEpi->eraseFromParent();
3027       }
3028       LastPro->clear();
3029       LastPro->eraseFromParent();
3030     } else {
3031       numAdded = TII->insertBranch(*Prolog, LastPro, nullptr, Cond, DebugLoc());
3032       removePhis(Epilog, Prolog);
3033     }
3034     LastPro = Prolog;
3035     LastEpi = Epilog;
3036     for (MachineBasicBlock::reverse_instr_iterator I = Prolog->instr_rbegin(),
3037                                                    E = Prolog->instr_rend();
3038          I != E && numAdded > 0; ++I, --numAdded)
3039       updateInstruction(&*I, false, j, 0, Schedule, VRMap);
3040   }
3041 }
3042 
3043 /// Return true if we can compute the amount the instruction changes
3044 /// during each iteration. Set Delta to the amount of the change.
3045 bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
3046   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3047   unsigned BaseReg;
3048   int64_t Offset;
3049   if (!TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI))
3050     return false;
3051 
3052   MachineRegisterInfo &MRI = MF.getRegInfo();
3053   // Check if there is a Phi. If so, get the definition in the loop.
3054   MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
3055   if (BaseDef && BaseDef->isPHI()) {
3056     BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
3057     BaseDef = MRI.getVRegDef(BaseReg);
3058   }
3059   if (!BaseDef)
3060     return false;
3061 
3062   int D = 0;
3063   if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
3064     return false;
3065 
3066   Delta = D;
3067   return true;
3068 }
3069 
3070 /// Update the memory operand with a new offset when the pipeliner
3071 /// generates a new copy of the instruction that refers to a
3072 /// different memory location.
3073 void SwingSchedulerDAG::updateMemOperands(MachineInstr &NewMI,
3074                                           MachineInstr &OldMI, unsigned Num) {
3075   if (Num == 0)
3076     return;
3077   // If the instruction has memory operands, then adjust the offset
3078   // when the instruction appears in different stages.
3079   unsigned NumRefs = NewMI.memoperands_end() - NewMI.memoperands_begin();
3080   if (NumRefs == 0)
3081     return;
3082   MachineInstr::mmo_iterator NewMemRefs = MF.allocateMemRefsArray(NumRefs);
3083   unsigned Refs = 0;
3084   for (MachineMemOperand *MMO : NewMI.memoperands()) {
3085     if (MMO->isVolatile() || (MMO->isInvariant() && MMO->isDereferenceable()) ||
3086         (!MMO->getValue())) {
3087       NewMemRefs[Refs++] = MMO;
3088       continue;
3089     }
3090     unsigned Delta;
3091     if (computeDelta(OldMI, Delta)) {
3092       int64_t AdjOffset = Delta * Num;
3093       NewMemRefs[Refs++] =
3094           MF.getMachineMemOperand(MMO, AdjOffset, MMO->getSize());
3095     } else
3096       NewMemRefs[Refs++] = MF.getMachineMemOperand(MMO, 0, UINT64_MAX);
3097   }
3098   NewMI.setMemRefs(NewMemRefs, NewMemRefs + NumRefs);
3099 }
3100 
3101 /// Clone the instruction for the new pipelined loop and update the
3102 /// memory operands, if needed.
3103 MachineInstr *SwingSchedulerDAG::cloneInstr(MachineInstr *OldMI,
3104                                             unsigned CurStageNum,
3105                                             unsigned InstStageNum) {
3106   MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
3107   // Check for tied operands in inline asm instructions. This should be handled
3108   // elsewhere, but I'm not sure of the best solution.
3109   if (OldMI->isInlineAsm())
3110     for (unsigned i = 0, e = OldMI->getNumOperands(); i != e; ++i) {
3111       const auto &MO = OldMI->getOperand(i);
3112       if (MO.isReg() && MO.isUse())
3113         break;
3114       unsigned UseIdx;
3115       if (OldMI->isRegTiedToUseOperand(i, &UseIdx))
3116         NewMI->tieOperands(i, UseIdx);
3117     }
3118   updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
3119   return NewMI;
3120 }
3121 
3122 /// Clone the instruction for the new pipelined loop. If needed, this
3123 /// function updates the instruction using the values saved in the
3124 /// InstrChanges structure.
3125 MachineInstr *SwingSchedulerDAG::cloneAndChangeInstr(MachineInstr *OldMI,
3126                                                      unsigned CurStageNum,
3127                                                      unsigned InstStageNum,
3128                                                      SMSchedule &Schedule) {
3129   MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
3130   DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
3131       InstrChanges.find(getSUnit(OldMI));
3132   if (It != InstrChanges.end()) {
3133     std::pair<unsigned, int64_t> RegAndOffset = It->second;
3134     unsigned BasePos, OffsetPos;
3135     if (!TII->getBaseAndOffsetPosition(*OldMI, BasePos, OffsetPos))
3136       return nullptr;
3137     int64_t NewOffset = OldMI->getOperand(OffsetPos).getImm();
3138     MachineInstr *LoopDef = findDefInLoop(RegAndOffset.first);
3139     if (Schedule.stageScheduled(getSUnit(LoopDef)) > (signed)InstStageNum)
3140       NewOffset += RegAndOffset.second * (CurStageNum - InstStageNum);
3141     NewMI->getOperand(OffsetPos).setImm(NewOffset);
3142   }
3143   updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
3144   return NewMI;
3145 }
3146 
3147 /// Update the machine instruction with new virtual registers.  This
3148 /// function may change the defintions and/or uses.
3149 void SwingSchedulerDAG::updateInstruction(MachineInstr *NewMI, bool LastDef,
3150                                           unsigned CurStageNum,
3151                                           unsigned InstrStageNum,
3152                                           SMSchedule &Schedule,
3153                                           ValueMapTy *VRMap) {
3154   for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
3155     MachineOperand &MO = NewMI->getOperand(i);
3156     if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
3157       continue;
3158     unsigned reg = MO.getReg();
3159     if (MO.isDef()) {
3160       // Create a new virtual register for the definition.
3161       const TargetRegisterClass *RC = MRI.getRegClass(reg);
3162       unsigned NewReg = MRI.createVirtualRegister(RC);
3163       MO.setReg(NewReg);
3164       VRMap[CurStageNum][reg] = NewReg;
3165       if (LastDef)
3166         replaceRegUsesAfterLoop(reg, NewReg, BB, MRI, LIS);
3167     } else if (MO.isUse()) {
3168       MachineInstr *Def = MRI.getVRegDef(reg);
3169       // Compute the stage that contains the last definition for instruction.
3170       int DefStageNum = Schedule.stageScheduled(getSUnit(Def));
3171       unsigned StageNum = CurStageNum;
3172       if (DefStageNum != -1 && (int)InstrStageNum > DefStageNum) {
3173         // Compute the difference in stages between the defintion and the use.
3174         unsigned StageDiff = (InstrStageNum - DefStageNum);
3175         // Make an adjustment to get the last definition.
3176         StageNum -= StageDiff;
3177       }
3178       if (VRMap[StageNum].count(reg))
3179         MO.setReg(VRMap[StageNum][reg]);
3180     }
3181   }
3182 }
3183 
3184 /// Return the instruction in the loop that defines the register.
3185 /// If the definition is a Phi, then follow the Phi operand to
3186 /// the instruction in the loop.
3187 MachineInstr *SwingSchedulerDAG::findDefInLoop(unsigned Reg) {
3188   SmallPtrSet<MachineInstr *, 8> Visited;
3189   MachineInstr *Def = MRI.getVRegDef(Reg);
3190   while (Def->isPHI()) {
3191     if (!Visited.insert(Def).second)
3192       break;
3193     for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
3194       if (Def->getOperand(i + 1).getMBB() == BB) {
3195         Def = MRI.getVRegDef(Def->getOperand(i).getReg());
3196         break;
3197       }
3198   }
3199   return Def;
3200 }
3201 
3202 /// Return the new name for the value from the previous stage.
3203 unsigned SwingSchedulerDAG::getPrevMapVal(unsigned StageNum, unsigned PhiStage,
3204                                           unsigned LoopVal, unsigned LoopStage,
3205                                           ValueMapTy *VRMap,
3206                                           MachineBasicBlock *BB) {
3207   unsigned PrevVal = 0;
3208   if (StageNum > PhiStage) {
3209     MachineInstr *LoopInst = MRI.getVRegDef(LoopVal);
3210     if (PhiStage == LoopStage && VRMap[StageNum - 1].count(LoopVal))
3211       // The name is defined in the previous stage.
3212       PrevVal = VRMap[StageNum - 1][LoopVal];
3213     else if (VRMap[StageNum].count(LoopVal))
3214       // The previous name is defined in the current stage when the instruction
3215       // order is swapped.
3216       PrevVal = VRMap[StageNum][LoopVal];
3217     else if (!LoopInst->isPHI())
3218       // The loop value hasn't yet been scheduled.
3219       PrevVal = LoopVal;
3220     else if (StageNum == PhiStage + 1)
3221       // The loop value is another phi, which has not been scheduled.
3222       PrevVal = getInitPhiReg(*LoopInst, BB);
3223     else if (StageNum > PhiStage + 1 && LoopInst->getParent() == BB)
3224       // The loop value is another phi, which has been scheduled.
3225       PrevVal =
3226           getPrevMapVal(StageNum - 1, PhiStage, getLoopPhiReg(*LoopInst, BB),
3227                         LoopStage, VRMap, BB);
3228   }
3229   return PrevVal;
3230 }
3231 
3232 /// Rewrite the Phi values in the specified block to use the mappings
3233 /// from the initial operand. Once the Phi is scheduled, we switch
3234 /// to using the loop value instead of the Phi value, so those names
3235 /// do not need to be rewritten.
3236 void SwingSchedulerDAG::rewritePhiValues(MachineBasicBlock *NewBB,
3237                                          unsigned StageNum,
3238                                          SMSchedule &Schedule,
3239                                          ValueMapTy *VRMap,
3240                                          InstrMapTy &InstrMap) {
3241   for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
3242                                    BBE = BB->getFirstNonPHI();
3243        BBI != BBE; ++BBI) {
3244     unsigned InitVal = 0;
3245     unsigned LoopVal = 0;
3246     getPhiRegs(*BBI, BB, InitVal, LoopVal);
3247     unsigned PhiDef = BBI->getOperand(0).getReg();
3248 
3249     unsigned PhiStage =
3250         (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(PhiDef)));
3251     unsigned LoopStage =
3252         (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal)));
3253     unsigned NumPhis = Schedule.getStagesForPhi(PhiDef);
3254     if (NumPhis > StageNum)
3255       NumPhis = StageNum;
3256     for (unsigned np = 0; np <= NumPhis; ++np) {
3257       unsigned NewVal =
3258           getPrevMapVal(StageNum - np, PhiStage, LoopVal, LoopStage, VRMap, BB);
3259       if (!NewVal)
3260         NewVal = InitVal;
3261       rewriteScheduledInstr(NewBB, Schedule, InstrMap, StageNum - np, np, &*BBI,
3262                             PhiDef, NewVal);
3263     }
3264   }
3265 }
3266 
3267 /// Rewrite a previously scheduled instruction to use the register value
3268 /// from the new instruction. Make sure the instruction occurs in the
3269 /// basic block, and we don't change the uses in the new instruction.
3270 void SwingSchedulerDAG::rewriteScheduledInstr(
3271     MachineBasicBlock *BB, SMSchedule &Schedule, InstrMapTy &InstrMap,
3272     unsigned CurStageNum, unsigned PhiNum, MachineInstr *Phi, unsigned OldReg,
3273     unsigned NewReg, unsigned PrevReg) {
3274   bool InProlog = (CurStageNum < Schedule.getMaxStageCount());
3275   int StagePhi = Schedule.stageScheduled(getSUnit(Phi)) + PhiNum;
3276   // Rewrite uses that have been scheduled already to use the new
3277   // Phi register.
3278   for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(OldReg),
3279                                          EI = MRI.use_end();
3280        UI != EI;) {
3281     MachineOperand &UseOp = *UI;
3282     MachineInstr *UseMI = UseOp.getParent();
3283     ++UI;
3284     if (UseMI->getParent() != BB)
3285       continue;
3286     if (UseMI->isPHI()) {
3287       if (!Phi->isPHI() && UseMI->getOperand(0).getReg() == NewReg)
3288         continue;
3289       if (getLoopPhiReg(*UseMI, BB) != OldReg)
3290         continue;
3291     }
3292     InstrMapTy::iterator OrigInstr = InstrMap.find(UseMI);
3293     assert(OrigInstr != InstrMap.end() && "Instruction not scheduled.");
3294     SUnit *OrigMISU = getSUnit(OrigInstr->second);
3295     int StageSched = Schedule.stageScheduled(OrigMISU);
3296     int CycleSched = Schedule.cycleScheduled(OrigMISU);
3297     unsigned ReplaceReg = 0;
3298     // This is the stage for the scheduled instruction.
3299     if (StagePhi == StageSched && Phi->isPHI()) {
3300       int CyclePhi = Schedule.cycleScheduled(getSUnit(Phi));
3301       if (PrevReg && InProlog)
3302         ReplaceReg = PrevReg;
3303       else if (PrevReg && !Schedule.isLoopCarried(this, *Phi) &&
3304                (CyclePhi <= CycleSched || OrigMISU->getInstr()->isPHI()))
3305         ReplaceReg = PrevReg;
3306       else
3307         ReplaceReg = NewReg;
3308     }
3309     // The scheduled instruction occurs before the scheduled Phi, and the
3310     // Phi is not loop carried.
3311     if (!InProlog && StagePhi + 1 == StageSched &&
3312         !Schedule.isLoopCarried(this, *Phi))
3313       ReplaceReg = NewReg;
3314     if (StagePhi > StageSched && Phi->isPHI())
3315       ReplaceReg = NewReg;
3316     if (!InProlog && !Phi->isPHI() && StagePhi < StageSched)
3317       ReplaceReg = NewReg;
3318     if (ReplaceReg) {
3319       MRI.constrainRegClass(ReplaceReg, MRI.getRegClass(OldReg));
3320       UseOp.setReg(ReplaceReg);
3321     }
3322   }
3323 }
3324 
3325 /// Check if we can change the instruction to use an offset value from the
3326 /// previous iteration. If so, return true and set the base and offset values
3327 /// so that we can rewrite the load, if necessary.
3328 ///   v1 = Phi(v0, v3)
3329 ///   v2 = load v1, 0
3330 ///   v3 = post_store v1, 4, x
3331 /// This function enables the load to be rewritten as v2 = load v3, 4.
3332 bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
3333                                               unsigned &BasePos,
3334                                               unsigned &OffsetPos,
3335                                               unsigned &NewBase,
3336                                               int64_t &Offset) {
3337   // Get the load instruction.
3338   if (TII->isPostIncrement(*MI))
3339     return false;
3340   unsigned BasePosLd, OffsetPosLd;
3341   if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
3342     return false;
3343   unsigned BaseReg = MI->getOperand(BasePosLd).getReg();
3344 
3345   // Look for the Phi instruction.
3346   MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
3347   MachineInstr *Phi = MRI.getVRegDef(BaseReg);
3348   if (!Phi || !Phi->isPHI())
3349     return false;
3350   // Get the register defined in the loop block.
3351   unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
3352   if (!PrevReg)
3353     return false;
3354 
3355   // Check for the post-increment load/store instruction.
3356   MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
3357   if (!PrevDef || PrevDef == MI)
3358     return false;
3359 
3360   if (!TII->isPostIncrement(*PrevDef))
3361     return false;
3362 
3363   unsigned BasePos1 = 0, OffsetPos1 = 0;
3364   if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
3365     return false;
3366 
3367   // Make sure offset values are both positive or both negative.
3368   int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
3369   int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
3370   if ((LoadOffset >= 0) != (StoreOffset >= 0))
3371     return false;
3372 
3373   // Set the return value once we determine that we return true.
3374   BasePos = BasePosLd;
3375   OffsetPos = OffsetPosLd;
3376   NewBase = PrevReg;
3377   Offset = StoreOffset;
3378   return true;
3379 }
3380 
3381 /// Apply changes to the instruction if needed. The changes are need
3382 /// to improve the scheduling and depend up on the final schedule.
3383 MachineInstr *SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
3384                                                   SMSchedule &Schedule,
3385                                                   bool UpdateDAG) {
3386   SUnit *SU = getSUnit(MI);
3387   DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
3388       InstrChanges.find(SU);
3389   if (It != InstrChanges.end()) {
3390     std::pair<unsigned, int64_t> RegAndOffset = It->second;
3391     unsigned BasePos, OffsetPos;
3392     if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3393       return nullptr;
3394     unsigned BaseReg = MI->getOperand(BasePos).getReg();
3395     MachineInstr *LoopDef = findDefInLoop(BaseReg);
3396     int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
3397     int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
3398     int BaseStageNum = Schedule.stageScheduled(SU);
3399     int BaseCycleNum = Schedule.cycleScheduled(SU);
3400     if (BaseStageNum < DefStageNum) {
3401       MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3402       int OffsetDiff = DefStageNum - BaseStageNum;
3403       if (DefCycleNum < BaseCycleNum) {
3404         NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
3405         if (OffsetDiff > 0)
3406           --OffsetDiff;
3407       }
3408       int64_t NewOffset =
3409           MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
3410       NewMI->getOperand(OffsetPos).setImm(NewOffset);
3411       if (UpdateDAG) {
3412         SU->setInstr(NewMI);
3413         MISUnitMap[NewMI] = SU;
3414       }
3415       NewMIs.insert(NewMI);
3416       return NewMI;
3417     }
3418   }
3419   return nullptr;
3420 }
3421 
3422 /// Return true for an order dependence that is loop carried potentially.
3423 /// An order dependence is loop carried if the destination defines a value
3424 /// that may be used by the source in a subsequent iteration.
3425 bool SwingSchedulerDAG::isLoopCarriedOrder(SUnit *Source, const SDep &Dep,
3426                                            bool isSucc) {
3427   if (!isOrder(Source, Dep) || Dep.isArtificial())
3428     return false;
3429 
3430   if (!SwpPruneLoopCarried)
3431     return true;
3432 
3433   MachineInstr *SI = Source->getInstr();
3434   MachineInstr *DI = Dep.getSUnit()->getInstr();
3435   if (!isSucc)
3436     std::swap(SI, DI);
3437   assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
3438 
3439   // Assume ordered loads and stores may have a loop carried dependence.
3440   if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
3441       SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
3442     return true;
3443 
3444   // Only chain dependences between a load and store can be loop carried.
3445   if (!DI->mayStore() || !SI->mayLoad())
3446     return false;
3447 
3448   unsigned DeltaS, DeltaD;
3449   if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
3450     return true;
3451 
3452   unsigned BaseRegS, BaseRegD;
3453   int64_t OffsetS, OffsetD;
3454   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3455   if (!TII->getMemOpBaseRegImmOfs(*SI, BaseRegS, OffsetS, TRI) ||
3456       !TII->getMemOpBaseRegImmOfs(*DI, BaseRegD, OffsetD, TRI))
3457     return true;
3458 
3459   if (BaseRegS != BaseRegD)
3460     return true;
3461 
3462   uint64_t AccessSizeS = (*SI->memoperands_begin())->getSize();
3463   uint64_t AccessSizeD = (*DI->memoperands_begin())->getSize();
3464 
3465   // This is the main test, which checks the offset values and the loop
3466   // increment value to determine if the accesses may be loop carried.
3467   if (OffsetS >= OffsetD)
3468     return OffsetS + AccessSizeS > DeltaS;
3469   else if (OffsetS < OffsetD)
3470     return OffsetD + AccessSizeD > DeltaD;
3471 
3472   return true;
3473 }
3474 
3475 /// Try to schedule the node at the specified StartCycle and continue
3476 /// until the node is schedule or the EndCycle is reached.  This function
3477 /// returns true if the node is scheduled.  This routine may search either
3478 /// forward or backward for a place to insert the instruction based upon
3479 /// the relative values of StartCycle and EndCycle.
3480 bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
3481   bool forward = true;
3482   if (StartCycle > EndCycle)
3483     forward = false;
3484 
3485   // The terminating condition depends on the direction.
3486   int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
3487   for (int curCycle = StartCycle; curCycle != termCycle;
3488        forward ? ++curCycle : --curCycle) {
3489 
3490     // Add the already scheduled instructions at the specified cycle to the DFA.
3491     Resources->clearResources();
3492     for (int checkCycle = FirstCycle + ((curCycle - FirstCycle) % II);
3493          checkCycle <= LastCycle; checkCycle += II) {
3494       std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[checkCycle];
3495 
3496       for (std::deque<SUnit *>::iterator I = cycleInstrs.begin(),
3497                                          E = cycleInstrs.end();
3498            I != E; ++I) {
3499         if (ST.getInstrInfo()->isZeroCost((*I)->getInstr()->getOpcode()))
3500           continue;
3501         assert(Resources->canReserveResources(*(*I)->getInstr()) &&
3502                "These instructions have already been scheduled.");
3503         Resources->reserveResources(*(*I)->getInstr());
3504       }
3505     }
3506     if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
3507         Resources->canReserveResources(*SU->getInstr())) {
3508       DEBUG({
3509         dbgs() << "\tinsert at cycle " << curCycle << " ";
3510         SU->getInstr()->dump();
3511       });
3512 
3513       ScheduledInstrs[curCycle].push_back(SU);
3514       InstrToCycle.insert(std::make_pair(SU, curCycle));
3515       if (curCycle > LastCycle)
3516         LastCycle = curCycle;
3517       if (curCycle < FirstCycle)
3518         FirstCycle = curCycle;
3519       return true;
3520     }
3521     DEBUG({
3522       dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
3523       SU->getInstr()->dump();
3524     });
3525   }
3526   return false;
3527 }
3528 
3529 // Return the cycle of the earliest scheduled instruction in the chain.
3530 int SMSchedule::earliestCycleInChain(const SDep &Dep) {
3531   SmallPtrSet<SUnit *, 8> Visited;
3532   SmallVector<SDep, 8> Worklist;
3533   Worklist.push_back(Dep);
3534   int EarlyCycle = INT_MAX;
3535   while (!Worklist.empty()) {
3536     const SDep &Cur = Worklist.pop_back_val();
3537     SUnit *PrevSU = Cur.getSUnit();
3538     if (Visited.count(PrevSU))
3539       continue;
3540     std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
3541     if (it == InstrToCycle.end())
3542       continue;
3543     EarlyCycle = std::min(EarlyCycle, it->second);
3544     for (const auto &PI : PrevSU->Preds)
3545       if (SwingSchedulerDAG::isOrder(PrevSU, PI))
3546         Worklist.push_back(PI);
3547     Visited.insert(PrevSU);
3548   }
3549   return EarlyCycle;
3550 }
3551 
3552 // Return the cycle of the latest scheduled instruction in the chain.
3553 int SMSchedule::latestCycleInChain(const SDep &Dep) {
3554   SmallPtrSet<SUnit *, 8> Visited;
3555   SmallVector<SDep, 8> Worklist;
3556   Worklist.push_back(Dep);
3557   int LateCycle = INT_MIN;
3558   while (!Worklist.empty()) {
3559     const SDep &Cur = Worklist.pop_back_val();
3560     SUnit *SuccSU = Cur.getSUnit();
3561     if (Visited.count(SuccSU))
3562       continue;
3563     std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
3564     if (it == InstrToCycle.end())
3565       continue;
3566     LateCycle = std::max(LateCycle, it->second);
3567     for (const auto &SI : SuccSU->Succs)
3568       if (SwingSchedulerDAG::isOrder(SuccSU, SI))
3569         Worklist.push_back(SI);
3570     Visited.insert(SuccSU);
3571   }
3572   return LateCycle;
3573 }
3574 
3575 /// If an instruction has a use that spans multiple iterations, then
3576 /// return true. These instructions are characterized by having a back-ege
3577 /// to a Phi, which contains a reference to another Phi.
3578 static SUnit *multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG) {
3579   for (auto &P : SU->Preds)
3580     if (DAG->isBackedge(SU, P) && P.getSUnit()->getInstr()->isPHI())
3581       for (auto &S : P.getSUnit()->Succs)
3582         if (S.getKind() == SDep::Order && S.getSUnit()->getInstr()->isPHI())
3583           return P.getSUnit();
3584   return nullptr;
3585 }
3586 
3587 /// Compute the scheduling start slot for the instruction.  The start slot
3588 /// depends on any predecessor or successor nodes scheduled already.
3589 void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
3590                               int *MinEnd, int *MaxStart, int II,
3591                               SwingSchedulerDAG *DAG) {
3592   // Iterate over each instruction that has been scheduled already.  The start
3593   // slot computuation depends on whether the previously scheduled instruction
3594   // is a predecessor or successor of the specified instruction.
3595   for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
3596 
3597     // Iterate over each instruction in the current cycle.
3598     for (SUnit *I : getInstructions(cycle)) {
3599       // Because we're processing a DAG for the dependences, we recognize
3600       // the back-edge in recurrences by anti dependences.
3601       for (unsigned i = 0, e = (unsigned)SU->Preds.size(); i != e; ++i) {
3602         const SDep &Dep = SU->Preds[i];
3603         if (Dep.getSUnit() == I) {
3604           if (!DAG->isBackedge(SU, Dep)) {
3605             int EarlyStart = cycle + DAG->getLatency(SU, Dep) -
3606                              DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
3607             *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
3608             if (DAG->isLoopCarriedOrder(SU, Dep, false)) {
3609               int End = earliestCycleInChain(Dep) + (II - 1);
3610               *MinEnd = std::min(*MinEnd, End);
3611             }
3612           } else {
3613             int LateStart = cycle - DAG->getLatency(SU, Dep) +
3614                             DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
3615             *MinLateStart = std::min(*MinLateStart, LateStart);
3616           }
3617         }
3618         // For instruction that requires multiple iterations, make sure that
3619         // the dependent instruction is not scheduled past the definition.
3620         SUnit *BE = multipleIterations(I, DAG);
3621         if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
3622             !SU->isPred(I))
3623           *MinLateStart = std::min(*MinLateStart, cycle);
3624       }
3625       for (unsigned i = 0, e = (unsigned)SU->Succs.size(); i != e; ++i)
3626         if (SU->Succs[i].getSUnit() == I) {
3627           const SDep &Dep = SU->Succs[i];
3628           if (!DAG->isBackedge(SU, Dep)) {
3629             int LateStart = cycle - DAG->getLatency(SU, Dep) +
3630                             DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
3631             *MinLateStart = std::min(*MinLateStart, LateStart);
3632             if (DAG->isLoopCarriedOrder(SU, Dep)) {
3633               int Start = latestCycleInChain(Dep) + 1 - II;
3634               *MaxStart = std::max(*MaxStart, Start);
3635             }
3636           } else {
3637             int EarlyStart = cycle + DAG->getLatency(SU, Dep) -
3638                              DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
3639             *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
3640           }
3641         }
3642     }
3643   }
3644 }
3645 
3646 /// Order the instructions within a cycle so that the definitions occur
3647 /// before the uses. Returns true if the instruction is added to the start
3648 /// of the list, or false if added to the end.
3649 bool SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
3650                                  std::deque<SUnit *> &Insts) {
3651   MachineInstr *MI = SU->getInstr();
3652   bool OrderBeforeUse = false;
3653   bool OrderAfterDef = false;
3654   bool OrderBeforeDef = false;
3655   unsigned MoveDef = 0;
3656   unsigned MoveUse = 0;
3657   int StageInst1 = stageScheduled(SU);
3658 
3659   unsigned Pos = 0;
3660   for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
3661        ++I, ++Pos) {
3662     // Relative order of Phis does not matter.
3663     if (MI->isPHI() && (*I)->getInstr()->isPHI())
3664       continue;
3665     for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3666       MachineOperand &MO = MI->getOperand(i);
3667       if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
3668         continue;
3669       unsigned Reg = MO.getReg();
3670       unsigned BasePos, OffsetPos;
3671       if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3672         if (MI->getOperand(BasePos).getReg() == Reg)
3673           if (unsigned NewReg = SSD->getInstrBaseReg(SU))
3674             Reg = NewReg;
3675       bool Reads, Writes;
3676       std::tie(Reads, Writes) =
3677           (*I)->getInstr()->readsWritesVirtualRegister(Reg);
3678       if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
3679         OrderBeforeUse = true;
3680         MoveUse = Pos;
3681       } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
3682         // Add the instruction after the scheduled instruction.
3683         OrderAfterDef = true;
3684         MoveDef = Pos;
3685       } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
3686         if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
3687           OrderBeforeUse = true;
3688           MoveUse = Pos;
3689         } else {
3690           OrderAfterDef = true;
3691           MoveDef = Pos;
3692         }
3693       } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
3694         OrderBeforeUse = true;
3695         MoveUse = Pos;
3696         if (MoveUse != 0) {
3697           OrderAfterDef = true;
3698           MoveDef = Pos - 1;
3699         }
3700       } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
3701         // Add the instruction before the scheduled instruction.
3702         OrderBeforeUse = true;
3703         MoveUse = Pos;
3704       } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
3705                  isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
3706         OrderBeforeDef = true;
3707         MoveUse = Pos;
3708       }
3709     }
3710     // Check for order dependences between instructions. Make sure the source
3711     // is ordered before the destination.
3712     for (auto &S : SU->Succs)
3713       if (S.getKind() == SDep::Order) {
3714         if (S.getSUnit() == *I && stageScheduled(*I) == StageInst1) {
3715           OrderBeforeUse = true;
3716           MoveUse = Pos;
3717         }
3718       } else if (TargetRegisterInfo::isPhysicalRegister(S.getReg())) {
3719         if (cycleScheduled(SU) != cycleScheduled(S.getSUnit())) {
3720           if (S.isAssignedRegDep()) {
3721             OrderAfterDef = true;
3722             MoveDef = Pos;
3723           }
3724         } else {
3725           OrderBeforeUse = true;
3726           MoveUse = Pos;
3727         }
3728       }
3729     for (auto &P : SU->Preds)
3730       if (P.getKind() == SDep::Order) {
3731         if (P.getSUnit() == *I && stageScheduled(*I) == StageInst1) {
3732           OrderAfterDef = true;
3733           MoveDef = Pos;
3734         }
3735       } else if (TargetRegisterInfo::isPhysicalRegister(P.getReg())) {
3736         if (cycleScheduled(SU) != cycleScheduled(P.getSUnit())) {
3737           if (P.isAssignedRegDep()) {
3738             OrderBeforeUse = true;
3739             MoveUse = Pos;
3740           }
3741         } else {
3742           OrderAfterDef = true;
3743           MoveDef = Pos;
3744         }
3745       }
3746   }
3747 
3748   // A circular dependence.
3749   if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3750     OrderBeforeUse = false;
3751 
3752   // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3753   // to a loop-carried dependence.
3754   if (OrderBeforeDef)
3755     OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
3756 
3757   // The uncommon case when the instruction order needs to be updated because
3758   // there is both a use and def.
3759   if (OrderBeforeUse && OrderAfterDef) {
3760     SUnit *UseSU = Insts.at(MoveUse);
3761     SUnit *DefSU = Insts.at(MoveDef);
3762     if (MoveUse > MoveDef) {
3763       Insts.erase(Insts.begin() + MoveUse);
3764       Insts.erase(Insts.begin() + MoveDef);
3765     } else {
3766       Insts.erase(Insts.begin() + MoveDef);
3767       Insts.erase(Insts.begin() + MoveUse);
3768     }
3769     if (orderDependence(SSD, UseSU, Insts)) {
3770       Insts.push_front(SU);
3771       orderDependence(SSD, DefSU, Insts);
3772       return true;
3773     }
3774     Insts.pop_back();
3775     Insts.push_back(SU);
3776     Insts.push_back(UseSU);
3777     orderDependence(SSD, DefSU, Insts);
3778     return false;
3779   }
3780   // Put the new instruction first if there is a use in the list. Otherwise,
3781   // put it at the end of the list.
3782   if (OrderBeforeUse)
3783     Insts.push_front(SU);
3784   else
3785     Insts.push_back(SU);
3786   return OrderBeforeUse;
3787 }
3788 
3789 /// Return true if the scheduled Phi has a loop carried operand.
3790 bool SMSchedule::isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi) {
3791   if (!Phi.isPHI())
3792     return false;
3793   assert(Phi.isPHI() && "Expecing a Phi.");
3794   SUnit *DefSU = SSD->getSUnit(&Phi);
3795   unsigned DefCycle = cycleScheduled(DefSU);
3796   int DefStage = stageScheduled(DefSU);
3797 
3798   unsigned InitVal = 0;
3799   unsigned LoopVal = 0;
3800   getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
3801   SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
3802   if (!UseSU)
3803     return true;
3804   if (UseSU->getInstr()->isPHI())
3805     return true;
3806   unsigned LoopCycle = cycleScheduled(UseSU);
3807   int LoopStage = stageScheduled(UseSU);
3808   return LoopCycle > DefCycle ||
3809          (LoopCycle <= DefCycle && LoopStage <= DefStage);
3810 }
3811 
3812 /// Return true if the instruction is a definition that is loop carried
3813 /// and defines the use on the next iteration.
3814 ///        v1 = phi(v2, v3)
3815 ///  (Def) v3 = op v1
3816 ///  (MO)   = v1
3817 /// If MO appears before Def, then then v1 and v3 may get assigned to the same
3818 /// register.
3819 bool SMSchedule::isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD,
3820                                        MachineInstr *Def, MachineOperand &MO) {
3821   if (!MO.isReg())
3822     return false;
3823   if (Def->isPHI())
3824     return false;
3825   MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
3826   if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
3827     return false;
3828   if (!isLoopCarried(SSD, *Phi))
3829     return false;
3830   unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
3831   for (unsigned i = 0, e = Def->getNumOperands(); i != e; ++i) {
3832     MachineOperand &DMO = Def->getOperand(i);
3833     if (!DMO.isReg() || !DMO.isDef())
3834       continue;
3835     if (DMO.getReg() == LoopReg)
3836       return true;
3837   }
3838   return false;
3839 }
3840 
3841 // Check if the generated schedule is valid. This function checks if
3842 // an instruction that uses a physical register is scheduled in a
3843 // different stage than the definition. The pipeliner does not handle
3844 // physical register values that may cross a basic block boundary.
3845 bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
3846   for (int i = 0, e = SSD->SUnits.size(); i < e; ++i) {
3847     SUnit &SU = SSD->SUnits[i];
3848     if (!SU.hasPhysRegDefs)
3849       continue;
3850     int StageDef = stageScheduled(&SU);
3851     assert(StageDef != -1 && "Instruction should have been scheduled.");
3852     for (auto &SI : SU.Succs)
3853       if (SI.isAssignedRegDep())
3854         if (ST.getRegisterInfo()->isPhysicalRegister(SI.getReg()))
3855           if (stageScheduled(SI.getSUnit()) != StageDef)
3856             return false;
3857   }
3858   return true;
3859 }
3860 
3861 /// After the schedule has been formed, call this function to combine
3862 /// the instructions from the different stages/cycles.  That is, this
3863 /// function creates a schedule that represents a single iteration.
3864 void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
3865   // Move all instructions to the first stage from later stages.
3866   for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3867     for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
3868          ++stage) {
3869       std::deque<SUnit *> &cycleInstrs =
3870           ScheduledInstrs[cycle + (stage * InitiationInterval)];
3871       for (std::deque<SUnit *>::reverse_iterator I = cycleInstrs.rbegin(),
3872                                                  E = cycleInstrs.rend();
3873            I != E; ++I)
3874         ScheduledInstrs[cycle].push_front(*I);
3875     }
3876   }
3877   // Iterate over the definitions in each instruction, and compute the
3878   // stage difference for each use.  Keep the maximum value.
3879   for (auto &I : InstrToCycle) {
3880     int DefStage = stageScheduled(I.first);
3881     MachineInstr *MI = I.first->getInstr();
3882     for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3883       MachineOperand &Op = MI->getOperand(i);
3884       if (!Op.isReg() || !Op.isDef())
3885         continue;
3886 
3887       unsigned Reg = Op.getReg();
3888       unsigned MaxDiff = 0;
3889       bool PhiIsSwapped = false;
3890       for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(Reg),
3891                                              EI = MRI.use_end();
3892            UI != EI; ++UI) {
3893         MachineOperand &UseOp = *UI;
3894         MachineInstr *UseMI = UseOp.getParent();
3895         SUnit *SUnitUse = SSD->getSUnit(UseMI);
3896         int UseStage = stageScheduled(SUnitUse);
3897         unsigned Diff = 0;
3898         if (UseStage != -1 && UseStage >= DefStage)
3899           Diff = UseStage - DefStage;
3900         if (MI->isPHI()) {
3901           if (isLoopCarried(SSD, *MI))
3902             ++Diff;
3903           else
3904             PhiIsSwapped = true;
3905         }
3906         MaxDiff = std::max(Diff, MaxDiff);
3907       }
3908       RegToStageDiff[Reg] = std::make_pair(MaxDiff, PhiIsSwapped);
3909     }
3910   }
3911 
3912   // Erase all the elements in the later stages. Only one iteration should
3913   // remain in the scheduled list, and it contains all the instructions.
3914   for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
3915     ScheduledInstrs.erase(cycle);
3916 
3917   // Change the registers in instruction as specified in the InstrChanges
3918   // map. We need to use the new registers to create the correct order.
3919   for (int i = 0, e = SSD->SUnits.size(); i != e; ++i) {
3920     SUnit *SU = &SSD->SUnits[i];
3921     SSD->applyInstrChange(SU->getInstr(), *this, true);
3922   }
3923 
3924   // Reorder the instructions in each cycle to fix and improve the
3925   // generated code.
3926   for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
3927     std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
3928     std::deque<SUnit *> newOrderZC;
3929     // Put the zero-cost, pseudo instructions at the start of the cycle.
3930     for (unsigned i = 0, e = cycleInstrs.size(); i < e; ++i) {
3931       SUnit *SU = cycleInstrs[i];
3932       if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
3933         orderDependence(SSD, SU, newOrderZC);
3934     }
3935     std::deque<SUnit *> newOrderI;
3936     // Then, add the regular instructions back.
3937     for (unsigned i = 0, e = cycleInstrs.size(); i < e; ++i) {
3938       SUnit *SU = cycleInstrs[i];
3939       if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
3940         orderDependence(SSD, SU, newOrderI);
3941     }
3942     // Replace the old order with the new order.
3943     cycleInstrs.swap(newOrderZC);
3944     cycleInstrs.insert(cycleInstrs.end(), newOrderI.begin(), newOrderI.end());
3945   }
3946 
3947   DEBUG(dump(););
3948 }
3949 
3950 /// Print the schedule information to the given output.
3951 void SMSchedule::print(raw_ostream &os) const {
3952   // Iterate over each cycle.
3953   for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3954     // Iterate over each instruction in the cycle.
3955     const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
3956     for (SUnit *CI : cycleInstrs->second) {
3957       os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
3958       os << "(" << CI->NodeNum << ") ";
3959       CI->getInstr()->print(os);
3960       os << "\n";
3961     }
3962   }
3963 }
3964 
3965 /// Utility function used for debugging to print the schedule.
3966 void SMSchedule::dump() const { print(dbgs()); }
3967