1 //===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
11 //
12 // Software pipelining (SWP) is an instruction scheduling technique for loops
13 // that overlap loop iterations and explioits ILP via a compiler transformation.
14 //
15 // Swing Modulo Scheduling is an implementation of software pipelining
16 // that generates schedules that are near optimal in terms of initiation
17 // interval, register requirements, and stage count. See the papers:
18 //
19 // "Swing Modulo Scheduling: A Lifetime-Sensitive Approach", by J. Llosa,
20 // A. Gonzalez, E. Ayguade, and M. Valero. In PACT '96 Processings of the 1996
21 // Conference on Parallel Architectures and Compilation Techiniques.
22 //
23 // "Lifetime-Sensitive Modulo Scheduling in a Production Environment", by J.
24 // Llosa, E. Ayguade, A. Gonzalez, M. Valero, and J. Eckhardt. In IEEE
25 // Transactions on Computers, Vol. 50, No. 3, 2001.
26 //
27 // "An Implementation of Swing Modulo Scheduling With Extensions for
28 // Superblocks", by T. Lattner, Master's Thesis, University of Illinois at
29 // Urbana-Chambpain, 2005.
30 //
31 //
32 // The SMS algorithm consists of three main steps after computing the minimal
33 // initiation interval (MII).
34 // 1) Analyze the dependence graph and compute information about each
35 //    instruction in the graph.
36 // 2) Order the nodes (instructions) by priority based upon the heuristics
37 //    described in the algorithm.
38 // 3) Attempt to schedule the nodes in the specified order using the MII.
39 //
40 // This SMS implementation is a target-independent back-end pass. When enabled,
41 // the pass runs just prior to the register allocation pass, while the machine
42 // IR is in SSA form. If software pipelining is successful, then the original
43 // loop is replaced by the optimized loop. The optimized loop contains one or
44 // more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
45 // the instructions cannot be scheduled in a given MII, we increase the MII by
46 // one and try again.
47 //
48 // The SMS implementation is an extension of the ScheduleDAGInstrs class. We
49 // represent loop carried dependences in the DAG as order edges to the Phi
50 // nodes. We also perform several passes over the DAG to eliminate unnecessary
51 // edges that inhibit the ability to pipeline. The implementation uses the
52 // DFAPacketizer class to compute the minimum initiation interval and the check
53 // where an instruction may be inserted in the pipelined schedule.
54 //
55 // In order for the SMS pass to work, several target specific hooks need to be
56 // implemented to get information about the loop structure and to rewrite
57 // instructions.
58 //
59 //===----------------------------------------------------------------------===//
60 
61 #include "llvm/ADT/ArrayRef.h"
62 #include "llvm/ADT/BitVector.h"
63 #include "llvm/ADT/DenseMap.h"
64 #include "llvm/ADT/MapVector.h"
65 #include "llvm/ADT/PriorityQueue.h"
66 #include "llvm/ADT/SetVector.h"
67 #include "llvm/ADT/SmallPtrSet.h"
68 #include "llvm/ADT/SmallSet.h"
69 #include "llvm/ADT/SmallVector.h"
70 #include "llvm/ADT/Statistic.h"
71 #include "llvm/ADT/iterator_range.h"
72 #include "llvm/Analysis/AliasAnalysis.h"
73 #include "llvm/Analysis/MemoryLocation.h"
74 #include "llvm/Analysis/ValueTracking.h"
75 #include "llvm/CodeGen/DFAPacketizer.h"
76 #include "llvm/CodeGen/LiveIntervals.h"
77 #include "llvm/CodeGen/MachineBasicBlock.h"
78 #include "llvm/CodeGen/MachineDominators.h"
79 #include "llvm/CodeGen/MachineFunction.h"
80 #include "llvm/CodeGen/MachineFunctionPass.h"
81 #include "llvm/CodeGen/MachineInstr.h"
82 #include "llvm/CodeGen/MachineInstrBuilder.h"
83 #include "llvm/CodeGen/MachineLoopInfo.h"
84 #include "llvm/CodeGen/MachineMemOperand.h"
85 #include "llvm/CodeGen/MachineOperand.h"
86 #include "llvm/CodeGen/MachineRegisterInfo.h"
87 #include "llvm/CodeGen/RegisterClassInfo.h"
88 #include "llvm/CodeGen/RegisterPressure.h"
89 #include "llvm/CodeGen/ScheduleDAG.h"
90 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
91 #include "llvm/CodeGen/ScheduleDAGMutation.h"
92 #include "llvm/CodeGen/TargetInstrInfo.h"
93 #include "llvm/CodeGen/TargetOpcodes.h"
94 #include "llvm/CodeGen/TargetRegisterInfo.h"
95 #include "llvm/CodeGen/TargetSubtargetInfo.h"
96 #include "llvm/IR/Attributes.h"
97 #include "llvm/IR/DebugLoc.h"
98 #include "llvm/IR/Function.h"
99 #include "llvm/MC/LaneBitmask.h"
100 #include "llvm/MC/MCInstrDesc.h"
101 #include "llvm/MC/MCInstrItineraries.h"
102 #include "llvm/MC/MCRegisterInfo.h"
103 #include "llvm/Pass.h"
104 #include "llvm/Support/CommandLine.h"
105 #include "llvm/Support/Compiler.h"
106 #include "llvm/Support/Debug.h"
107 #include "llvm/Support/MathExtras.h"
108 #include "llvm/Support/raw_ostream.h"
109 #include <algorithm>
110 #include <cassert>
111 #include <climits>
112 #include <cstdint>
113 #include <deque>
114 #include <functional>
115 #include <iterator>
116 #include <map>
117 #include <memory>
118 #include <tuple>
119 #include <utility>
120 #include <vector>
121 
122 using namespace llvm;
123 
124 #define DEBUG_TYPE "pipeliner"
125 
126 STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
127 STATISTIC(NumPipelined, "Number of loops software pipelined");
128 
129 /// A command line option to turn software pipelining on or off.
130 static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
131                                cl::ZeroOrMore,
132                                cl::desc("Enable Software Pipelining"));
133 
134 /// A command line option to enable SWP at -Os.
135 static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
136                                       cl::desc("Enable SWP at Os."), cl::Hidden,
137                                       cl::init(false));
138 
139 /// A command line argument to limit minimum initial interval for pipelining.
140 static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
141                               cl::desc("Size limit for the MII."),
142                               cl::Hidden, cl::init(27));
143 
144 /// A command line argument to limit the number of stages in the pipeline.
145 static cl::opt<int>
146     SwpMaxStages("pipeliner-max-stages",
147                  cl::desc("Maximum stages allowed in the generated scheduled."),
148                  cl::Hidden, cl::init(3));
149 
150 /// A command line option to disable the pruning of chain dependences due to
151 /// an unrelated Phi.
152 static cl::opt<bool>
153     SwpPruneDeps("pipeliner-prune-deps",
154                  cl::desc("Prune dependences between unrelated Phi nodes."),
155                  cl::Hidden, cl::init(true));
156 
157 /// A command line option to disable the pruning of loop carried order
158 /// dependences.
159 static cl::opt<bool>
160     SwpPruneLoopCarried("pipeliner-prune-loop-carried",
161                         cl::desc("Prune loop carried order dependences."),
162                         cl::Hidden, cl::init(true));
163 
164 #ifndef NDEBUG
165 static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
166 #endif
167 
168 static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
169                                      cl::ReallyHidden, cl::init(false),
170                                      cl::ZeroOrMore, cl::desc("Ignore RecMII"));
171 
172 namespace {
173 
174 class NodeSet;
175 class SMSchedule;
176 
177 /// The main class in the implementation of the target independent
178 /// software pipeliner pass.
179 class MachinePipeliner : public MachineFunctionPass {
180 public:
181   MachineFunction *MF = nullptr;
182   const MachineLoopInfo *MLI = nullptr;
183   const MachineDominatorTree *MDT = nullptr;
184   const InstrItineraryData *InstrItins;
185   const TargetInstrInfo *TII = nullptr;
186   RegisterClassInfo RegClassInfo;
187 
188 #ifndef NDEBUG
189   static int NumTries;
190 #endif
191 
192   /// Cache the target analysis information about the loop.
193   struct LoopInfo {
194     MachineBasicBlock *TBB = nullptr;
195     MachineBasicBlock *FBB = nullptr;
196     SmallVector<MachineOperand, 4> BrCond;
197     MachineInstr *LoopInductionVar = nullptr;
198     MachineInstr *LoopCompare = nullptr;
199   };
200   LoopInfo LI;
201 
202   static char ID;
203 
204   MachinePipeliner() : MachineFunctionPass(ID) {
205     initializeMachinePipelinerPass(*PassRegistry::getPassRegistry());
206   }
207 
208   bool runOnMachineFunction(MachineFunction &MF) override;
209 
210   void getAnalysisUsage(AnalysisUsage &AU) const override {
211     AU.addRequired<AAResultsWrapperPass>();
212     AU.addPreserved<AAResultsWrapperPass>();
213     AU.addRequired<MachineLoopInfo>();
214     AU.addRequired<MachineDominatorTree>();
215     AU.addRequired<LiveIntervals>();
216     MachineFunctionPass::getAnalysisUsage(AU);
217   }
218 
219 private:
220   bool canPipelineLoop(MachineLoop &L);
221   bool scheduleLoop(MachineLoop &L);
222   bool swingModuloScheduler(MachineLoop &L);
223 };
224 
225 /// This class builds the dependence graph for the instructions in a loop,
226 /// and attempts to schedule the instructions using the SMS algorithm.
227 class SwingSchedulerDAG : public ScheduleDAGInstrs {
228   MachinePipeliner &Pass;
229   /// The minimum initiation interval between iterations for this schedule.
230   unsigned MII = 0;
231   /// Set to true if a valid pipelined schedule is found for the loop.
232   bool Scheduled = false;
233   MachineLoop &Loop;
234   LiveIntervals &LIS;
235   const RegisterClassInfo &RegClassInfo;
236 
237   /// A toplogical ordering of the SUnits, which is needed for changing
238   /// dependences and iterating over the SUnits.
239   ScheduleDAGTopologicalSort Topo;
240 
241   struct NodeInfo {
242     int ASAP = 0;
243     int ALAP = 0;
244 
245     NodeInfo() = default;
246   };
247   /// Computed properties for each node in the graph.
248   std::vector<NodeInfo> ScheduleInfo;
249 
250   enum OrderKind { BottomUp = 0, TopDown = 1 };
251   /// Computed node ordering for scheduling.
252   SetVector<SUnit *> NodeOrder;
253 
254   using NodeSetType = SmallVector<NodeSet, 8>;
255   using ValueMapTy = DenseMap<unsigned, unsigned>;
256   using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
257   using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>;
258 
259   /// Instructions to change when emitting the final schedule.
260   DenseMap<SUnit *, std::pair<unsigned, int64_t>> InstrChanges;
261 
262   /// We may create a new instruction, so remember it because it
263   /// must be deleted when the pass is finished.
264   SmallPtrSet<MachineInstr *, 4> NewMIs;
265 
266   /// Ordered list of DAG postprocessing steps.
267   std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
268 
269   /// Helper class to implement Johnson's circuit finding algorithm.
270   class Circuits {
271     std::vector<SUnit> &SUnits;
272     SetVector<SUnit *> Stack;
273     BitVector Blocked;
274     SmallVector<SmallPtrSet<SUnit *, 4>, 10> B;
275     SmallVector<SmallVector<int, 4>, 16> AdjK;
276     unsigned NumPaths;
277     static unsigned MaxPaths;
278 
279   public:
280     Circuits(std::vector<SUnit> &SUs)
281         : SUnits(SUs), Blocked(SUs.size()), B(SUs.size()), AdjK(SUs.size()) {}
282 
283     /// Reset the data structures used in the circuit algorithm.
284     void reset() {
285       Stack.clear();
286       Blocked.reset();
287       B.assign(SUnits.size(), SmallPtrSet<SUnit *, 4>());
288       NumPaths = 0;
289     }
290 
291     void createAdjacencyStructure(SwingSchedulerDAG *DAG);
292     bool circuit(int V, int S, NodeSetType &NodeSets, bool HasBackedge = false);
293     void unblock(int U);
294   };
295 
296 public:
297   SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis,
298                     const RegisterClassInfo &rci)
299       : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), LIS(lis),
300         RegClassInfo(rci), Topo(SUnits, &ExitSU) {
301     P.MF->getSubtarget().getSMSMutations(Mutations);
302   }
303 
304   void schedule() override;
305   void finishBlock() override;
306 
307   /// Return true if the loop kernel has been scheduled.
308   bool hasNewSchedule() { return Scheduled; }
309 
310   /// Return the earliest time an instruction may be scheduled.
311   int getASAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ASAP; }
312 
313   /// Return the latest time an instruction my be scheduled.
314   int getALAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ALAP; }
315 
316   /// The mobility function, which the number of slots in which
317   /// an instruction may be scheduled.
318   int getMOV(SUnit *Node) { return getALAP(Node) - getASAP(Node); }
319 
320   /// The depth, in the dependence graph, for a node.
321   int getDepth(SUnit *Node) { return Node->getDepth(); }
322 
323   /// The height, in the dependence graph, for a node.
324   int getHeight(SUnit *Node) { return Node->getHeight(); }
325 
326   /// Return true if the dependence is a back-edge in the data dependence graph.
327   /// Since the DAG doesn't contain cycles, we represent a cycle in the graph
328   /// using an anti dependence from a Phi to an instruction.
329   bool isBackedge(SUnit *Source, const SDep &Dep) {
330     if (Dep.getKind() != SDep::Anti)
331       return false;
332     return Source->getInstr()->isPHI() || Dep.getSUnit()->getInstr()->isPHI();
333   }
334 
335   /// Return true if the dependence is an order dependence between non-Phis.
336   static bool isOrder(SUnit *Source, const SDep &Dep) {
337     if (Dep.getKind() != SDep::Order)
338       return false;
339     return (!Source->getInstr()->isPHI() &&
340             !Dep.getSUnit()->getInstr()->isPHI());
341   }
342 
343   bool isLoopCarriedOrder(SUnit *Source, const SDep &Dep, bool isSucc = true);
344 
345   /// The latency of the dependence.
346   unsigned getLatency(SUnit *Source, const SDep &Dep) {
347     // Anti dependences represent recurrences, so use the latency of the
348     // instruction on the back-edge.
349     if (Dep.getKind() == SDep::Anti) {
350       if (Source->getInstr()->isPHI())
351         return Dep.getSUnit()->Latency;
352       if (Dep.getSUnit()->getInstr()->isPHI())
353         return Source->Latency;
354       return Dep.getLatency();
355     }
356     return Dep.getLatency();
357   }
358 
359   /// The distance function, which indicates that operation V of iteration I
360   /// depends on operations U of iteration I-distance.
361   unsigned getDistance(SUnit *U, SUnit *V, const SDep &Dep) {
362     // Instructions that feed a Phi have a distance of 1. Computing larger
363     // values for arrays requires data dependence information.
364     if (V->getInstr()->isPHI() && Dep.getKind() == SDep::Anti)
365       return 1;
366     return 0;
367   }
368 
369   /// Set the Minimum Initiation Interval for this schedule attempt.
370   void setMII(unsigned mii) { MII = mii; }
371 
372   void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule);
373 
374   void fixupRegisterOverlaps(std::deque<SUnit *> &Instrs);
375 
376   /// Return the new base register that was stored away for the changed
377   /// instruction.
378   unsigned getInstrBaseReg(SUnit *SU) {
379     DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
380         InstrChanges.find(SU);
381     if (It != InstrChanges.end())
382       return It->second.first;
383     return 0;
384   }
385 
386   void addMutation(std::unique_ptr<ScheduleDAGMutation> Mutation) {
387     Mutations.push_back(std::move(Mutation));
388   }
389 
390 private:
391   void addLoopCarriedDependences(AliasAnalysis *AA);
392   void updatePhiDependences();
393   void changeDependences();
394   unsigned calculateResMII();
395   unsigned calculateRecMII(NodeSetType &RecNodeSets);
396   void findCircuits(NodeSetType &NodeSets);
397   void fuseRecs(NodeSetType &NodeSets);
398   void removeDuplicateNodes(NodeSetType &NodeSets);
399   void computeNodeFunctions(NodeSetType &NodeSets);
400   void registerPressureFilter(NodeSetType &NodeSets);
401   void colocateNodeSets(NodeSetType &NodeSets);
402   void checkNodeSets(NodeSetType &NodeSets);
403   void groupRemainingNodes(NodeSetType &NodeSets);
404   void addConnectedNodes(SUnit *SU, NodeSet &NewSet,
405                          SetVector<SUnit *> &NodesAdded);
406   void computeNodeOrder(NodeSetType &NodeSets);
407   bool schedulePipeline(SMSchedule &Schedule);
408   void generatePipelinedLoop(SMSchedule &Schedule);
409   void generateProlog(SMSchedule &Schedule, unsigned LastStage,
410                       MachineBasicBlock *KernelBB, ValueMapTy *VRMap,
411                       MBBVectorTy &PrologBBs);
412   void generateEpilog(SMSchedule &Schedule, unsigned LastStage,
413                       MachineBasicBlock *KernelBB, ValueMapTy *VRMap,
414                       MBBVectorTy &EpilogBBs, MBBVectorTy &PrologBBs);
415   void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
416                             MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
417                             SMSchedule &Schedule, ValueMapTy *VRMap,
418                             InstrMapTy &InstrMap, unsigned LastStageNum,
419                             unsigned CurStageNum, bool IsLast);
420   void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
421                     MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
422                     SMSchedule &Schedule, ValueMapTy *VRMap,
423                     InstrMapTy &InstrMap, unsigned LastStageNum,
424                     unsigned CurStageNum, bool IsLast);
425   void removeDeadInstructions(MachineBasicBlock *KernelBB,
426                               MBBVectorTy &EpilogBBs);
427   void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
428                       SMSchedule &Schedule);
429   void addBranches(MBBVectorTy &PrologBBs, MachineBasicBlock *KernelBB,
430                    MBBVectorTy &EpilogBBs, SMSchedule &Schedule,
431                    ValueMapTy *VRMap);
432   bool computeDelta(MachineInstr &MI, unsigned &Delta);
433   void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
434                          unsigned Num);
435   MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum,
436                            unsigned InstStageNum);
437   MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum,
438                                     unsigned InstStageNum,
439                                     SMSchedule &Schedule);
440   void updateInstruction(MachineInstr *NewMI, bool LastDef,
441                          unsigned CurStageNum, unsigned InstStageNum,
442                          SMSchedule &Schedule, ValueMapTy *VRMap);
443   MachineInstr *findDefInLoop(unsigned Reg);
444   unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal,
445                          unsigned LoopStage, ValueMapTy *VRMap,
446                          MachineBasicBlock *BB);
447   void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum,
448                         SMSchedule &Schedule, ValueMapTy *VRMap,
449                         InstrMapTy &InstrMap);
450   void rewriteScheduledInstr(MachineBasicBlock *BB, SMSchedule &Schedule,
451                              InstrMapTy &InstrMap, unsigned CurStageNum,
452                              unsigned PhiNum, MachineInstr *Phi,
453                              unsigned OldReg, unsigned NewReg,
454                              unsigned PrevReg = 0);
455   bool canUseLastOffsetValue(MachineInstr *MI, unsigned &BasePos,
456                              unsigned &OffsetPos, unsigned &NewBase,
457                              int64_t &NewOffset);
458   void postprocessDAG();
459 };
460 
461 /// A NodeSet contains a set of SUnit DAG nodes with additional information
462 /// that assigns a priority to the set.
463 class NodeSet {
464   SetVector<SUnit *> Nodes;
465   bool HasRecurrence = false;
466   unsigned RecMII = 0;
467   int MaxMOV = 0;
468   int MaxDepth = 0;
469   unsigned Colocate = 0;
470   SUnit *ExceedPressure = nullptr;
471 
472 public:
473   using iterator = SetVector<SUnit *>::const_iterator;
474 
475   NodeSet() = default;
476   NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) {}
477 
478   bool insert(SUnit *SU) { return Nodes.insert(SU); }
479 
480   void insert(iterator S, iterator E) { Nodes.insert(S, E); }
481 
482   template <typename UnaryPredicate> bool remove_if(UnaryPredicate P) {
483     return Nodes.remove_if(P);
484   }
485 
486   unsigned count(SUnit *SU) const { return Nodes.count(SU); }
487 
488   bool hasRecurrence() { return HasRecurrence; };
489 
490   unsigned size() const { return Nodes.size(); }
491 
492   bool empty() const { return Nodes.empty(); }
493 
494   SUnit *getNode(unsigned i) const { return Nodes[i]; };
495 
496   void setRecMII(unsigned mii) { RecMII = mii; };
497 
498   void setColocate(unsigned c) { Colocate = c; };
499 
500   void setExceedPressure(SUnit *SU) { ExceedPressure = SU; }
501 
502   bool isExceedSU(SUnit *SU) { return ExceedPressure == SU; }
503 
504   int compareRecMII(NodeSet &RHS) { return RecMII - RHS.RecMII; }
505 
506   int getRecMII() { return RecMII; }
507 
508   /// Summarize node functions for the entire node set.
509   void computeNodeSetInfo(SwingSchedulerDAG *SSD) {
510     for (SUnit *SU : *this) {
511       MaxMOV = std::max(MaxMOV, SSD->getMOV(SU));
512       MaxDepth = std::max(MaxDepth, SSD->getDepth(SU));
513     }
514   }
515 
516   void clear() {
517     Nodes.clear();
518     RecMII = 0;
519     HasRecurrence = false;
520     MaxMOV = 0;
521     MaxDepth = 0;
522     Colocate = 0;
523     ExceedPressure = nullptr;
524   }
525 
526   operator SetVector<SUnit *> &() { return Nodes; }
527 
528   /// Sort the node sets by importance. First, rank them by recurrence MII,
529   /// then by mobility (least mobile done first), and finally by depth.
530   /// Each node set may contain a colocate value which is used as the first
531   /// tie breaker, if it's set.
532   bool operator>(const NodeSet &RHS) const {
533     if (RecMII == RHS.RecMII) {
534       if (Colocate != 0 && RHS.Colocate != 0 && Colocate != RHS.Colocate)
535         return Colocate < RHS.Colocate;
536       if (MaxMOV == RHS.MaxMOV)
537         return MaxDepth > RHS.MaxDepth;
538       return MaxMOV < RHS.MaxMOV;
539     }
540     return RecMII > RHS.RecMII;
541   }
542 
543   bool operator==(const NodeSet &RHS) const {
544     return RecMII == RHS.RecMII && MaxMOV == RHS.MaxMOV &&
545            MaxDepth == RHS.MaxDepth;
546   }
547 
548   bool operator!=(const NodeSet &RHS) const { return !operator==(RHS); }
549 
550   iterator begin() { return Nodes.begin(); }
551   iterator end() { return Nodes.end(); }
552 
553   void print(raw_ostream &os) const {
554     os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
555        << " depth " << MaxDepth << " col " << Colocate << "\n";
556     for (const auto &I : Nodes)
557       os << "   SU(" << I->NodeNum << ") " << *(I->getInstr());
558     os << "\n";
559   }
560 
561 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
562   LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
563 #endif
564 };
565 
566 /// This class repesents the scheduled code.  The main data structure is a
567 /// map from scheduled cycle to instructions.  During scheduling, the
568 /// data structure explicitly represents all stages/iterations.   When
569 /// the algorithm finshes, the schedule is collapsed into a single stage,
570 /// which represents instructions from different loop iterations.
571 ///
572 /// The SMS algorithm allows negative values for cycles, so the first cycle
573 /// in the schedule is the smallest cycle value.
574 class SMSchedule {
575 private:
576   /// Map from execution cycle to instructions.
577   DenseMap<int, std::deque<SUnit *>> ScheduledInstrs;
578 
579   /// Map from instruction to execution cycle.
580   std::map<SUnit *, int> InstrToCycle;
581 
582   /// Map for each register and the max difference between its uses and def.
583   /// The first element in the pair is the max difference in stages. The
584   /// second is true if the register defines a Phi value and loop value is
585   /// scheduled before the Phi.
586   std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff;
587 
588   /// Keep track of the first cycle value in the schedule.  It starts
589   /// as zero, but the algorithm allows negative values.
590   int FirstCycle = 0;
591 
592   /// Keep track of the last cycle value in the schedule.
593   int LastCycle = 0;
594 
595   /// The initiation interval (II) for the schedule.
596   int InitiationInterval = 0;
597 
598   /// Target machine information.
599   const TargetSubtargetInfo &ST;
600 
601   /// Virtual register information.
602   MachineRegisterInfo &MRI;
603 
604   std::unique_ptr<DFAPacketizer> Resources;
605 
606 public:
607   SMSchedule(MachineFunction *mf)
608       : ST(mf->getSubtarget()), MRI(mf->getRegInfo()),
609         Resources(ST.getInstrInfo()->CreateTargetScheduleState(ST)) {}
610 
611   void reset() {
612     ScheduledInstrs.clear();
613     InstrToCycle.clear();
614     RegToStageDiff.clear();
615     FirstCycle = 0;
616     LastCycle = 0;
617     InitiationInterval = 0;
618   }
619 
620   /// Set the initiation interval for this schedule.
621   void setInitiationInterval(int ii) { InitiationInterval = ii; }
622 
623   /// Return the first cycle in the completed schedule.  This
624   /// can be a negative value.
625   int getFirstCycle() const { return FirstCycle; }
626 
627   /// Return the last cycle in the finalized schedule.
628   int getFinalCycle() const { return FirstCycle + InitiationInterval - 1; }
629 
630   /// Return the cycle of the earliest scheduled instruction in the dependence
631   /// chain.
632   int earliestCycleInChain(const SDep &Dep);
633 
634   /// Return the cycle of the latest scheduled instruction in the dependence
635   /// chain.
636   int latestCycleInChain(const SDep &Dep);
637 
638   void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
639                     int *MinEnd, int *MaxStart, int II, SwingSchedulerDAG *DAG);
640   bool insert(SUnit *SU, int StartCycle, int EndCycle, int II);
641 
642   /// Iterators for the cycle to instruction map.
643   using sched_iterator = DenseMap<int, std::deque<SUnit *>>::iterator;
644   using const_sched_iterator =
645       DenseMap<int, std::deque<SUnit *>>::const_iterator;
646 
647   /// Return true if the instruction is scheduled at the specified stage.
648   bool isScheduledAtStage(SUnit *SU, unsigned StageNum) {
649     return (stageScheduled(SU) == (int)StageNum);
650   }
651 
652   /// Return the stage for a scheduled instruction.  Return -1 if
653   /// the instruction has not been scheduled.
654   int stageScheduled(SUnit *SU) const {
655     std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SU);
656     if (it == InstrToCycle.end())
657       return -1;
658     return (it->second - FirstCycle) / InitiationInterval;
659   }
660 
661   /// Return the cycle for a scheduled instruction. This function normalizes
662   /// the first cycle to be 0.
663   unsigned cycleScheduled(SUnit *SU) const {
664     std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SU);
665     assert(it != InstrToCycle.end() && "Instruction hasn't been scheduled.");
666     return (it->second - FirstCycle) % InitiationInterval;
667   }
668 
669   /// Return the maximum stage count needed for this schedule.
670   unsigned getMaxStageCount() {
671     return (LastCycle - FirstCycle) / InitiationInterval;
672   }
673 
674   /// Return the max. number of stages/iterations that can occur between a
675   /// register definition and its uses.
676   unsigned getStagesForReg(int Reg, unsigned CurStage) {
677     std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
678     if (CurStage > getMaxStageCount() && Stages.first == 0 && Stages.second)
679       return 1;
680     return Stages.first;
681   }
682 
683   /// The number of stages for a Phi is a little different than other
684   /// instructions. The minimum value computed in RegToStageDiff is 1
685   /// because we assume the Phi is needed for at least 1 iteration.
686   /// This is not the case if the loop value is scheduled prior to the
687   /// Phi in the same stage.  This function returns the number of stages
688   /// or iterations needed between the Phi definition and any uses.
689   unsigned getStagesForPhi(int Reg) {
690     std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
691     if (Stages.second)
692       return Stages.first;
693     return Stages.first - 1;
694   }
695 
696   /// Return the instructions that are scheduled at the specified cycle.
697   std::deque<SUnit *> &getInstructions(int cycle) {
698     return ScheduledInstrs[cycle];
699   }
700 
701   bool isValidSchedule(SwingSchedulerDAG *SSD);
702   void finalizeSchedule(SwingSchedulerDAG *SSD);
703   bool orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
704                        std::deque<SUnit *> &Insts);
705   bool isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi);
706   bool isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD, MachineInstr *Inst,
707                              MachineOperand &MO);
708   void print(raw_ostream &os) const;
709   void dump() const;
710 };
711 
712 } // end anonymous namespace
713 
714 unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
715 char MachinePipeliner::ID = 0;
716 #ifndef NDEBUG
717 int MachinePipeliner::NumTries = 0;
718 #endif
719 char &llvm::MachinePipelinerID = MachinePipeliner::ID;
720 
721 INITIALIZE_PASS_BEGIN(MachinePipeliner, DEBUG_TYPE,
722                       "Modulo Software Pipelining", false, false)
723 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
724 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
725 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
726 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
727 INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE,
728                     "Modulo Software Pipelining", false, false)
729 
730 /// The "main" function for implementing Swing Modulo Scheduling.
731 bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
732   if (skipFunction(mf.getFunction()))
733     return false;
734 
735   if (!EnableSWP)
736     return false;
737 
738   if (mf.getFunction().getAttributes().hasAttribute(
739           AttributeList::FunctionIndex, Attribute::OptimizeForSize) &&
740       !EnableSWPOptSize.getPosition())
741     return false;
742 
743   MF = &mf;
744   MLI = &getAnalysis<MachineLoopInfo>();
745   MDT = &getAnalysis<MachineDominatorTree>();
746   TII = MF->getSubtarget().getInstrInfo();
747   RegClassInfo.runOnMachineFunction(*MF);
748 
749   for (auto &L : *MLI)
750     scheduleLoop(*L);
751 
752   return false;
753 }
754 
755 /// Attempt to perform the SMS algorithm on the specified loop. This function is
756 /// the main entry point for the algorithm.  The function identifies candidate
757 /// loops, calculates the minimum initiation interval, and attempts to schedule
758 /// the loop.
759 bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
760   bool Changed = false;
761   for (auto &InnerLoop : L)
762     Changed |= scheduleLoop(*InnerLoop);
763 
764 #ifndef NDEBUG
765   // Stop trying after reaching the limit (if any).
766   int Limit = SwpLoopLimit;
767   if (Limit >= 0) {
768     if (NumTries >= SwpLoopLimit)
769       return Changed;
770     NumTries++;
771   }
772 #endif
773 
774   if (!canPipelineLoop(L))
775     return Changed;
776 
777   ++NumTrytoPipeline;
778 
779   Changed = swingModuloScheduler(L);
780 
781   return Changed;
782 }
783 
784 /// Return true if the loop can be software pipelined.  The algorithm is
785 /// restricted to loops with a single basic block.  Make sure that the
786 /// branch in the loop can be analyzed.
787 bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
788   if (L.getNumBlocks() != 1)
789     return false;
790 
791   // Check if the branch can't be understood because we can't do pipelining
792   // if that's the case.
793   LI.TBB = nullptr;
794   LI.FBB = nullptr;
795   LI.BrCond.clear();
796   if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond))
797     return false;
798 
799   LI.LoopInductionVar = nullptr;
800   LI.LoopCompare = nullptr;
801   if (TII->analyzeLoop(L, LI.LoopInductionVar, LI.LoopCompare))
802     return false;
803 
804   if (!L.getLoopPreheader())
805     return false;
806 
807   // If any of the Phis contain subregs, then we can't pipeline
808   // because we don't know how to maintain subreg information in the
809   // VMap structure.
810   MachineBasicBlock *MBB = L.getHeader();
811   for (auto &PHI : MBB->phis())
812     for (unsigned i = 1; i != PHI.getNumOperands(); i += 2)
813       if (PHI.getOperand(i).getSubReg() != 0)
814         return false;
815 
816   return true;
817 }
818 
819 /// The SMS algorithm consists of the following main steps:
820 /// 1. Computation and analysis of the dependence graph.
821 /// 2. Ordering of the nodes (instructions).
822 /// 3. Attempt to Schedule the loop.
823 bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
824   assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
825 
826   SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo);
827 
828   MachineBasicBlock *MBB = L.getHeader();
829   // The kernel should not include any terminator instructions.  These
830   // will be added back later.
831   SMS.startBlock(MBB);
832 
833   // Compute the number of 'real' instructions in the basic block by
834   // ignoring terminators.
835   unsigned size = MBB->size();
836   for (MachineBasicBlock::iterator I = MBB->getFirstTerminator(),
837                                    E = MBB->instr_end();
838        I != E; ++I, --size)
839     ;
840 
841   SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
842   SMS.schedule();
843   SMS.exitRegion();
844 
845   SMS.finishBlock();
846   return SMS.hasNewSchedule();
847 }
848 
849 /// We override the schedule function in ScheduleDAGInstrs to implement the
850 /// scheduling part of the Swing Modulo Scheduling algorithm.
851 void SwingSchedulerDAG::schedule() {
852   AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
853   buildSchedGraph(AA);
854   addLoopCarriedDependences(AA);
855   updatePhiDependences();
856   Topo.InitDAGTopologicalSorting();
857   postprocessDAG();
858   changeDependences();
859   DEBUG({
860     for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
861       SUnits[su].dumpAll(this);
862   });
863 
864   NodeSetType NodeSets;
865   findCircuits(NodeSets);
866 
867   // Calculate the MII.
868   unsigned ResMII = calculateResMII();
869   unsigned RecMII = calculateRecMII(NodeSets);
870 
871   fuseRecs(NodeSets);
872 
873   // This flag is used for testing and can cause correctness problems.
874   if (SwpIgnoreRecMII)
875     RecMII = 0;
876 
877   MII = std::max(ResMII, RecMII);
878   DEBUG(dbgs() << "MII = " << MII << " (rec=" << RecMII << ", res=" << ResMII
879                << ")\n");
880 
881   // Can't schedule a loop without a valid MII.
882   if (MII == 0)
883     return;
884 
885   // Don't pipeline large loops.
886   if (SwpMaxMii != -1 && (int)MII > SwpMaxMii)
887     return;
888 
889   computeNodeFunctions(NodeSets);
890 
891   registerPressureFilter(NodeSets);
892 
893   colocateNodeSets(NodeSets);
894 
895   checkNodeSets(NodeSets);
896 
897   DEBUG({
898     for (auto &I : NodeSets) {
899       dbgs() << "  Rec NodeSet ";
900       I.dump();
901     }
902   });
903 
904   std::sort(NodeSets.begin(), NodeSets.end(), std::greater<NodeSet>());
905 
906   groupRemainingNodes(NodeSets);
907 
908   removeDuplicateNodes(NodeSets);
909 
910   DEBUG({
911     for (auto &I : NodeSets) {
912       dbgs() << "  NodeSet ";
913       I.dump();
914     }
915   });
916 
917   computeNodeOrder(NodeSets);
918 
919   SMSchedule Schedule(Pass.MF);
920   Scheduled = schedulePipeline(Schedule);
921 
922   if (!Scheduled)
923     return;
924 
925   unsigned numStages = Schedule.getMaxStageCount();
926   // No need to generate pipeline if there are no overlapped iterations.
927   if (numStages == 0)
928     return;
929 
930   // Check that the maximum stage count is less than user-defined limit.
931   if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages)
932     return;
933 
934   generatePipelinedLoop(Schedule);
935   ++NumPipelined;
936 }
937 
938 /// Clean up after the software pipeliner runs.
939 void SwingSchedulerDAG::finishBlock() {
940   for (MachineInstr *I : NewMIs)
941     MF.DeleteMachineInstr(I);
942   NewMIs.clear();
943 
944   // Call the superclass.
945   ScheduleDAGInstrs::finishBlock();
946 }
947 
948 /// Return the register values for  the operands of a Phi instruction.
949 /// This function assume the instruction is a Phi.
950 static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
951                        unsigned &InitVal, unsigned &LoopVal) {
952   assert(Phi.isPHI() && "Expecting a Phi.");
953 
954   InitVal = 0;
955   LoopVal = 0;
956   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
957     if (Phi.getOperand(i + 1).getMBB() != Loop)
958       InitVal = Phi.getOperand(i).getReg();
959     else
960       LoopVal = Phi.getOperand(i).getReg();
961 
962   assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
963 }
964 
965 /// Return the Phi register value that comes from the incoming block.
966 static unsigned getInitPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
967   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
968     if (Phi.getOperand(i + 1).getMBB() != LoopBB)
969       return Phi.getOperand(i).getReg();
970   return 0;
971 }
972 
973 /// Return the Phi register value that comes the loop block.
974 static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
975   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
976     if (Phi.getOperand(i + 1).getMBB() == LoopBB)
977       return Phi.getOperand(i).getReg();
978   return 0;
979 }
980 
981 /// Return true if SUb can be reached from SUa following the chain edges.
982 static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
983   SmallPtrSet<SUnit *, 8> Visited;
984   SmallVector<SUnit *, 8> Worklist;
985   Worklist.push_back(SUa);
986   while (!Worklist.empty()) {
987     const SUnit *SU = Worklist.pop_back_val();
988     for (auto &SI : SU->Succs) {
989       SUnit *SuccSU = SI.getSUnit();
990       if (SI.getKind() == SDep::Order) {
991         if (Visited.count(SuccSU))
992           continue;
993         if (SuccSU == SUb)
994           return true;
995         Worklist.push_back(SuccSU);
996         Visited.insert(SuccSU);
997       }
998     }
999   }
1000   return false;
1001 }
1002 
1003 /// Return true if the instruction causes a chain between memory
1004 /// references before and after it.
1005 static bool isDependenceBarrier(MachineInstr &MI, AliasAnalysis *AA) {
1006   return MI.isCall() || MI.hasUnmodeledSideEffects() ||
1007          (MI.hasOrderedMemoryRef() &&
1008           (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad(AA)));
1009 }
1010 
1011 /// Return the underlying objects for the memory references of an instruction.
1012 /// This function calls the code in ValueTracking, but first checks that the
1013 /// instruction has a memory operand.
1014 static void getUnderlyingObjects(MachineInstr *MI,
1015                                  SmallVectorImpl<Value *> &Objs,
1016                                  const DataLayout &DL) {
1017   if (!MI->hasOneMemOperand())
1018     return;
1019   MachineMemOperand *MM = *MI->memoperands_begin();
1020   if (!MM->getValue())
1021     return;
1022   GetUnderlyingObjects(const_cast<Value *>(MM->getValue()), Objs, DL);
1023 }
1024 
1025 /// Add a chain edge between a load and store if the store can be an
1026 /// alias of the load on a subsequent iteration, i.e., a loop carried
1027 /// dependence. This code is very similar to the code in ScheduleDAGInstrs
1028 /// but that code doesn't create loop carried dependences.
1029 void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
1030   MapVector<Value *, SmallVector<SUnit *, 4>> PendingLoads;
1031   for (auto &SU : SUnits) {
1032     MachineInstr &MI = *SU.getInstr();
1033     if (isDependenceBarrier(MI, AA))
1034       PendingLoads.clear();
1035     else if (MI.mayLoad()) {
1036       SmallVector<Value *, 4> Objs;
1037       getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
1038       for (auto V : Objs) {
1039         SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
1040         SUs.push_back(&SU);
1041       }
1042     } else if (MI.mayStore()) {
1043       SmallVector<Value *, 4> Objs;
1044       getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
1045       for (auto V : Objs) {
1046         MapVector<Value *, SmallVector<SUnit *, 4>>::iterator I =
1047             PendingLoads.find(V);
1048         if (I == PendingLoads.end())
1049           continue;
1050         for (auto Load : I->second) {
1051           if (isSuccOrder(Load, &SU))
1052             continue;
1053           MachineInstr &LdMI = *Load->getInstr();
1054           // First, perform the cheaper check that compares the base register.
1055           // If they are the same and the load offset is less than the store
1056           // offset, then mark the dependence as loop carried potentially.
1057           unsigned BaseReg1, BaseReg2;
1058           int64_t Offset1, Offset2;
1059           if (!TII->getMemOpBaseRegImmOfs(LdMI, BaseReg1, Offset1, TRI) ||
1060               !TII->getMemOpBaseRegImmOfs(MI, BaseReg2, Offset2, TRI)) {
1061             SU.addPred(SDep(Load, SDep::Barrier));
1062             continue;
1063           }
1064           if (BaseReg1 == BaseReg2 && (int)Offset1 < (int)Offset2) {
1065             assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI, AA) &&
1066                    "What happened to the chain edge?");
1067             SU.addPred(SDep(Load, SDep::Barrier));
1068             continue;
1069           }
1070           // Second, the more expensive check that uses alias analysis on the
1071           // base registers. If they alias, and the load offset is less than
1072           // the store offset, the mark the dependence as loop carried.
1073           if (!AA) {
1074             SU.addPred(SDep(Load, SDep::Barrier));
1075             continue;
1076           }
1077           MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
1078           MachineMemOperand *MMO2 = *MI.memoperands_begin();
1079           if (!MMO1->getValue() || !MMO2->getValue()) {
1080             SU.addPred(SDep(Load, SDep::Barrier));
1081             continue;
1082           }
1083           if (MMO1->getValue() == MMO2->getValue() &&
1084               MMO1->getOffset() <= MMO2->getOffset()) {
1085             SU.addPred(SDep(Load, SDep::Barrier));
1086             continue;
1087           }
1088           AliasResult AAResult = AA->alias(
1089               MemoryLocation(MMO1->getValue(), MemoryLocation::UnknownSize,
1090                              MMO1->getAAInfo()),
1091               MemoryLocation(MMO2->getValue(), MemoryLocation::UnknownSize,
1092                              MMO2->getAAInfo()));
1093 
1094           if (AAResult != NoAlias)
1095             SU.addPred(SDep(Load, SDep::Barrier));
1096         }
1097       }
1098     }
1099   }
1100 }
1101 
1102 /// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
1103 /// processes dependences for PHIs. This function adds true dependences
1104 /// from a PHI to a use, and a loop carried dependence from the use to the
1105 /// PHI. The loop carried dependence is represented as an anti dependence
1106 /// edge. This function also removes chain dependences between unrelated
1107 /// PHIs.
1108 void SwingSchedulerDAG::updatePhiDependences() {
1109   SmallVector<SDep, 4> RemoveDeps;
1110   const TargetSubtargetInfo &ST = MF.getSubtarget<TargetSubtargetInfo>();
1111 
1112   // Iterate over each DAG node.
1113   for (SUnit &I : SUnits) {
1114     RemoveDeps.clear();
1115     // Set to true if the instruction has an operand defined by a Phi.
1116     unsigned HasPhiUse = 0;
1117     unsigned HasPhiDef = 0;
1118     MachineInstr *MI = I.getInstr();
1119     // Iterate over each operand, and we process the definitions.
1120     for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
1121                                     MOE = MI->operands_end();
1122          MOI != MOE; ++MOI) {
1123       if (!MOI->isReg())
1124         continue;
1125       unsigned Reg = MOI->getReg();
1126       if (MOI->isDef()) {
1127         // If the register is used by a Phi, then create an anti dependence.
1128         for (MachineRegisterInfo::use_instr_iterator
1129                  UI = MRI.use_instr_begin(Reg),
1130                  UE = MRI.use_instr_end();
1131              UI != UE; ++UI) {
1132           MachineInstr *UseMI = &*UI;
1133           SUnit *SU = getSUnit(UseMI);
1134           if (SU != nullptr && UseMI->isPHI()) {
1135             if (!MI->isPHI()) {
1136               SDep Dep(SU, SDep::Anti, Reg);
1137               I.addPred(Dep);
1138             } else {
1139               HasPhiDef = Reg;
1140               // Add a chain edge to a dependent Phi that isn't an existing
1141               // predecessor.
1142               if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
1143                 I.addPred(SDep(SU, SDep::Barrier));
1144             }
1145           }
1146         }
1147       } else if (MOI->isUse()) {
1148         // If the register is defined by a Phi, then create a true dependence.
1149         MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
1150         if (DefMI == nullptr)
1151           continue;
1152         SUnit *SU = getSUnit(DefMI);
1153         if (SU != nullptr && DefMI->isPHI()) {
1154           if (!MI->isPHI()) {
1155             SDep Dep(SU, SDep::Data, Reg);
1156             Dep.setLatency(0);
1157             ST.adjustSchedDependency(SU, &I, Dep);
1158             I.addPred(Dep);
1159           } else {
1160             HasPhiUse = Reg;
1161             // Add a chain edge to a dependent Phi that isn't an existing
1162             // predecessor.
1163             if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
1164               I.addPred(SDep(SU, SDep::Barrier));
1165           }
1166         }
1167       }
1168     }
1169     // Remove order dependences from an unrelated Phi.
1170     if (!SwpPruneDeps)
1171       continue;
1172     for (auto &PI : I.Preds) {
1173       MachineInstr *PMI = PI.getSUnit()->getInstr();
1174       if (PMI->isPHI() && PI.getKind() == SDep::Order) {
1175         if (I.getInstr()->isPHI()) {
1176           if (PMI->getOperand(0).getReg() == HasPhiUse)
1177             continue;
1178           if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
1179             continue;
1180         }
1181         RemoveDeps.push_back(PI);
1182       }
1183     }
1184     for (int i = 0, e = RemoveDeps.size(); i != e; ++i)
1185       I.removePred(RemoveDeps[i]);
1186   }
1187 }
1188 
1189 /// Iterate over each DAG node and see if we can change any dependences
1190 /// in order to reduce the recurrence MII.
1191 void SwingSchedulerDAG::changeDependences() {
1192   // See if an instruction can use a value from the previous iteration.
1193   // If so, we update the base and offset of the instruction and change
1194   // the dependences.
1195   for (SUnit &I : SUnits) {
1196     unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
1197     int64_t NewOffset = 0;
1198     if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
1199                                NewOffset))
1200       continue;
1201 
1202     // Get the MI and SUnit for the instruction that defines the original base.
1203     unsigned OrigBase = I.getInstr()->getOperand(BasePos).getReg();
1204     MachineInstr *DefMI = MRI.getUniqueVRegDef(OrigBase);
1205     if (!DefMI)
1206       continue;
1207     SUnit *DefSU = getSUnit(DefMI);
1208     if (!DefSU)
1209       continue;
1210     // Get the MI and SUnit for the instruction that defins the new base.
1211     MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
1212     if (!LastMI)
1213       continue;
1214     SUnit *LastSU = getSUnit(LastMI);
1215     if (!LastSU)
1216       continue;
1217 
1218     if (Topo.IsReachable(&I, LastSU))
1219       continue;
1220 
1221     // Remove the dependence. The value now depends on a prior iteration.
1222     SmallVector<SDep, 4> Deps;
1223     for (SUnit::pred_iterator P = I.Preds.begin(), E = I.Preds.end(); P != E;
1224          ++P)
1225       if (P->getSUnit() == DefSU)
1226         Deps.push_back(*P);
1227     for (int i = 0, e = Deps.size(); i != e; i++) {
1228       Topo.RemovePred(&I, Deps[i].getSUnit());
1229       I.removePred(Deps[i]);
1230     }
1231     // Remove the chain dependence between the instructions.
1232     Deps.clear();
1233     for (auto &P : LastSU->Preds)
1234       if (P.getSUnit() == &I && P.getKind() == SDep::Order)
1235         Deps.push_back(P);
1236     for (int i = 0, e = Deps.size(); i != e; i++) {
1237       Topo.RemovePred(LastSU, Deps[i].getSUnit());
1238       LastSU->removePred(Deps[i]);
1239     }
1240 
1241     // Add a dependence between the new instruction and the instruction
1242     // that defines the new base.
1243     SDep Dep(&I, SDep::Anti, NewBase);
1244     LastSU->addPred(Dep);
1245 
1246     // Remember the base and offset information so that we can update the
1247     // instruction during code generation.
1248     InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
1249   }
1250 }
1251 
1252 namespace {
1253 
1254 // FuncUnitSorter - Comparison operator used to sort instructions by
1255 // the number of functional unit choices.
1256 struct FuncUnitSorter {
1257   const InstrItineraryData *InstrItins;
1258   DenseMap<unsigned, unsigned> Resources;
1259 
1260   FuncUnitSorter(const InstrItineraryData *IID) : InstrItins(IID) {}
1261 
1262   // Compute the number of functional unit alternatives needed
1263   // at each stage, and take the minimum value. We prioritize the
1264   // instructions by the least number of choices first.
1265   unsigned minFuncUnits(const MachineInstr *Inst, unsigned &F) const {
1266     unsigned schedClass = Inst->getDesc().getSchedClass();
1267     unsigned min = UINT_MAX;
1268     for (const InstrStage *IS = InstrItins->beginStage(schedClass),
1269                           *IE = InstrItins->endStage(schedClass);
1270          IS != IE; ++IS) {
1271       unsigned funcUnits = IS->getUnits();
1272       unsigned numAlternatives = countPopulation(funcUnits);
1273       if (numAlternatives < min) {
1274         min = numAlternatives;
1275         F = funcUnits;
1276       }
1277     }
1278     return min;
1279   }
1280 
1281   // Compute the critical resources needed by the instruction. This
1282   // function records the functional units needed by instructions that
1283   // must use only one functional unit. We use this as a tie breaker
1284   // for computing the resource MII. The instrutions that require
1285   // the same, highly used, functional unit have high priority.
1286   void calcCriticalResources(MachineInstr &MI) {
1287     unsigned SchedClass = MI.getDesc().getSchedClass();
1288     for (const InstrStage *IS = InstrItins->beginStage(SchedClass),
1289                           *IE = InstrItins->endStage(SchedClass);
1290          IS != IE; ++IS) {
1291       unsigned FuncUnits = IS->getUnits();
1292       if (countPopulation(FuncUnits) == 1)
1293         Resources[FuncUnits]++;
1294     }
1295   }
1296 
1297   /// Return true if IS1 has less priority than IS2.
1298   bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1299     unsigned F1 = 0, F2 = 0;
1300     unsigned MFUs1 = minFuncUnits(IS1, F1);
1301     unsigned MFUs2 = minFuncUnits(IS2, F2);
1302     if (MFUs1 == 1 && MFUs2 == 1)
1303       return Resources.lookup(F1) < Resources.lookup(F2);
1304     return MFUs1 > MFUs2;
1305   }
1306 };
1307 
1308 } // end anonymous namespace
1309 
1310 /// Calculate the resource constrained minimum initiation interval for the
1311 /// specified loop. We use the DFA to model the resources needed for
1312 /// each instruction, and we ignore dependences. A different DFA is created
1313 /// for each cycle that is required. When adding a new instruction, we attempt
1314 /// to add it to each existing DFA, until a legal space is found. If the
1315 /// instruction cannot be reserved in an existing DFA, we create a new one.
1316 unsigned SwingSchedulerDAG::calculateResMII() {
1317   SmallVector<DFAPacketizer *, 8> Resources;
1318   MachineBasicBlock *MBB = Loop.getHeader();
1319   Resources.push_back(TII->CreateTargetScheduleState(MF.getSubtarget()));
1320 
1321   // Sort the instructions by the number of available choices for scheduling,
1322   // least to most. Use the number of critical resources as the tie breaker.
1323   FuncUnitSorter FUS =
1324       FuncUnitSorter(MF.getSubtarget().getInstrItineraryData());
1325   for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
1326                                    E = MBB->getFirstTerminator();
1327        I != E; ++I)
1328     FUS.calcCriticalResources(*I);
1329   PriorityQueue<MachineInstr *, std::vector<MachineInstr *>, FuncUnitSorter>
1330       FuncUnitOrder(FUS);
1331 
1332   for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
1333                                    E = MBB->getFirstTerminator();
1334        I != E; ++I)
1335     FuncUnitOrder.push(&*I);
1336 
1337   while (!FuncUnitOrder.empty()) {
1338     MachineInstr *MI = FuncUnitOrder.top();
1339     FuncUnitOrder.pop();
1340     if (TII->isZeroCost(MI->getOpcode()))
1341       continue;
1342     // Attempt to reserve the instruction in an existing DFA. At least one
1343     // DFA is needed for each cycle.
1344     unsigned NumCycles = getSUnit(MI)->Latency;
1345     unsigned ReservedCycles = 0;
1346     SmallVectorImpl<DFAPacketizer *>::iterator RI = Resources.begin();
1347     SmallVectorImpl<DFAPacketizer *>::iterator RE = Resources.end();
1348     for (unsigned C = 0; C < NumCycles; ++C)
1349       while (RI != RE) {
1350         if ((*RI++)->canReserveResources(*MI)) {
1351           ++ReservedCycles;
1352           break;
1353         }
1354       }
1355     // Start reserving resources using existing DFAs.
1356     for (unsigned C = 0; C < ReservedCycles; ++C) {
1357       --RI;
1358       (*RI)->reserveResources(*MI);
1359     }
1360     // Add new DFAs, if needed, to reserve resources.
1361     for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
1362       DFAPacketizer *NewResource =
1363           TII->CreateTargetScheduleState(MF.getSubtarget());
1364       assert(NewResource->canReserveResources(*MI) && "Reserve error.");
1365       NewResource->reserveResources(*MI);
1366       Resources.push_back(NewResource);
1367     }
1368   }
1369   int Resmii = Resources.size();
1370   // Delete the memory for each of the DFAs that were created earlier.
1371   for (DFAPacketizer *RI : Resources) {
1372     DFAPacketizer *D = RI;
1373     delete D;
1374   }
1375   Resources.clear();
1376   return Resmii;
1377 }
1378 
1379 /// Calculate the recurrence-constrainted minimum initiation interval.
1380 /// Iterate over each circuit.  Compute the delay(c) and distance(c)
1381 /// for each circuit. The II needs to satisfy the inequality
1382 /// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1383 /// II that satistifies the inequality, and the RecMII is the maximum
1384 /// of those values.
1385 unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1386   unsigned RecMII = 0;
1387 
1388   for (NodeSet &Nodes : NodeSets) {
1389     if (Nodes.empty())
1390       continue;
1391 
1392     unsigned Delay = Nodes.size() - 1;
1393     unsigned Distance = 1;
1394 
1395     // ii = ceil(delay / distance)
1396     unsigned CurMII = (Delay + Distance - 1) / Distance;
1397     Nodes.setRecMII(CurMII);
1398     if (CurMII > RecMII)
1399       RecMII = CurMII;
1400   }
1401 
1402   return RecMII;
1403 }
1404 
1405 /// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1406 /// but we do this to find the circuits, and then change them back.
1407 static void swapAntiDependences(std::vector<SUnit> &SUnits) {
1408   SmallVector<std::pair<SUnit *, SDep>, 8> DepsAdded;
1409   for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
1410     SUnit *SU = &SUnits[i];
1411     for (SUnit::pred_iterator IP = SU->Preds.begin(), EP = SU->Preds.end();
1412          IP != EP; ++IP) {
1413       if (IP->getKind() != SDep::Anti)
1414         continue;
1415       DepsAdded.push_back(std::make_pair(SU, *IP));
1416     }
1417   }
1418   for (SmallVector<std::pair<SUnit *, SDep>, 8>::iterator I = DepsAdded.begin(),
1419                                                           E = DepsAdded.end();
1420        I != E; ++I) {
1421     // Remove this anti dependency and add one in the reverse direction.
1422     SUnit *SU = I->first;
1423     SDep &D = I->second;
1424     SUnit *TargetSU = D.getSUnit();
1425     unsigned Reg = D.getReg();
1426     unsigned Lat = D.getLatency();
1427     SU->removePred(D);
1428     SDep Dep(SU, SDep::Anti, Reg);
1429     Dep.setLatency(Lat);
1430     TargetSU->addPred(Dep);
1431   }
1432 }
1433 
1434 /// Create the adjacency structure of the nodes in the graph.
1435 void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1436     SwingSchedulerDAG *DAG) {
1437   BitVector Added(SUnits.size());
1438   for (int i = 0, e = SUnits.size(); i != e; ++i) {
1439     Added.reset();
1440     // Add any successor to the adjacency matrix and exclude duplicates.
1441     for (auto &SI : SUnits[i].Succs) {
1442       // Do not process a boundary node and a back-edge is processed only
1443       // if it goes to a Phi.
1444       if (SI.getSUnit()->isBoundaryNode() ||
1445           (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI()))
1446         continue;
1447       int N = SI.getSUnit()->NodeNum;
1448       if (!Added.test(N)) {
1449         AdjK[i].push_back(N);
1450         Added.set(N);
1451       }
1452     }
1453     // A chain edge between a store and a load is treated as a back-edge in the
1454     // adjacency matrix.
1455     for (auto &PI : SUnits[i].Preds) {
1456       if (!SUnits[i].getInstr()->mayStore() ||
1457           !DAG->isLoopCarriedOrder(&SUnits[i], PI, false))
1458         continue;
1459       if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
1460         int N = PI.getSUnit()->NodeNum;
1461         if (!Added.test(N)) {
1462           AdjK[i].push_back(N);
1463           Added.set(N);
1464         }
1465       }
1466     }
1467   }
1468 }
1469 
1470 /// Identify an elementary circuit in the dependence graph starting at the
1471 /// specified node.
1472 bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1473                                           bool HasBackedge) {
1474   SUnit *SV = &SUnits[V];
1475   bool F = false;
1476   Stack.insert(SV);
1477   Blocked.set(V);
1478 
1479   for (auto W : AdjK[V]) {
1480     if (NumPaths > MaxPaths)
1481       break;
1482     if (W < S)
1483       continue;
1484     if (W == S) {
1485       if (!HasBackedge)
1486         NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
1487       F = true;
1488       ++NumPaths;
1489       break;
1490     } else if (!Blocked.test(W)) {
1491       if (circuit(W, S, NodeSets, W < V ? true : HasBackedge))
1492         F = true;
1493     }
1494   }
1495 
1496   if (F)
1497     unblock(V);
1498   else {
1499     for (auto W : AdjK[V]) {
1500       if (W < S)
1501         continue;
1502       if (B[W].count(SV) == 0)
1503         B[W].insert(SV);
1504     }
1505   }
1506   Stack.pop_back();
1507   return F;
1508 }
1509 
1510 /// Unblock a node in the circuit finding algorithm.
1511 void SwingSchedulerDAG::Circuits::unblock(int U) {
1512   Blocked.reset(U);
1513   SmallPtrSet<SUnit *, 4> &BU = B[U];
1514   while (!BU.empty()) {
1515     SmallPtrSet<SUnit *, 4>::iterator SI = BU.begin();
1516     assert(SI != BU.end() && "Invalid B set.");
1517     SUnit *W = *SI;
1518     BU.erase(W);
1519     if (Blocked.test(W->NodeNum))
1520       unblock(W->NodeNum);
1521   }
1522 }
1523 
1524 /// Identify all the elementary circuits in the dependence graph using
1525 /// Johnson's circuit algorithm.
1526 void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1527   // Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1528   // but we do this to find the circuits, and then change them back.
1529   swapAntiDependences(SUnits);
1530 
1531   Circuits Cir(SUnits);
1532   // Create the adjacency structure.
1533   Cir.createAdjacencyStructure(this);
1534   for (int i = 0, e = SUnits.size(); i != e; ++i) {
1535     Cir.reset();
1536     Cir.circuit(i, i, NodeSets);
1537   }
1538 
1539   // Change the dependences back so that we've created a DAG again.
1540   swapAntiDependences(SUnits);
1541 }
1542 
1543 /// Return true for DAG nodes that we ignore when computing the cost functions.
1544 /// We ignore the back-edge recurrence in order to avoid unbounded recurison
1545 /// in the calculation of the ASAP, ALAP, etc functions.
1546 static bool ignoreDependence(const SDep &D, bool isPred) {
1547   if (D.isArtificial())
1548     return true;
1549   return D.getKind() == SDep::Anti && isPred;
1550 }
1551 
1552 /// Compute several functions need to order the nodes for scheduling.
1553 ///  ASAP - Earliest time to schedule a node.
1554 ///  ALAP - Latest time to schedule a node.
1555 ///  MOV - Mobility function, difference between ALAP and ASAP.
1556 ///  D - Depth of each node.
1557 ///  H - Height of each node.
1558 void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
1559   ScheduleInfo.resize(SUnits.size());
1560 
1561   DEBUG({
1562     for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(),
1563                                                     E = Topo.end();
1564          I != E; ++I) {
1565       SUnit *SU = &SUnits[*I];
1566       SU->dump(this);
1567     }
1568   });
1569 
1570   int maxASAP = 0;
1571   // Compute ASAP.
1572   for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(),
1573                                                   E = Topo.end();
1574        I != E; ++I) {
1575     int asap = 0;
1576     SUnit *SU = &SUnits[*I];
1577     for (SUnit::const_pred_iterator IP = SU->Preds.begin(),
1578                                     EP = SU->Preds.end();
1579          IP != EP; ++IP) {
1580       if (ignoreDependence(*IP, true))
1581         continue;
1582       SUnit *pred = IP->getSUnit();
1583       asap = std::max(asap, (int)(getASAP(pred) + getLatency(SU, *IP) -
1584                                   getDistance(pred, SU, *IP) * MII));
1585     }
1586     maxASAP = std::max(maxASAP, asap);
1587     ScheduleInfo[*I].ASAP = asap;
1588   }
1589 
1590   // Compute ALAP and MOV.
1591   for (ScheduleDAGTopologicalSort::const_reverse_iterator I = Topo.rbegin(),
1592                                                           E = Topo.rend();
1593        I != E; ++I) {
1594     int alap = maxASAP;
1595     SUnit *SU = &SUnits[*I];
1596     for (SUnit::const_succ_iterator IS = SU->Succs.begin(),
1597                                     ES = SU->Succs.end();
1598          IS != ES; ++IS) {
1599       if (ignoreDependence(*IS, true))
1600         continue;
1601       SUnit *succ = IS->getSUnit();
1602       alap = std::min(alap, (int)(getALAP(succ) - getLatency(SU, *IS) +
1603                                   getDistance(SU, succ, *IS) * MII));
1604     }
1605 
1606     ScheduleInfo[*I].ALAP = alap;
1607   }
1608 
1609   // After computing the node functions, compute the summary for each node set.
1610   for (NodeSet &I : NodeSets)
1611     I.computeNodeSetInfo(this);
1612 
1613   DEBUG({
1614     for (unsigned i = 0; i < SUnits.size(); i++) {
1615       dbgs() << "\tNode " << i << ":\n";
1616       dbgs() << "\t   ASAP = " << getASAP(&SUnits[i]) << "\n";
1617       dbgs() << "\t   ALAP = " << getALAP(&SUnits[i]) << "\n";
1618       dbgs() << "\t   MOV  = " << getMOV(&SUnits[i]) << "\n";
1619       dbgs() << "\t   D    = " << getDepth(&SUnits[i]) << "\n";
1620       dbgs() << "\t   H    = " << getHeight(&SUnits[i]) << "\n";
1621     }
1622   });
1623 }
1624 
1625 /// Compute the Pred_L(O) set, as defined in the paper. The set is defined
1626 /// as the predecessors of the elements of NodeOrder that are not also in
1627 /// NodeOrder.
1628 static bool pred_L(SetVector<SUnit *> &NodeOrder,
1629                    SmallSetVector<SUnit *, 8> &Preds,
1630                    const NodeSet *S = nullptr) {
1631   Preds.clear();
1632   for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end();
1633        I != E; ++I) {
1634     for (SUnit::pred_iterator PI = (*I)->Preds.begin(), PE = (*I)->Preds.end();
1635          PI != PE; ++PI) {
1636       if (S && S->count(PI->getSUnit()) == 0)
1637         continue;
1638       if (ignoreDependence(*PI, true))
1639         continue;
1640       if (NodeOrder.count(PI->getSUnit()) == 0)
1641         Preds.insert(PI->getSUnit());
1642     }
1643     // Back-edges are predecessors with an anti-dependence.
1644     for (SUnit::const_succ_iterator IS = (*I)->Succs.begin(),
1645                                     ES = (*I)->Succs.end();
1646          IS != ES; ++IS) {
1647       if (IS->getKind() != SDep::Anti)
1648         continue;
1649       if (S && S->count(IS->getSUnit()) == 0)
1650         continue;
1651       if (NodeOrder.count(IS->getSUnit()) == 0)
1652         Preds.insert(IS->getSUnit());
1653     }
1654   }
1655   return !Preds.empty();
1656 }
1657 
1658 /// Compute the Succ_L(O) set, as defined in the paper. The set is defined
1659 /// as the successors of the elements of NodeOrder that are not also in
1660 /// NodeOrder.
1661 static bool succ_L(SetVector<SUnit *> &NodeOrder,
1662                    SmallSetVector<SUnit *, 8> &Succs,
1663                    const NodeSet *S = nullptr) {
1664   Succs.clear();
1665   for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end();
1666        I != E; ++I) {
1667     for (SUnit::succ_iterator SI = (*I)->Succs.begin(), SE = (*I)->Succs.end();
1668          SI != SE; ++SI) {
1669       if (S && S->count(SI->getSUnit()) == 0)
1670         continue;
1671       if (ignoreDependence(*SI, false))
1672         continue;
1673       if (NodeOrder.count(SI->getSUnit()) == 0)
1674         Succs.insert(SI->getSUnit());
1675     }
1676     for (SUnit::const_pred_iterator PI = (*I)->Preds.begin(),
1677                                     PE = (*I)->Preds.end();
1678          PI != PE; ++PI) {
1679       if (PI->getKind() != SDep::Anti)
1680         continue;
1681       if (S && S->count(PI->getSUnit()) == 0)
1682         continue;
1683       if (NodeOrder.count(PI->getSUnit()) == 0)
1684         Succs.insert(PI->getSUnit());
1685     }
1686   }
1687   return !Succs.empty();
1688 }
1689 
1690 /// Return true if there is a path from the specified node to any of the nodes
1691 /// in DestNodes. Keep track and return the nodes in any path.
1692 static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
1693                         SetVector<SUnit *> &DestNodes,
1694                         SetVector<SUnit *> &Exclude,
1695                         SmallPtrSet<SUnit *, 8> &Visited) {
1696   if (Cur->isBoundaryNode())
1697     return false;
1698   if (Exclude.count(Cur) != 0)
1699     return false;
1700   if (DestNodes.count(Cur) != 0)
1701     return true;
1702   if (!Visited.insert(Cur).second)
1703     return Path.count(Cur) != 0;
1704   bool FoundPath = false;
1705   for (auto &SI : Cur->Succs)
1706     FoundPath |= computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
1707   for (auto &PI : Cur->Preds)
1708     if (PI.getKind() == SDep::Anti)
1709       FoundPath |=
1710           computePath(PI.getSUnit(), Path, DestNodes, Exclude, Visited);
1711   if (FoundPath)
1712     Path.insert(Cur);
1713   return FoundPath;
1714 }
1715 
1716 /// Return true if Set1 is a subset of Set2.
1717 template <class S1Ty, class S2Ty> static bool isSubset(S1Ty &Set1, S2Ty &Set2) {
1718   for (typename S1Ty::iterator I = Set1.begin(), E = Set1.end(); I != E; ++I)
1719     if (Set2.count(*I) == 0)
1720       return false;
1721   return true;
1722 }
1723 
1724 /// Compute the live-out registers for the instructions in a node-set.
1725 /// The live-out registers are those that are defined in the node-set,
1726 /// but not used. Except for use operands of Phis.
1727 static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker,
1728                             NodeSet &NS) {
1729   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1730   MachineRegisterInfo &MRI = MF.getRegInfo();
1731   SmallVector<RegisterMaskPair, 8> LiveOutRegs;
1732   SmallSet<unsigned, 4> Uses;
1733   for (SUnit *SU : NS) {
1734     const MachineInstr *MI = SU->getInstr();
1735     if (MI->isPHI())
1736       continue;
1737     for (const MachineOperand &MO : MI->operands())
1738       if (MO.isReg() && MO.isUse()) {
1739         unsigned Reg = MO.getReg();
1740         if (TargetRegisterInfo::isVirtualRegister(Reg))
1741           Uses.insert(Reg);
1742         else if (MRI.isAllocatable(Reg))
1743           for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
1744             Uses.insert(*Units);
1745       }
1746   }
1747   for (SUnit *SU : NS)
1748     for (const MachineOperand &MO : SU->getInstr()->operands())
1749       if (MO.isReg() && MO.isDef() && !MO.isDead()) {
1750         unsigned Reg = MO.getReg();
1751         if (TargetRegisterInfo::isVirtualRegister(Reg)) {
1752           if (!Uses.count(Reg))
1753             LiveOutRegs.push_back(RegisterMaskPair(Reg,
1754                                                    LaneBitmask::getNone()));
1755         } else if (MRI.isAllocatable(Reg)) {
1756           for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
1757             if (!Uses.count(*Units))
1758               LiveOutRegs.push_back(RegisterMaskPair(*Units,
1759                                                      LaneBitmask::getNone()));
1760         }
1761       }
1762   RPTracker.addLiveRegs(LiveOutRegs);
1763 }
1764 
1765 /// A heuristic to filter nodes in recurrent node-sets if the register
1766 /// pressure of a set is too high.
1767 void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
1768   for (auto &NS : NodeSets) {
1769     // Skip small node-sets since they won't cause register pressure problems.
1770     if (NS.size() <= 2)
1771       continue;
1772     IntervalPressure RecRegPressure;
1773     RegPressureTracker RecRPTracker(RecRegPressure);
1774     RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
1775     computeLiveOuts(MF, RecRPTracker, NS);
1776     RecRPTracker.closeBottom();
1777 
1778     std::vector<SUnit *> SUnits(NS.begin(), NS.end());
1779     std::sort(SUnits.begin(), SUnits.end(), [](const SUnit *A, const SUnit *B) {
1780       return A->NodeNum > B->NodeNum;
1781     });
1782 
1783     for (auto &SU : SUnits) {
1784       // Since we're computing the register pressure for a subset of the
1785       // instructions in a block, we need to set the tracker for each
1786       // instruction in the node-set. The tracker is set to the instruction
1787       // just after the one we're interested in.
1788       MachineBasicBlock::const_iterator CurInstI = SU->getInstr();
1789       RecRPTracker.setPos(std::next(CurInstI));
1790 
1791       RegPressureDelta RPDelta;
1792       ArrayRef<PressureChange> CriticalPSets;
1793       RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
1794                                              CriticalPSets,
1795                                              RecRegPressure.MaxSetPressure);
1796       if (RPDelta.Excess.isValid()) {
1797         DEBUG(dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
1798                      << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
1799                      << ":" << RPDelta.Excess.getUnitInc());
1800         NS.setExceedPressure(SU);
1801         break;
1802       }
1803       RecRPTracker.recede();
1804     }
1805   }
1806 }
1807 
1808 /// A heuristic to colocate node sets that have the same set of
1809 /// successors.
1810 void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
1811   unsigned Colocate = 0;
1812   for (int i = 0, e = NodeSets.size(); i < e; ++i) {
1813     NodeSet &N1 = NodeSets[i];
1814     SmallSetVector<SUnit *, 8> S1;
1815     if (N1.empty() || !succ_L(N1, S1))
1816       continue;
1817     for (int j = i + 1; j < e; ++j) {
1818       NodeSet &N2 = NodeSets[j];
1819       if (N1.compareRecMII(N2) != 0)
1820         continue;
1821       SmallSetVector<SUnit *, 8> S2;
1822       if (N2.empty() || !succ_L(N2, S2))
1823         continue;
1824       if (isSubset(S1, S2) && S1.size() == S2.size()) {
1825         N1.setColocate(++Colocate);
1826         N2.setColocate(Colocate);
1827         break;
1828       }
1829     }
1830   }
1831 }
1832 
1833 /// Check if the existing node-sets are profitable. If not, then ignore the
1834 /// recurrent node-sets, and attempt to schedule all nodes together. This is
1835 /// a heuristic. If the MII is large and there is a non-recurrent node with
1836 /// a large depth compared to the MII, then it's best to try and schedule
1837 /// all instruction together instead of starting with the recurrent node-sets.
1838 void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
1839   // Look for loops with a large MII.
1840   if (MII <= 20)
1841     return;
1842   // Check if the node-set contains only a simple add recurrence.
1843   for (auto &NS : NodeSets)
1844     if (NS.size() > 2)
1845       return;
1846   // If the depth of any instruction is significantly larger than the MII, then
1847   // ignore the recurrent node-sets and treat all instructions equally.
1848   for (auto &SU : SUnits)
1849     if (SU.getDepth() > MII * 1.5) {
1850       NodeSets.clear();
1851       DEBUG(dbgs() << "Clear recurrence node-sets\n");
1852       return;
1853     }
1854 }
1855 
1856 /// Add the nodes that do not belong to a recurrence set into groups
1857 /// based upon connected componenets.
1858 void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
1859   SetVector<SUnit *> NodesAdded;
1860   SmallPtrSet<SUnit *, 8> Visited;
1861   // Add the nodes that are on a path between the previous node sets and
1862   // the current node set.
1863   for (NodeSet &I : NodeSets) {
1864     SmallSetVector<SUnit *, 8> N;
1865     // Add the nodes from the current node set to the previous node set.
1866     if (succ_L(I, N)) {
1867       SetVector<SUnit *> Path;
1868       for (SUnit *NI : N) {
1869         Visited.clear();
1870         computePath(NI, Path, NodesAdded, I, Visited);
1871       }
1872       if (!Path.empty())
1873         I.insert(Path.begin(), Path.end());
1874     }
1875     // Add the nodes from the previous node set to the current node set.
1876     N.clear();
1877     if (succ_L(NodesAdded, N)) {
1878       SetVector<SUnit *> Path;
1879       for (SUnit *NI : N) {
1880         Visited.clear();
1881         computePath(NI, Path, I, NodesAdded, Visited);
1882       }
1883       if (!Path.empty())
1884         I.insert(Path.begin(), Path.end());
1885     }
1886     NodesAdded.insert(I.begin(), I.end());
1887   }
1888 
1889   // Create a new node set with the connected nodes of any successor of a node
1890   // in a recurrent set.
1891   NodeSet NewSet;
1892   SmallSetVector<SUnit *, 8> N;
1893   if (succ_L(NodesAdded, N))
1894     for (SUnit *I : N)
1895       addConnectedNodes(I, NewSet, NodesAdded);
1896   if (!NewSet.empty())
1897     NodeSets.push_back(NewSet);
1898 
1899   // Create a new node set with the connected nodes of any predecessor of a node
1900   // in a recurrent set.
1901   NewSet.clear();
1902   if (pred_L(NodesAdded, N))
1903     for (SUnit *I : N)
1904       addConnectedNodes(I, NewSet, NodesAdded);
1905   if (!NewSet.empty())
1906     NodeSets.push_back(NewSet);
1907 
1908   // Create new nodes sets with the connected nodes any any remaining node that
1909   // has no predecessor.
1910   for (unsigned i = 0; i < SUnits.size(); ++i) {
1911     SUnit *SU = &SUnits[i];
1912     if (NodesAdded.count(SU) == 0) {
1913       NewSet.clear();
1914       addConnectedNodes(SU, NewSet, NodesAdded);
1915       if (!NewSet.empty())
1916         NodeSets.push_back(NewSet);
1917     }
1918   }
1919 }
1920 
1921 /// Add the node to the set, and add all is its connected nodes to the set.
1922 void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
1923                                           SetVector<SUnit *> &NodesAdded) {
1924   NewSet.insert(SU);
1925   NodesAdded.insert(SU);
1926   for (auto &SI : SU->Succs) {
1927     SUnit *Successor = SI.getSUnit();
1928     if (!SI.isArtificial() && NodesAdded.count(Successor) == 0)
1929       addConnectedNodes(Successor, NewSet, NodesAdded);
1930   }
1931   for (auto &PI : SU->Preds) {
1932     SUnit *Predecessor = PI.getSUnit();
1933     if (!PI.isArtificial() && NodesAdded.count(Predecessor) == 0)
1934       addConnectedNodes(Predecessor, NewSet, NodesAdded);
1935   }
1936 }
1937 
1938 /// Return true if Set1 contains elements in Set2. The elements in common
1939 /// are returned in a different container.
1940 static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
1941                         SmallSetVector<SUnit *, 8> &Result) {
1942   Result.clear();
1943   for (unsigned i = 0, e = Set1.size(); i != e; ++i) {
1944     SUnit *SU = Set1[i];
1945     if (Set2.count(SU) != 0)
1946       Result.insert(SU);
1947   }
1948   return !Result.empty();
1949 }
1950 
1951 /// Merge the recurrence node sets that have the same initial node.
1952 void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
1953   for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
1954        ++I) {
1955     NodeSet &NI = *I;
1956     for (NodeSetType::iterator J = I + 1; J != E;) {
1957       NodeSet &NJ = *J;
1958       if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
1959         if (NJ.compareRecMII(NI) > 0)
1960           NI.setRecMII(NJ.getRecMII());
1961         for (NodeSet::iterator NII = J->begin(), ENI = J->end(); NII != ENI;
1962              ++NII)
1963           I->insert(*NII);
1964         NodeSets.erase(J);
1965         E = NodeSets.end();
1966       } else {
1967         ++J;
1968       }
1969     }
1970   }
1971 }
1972 
1973 /// Remove nodes that have been scheduled in previous NodeSets.
1974 void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
1975   for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
1976        ++I)
1977     for (NodeSetType::iterator J = I + 1; J != E;) {
1978       J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
1979 
1980       if (J->empty()) {
1981         NodeSets.erase(J);
1982         E = NodeSets.end();
1983       } else {
1984         ++J;
1985       }
1986     }
1987 }
1988 
1989 /// Return true if Inst1 defines a value that is used in Inst2.
1990 static bool hasDataDependence(SUnit *Inst1, SUnit *Inst2) {
1991   for (auto &SI : Inst1->Succs)
1992     if (SI.getSUnit() == Inst2 && SI.getKind() == SDep::Data)
1993       return true;
1994   return false;
1995 }
1996 
1997 /// Compute an ordered list of the dependence graph nodes, which
1998 /// indicates the order that the nodes will be scheduled.  This is a
1999 /// two-level algorithm. First, a partial order is created, which
2000 /// consists of a list of sets ordered from highest to lowest priority.
2001 void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
2002   SmallSetVector<SUnit *, 8> R;
2003   NodeOrder.clear();
2004 
2005   for (auto &Nodes : NodeSets) {
2006     DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
2007     OrderKind Order;
2008     SmallSetVector<SUnit *, 8> N;
2009     if (pred_L(NodeOrder, N) && isSubset(N, Nodes)) {
2010       R.insert(N.begin(), N.end());
2011       Order = BottomUp;
2012       DEBUG(dbgs() << "  Bottom up (preds) ");
2013     } else if (succ_L(NodeOrder, N) && isSubset(N, Nodes)) {
2014       R.insert(N.begin(), N.end());
2015       Order = TopDown;
2016       DEBUG(dbgs() << "  Top down (succs) ");
2017     } else if (isIntersect(N, Nodes, R)) {
2018       // If some of the successors are in the existing node-set, then use the
2019       // top-down ordering.
2020       Order = TopDown;
2021       DEBUG(dbgs() << "  Top down (intersect) ");
2022     } else if (NodeSets.size() == 1) {
2023       for (auto &N : Nodes)
2024         if (N->Succs.size() == 0)
2025           R.insert(N);
2026       Order = BottomUp;
2027       DEBUG(dbgs() << "  Bottom up (all) ");
2028     } else {
2029       // Find the node with the highest ASAP.
2030       SUnit *maxASAP = nullptr;
2031       for (SUnit *SU : Nodes) {
2032         if (maxASAP == nullptr || getASAP(SU) >= getASAP(maxASAP))
2033           maxASAP = SU;
2034       }
2035       R.insert(maxASAP);
2036       Order = BottomUp;
2037       DEBUG(dbgs() << "  Bottom up (default) ");
2038     }
2039 
2040     while (!R.empty()) {
2041       if (Order == TopDown) {
2042         // Choose the node with the maximum height.  If more than one, choose
2043         // the node with the lowest MOV. If still more than one, check if there
2044         // is a dependence between the instructions.
2045         while (!R.empty()) {
2046           SUnit *maxHeight = nullptr;
2047           for (SUnit *I : R) {
2048             if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
2049               maxHeight = I;
2050             else if (getHeight(I) == getHeight(maxHeight) &&
2051                      getMOV(I) < getMOV(maxHeight) &&
2052                      !hasDataDependence(maxHeight, I))
2053               maxHeight = I;
2054             else if (hasDataDependence(I, maxHeight))
2055               maxHeight = I;
2056           }
2057           NodeOrder.insert(maxHeight);
2058           DEBUG(dbgs() << maxHeight->NodeNum << " ");
2059           R.remove(maxHeight);
2060           for (const auto &I : maxHeight->Succs) {
2061             if (Nodes.count(I.getSUnit()) == 0)
2062               continue;
2063             if (NodeOrder.count(I.getSUnit()) != 0)
2064               continue;
2065             if (ignoreDependence(I, false))
2066               continue;
2067             R.insert(I.getSUnit());
2068           }
2069           // Back-edges are predecessors with an anti-dependence.
2070           for (const auto &I : maxHeight->Preds) {
2071             if (I.getKind() != SDep::Anti)
2072               continue;
2073             if (Nodes.count(I.getSUnit()) == 0)
2074               continue;
2075             if (NodeOrder.count(I.getSUnit()) != 0)
2076               continue;
2077             R.insert(I.getSUnit());
2078           }
2079         }
2080         Order = BottomUp;
2081         DEBUG(dbgs() << "\n   Switching order to bottom up ");
2082         SmallSetVector<SUnit *, 8> N;
2083         if (pred_L(NodeOrder, N, &Nodes))
2084           R.insert(N.begin(), N.end());
2085       } else {
2086         // Choose the node with the maximum depth.  If more than one, choose
2087         // the node with the lowest MOV. If there is still more than one, check
2088         // for a dependence between the instructions.
2089         while (!R.empty()) {
2090           SUnit *maxDepth = nullptr;
2091           for (SUnit *I : R) {
2092             if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
2093               maxDepth = I;
2094             else if (getDepth(I) == getDepth(maxDepth) &&
2095                      getMOV(I) < getMOV(maxDepth) &&
2096                      !hasDataDependence(I, maxDepth))
2097               maxDepth = I;
2098             else if (hasDataDependence(maxDepth, I))
2099               maxDepth = I;
2100           }
2101           NodeOrder.insert(maxDepth);
2102           DEBUG(dbgs() << maxDepth->NodeNum << " ");
2103           R.remove(maxDepth);
2104           if (Nodes.isExceedSU(maxDepth)) {
2105             Order = TopDown;
2106             R.clear();
2107             R.insert(Nodes.getNode(0));
2108             break;
2109           }
2110           for (const auto &I : maxDepth->Preds) {
2111             if (Nodes.count(I.getSUnit()) == 0)
2112               continue;
2113             if (NodeOrder.count(I.getSUnit()) != 0)
2114               continue;
2115             if (I.getKind() == SDep::Anti)
2116               continue;
2117             R.insert(I.getSUnit());
2118           }
2119           // Back-edges are predecessors with an anti-dependence.
2120           for (const auto &I : maxDepth->Succs) {
2121             if (I.getKind() != SDep::Anti)
2122               continue;
2123             if (Nodes.count(I.getSUnit()) == 0)
2124               continue;
2125             if (NodeOrder.count(I.getSUnit()) != 0)
2126               continue;
2127             R.insert(I.getSUnit());
2128           }
2129         }
2130         Order = TopDown;
2131         DEBUG(dbgs() << "\n   Switching order to top down ");
2132         SmallSetVector<SUnit *, 8> N;
2133         if (succ_L(NodeOrder, N, &Nodes))
2134           R.insert(N.begin(), N.end());
2135       }
2136     }
2137     DEBUG(dbgs() << "\nDone with Nodeset\n");
2138   }
2139 
2140   DEBUG({
2141     dbgs() << "Node order: ";
2142     for (SUnit *I : NodeOrder)
2143       dbgs() << " " << I->NodeNum << " ";
2144     dbgs() << "\n";
2145   });
2146 }
2147 
2148 /// Process the nodes in the computed order and create the pipelined schedule
2149 /// of the instructions, if possible. Return true if a schedule is found.
2150 bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2151   if (NodeOrder.empty())
2152     return false;
2153 
2154   bool scheduleFound = false;
2155   // Keep increasing II until a valid schedule is found.
2156   for (unsigned II = MII; II < MII + 10 && !scheduleFound; ++II) {
2157     Schedule.reset();
2158     Schedule.setInitiationInterval(II);
2159     DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2160 
2161     SetVector<SUnit *>::iterator NI = NodeOrder.begin();
2162     SetVector<SUnit *>::iterator NE = NodeOrder.end();
2163     do {
2164       SUnit *SU = *NI;
2165 
2166       // Compute the schedule time for the instruction, which is based
2167       // upon the scheduled time for any predecessors/successors.
2168       int EarlyStart = INT_MIN;
2169       int LateStart = INT_MAX;
2170       // These values are set when the size of the schedule window is limited
2171       // due to chain dependences.
2172       int SchedEnd = INT_MAX;
2173       int SchedStart = INT_MIN;
2174       Schedule.computeStart(SU, &EarlyStart, &LateStart, &SchedEnd, &SchedStart,
2175                             II, this);
2176       DEBUG({
2177         dbgs() << "Inst (" << SU->NodeNum << ") ";
2178         SU->getInstr()->dump();
2179         dbgs() << "\n";
2180       });
2181       DEBUG({
2182         dbgs() << "\tes: " << EarlyStart << " ls: " << LateStart
2183                << " me: " << SchedEnd << " ms: " << SchedStart << "\n";
2184       });
2185 
2186       if (EarlyStart > LateStart || SchedEnd < EarlyStart ||
2187           SchedStart > LateStart)
2188         scheduleFound = false;
2189       else if (EarlyStart != INT_MIN && LateStart == INT_MAX) {
2190         SchedEnd = std::min(SchedEnd, EarlyStart + (int)II - 1);
2191         scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2192       } else if (EarlyStart == INT_MIN && LateStart != INT_MAX) {
2193         SchedStart = std::max(SchedStart, LateStart - (int)II + 1);
2194         scheduleFound = Schedule.insert(SU, LateStart, SchedStart, II);
2195       } else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2196         SchedEnd =
2197             std::min(SchedEnd, std::min(LateStart, EarlyStart + (int)II - 1));
2198         // When scheduling a Phi it is better to start at the late cycle and go
2199         // backwards. The default order may insert the Phi too far away from
2200         // its first dependence.
2201         if (SU->getInstr()->isPHI())
2202           scheduleFound = Schedule.insert(SU, SchedEnd, EarlyStart, II);
2203         else
2204           scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2205       } else {
2206         int FirstCycle = Schedule.getFirstCycle();
2207         scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2208                                         FirstCycle + getASAP(SU) + II - 1, II);
2209       }
2210       // Even if we find a schedule, make sure the schedule doesn't exceed the
2211       // allowable number of stages. We keep trying if this happens.
2212       if (scheduleFound)
2213         if (SwpMaxStages > -1 &&
2214             Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2215           scheduleFound = false;
2216 
2217       DEBUG({
2218         if (!scheduleFound)
2219           dbgs() << "\tCan't schedule\n";
2220       });
2221     } while (++NI != NE && scheduleFound);
2222 
2223     // If a schedule is found, check if it is a valid schedule too.
2224     if (scheduleFound)
2225       scheduleFound = Schedule.isValidSchedule(this);
2226   }
2227 
2228   DEBUG(dbgs() << "Schedule Found? " << scheduleFound << "\n");
2229 
2230   if (scheduleFound)
2231     Schedule.finalizeSchedule(this);
2232   else
2233     Schedule.reset();
2234 
2235   return scheduleFound && Schedule.getMaxStageCount() > 0;
2236 }
2237 
2238 /// Given a schedule for the loop, generate a new version of the loop,
2239 /// and replace the old version.  This function generates a prolog
2240 /// that contains the initial iterations in the pipeline, and kernel
2241 /// loop, and the epilogue that contains the code for the final
2242 /// iterations.
2243 void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) {
2244   // Create a new basic block for the kernel and add it to the CFG.
2245   MachineBasicBlock *KernelBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
2246 
2247   unsigned MaxStageCount = Schedule.getMaxStageCount();
2248 
2249   // Remember the registers that are used in different stages. The index is
2250   // the iteration, or stage, that the instruction is scheduled in.  This is
2251   // a map between register names in the orignal block and the names created
2252   // in each stage of the pipelined loop.
2253   ValueMapTy *VRMap = new ValueMapTy[(MaxStageCount + 1) * 2];
2254   InstrMapTy InstrMap;
2255 
2256   SmallVector<MachineBasicBlock *, 4> PrologBBs;
2257   // Generate the prolog instructions that set up the pipeline.
2258   generateProlog(Schedule, MaxStageCount, KernelBB, VRMap, PrologBBs);
2259   MF.insert(BB->getIterator(), KernelBB);
2260 
2261   // Rearrange the instructions to generate the new, pipelined loop,
2262   // and update register names as needed.
2263   for (int Cycle = Schedule.getFirstCycle(),
2264            LastCycle = Schedule.getFinalCycle();
2265        Cycle <= LastCycle; ++Cycle) {
2266     std::deque<SUnit *> &CycleInstrs = Schedule.getInstructions(Cycle);
2267     // This inner loop schedules each instruction in the cycle.
2268     for (SUnit *CI : CycleInstrs) {
2269       if (CI->getInstr()->isPHI())
2270         continue;
2271       unsigned StageNum = Schedule.stageScheduled(getSUnit(CI->getInstr()));
2272       MachineInstr *NewMI = cloneInstr(CI->getInstr(), MaxStageCount, StageNum);
2273       updateInstruction(NewMI, false, MaxStageCount, StageNum, Schedule, VRMap);
2274       KernelBB->push_back(NewMI);
2275       InstrMap[NewMI] = CI->getInstr();
2276     }
2277   }
2278 
2279   // Copy any terminator instructions to the new kernel, and update
2280   // names as needed.
2281   for (MachineBasicBlock::iterator I = BB->getFirstTerminator(),
2282                                    E = BB->instr_end();
2283        I != E; ++I) {
2284     MachineInstr *NewMI = MF.CloneMachineInstr(&*I);
2285     updateInstruction(NewMI, false, MaxStageCount, 0, Schedule, VRMap);
2286     KernelBB->push_back(NewMI);
2287     InstrMap[NewMI] = &*I;
2288   }
2289 
2290   KernelBB->transferSuccessors(BB);
2291   KernelBB->replaceSuccessor(BB, KernelBB);
2292 
2293   generateExistingPhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule,
2294                        VRMap, InstrMap, MaxStageCount, MaxStageCount, false);
2295   generatePhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule, VRMap,
2296                InstrMap, MaxStageCount, MaxStageCount, false);
2297 
2298   DEBUG(dbgs() << "New block\n"; KernelBB->dump(););
2299 
2300   SmallVector<MachineBasicBlock *, 4> EpilogBBs;
2301   // Generate the epilog instructions to complete the pipeline.
2302   generateEpilog(Schedule, MaxStageCount, KernelBB, VRMap, EpilogBBs,
2303                  PrologBBs);
2304 
2305   // We need this step because the register allocation doesn't handle some
2306   // situations well, so we insert copies to help out.
2307   splitLifetimes(KernelBB, EpilogBBs, Schedule);
2308 
2309   // Remove dead instructions due to loop induction variables.
2310   removeDeadInstructions(KernelBB, EpilogBBs);
2311 
2312   // Add branches between prolog and epilog blocks.
2313   addBranches(PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap);
2314 
2315   // Remove the original loop since it's no longer referenced.
2316   BB->clear();
2317   BB->eraseFromParent();
2318 
2319   delete[] VRMap;
2320 }
2321 
2322 /// Generate the pipeline prolog code.
2323 void SwingSchedulerDAG::generateProlog(SMSchedule &Schedule, unsigned LastStage,
2324                                        MachineBasicBlock *KernelBB,
2325                                        ValueMapTy *VRMap,
2326                                        MBBVectorTy &PrologBBs) {
2327   MachineBasicBlock *PreheaderBB = MLI->getLoopFor(BB)->getLoopPreheader();
2328   assert(PreheaderBB != nullptr &&
2329          "Need to add code to handle loops w/o preheader");
2330   MachineBasicBlock *PredBB = PreheaderBB;
2331   InstrMapTy InstrMap;
2332 
2333   // Generate a basic block for each stage, not including the last stage,
2334   // which will be generated in the kernel. Each basic block may contain
2335   // instructions from multiple stages/iterations.
2336   for (unsigned i = 0; i < LastStage; ++i) {
2337     // Create and insert the prolog basic block prior to the original loop
2338     // basic block.  The original loop is removed later.
2339     MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
2340     PrologBBs.push_back(NewBB);
2341     MF.insert(BB->getIterator(), NewBB);
2342     NewBB->transferSuccessors(PredBB);
2343     PredBB->addSuccessor(NewBB);
2344     PredBB = NewBB;
2345 
2346     // Generate instructions for each appropriate stage. Process instructions
2347     // in original program order.
2348     for (int StageNum = i; StageNum >= 0; --StageNum) {
2349       for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
2350                                        BBE = BB->getFirstTerminator();
2351            BBI != BBE; ++BBI) {
2352         if (Schedule.isScheduledAtStage(getSUnit(&*BBI), (unsigned)StageNum)) {
2353           if (BBI->isPHI())
2354             continue;
2355           MachineInstr *NewMI =
2356               cloneAndChangeInstr(&*BBI, i, (unsigned)StageNum, Schedule);
2357           updateInstruction(NewMI, false, i, (unsigned)StageNum, Schedule,
2358                             VRMap);
2359           NewBB->push_back(NewMI);
2360           InstrMap[NewMI] = &*BBI;
2361         }
2362       }
2363     }
2364     rewritePhiValues(NewBB, i, Schedule, VRMap, InstrMap);
2365     DEBUG({
2366       dbgs() << "prolog:\n";
2367       NewBB->dump();
2368     });
2369   }
2370 
2371   PredBB->replaceSuccessor(BB, KernelBB);
2372 
2373   // Check if we need to remove the branch from the preheader to the original
2374   // loop, and replace it with a branch to the new loop.
2375   unsigned numBranches = TII->removeBranch(*PreheaderBB);
2376   if (numBranches) {
2377     SmallVector<MachineOperand, 0> Cond;
2378     TII->insertBranch(*PreheaderBB, PrologBBs[0], nullptr, Cond, DebugLoc());
2379   }
2380 }
2381 
2382 /// Generate the pipeline epilog code. The epilog code finishes the iterations
2383 /// that were started in either the prolog or the kernel.  We create a basic
2384 /// block for each stage that needs to complete.
2385 void SwingSchedulerDAG::generateEpilog(SMSchedule &Schedule, unsigned LastStage,
2386                                        MachineBasicBlock *KernelBB,
2387                                        ValueMapTy *VRMap,
2388                                        MBBVectorTy &EpilogBBs,
2389                                        MBBVectorTy &PrologBBs) {
2390   // We need to change the branch from the kernel to the first epilog block, so
2391   // this call to analyze branch uses the kernel rather than the original BB.
2392   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
2393   SmallVector<MachineOperand, 4> Cond;
2394   bool checkBranch = TII->analyzeBranch(*KernelBB, TBB, FBB, Cond);
2395   assert(!checkBranch && "generateEpilog must be able to analyze the branch");
2396   if (checkBranch)
2397     return;
2398 
2399   MachineBasicBlock::succ_iterator LoopExitI = KernelBB->succ_begin();
2400   if (*LoopExitI == KernelBB)
2401     ++LoopExitI;
2402   assert(LoopExitI != KernelBB->succ_end() && "Expecting a successor");
2403   MachineBasicBlock *LoopExitBB = *LoopExitI;
2404 
2405   MachineBasicBlock *PredBB = KernelBB;
2406   MachineBasicBlock *EpilogStart = LoopExitBB;
2407   InstrMapTy InstrMap;
2408 
2409   // Generate a basic block for each stage, not including the last stage,
2410   // which was generated for the kernel.  Each basic block may contain
2411   // instructions from multiple stages/iterations.
2412   int EpilogStage = LastStage + 1;
2413   for (unsigned i = LastStage; i >= 1; --i, ++EpilogStage) {
2414     MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock();
2415     EpilogBBs.push_back(NewBB);
2416     MF.insert(BB->getIterator(), NewBB);
2417 
2418     PredBB->replaceSuccessor(LoopExitBB, NewBB);
2419     NewBB->addSuccessor(LoopExitBB);
2420 
2421     if (EpilogStart == LoopExitBB)
2422       EpilogStart = NewBB;
2423 
2424     // Add instructions to the epilog depending on the current block.
2425     // Process instructions in original program order.
2426     for (unsigned StageNum = i; StageNum <= LastStage; ++StageNum) {
2427       for (auto &BBI : *BB) {
2428         if (BBI.isPHI())
2429           continue;
2430         MachineInstr *In = &BBI;
2431         if (Schedule.isScheduledAtStage(getSUnit(In), StageNum)) {
2432           MachineInstr *NewMI = cloneInstr(In, EpilogStage - LastStage, 0);
2433           updateInstruction(NewMI, i == 1, EpilogStage, 0, Schedule, VRMap);
2434           NewBB->push_back(NewMI);
2435           InstrMap[NewMI] = In;
2436         }
2437       }
2438     }
2439     generateExistingPhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule,
2440                          VRMap, InstrMap, LastStage, EpilogStage, i == 1);
2441     generatePhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule, VRMap,
2442                  InstrMap, LastStage, EpilogStage, i == 1);
2443     PredBB = NewBB;
2444 
2445     DEBUG({
2446       dbgs() << "epilog:\n";
2447       NewBB->dump();
2448     });
2449   }
2450 
2451   // Fix any Phi nodes in the loop exit block.
2452   for (MachineInstr &MI : *LoopExitBB) {
2453     if (!MI.isPHI())
2454       break;
2455     for (unsigned i = 2, e = MI.getNumOperands() + 1; i != e; i += 2) {
2456       MachineOperand &MO = MI.getOperand(i);
2457       if (MO.getMBB() == BB)
2458         MO.setMBB(PredBB);
2459     }
2460   }
2461 
2462   // Create a branch to the new epilog from the kernel.
2463   // Remove the original branch and add a new branch to the epilog.
2464   TII->removeBranch(*KernelBB);
2465   TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
2466   // Add a branch to the loop exit.
2467   if (EpilogBBs.size() > 0) {
2468     MachineBasicBlock *LastEpilogBB = EpilogBBs.back();
2469     SmallVector<MachineOperand, 4> Cond1;
2470     TII->insertBranch(*LastEpilogBB, LoopExitBB, nullptr, Cond1, DebugLoc());
2471   }
2472 }
2473 
2474 /// Replace all uses of FromReg that appear outside the specified
2475 /// basic block with ToReg.
2476 static void replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg,
2477                                     MachineBasicBlock *MBB,
2478                                     MachineRegisterInfo &MRI,
2479                                     LiveIntervals &LIS) {
2480   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(FromReg),
2481                                          E = MRI.use_end();
2482        I != E;) {
2483     MachineOperand &O = *I;
2484     ++I;
2485     if (O.getParent()->getParent() != MBB)
2486       O.setReg(ToReg);
2487   }
2488   if (!LIS.hasInterval(ToReg))
2489     LIS.createEmptyInterval(ToReg);
2490 }
2491 
2492 /// Return true if the register has a use that occurs outside the
2493 /// specified loop.
2494 static bool hasUseAfterLoop(unsigned Reg, MachineBasicBlock *BB,
2495                             MachineRegisterInfo &MRI) {
2496   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(Reg),
2497                                          E = MRI.use_end();
2498        I != E; ++I)
2499     if (I->getParent()->getParent() != BB)
2500       return true;
2501   return false;
2502 }
2503 
2504 /// Generate Phis for the specific block in the generated pipelined code.
2505 /// This function looks at the Phis from the original code to guide the
2506 /// creation of new Phis.
2507 void SwingSchedulerDAG::generateExistingPhis(
2508     MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
2509     MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
2510     InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
2511     bool IsLast) {
2512   // Compute the stage number for the initial value of the Phi, which
2513   // comes from the prolog. The prolog to use depends on to which kernel/
2514   // epilog that we're adding the Phi.
2515   unsigned PrologStage = 0;
2516   unsigned PrevStage = 0;
2517   bool InKernel = (LastStageNum == CurStageNum);
2518   if (InKernel) {
2519     PrologStage = LastStageNum - 1;
2520     PrevStage = CurStageNum;
2521   } else {
2522     PrologStage = LastStageNum - (CurStageNum - LastStageNum);
2523     PrevStage = LastStageNum + (CurStageNum - LastStageNum) - 1;
2524   }
2525 
2526   for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
2527                                    BBE = BB->getFirstNonPHI();
2528        BBI != BBE; ++BBI) {
2529     unsigned Def = BBI->getOperand(0).getReg();
2530 
2531     unsigned InitVal = 0;
2532     unsigned LoopVal = 0;
2533     getPhiRegs(*BBI, BB, InitVal, LoopVal);
2534 
2535     unsigned PhiOp1 = 0;
2536     // The Phi value from the loop body typically is defined in the loop, but
2537     // not always. So, we need to check if the value is defined in the loop.
2538     unsigned PhiOp2 = LoopVal;
2539     if (VRMap[LastStageNum].count(LoopVal))
2540       PhiOp2 = VRMap[LastStageNum][LoopVal];
2541 
2542     int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI));
2543     int LoopValStage =
2544         Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal)));
2545     unsigned NumStages = Schedule.getStagesForReg(Def, CurStageNum);
2546     if (NumStages == 0) {
2547       // We don't need to generate a Phi anymore, but we need to rename any uses
2548       // of the Phi value.
2549       unsigned NewReg = VRMap[PrevStage][LoopVal];
2550       rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, 0, &*BBI,
2551                             Def, NewReg);
2552       if (VRMap[CurStageNum].count(LoopVal))
2553         VRMap[CurStageNum][Def] = VRMap[CurStageNum][LoopVal];
2554     }
2555     // Adjust the number of Phis needed depending on the number of prologs left,
2556     // and the distance from where the Phi is first scheduled.
2557     unsigned NumPhis = NumStages;
2558     if (!InKernel && (int)PrologStage < LoopValStage)
2559       // The NumPhis is the maximum number of new Phis needed during the steady
2560       // state. If the Phi has not been scheduled in current prolog, then we
2561       // need to generate less Phis.
2562       NumPhis = std::max((int)NumPhis - (int)(LoopValStage - PrologStage), 1);
2563     // The number of Phis cannot exceed the number of prolog stages. Each
2564     // stage can potentially define two values.
2565     NumPhis = std::min(NumPhis, PrologStage + 2);
2566 
2567     unsigned NewReg = 0;
2568 
2569     unsigned AccessStage = (LoopValStage != -1) ? LoopValStage : StageScheduled;
2570     // In the epilog, we may need to look back one stage to get the correct
2571     // Phi name because the epilog and prolog blocks execute the same stage.
2572     // The correct name is from the previous block only when the Phi has
2573     // been completely scheduled prior to the epilog, and Phi value is not
2574     // needed in multiple stages.
2575     int StageDiff = 0;
2576     if (!InKernel && StageScheduled >= LoopValStage && AccessStage == 0 &&
2577         NumPhis == 1)
2578       StageDiff = 1;
2579     // Adjust the computations below when the phi and the loop definition
2580     // are scheduled in different stages.
2581     if (InKernel && LoopValStage != -1 && StageScheduled > LoopValStage)
2582       StageDiff = StageScheduled - LoopValStage;
2583     for (unsigned np = 0; np < NumPhis; ++np) {
2584       // If the Phi hasn't been scheduled, then use the initial Phi operand
2585       // value. Otherwise, use the scheduled version of the instruction. This
2586       // is a little complicated when a Phi references another Phi.
2587       if (np > PrologStage || StageScheduled >= (int)LastStageNum)
2588         PhiOp1 = InitVal;
2589       // Check if the Phi has already been scheduled in a prolog stage.
2590       else if (PrologStage >= AccessStage + StageDiff + np &&
2591                VRMap[PrologStage - StageDiff - np].count(LoopVal) != 0)
2592         PhiOp1 = VRMap[PrologStage - StageDiff - np][LoopVal];
2593       // Check if the Phi has already been scheduled, but the loop intruction
2594       // is either another Phi, or doesn't occur in the loop.
2595       else if (PrologStage >= AccessStage + StageDiff + np) {
2596         // If the Phi references another Phi, we need to examine the other
2597         // Phi to get the correct value.
2598         PhiOp1 = LoopVal;
2599         MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1);
2600         int Indirects = 1;
2601         while (InstOp1 && InstOp1->isPHI() && InstOp1->getParent() == BB) {
2602           int PhiStage = Schedule.stageScheduled(getSUnit(InstOp1));
2603           if ((int)(PrologStage - StageDiff - np) < PhiStage + Indirects)
2604             PhiOp1 = getInitPhiReg(*InstOp1, BB);
2605           else
2606             PhiOp1 = getLoopPhiReg(*InstOp1, BB);
2607           InstOp1 = MRI.getVRegDef(PhiOp1);
2608           int PhiOpStage = Schedule.stageScheduled(getSUnit(InstOp1));
2609           int StageAdj = (PhiOpStage != -1 ? PhiStage - PhiOpStage : 0);
2610           if (PhiOpStage != -1 && PrologStage - StageAdj >= Indirects + np &&
2611               VRMap[PrologStage - StageAdj - Indirects - np].count(PhiOp1)) {
2612             PhiOp1 = VRMap[PrologStage - StageAdj - Indirects - np][PhiOp1];
2613             break;
2614           }
2615           ++Indirects;
2616         }
2617       } else
2618         PhiOp1 = InitVal;
2619       // If this references a generated Phi in the kernel, get the Phi operand
2620       // from the incoming block.
2621       if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1))
2622         if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
2623           PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
2624 
2625       MachineInstr *PhiInst = MRI.getVRegDef(LoopVal);
2626       bool LoopDefIsPhi = PhiInst && PhiInst->isPHI();
2627       // In the epilog, a map lookup is needed to get the value from the kernel,
2628       // or previous epilog block. How is does this depends on if the
2629       // instruction is scheduled in the previous block.
2630       if (!InKernel) {
2631         int StageDiffAdj = 0;
2632         if (LoopValStage != -1 && StageScheduled > LoopValStage)
2633           StageDiffAdj = StageScheduled - LoopValStage;
2634         // Use the loop value defined in the kernel, unless the kernel
2635         // contains the last definition of the Phi.
2636         if (np == 0 && PrevStage == LastStageNum &&
2637             (StageScheduled != 0 || LoopValStage != 0) &&
2638             VRMap[PrevStage - StageDiffAdj].count(LoopVal))
2639           PhiOp2 = VRMap[PrevStage - StageDiffAdj][LoopVal];
2640         // Use the value defined by the Phi. We add one because we switch
2641         // from looking at the loop value to the Phi definition.
2642         else if (np > 0 && PrevStage == LastStageNum &&
2643                  VRMap[PrevStage - np + 1].count(Def))
2644           PhiOp2 = VRMap[PrevStage - np + 1][Def];
2645         // Use the loop value defined in the kernel.
2646         else if ((unsigned)LoopValStage + StageDiffAdj > PrologStage + 1 &&
2647                  VRMap[PrevStage - StageDiffAdj - np].count(LoopVal))
2648           PhiOp2 = VRMap[PrevStage - StageDiffAdj - np][LoopVal];
2649         // Use the value defined by the Phi, unless we're generating the first
2650         // epilog and the Phi refers to a Phi in a different stage.
2651         else if (VRMap[PrevStage - np].count(Def) &&
2652                  (!LoopDefIsPhi || PrevStage != LastStageNum))
2653           PhiOp2 = VRMap[PrevStage - np][Def];
2654       }
2655 
2656       // Check if we can reuse an existing Phi. This occurs when a Phi
2657       // references another Phi, and the other Phi is scheduled in an
2658       // earlier stage. We can try to reuse an existing Phi up until the last
2659       // stage of the current Phi.
2660       if (LoopDefIsPhi && (int)PrologStage >= StageScheduled) {
2661         int LVNumStages = Schedule.getStagesForPhi(LoopVal);
2662         int StageDiff = (StageScheduled - LoopValStage);
2663         LVNumStages -= StageDiff;
2664         if (LVNumStages > (int)np) {
2665           NewReg = PhiOp2;
2666           unsigned ReuseStage = CurStageNum;
2667           if (Schedule.isLoopCarried(this, *PhiInst))
2668             ReuseStage -= LVNumStages;
2669           // Check if the Phi to reuse has been generated yet. If not, then
2670           // there is nothing to reuse.
2671           if (VRMap[ReuseStage].count(LoopVal)) {
2672             NewReg = VRMap[ReuseStage][LoopVal];
2673 
2674             rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2675                                   &*BBI, Def, NewReg);
2676             // Update the map with the new Phi name.
2677             VRMap[CurStageNum - np][Def] = NewReg;
2678             PhiOp2 = NewReg;
2679             if (VRMap[LastStageNum - np - 1].count(LoopVal))
2680               PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal];
2681 
2682             if (IsLast && np == NumPhis - 1)
2683               replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
2684             continue;
2685           }
2686         } else if (InKernel && StageDiff > 0 &&
2687                    VRMap[CurStageNum - StageDiff - np].count(LoopVal))
2688           PhiOp2 = VRMap[CurStageNum - StageDiff - np][LoopVal];
2689       }
2690 
2691       const TargetRegisterClass *RC = MRI.getRegClass(Def);
2692       NewReg = MRI.createVirtualRegister(RC);
2693 
2694       MachineInstrBuilder NewPhi =
2695           BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
2696                   TII->get(TargetOpcode::PHI), NewReg);
2697       NewPhi.addReg(PhiOp1).addMBB(BB1);
2698       NewPhi.addReg(PhiOp2).addMBB(BB2);
2699       if (np == 0)
2700         InstrMap[NewPhi] = &*BBI;
2701 
2702       // We define the Phis after creating the new pipelined code, so
2703       // we need to rename the Phi values in scheduled instructions.
2704 
2705       unsigned PrevReg = 0;
2706       if (InKernel && VRMap[PrevStage - np].count(LoopVal))
2707         PrevReg = VRMap[PrevStage - np][LoopVal];
2708       rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI,
2709                             Def, NewReg, PrevReg);
2710       // If the Phi has been scheduled, use the new name for rewriting.
2711       if (VRMap[CurStageNum - np].count(Def)) {
2712         unsigned R = VRMap[CurStageNum - np][Def];
2713         rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI,
2714                               R, NewReg);
2715       }
2716 
2717       // Check if we need to rename any uses that occurs after the loop. The
2718       // register to replace depends on whether the Phi is scheduled in the
2719       // epilog.
2720       if (IsLast && np == NumPhis - 1)
2721         replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
2722 
2723       // In the kernel, a dependent Phi uses the value from this Phi.
2724       if (InKernel)
2725         PhiOp2 = NewReg;
2726 
2727       // Update the map with the new Phi name.
2728       VRMap[CurStageNum - np][Def] = NewReg;
2729     }
2730 
2731     while (NumPhis++ < NumStages) {
2732       rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, NumPhis,
2733                             &*BBI, Def, NewReg, 0);
2734     }
2735 
2736     // Check if we need to rename a Phi that has been eliminated due to
2737     // scheduling.
2738     if (NumStages == 0 && IsLast && VRMap[CurStageNum].count(LoopVal))
2739       replaceRegUsesAfterLoop(Def, VRMap[CurStageNum][LoopVal], BB, MRI, LIS);
2740   }
2741 }
2742 
2743 /// Generate Phis for the specified block in the generated pipelined code.
2744 /// These are new Phis needed because the definition is scheduled after the
2745 /// use in the pipelened sequence.
2746 void SwingSchedulerDAG::generatePhis(
2747     MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
2748     MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
2749     InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
2750     bool IsLast) {
2751   // Compute the stage number that contains the initial Phi value, and
2752   // the Phi from the previous stage.
2753   unsigned PrologStage = 0;
2754   unsigned PrevStage = 0;
2755   unsigned StageDiff = CurStageNum - LastStageNum;
2756   bool InKernel = (StageDiff == 0);
2757   if (InKernel) {
2758     PrologStage = LastStageNum - 1;
2759     PrevStage = CurStageNum;
2760   } else {
2761     PrologStage = LastStageNum - StageDiff;
2762     PrevStage = LastStageNum + StageDiff - 1;
2763   }
2764 
2765   for (MachineBasicBlock::iterator BBI = BB->getFirstNonPHI(),
2766                                    BBE = BB->instr_end();
2767        BBI != BBE; ++BBI) {
2768     for (unsigned i = 0, e = BBI->getNumOperands(); i != e; ++i) {
2769       MachineOperand &MO = BBI->getOperand(i);
2770       if (!MO.isReg() || !MO.isDef() ||
2771           !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
2772         continue;
2773 
2774       int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI));
2775       assert(StageScheduled != -1 && "Expecting scheduled instruction.");
2776       unsigned Def = MO.getReg();
2777       unsigned NumPhis = Schedule.getStagesForReg(Def, CurStageNum);
2778       // An instruction scheduled in stage 0 and is used after the loop
2779       // requires a phi in the epilog for the last definition from either
2780       // the kernel or prolog.
2781       if (!InKernel && NumPhis == 0 && StageScheduled == 0 &&
2782           hasUseAfterLoop(Def, BB, MRI))
2783         NumPhis = 1;
2784       if (!InKernel && (unsigned)StageScheduled > PrologStage)
2785         continue;
2786 
2787       unsigned PhiOp2 = VRMap[PrevStage][Def];
2788       if (MachineInstr *InstOp2 = MRI.getVRegDef(PhiOp2))
2789         if (InstOp2->isPHI() && InstOp2->getParent() == NewBB)
2790           PhiOp2 = getLoopPhiReg(*InstOp2, BB2);
2791       // The number of Phis can't exceed the number of prolog stages. The
2792       // prolog stage number is zero based.
2793       if (NumPhis > PrologStage + 1 - StageScheduled)
2794         NumPhis = PrologStage + 1 - StageScheduled;
2795       for (unsigned np = 0; np < NumPhis; ++np) {
2796         unsigned PhiOp1 = VRMap[PrologStage][Def];
2797         if (np <= PrologStage)
2798           PhiOp1 = VRMap[PrologStage - np][Def];
2799         if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1)) {
2800           if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
2801             PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
2802           if (InstOp1->isPHI() && InstOp1->getParent() == NewBB)
2803             PhiOp1 = getInitPhiReg(*InstOp1, NewBB);
2804         }
2805         if (!InKernel)
2806           PhiOp2 = VRMap[PrevStage - np][Def];
2807 
2808         const TargetRegisterClass *RC = MRI.getRegClass(Def);
2809         unsigned NewReg = MRI.createVirtualRegister(RC);
2810 
2811         MachineInstrBuilder NewPhi =
2812             BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
2813                     TII->get(TargetOpcode::PHI), NewReg);
2814         NewPhi.addReg(PhiOp1).addMBB(BB1);
2815         NewPhi.addReg(PhiOp2).addMBB(BB2);
2816         if (np == 0)
2817           InstrMap[NewPhi] = &*BBI;
2818 
2819         // Rewrite uses and update the map. The actions depend upon whether
2820         // we generating code for the kernel or epilog blocks.
2821         if (InKernel) {
2822           rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2823                                 &*BBI, PhiOp1, NewReg);
2824           rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2825                                 &*BBI, PhiOp2, NewReg);
2826 
2827           PhiOp2 = NewReg;
2828           VRMap[PrevStage - np - 1][Def] = NewReg;
2829         } else {
2830           VRMap[CurStageNum - np][Def] = NewReg;
2831           if (np == NumPhis - 1)
2832             rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2833                                   &*BBI, Def, NewReg);
2834         }
2835         if (IsLast && np == NumPhis - 1)
2836           replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
2837       }
2838     }
2839   }
2840 }
2841 
2842 /// Remove instructions that generate values with no uses.
2843 /// Typically, these are induction variable operations that generate values
2844 /// used in the loop itself.  A dead instruction has a definition with
2845 /// no uses, or uses that occur in the original loop only.
2846 void SwingSchedulerDAG::removeDeadInstructions(MachineBasicBlock *KernelBB,
2847                                                MBBVectorTy &EpilogBBs) {
2848   // For each epilog block, check that the value defined by each instruction
2849   // is used.  If not, delete it.
2850   for (MBBVectorTy::reverse_iterator MBB = EpilogBBs.rbegin(),
2851                                      MBE = EpilogBBs.rend();
2852        MBB != MBE; ++MBB)
2853     for (MachineBasicBlock::reverse_instr_iterator MI = (*MBB)->instr_rbegin(),
2854                                                    ME = (*MBB)->instr_rend();
2855          MI != ME;) {
2856       // From DeadMachineInstructionElem. Don't delete inline assembly.
2857       if (MI->isInlineAsm()) {
2858         ++MI;
2859         continue;
2860       }
2861       bool SawStore = false;
2862       // Check if it's safe to remove the instruction due to side effects.
2863       // We can, and want to, remove Phis here.
2864       if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI()) {
2865         ++MI;
2866         continue;
2867       }
2868       bool used = true;
2869       for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
2870                                       MOE = MI->operands_end();
2871            MOI != MOE; ++MOI) {
2872         if (!MOI->isReg() || !MOI->isDef())
2873           continue;
2874         unsigned reg = MOI->getReg();
2875         unsigned realUses = 0;
2876         for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(reg),
2877                                                EI = MRI.use_end();
2878              UI != EI; ++UI) {
2879           // Check if there are any uses that occur only in the original
2880           // loop.  If so, that's not a real use.
2881           if (UI->getParent()->getParent() != BB) {
2882             realUses++;
2883             used = true;
2884             break;
2885           }
2886         }
2887         if (realUses > 0)
2888           break;
2889         used = false;
2890       }
2891       if (!used) {
2892         MI++->eraseFromParent();
2893         continue;
2894       }
2895       ++MI;
2896     }
2897   // In the kernel block, check if we can remove a Phi that generates a value
2898   // used in an instruction removed in the epilog block.
2899   for (MachineBasicBlock::iterator BBI = KernelBB->instr_begin(),
2900                                    BBE = KernelBB->getFirstNonPHI();
2901        BBI != BBE;) {
2902     MachineInstr *MI = &*BBI;
2903     ++BBI;
2904     unsigned reg = MI->getOperand(0).getReg();
2905     if (MRI.use_begin(reg) == MRI.use_end()) {
2906       MI->eraseFromParent();
2907     }
2908   }
2909 }
2910 
2911 /// For loop carried definitions, we split the lifetime of a virtual register
2912 /// that has uses past the definition in the next iteration. A copy with a new
2913 /// virtual register is inserted before the definition, which helps with
2914 /// generating a better register assignment.
2915 ///
2916 ///   v1 = phi(a, v2)     v1 = phi(a, v2)
2917 ///   v2 = phi(b, v3)     v2 = phi(b, v3)
2918 ///   v3 = ..             v4 = copy v1
2919 ///   .. = V1             v3 = ..
2920 ///                       .. = v4
2921 void SwingSchedulerDAG::splitLifetimes(MachineBasicBlock *KernelBB,
2922                                        MBBVectorTy &EpilogBBs,
2923                                        SMSchedule &Schedule) {
2924   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2925   for (auto &PHI : KernelBB->phis()) {
2926     unsigned Def = PHI.getOperand(0).getReg();
2927     // Check for any Phi definition that used as an operand of another Phi
2928     // in the same block.
2929     for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(Def),
2930                                                  E = MRI.use_instr_end();
2931          I != E; ++I) {
2932       if (I->isPHI() && I->getParent() == KernelBB) {
2933         // Get the loop carried definition.
2934         unsigned LCDef = getLoopPhiReg(PHI, KernelBB);
2935         if (!LCDef)
2936           continue;
2937         MachineInstr *MI = MRI.getVRegDef(LCDef);
2938         if (!MI || MI->getParent() != KernelBB || MI->isPHI())
2939           continue;
2940         // Search through the rest of the block looking for uses of the Phi
2941         // definition. If one occurs, then split the lifetime.
2942         unsigned SplitReg = 0;
2943         for (auto &BBJ : make_range(MachineBasicBlock::instr_iterator(MI),
2944                                     KernelBB->instr_end()))
2945           if (BBJ.readsRegister(Def)) {
2946             // We split the lifetime when we find the first use.
2947             if (SplitReg == 0) {
2948               SplitReg = MRI.createVirtualRegister(MRI.getRegClass(Def));
2949               BuildMI(*KernelBB, MI, MI->getDebugLoc(),
2950                       TII->get(TargetOpcode::COPY), SplitReg)
2951                   .addReg(Def);
2952             }
2953             BBJ.substituteRegister(Def, SplitReg, 0, *TRI);
2954           }
2955         if (!SplitReg)
2956           continue;
2957         // Search through each of the epilog blocks for any uses to be renamed.
2958         for (auto &Epilog : EpilogBBs)
2959           for (auto &I : *Epilog)
2960             if (I.readsRegister(Def))
2961               I.substituteRegister(Def, SplitReg, 0, *TRI);
2962         break;
2963       }
2964     }
2965   }
2966 }
2967 
2968 /// Remove the incoming block from the Phis in a basic block.
2969 static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
2970   for (MachineInstr &MI : *BB) {
2971     if (!MI.isPHI())
2972       break;
2973     for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2)
2974       if (MI.getOperand(i + 1).getMBB() == Incoming) {
2975         MI.RemoveOperand(i + 1);
2976         MI.RemoveOperand(i);
2977         break;
2978       }
2979   }
2980 }
2981 
2982 /// Create branches from each prolog basic block to the appropriate epilog
2983 /// block.  These edges are needed if the loop ends before reaching the
2984 /// kernel.
2985 void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs,
2986                                     MachineBasicBlock *KernelBB,
2987                                     MBBVectorTy &EpilogBBs,
2988                                     SMSchedule &Schedule, ValueMapTy *VRMap) {
2989   assert(PrologBBs.size() == EpilogBBs.size() && "Prolog/Epilog mismatch");
2990   MachineInstr *IndVar = Pass.LI.LoopInductionVar;
2991   MachineInstr *Cmp = Pass.LI.LoopCompare;
2992   MachineBasicBlock *LastPro = KernelBB;
2993   MachineBasicBlock *LastEpi = KernelBB;
2994 
2995   // Start from the blocks connected to the kernel and work "out"
2996   // to the first prolog and the last epilog blocks.
2997   SmallVector<MachineInstr *, 4> PrevInsts;
2998   unsigned MaxIter = PrologBBs.size() - 1;
2999   unsigned LC = UINT_MAX;
3000   unsigned LCMin = UINT_MAX;
3001   for (unsigned i = 0, j = MaxIter; i <= MaxIter; ++i, --j) {
3002     // Add branches to the prolog that go to the corresponding
3003     // epilog, and the fall-thru prolog/kernel block.
3004     MachineBasicBlock *Prolog = PrologBBs[j];
3005     MachineBasicBlock *Epilog = EpilogBBs[i];
3006     // We've executed one iteration, so decrement the loop count and check for
3007     // the loop end.
3008     SmallVector<MachineOperand, 4> Cond;
3009     // Check if the LOOP0 has already been removed. If so, then there is no need
3010     // to reduce the trip count.
3011     if (LC != 0)
3012       LC = TII->reduceLoopCount(*Prolog, IndVar, *Cmp, Cond, PrevInsts, j,
3013                                 MaxIter);
3014 
3015     // Record the value of the first trip count, which is used to determine if
3016     // branches and blocks can be removed for constant trip counts.
3017     if (LCMin == UINT_MAX)
3018       LCMin = LC;
3019 
3020     unsigned numAdded = 0;
3021     if (TargetRegisterInfo::isVirtualRegister(LC)) {
3022       Prolog->addSuccessor(Epilog);
3023       numAdded = TII->insertBranch(*Prolog, Epilog, LastPro, Cond, DebugLoc());
3024     } else if (j >= LCMin) {
3025       Prolog->addSuccessor(Epilog);
3026       Prolog->removeSuccessor(LastPro);
3027       LastEpi->removeSuccessor(Epilog);
3028       numAdded = TII->insertBranch(*Prolog, Epilog, nullptr, Cond, DebugLoc());
3029       removePhis(Epilog, LastEpi);
3030       // Remove the blocks that are no longer referenced.
3031       if (LastPro != LastEpi) {
3032         LastEpi->clear();
3033         LastEpi->eraseFromParent();
3034       }
3035       LastPro->clear();
3036       LastPro->eraseFromParent();
3037     } else {
3038       numAdded = TII->insertBranch(*Prolog, LastPro, nullptr, Cond, DebugLoc());
3039       removePhis(Epilog, Prolog);
3040     }
3041     LastPro = Prolog;
3042     LastEpi = Epilog;
3043     for (MachineBasicBlock::reverse_instr_iterator I = Prolog->instr_rbegin(),
3044                                                    E = Prolog->instr_rend();
3045          I != E && numAdded > 0; ++I, --numAdded)
3046       updateInstruction(&*I, false, j, 0, Schedule, VRMap);
3047   }
3048 }
3049 
3050 /// Return true if we can compute the amount the instruction changes
3051 /// during each iteration. Set Delta to the amount of the change.
3052 bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
3053   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3054   unsigned BaseReg;
3055   int64_t Offset;
3056   if (!TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI))
3057     return false;
3058 
3059   MachineRegisterInfo &MRI = MF.getRegInfo();
3060   // Check if there is a Phi. If so, get the definition in the loop.
3061   MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
3062   if (BaseDef && BaseDef->isPHI()) {
3063     BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
3064     BaseDef = MRI.getVRegDef(BaseReg);
3065   }
3066   if (!BaseDef)
3067     return false;
3068 
3069   int D = 0;
3070   if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
3071     return false;
3072 
3073   Delta = D;
3074   return true;
3075 }
3076 
3077 /// Update the memory operand with a new offset when the pipeliner
3078 /// generates a new copy of the instruction that refers to a
3079 /// different memory location.
3080 void SwingSchedulerDAG::updateMemOperands(MachineInstr &NewMI,
3081                                           MachineInstr &OldMI, unsigned Num) {
3082   if (Num == 0)
3083     return;
3084   // If the instruction has memory operands, then adjust the offset
3085   // when the instruction appears in different stages.
3086   unsigned NumRefs = NewMI.memoperands_end() - NewMI.memoperands_begin();
3087   if (NumRefs == 0)
3088     return;
3089   MachineInstr::mmo_iterator NewMemRefs = MF.allocateMemRefsArray(NumRefs);
3090   unsigned Refs = 0;
3091   for (MachineMemOperand *MMO : NewMI.memoperands()) {
3092     if (MMO->isVolatile() || (MMO->isInvariant() && MMO->isDereferenceable()) ||
3093         (!MMO->getValue())) {
3094       NewMemRefs[Refs++] = MMO;
3095       continue;
3096     }
3097     unsigned Delta;
3098     if (computeDelta(OldMI, Delta)) {
3099       int64_t AdjOffset = Delta * Num;
3100       NewMemRefs[Refs++] =
3101           MF.getMachineMemOperand(MMO, AdjOffset, MMO->getSize());
3102     } else
3103       NewMemRefs[Refs++] = MF.getMachineMemOperand(MMO, 0, UINT64_MAX);
3104   }
3105   NewMI.setMemRefs(NewMemRefs, NewMemRefs + NumRefs);
3106 }
3107 
3108 /// Clone the instruction for the new pipelined loop and update the
3109 /// memory operands, if needed.
3110 MachineInstr *SwingSchedulerDAG::cloneInstr(MachineInstr *OldMI,
3111                                             unsigned CurStageNum,
3112                                             unsigned InstStageNum) {
3113   MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
3114   // Check for tied operands in inline asm instructions. This should be handled
3115   // elsewhere, but I'm not sure of the best solution.
3116   if (OldMI->isInlineAsm())
3117     for (unsigned i = 0, e = OldMI->getNumOperands(); i != e; ++i) {
3118       const auto &MO = OldMI->getOperand(i);
3119       if (MO.isReg() && MO.isUse())
3120         break;
3121       unsigned UseIdx;
3122       if (OldMI->isRegTiedToUseOperand(i, &UseIdx))
3123         NewMI->tieOperands(i, UseIdx);
3124     }
3125   updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
3126   return NewMI;
3127 }
3128 
3129 /// Clone the instruction for the new pipelined loop. If needed, this
3130 /// function updates the instruction using the values saved in the
3131 /// InstrChanges structure.
3132 MachineInstr *SwingSchedulerDAG::cloneAndChangeInstr(MachineInstr *OldMI,
3133                                                      unsigned CurStageNum,
3134                                                      unsigned InstStageNum,
3135                                                      SMSchedule &Schedule) {
3136   MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
3137   DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
3138       InstrChanges.find(getSUnit(OldMI));
3139   if (It != InstrChanges.end()) {
3140     std::pair<unsigned, int64_t> RegAndOffset = It->second;
3141     unsigned BasePos, OffsetPos;
3142     if (!TII->getBaseAndOffsetPosition(*OldMI, BasePos, OffsetPos))
3143       return nullptr;
3144     int64_t NewOffset = OldMI->getOperand(OffsetPos).getImm();
3145     MachineInstr *LoopDef = findDefInLoop(RegAndOffset.first);
3146     if (Schedule.stageScheduled(getSUnit(LoopDef)) > (signed)InstStageNum)
3147       NewOffset += RegAndOffset.second * (CurStageNum - InstStageNum);
3148     NewMI->getOperand(OffsetPos).setImm(NewOffset);
3149   }
3150   updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
3151   return NewMI;
3152 }
3153 
3154 /// Update the machine instruction with new virtual registers.  This
3155 /// function may change the defintions and/or uses.
3156 void SwingSchedulerDAG::updateInstruction(MachineInstr *NewMI, bool LastDef,
3157                                           unsigned CurStageNum,
3158                                           unsigned InstrStageNum,
3159                                           SMSchedule &Schedule,
3160                                           ValueMapTy *VRMap) {
3161   for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
3162     MachineOperand &MO = NewMI->getOperand(i);
3163     if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
3164       continue;
3165     unsigned reg = MO.getReg();
3166     if (MO.isDef()) {
3167       // Create a new virtual register for the definition.
3168       const TargetRegisterClass *RC = MRI.getRegClass(reg);
3169       unsigned NewReg = MRI.createVirtualRegister(RC);
3170       MO.setReg(NewReg);
3171       VRMap[CurStageNum][reg] = NewReg;
3172       if (LastDef)
3173         replaceRegUsesAfterLoop(reg, NewReg, BB, MRI, LIS);
3174     } else if (MO.isUse()) {
3175       MachineInstr *Def = MRI.getVRegDef(reg);
3176       // Compute the stage that contains the last definition for instruction.
3177       int DefStageNum = Schedule.stageScheduled(getSUnit(Def));
3178       unsigned StageNum = CurStageNum;
3179       if (DefStageNum != -1 && (int)InstrStageNum > DefStageNum) {
3180         // Compute the difference in stages between the defintion and the use.
3181         unsigned StageDiff = (InstrStageNum - DefStageNum);
3182         // Make an adjustment to get the last definition.
3183         StageNum -= StageDiff;
3184       }
3185       if (VRMap[StageNum].count(reg))
3186         MO.setReg(VRMap[StageNum][reg]);
3187     }
3188   }
3189 }
3190 
3191 /// Return the instruction in the loop that defines the register.
3192 /// If the definition is a Phi, then follow the Phi operand to
3193 /// the instruction in the loop.
3194 MachineInstr *SwingSchedulerDAG::findDefInLoop(unsigned Reg) {
3195   SmallPtrSet<MachineInstr *, 8> Visited;
3196   MachineInstr *Def = MRI.getVRegDef(Reg);
3197   while (Def->isPHI()) {
3198     if (!Visited.insert(Def).second)
3199       break;
3200     for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
3201       if (Def->getOperand(i + 1).getMBB() == BB) {
3202         Def = MRI.getVRegDef(Def->getOperand(i).getReg());
3203         break;
3204       }
3205   }
3206   return Def;
3207 }
3208 
3209 /// Return the new name for the value from the previous stage.
3210 unsigned SwingSchedulerDAG::getPrevMapVal(unsigned StageNum, unsigned PhiStage,
3211                                           unsigned LoopVal, unsigned LoopStage,
3212                                           ValueMapTy *VRMap,
3213                                           MachineBasicBlock *BB) {
3214   unsigned PrevVal = 0;
3215   if (StageNum > PhiStage) {
3216     MachineInstr *LoopInst = MRI.getVRegDef(LoopVal);
3217     if (PhiStage == LoopStage && VRMap[StageNum - 1].count(LoopVal))
3218       // The name is defined in the previous stage.
3219       PrevVal = VRMap[StageNum - 1][LoopVal];
3220     else if (VRMap[StageNum].count(LoopVal))
3221       // The previous name is defined in the current stage when the instruction
3222       // order is swapped.
3223       PrevVal = VRMap[StageNum][LoopVal];
3224     else if (!LoopInst->isPHI() || LoopInst->getParent() != BB)
3225       // The loop value hasn't yet been scheduled.
3226       PrevVal = LoopVal;
3227     else if (StageNum == PhiStage + 1)
3228       // The loop value is another phi, which has not been scheduled.
3229       PrevVal = getInitPhiReg(*LoopInst, BB);
3230     else if (StageNum > PhiStage + 1 && LoopInst->getParent() == BB)
3231       // The loop value is another phi, which has been scheduled.
3232       PrevVal =
3233           getPrevMapVal(StageNum - 1, PhiStage, getLoopPhiReg(*LoopInst, BB),
3234                         LoopStage, VRMap, BB);
3235   }
3236   return PrevVal;
3237 }
3238 
3239 /// Rewrite the Phi values in the specified block to use the mappings
3240 /// from the initial operand. Once the Phi is scheduled, we switch
3241 /// to using the loop value instead of the Phi value, so those names
3242 /// do not need to be rewritten.
3243 void SwingSchedulerDAG::rewritePhiValues(MachineBasicBlock *NewBB,
3244                                          unsigned StageNum,
3245                                          SMSchedule &Schedule,
3246                                          ValueMapTy *VRMap,
3247                                          InstrMapTy &InstrMap) {
3248   for (auto &PHI : BB->phis()) {
3249     unsigned InitVal = 0;
3250     unsigned LoopVal = 0;
3251     getPhiRegs(PHI, BB, InitVal, LoopVal);
3252     unsigned PhiDef = PHI.getOperand(0).getReg();
3253 
3254     unsigned PhiStage =
3255         (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(PhiDef)));
3256     unsigned LoopStage =
3257         (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal)));
3258     unsigned NumPhis = Schedule.getStagesForPhi(PhiDef);
3259     if (NumPhis > StageNum)
3260       NumPhis = StageNum;
3261     for (unsigned np = 0; np <= NumPhis; ++np) {
3262       unsigned NewVal =
3263           getPrevMapVal(StageNum - np, PhiStage, LoopVal, LoopStage, VRMap, BB);
3264       if (!NewVal)
3265         NewVal = InitVal;
3266       rewriteScheduledInstr(NewBB, Schedule, InstrMap, StageNum - np, np, &PHI,
3267                             PhiDef, NewVal);
3268     }
3269   }
3270 }
3271 
3272 /// Rewrite a previously scheduled instruction to use the register value
3273 /// from the new instruction. Make sure the instruction occurs in the
3274 /// basic block, and we don't change the uses in the new instruction.
3275 void SwingSchedulerDAG::rewriteScheduledInstr(
3276     MachineBasicBlock *BB, SMSchedule &Schedule, InstrMapTy &InstrMap,
3277     unsigned CurStageNum, unsigned PhiNum, MachineInstr *Phi, unsigned OldReg,
3278     unsigned NewReg, unsigned PrevReg) {
3279   bool InProlog = (CurStageNum < Schedule.getMaxStageCount());
3280   int StagePhi = Schedule.stageScheduled(getSUnit(Phi)) + PhiNum;
3281   // Rewrite uses that have been scheduled already to use the new
3282   // Phi register.
3283   for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(OldReg),
3284                                          EI = MRI.use_end();
3285        UI != EI;) {
3286     MachineOperand &UseOp = *UI;
3287     MachineInstr *UseMI = UseOp.getParent();
3288     ++UI;
3289     if (UseMI->getParent() != BB)
3290       continue;
3291     if (UseMI->isPHI()) {
3292       if (!Phi->isPHI() && UseMI->getOperand(0).getReg() == NewReg)
3293         continue;
3294       if (getLoopPhiReg(*UseMI, BB) != OldReg)
3295         continue;
3296     }
3297     InstrMapTy::iterator OrigInstr = InstrMap.find(UseMI);
3298     assert(OrigInstr != InstrMap.end() && "Instruction not scheduled.");
3299     SUnit *OrigMISU = getSUnit(OrigInstr->second);
3300     int StageSched = Schedule.stageScheduled(OrigMISU);
3301     int CycleSched = Schedule.cycleScheduled(OrigMISU);
3302     unsigned ReplaceReg = 0;
3303     // This is the stage for the scheduled instruction.
3304     if (StagePhi == StageSched && Phi->isPHI()) {
3305       int CyclePhi = Schedule.cycleScheduled(getSUnit(Phi));
3306       if (PrevReg && InProlog)
3307         ReplaceReg = PrevReg;
3308       else if (PrevReg && !Schedule.isLoopCarried(this, *Phi) &&
3309                (CyclePhi <= CycleSched || OrigMISU->getInstr()->isPHI()))
3310         ReplaceReg = PrevReg;
3311       else
3312         ReplaceReg = NewReg;
3313     }
3314     // The scheduled instruction occurs before the scheduled Phi, and the
3315     // Phi is not loop carried.
3316     if (!InProlog && StagePhi + 1 == StageSched &&
3317         !Schedule.isLoopCarried(this, *Phi))
3318       ReplaceReg = NewReg;
3319     if (StagePhi > StageSched && Phi->isPHI())
3320       ReplaceReg = NewReg;
3321     if (!InProlog && !Phi->isPHI() && StagePhi < StageSched)
3322       ReplaceReg = NewReg;
3323     if (ReplaceReg) {
3324       MRI.constrainRegClass(ReplaceReg, MRI.getRegClass(OldReg));
3325       UseOp.setReg(ReplaceReg);
3326     }
3327   }
3328 }
3329 
3330 /// Check if we can change the instruction to use an offset value from the
3331 /// previous iteration. If so, return true and set the base and offset values
3332 /// so that we can rewrite the load, if necessary.
3333 ///   v1 = Phi(v0, v3)
3334 ///   v2 = load v1, 0
3335 ///   v3 = post_store v1, 4, x
3336 /// This function enables the load to be rewritten as v2 = load v3, 4.
3337 bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
3338                                               unsigned &BasePos,
3339                                               unsigned &OffsetPos,
3340                                               unsigned &NewBase,
3341                                               int64_t &Offset) {
3342   // Get the load instruction.
3343   if (TII->isPostIncrement(*MI))
3344     return false;
3345   unsigned BasePosLd, OffsetPosLd;
3346   if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
3347     return false;
3348   unsigned BaseReg = MI->getOperand(BasePosLd).getReg();
3349 
3350   // Look for the Phi instruction.
3351   MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
3352   MachineInstr *Phi = MRI.getVRegDef(BaseReg);
3353   if (!Phi || !Phi->isPHI())
3354     return false;
3355   // Get the register defined in the loop block.
3356   unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
3357   if (!PrevReg)
3358     return false;
3359 
3360   // Check for the post-increment load/store instruction.
3361   MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
3362   if (!PrevDef || PrevDef == MI)
3363     return false;
3364 
3365   if (!TII->isPostIncrement(*PrevDef))
3366     return false;
3367 
3368   unsigned BasePos1 = 0, OffsetPos1 = 0;
3369   if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
3370     return false;
3371 
3372   // Make sure offset values are both positive or both negative.
3373   int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
3374   int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
3375   if ((LoadOffset >= 0) != (StoreOffset >= 0))
3376     return false;
3377 
3378   // Set the return value once we determine that we return true.
3379   BasePos = BasePosLd;
3380   OffsetPos = OffsetPosLd;
3381   NewBase = PrevReg;
3382   Offset = StoreOffset;
3383   return true;
3384 }
3385 
3386 /// Apply changes to the instruction if needed. The changes are need
3387 /// to improve the scheduling and depend up on the final schedule.
3388 void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
3389                                          SMSchedule &Schedule) {
3390   SUnit *SU = getSUnit(MI);
3391   DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
3392       InstrChanges.find(SU);
3393   if (It != InstrChanges.end()) {
3394     std::pair<unsigned, int64_t> RegAndOffset = It->second;
3395     unsigned BasePos, OffsetPos;
3396     if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3397       return;
3398     unsigned BaseReg = MI->getOperand(BasePos).getReg();
3399     MachineInstr *LoopDef = findDefInLoop(BaseReg);
3400     int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
3401     int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
3402     int BaseStageNum = Schedule.stageScheduled(SU);
3403     int BaseCycleNum = Schedule.cycleScheduled(SU);
3404     if (BaseStageNum < DefStageNum) {
3405       MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3406       int OffsetDiff = DefStageNum - BaseStageNum;
3407       if (DefCycleNum < BaseCycleNum) {
3408         NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
3409         if (OffsetDiff > 0)
3410           --OffsetDiff;
3411       }
3412       int64_t NewOffset =
3413           MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
3414       NewMI->getOperand(OffsetPos).setImm(NewOffset);
3415       SU->setInstr(NewMI);
3416       MISUnitMap[NewMI] = SU;
3417       NewMIs.insert(NewMI);
3418     }
3419   }
3420 }
3421 
3422 /// Return true for an order dependence that is loop carried potentially.
3423 /// An order dependence is loop carried if the destination defines a value
3424 /// that may be used by the source in a subsequent iteration.
3425 bool SwingSchedulerDAG::isLoopCarriedOrder(SUnit *Source, const SDep &Dep,
3426                                            bool isSucc) {
3427   if (!isOrder(Source, Dep) || Dep.isArtificial())
3428     return false;
3429 
3430   if (!SwpPruneLoopCarried)
3431     return true;
3432 
3433   MachineInstr *SI = Source->getInstr();
3434   MachineInstr *DI = Dep.getSUnit()->getInstr();
3435   if (!isSucc)
3436     std::swap(SI, DI);
3437   assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
3438 
3439   // Assume ordered loads and stores may have a loop carried dependence.
3440   if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
3441       SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
3442     return true;
3443 
3444   // Only chain dependences between a load and store can be loop carried.
3445   if (!DI->mayStore() || !SI->mayLoad())
3446     return false;
3447 
3448   unsigned DeltaS, DeltaD;
3449   if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
3450     return true;
3451 
3452   unsigned BaseRegS, BaseRegD;
3453   int64_t OffsetS, OffsetD;
3454   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3455   if (!TII->getMemOpBaseRegImmOfs(*SI, BaseRegS, OffsetS, TRI) ||
3456       !TII->getMemOpBaseRegImmOfs(*DI, BaseRegD, OffsetD, TRI))
3457     return true;
3458 
3459   if (BaseRegS != BaseRegD)
3460     return true;
3461 
3462   uint64_t AccessSizeS = (*SI->memoperands_begin())->getSize();
3463   uint64_t AccessSizeD = (*DI->memoperands_begin())->getSize();
3464 
3465   // This is the main test, which checks the offset values and the loop
3466   // increment value to determine if the accesses may be loop carried.
3467   if (OffsetS >= OffsetD)
3468     return OffsetS + AccessSizeS > DeltaS;
3469   else
3470     return OffsetD + AccessSizeD > DeltaD;
3471 
3472   return true;
3473 }
3474 
3475 void SwingSchedulerDAG::postprocessDAG() {
3476   for (auto &M : Mutations)
3477     M->apply(this);
3478 }
3479 
3480 /// Try to schedule the node at the specified StartCycle and continue
3481 /// until the node is schedule or the EndCycle is reached.  This function
3482 /// returns true if the node is scheduled.  This routine may search either
3483 /// forward or backward for a place to insert the instruction based upon
3484 /// the relative values of StartCycle and EndCycle.
3485 bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
3486   bool forward = true;
3487   if (StartCycle > EndCycle)
3488     forward = false;
3489 
3490   // The terminating condition depends on the direction.
3491   int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
3492   for (int curCycle = StartCycle; curCycle != termCycle;
3493        forward ? ++curCycle : --curCycle) {
3494 
3495     // Add the already scheduled instructions at the specified cycle to the DFA.
3496     Resources->clearResources();
3497     for (int checkCycle = FirstCycle + ((curCycle - FirstCycle) % II);
3498          checkCycle <= LastCycle; checkCycle += II) {
3499       std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[checkCycle];
3500 
3501       for (std::deque<SUnit *>::iterator I = cycleInstrs.begin(),
3502                                          E = cycleInstrs.end();
3503            I != E; ++I) {
3504         if (ST.getInstrInfo()->isZeroCost((*I)->getInstr()->getOpcode()))
3505           continue;
3506         assert(Resources->canReserveResources(*(*I)->getInstr()) &&
3507                "These instructions have already been scheduled.");
3508         Resources->reserveResources(*(*I)->getInstr());
3509       }
3510     }
3511     if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
3512         Resources->canReserveResources(*SU->getInstr())) {
3513       DEBUG({
3514         dbgs() << "\tinsert at cycle " << curCycle << " ";
3515         SU->getInstr()->dump();
3516       });
3517 
3518       ScheduledInstrs[curCycle].push_back(SU);
3519       InstrToCycle.insert(std::make_pair(SU, curCycle));
3520       if (curCycle > LastCycle)
3521         LastCycle = curCycle;
3522       if (curCycle < FirstCycle)
3523         FirstCycle = curCycle;
3524       return true;
3525     }
3526     DEBUG({
3527       dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
3528       SU->getInstr()->dump();
3529     });
3530   }
3531   return false;
3532 }
3533 
3534 // Return the cycle of the earliest scheduled instruction in the chain.
3535 int SMSchedule::earliestCycleInChain(const SDep &Dep) {
3536   SmallPtrSet<SUnit *, 8> Visited;
3537   SmallVector<SDep, 8> Worklist;
3538   Worklist.push_back(Dep);
3539   int EarlyCycle = INT_MAX;
3540   while (!Worklist.empty()) {
3541     const SDep &Cur = Worklist.pop_back_val();
3542     SUnit *PrevSU = Cur.getSUnit();
3543     if (Visited.count(PrevSU))
3544       continue;
3545     std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
3546     if (it == InstrToCycle.end())
3547       continue;
3548     EarlyCycle = std::min(EarlyCycle, it->second);
3549     for (const auto &PI : PrevSU->Preds)
3550       if (SwingSchedulerDAG::isOrder(PrevSU, PI))
3551         Worklist.push_back(PI);
3552     Visited.insert(PrevSU);
3553   }
3554   return EarlyCycle;
3555 }
3556 
3557 // Return the cycle of the latest scheduled instruction in the chain.
3558 int SMSchedule::latestCycleInChain(const SDep &Dep) {
3559   SmallPtrSet<SUnit *, 8> Visited;
3560   SmallVector<SDep, 8> Worklist;
3561   Worklist.push_back(Dep);
3562   int LateCycle = INT_MIN;
3563   while (!Worklist.empty()) {
3564     const SDep &Cur = Worklist.pop_back_val();
3565     SUnit *SuccSU = Cur.getSUnit();
3566     if (Visited.count(SuccSU))
3567       continue;
3568     std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
3569     if (it == InstrToCycle.end())
3570       continue;
3571     LateCycle = std::max(LateCycle, it->second);
3572     for (const auto &SI : SuccSU->Succs)
3573       if (SwingSchedulerDAG::isOrder(SuccSU, SI))
3574         Worklist.push_back(SI);
3575     Visited.insert(SuccSU);
3576   }
3577   return LateCycle;
3578 }
3579 
3580 /// If an instruction has a use that spans multiple iterations, then
3581 /// return true. These instructions are characterized by having a back-ege
3582 /// to a Phi, which contains a reference to another Phi.
3583 static SUnit *multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG) {
3584   for (auto &P : SU->Preds)
3585     if (DAG->isBackedge(SU, P) && P.getSUnit()->getInstr()->isPHI())
3586       for (auto &S : P.getSUnit()->Succs)
3587         if (S.getKind() == SDep::Order && S.getSUnit()->getInstr()->isPHI())
3588           return P.getSUnit();
3589   return nullptr;
3590 }
3591 
3592 /// Compute the scheduling start slot for the instruction.  The start slot
3593 /// depends on any predecessor or successor nodes scheduled already.
3594 void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
3595                               int *MinEnd, int *MaxStart, int II,
3596                               SwingSchedulerDAG *DAG) {
3597   // Iterate over each instruction that has been scheduled already.  The start
3598   // slot computuation depends on whether the previously scheduled instruction
3599   // is a predecessor or successor of the specified instruction.
3600   for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
3601 
3602     // Iterate over each instruction in the current cycle.
3603     for (SUnit *I : getInstructions(cycle)) {
3604       // Because we're processing a DAG for the dependences, we recognize
3605       // the back-edge in recurrences by anti dependences.
3606       for (unsigned i = 0, e = (unsigned)SU->Preds.size(); i != e; ++i) {
3607         const SDep &Dep = SU->Preds[i];
3608         if (Dep.getSUnit() == I) {
3609           if (!DAG->isBackedge(SU, Dep)) {
3610             int EarlyStart = cycle + DAG->getLatency(SU, Dep) -
3611                              DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
3612             *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
3613             if (DAG->isLoopCarriedOrder(SU, Dep, false)) {
3614               int End = earliestCycleInChain(Dep) + (II - 1);
3615               *MinEnd = std::min(*MinEnd, End);
3616             }
3617           } else {
3618             int LateStart = cycle - DAG->getLatency(SU, Dep) +
3619                             DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
3620             *MinLateStart = std::min(*MinLateStart, LateStart);
3621           }
3622         }
3623         // For instruction that requires multiple iterations, make sure that
3624         // the dependent instruction is not scheduled past the definition.
3625         SUnit *BE = multipleIterations(I, DAG);
3626         if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
3627             !SU->isPred(I))
3628           *MinLateStart = std::min(*MinLateStart, cycle);
3629       }
3630       for (unsigned i = 0, e = (unsigned)SU->Succs.size(); i != e; ++i)
3631         if (SU->Succs[i].getSUnit() == I) {
3632           const SDep &Dep = SU->Succs[i];
3633           if (!DAG->isBackedge(SU, Dep)) {
3634             int LateStart = cycle - DAG->getLatency(SU, Dep) +
3635                             DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
3636             *MinLateStart = std::min(*MinLateStart, LateStart);
3637             if (DAG->isLoopCarriedOrder(SU, Dep)) {
3638               int Start = latestCycleInChain(Dep) + 1 - II;
3639               *MaxStart = std::max(*MaxStart, Start);
3640             }
3641           } else {
3642             int EarlyStart = cycle + DAG->getLatency(SU, Dep) -
3643                              DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
3644             *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
3645           }
3646         }
3647     }
3648   }
3649 }
3650 
3651 /// Order the instructions within a cycle so that the definitions occur
3652 /// before the uses. Returns true if the instruction is added to the start
3653 /// of the list, or false if added to the end.
3654 bool SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
3655                                  std::deque<SUnit *> &Insts) {
3656   MachineInstr *MI = SU->getInstr();
3657   bool OrderBeforeUse = false;
3658   bool OrderAfterDef = false;
3659   bool OrderBeforeDef = false;
3660   unsigned MoveDef = 0;
3661   unsigned MoveUse = 0;
3662   int StageInst1 = stageScheduled(SU);
3663 
3664   unsigned Pos = 0;
3665   for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
3666        ++I, ++Pos) {
3667     // Relative order of Phis does not matter.
3668     if (MI->isPHI() && (*I)->getInstr()->isPHI())
3669       continue;
3670     for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3671       MachineOperand &MO = MI->getOperand(i);
3672       if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
3673         continue;
3674       unsigned Reg = MO.getReg();
3675       unsigned BasePos, OffsetPos;
3676       if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3677         if (MI->getOperand(BasePos).getReg() == Reg)
3678           if (unsigned NewReg = SSD->getInstrBaseReg(SU))
3679             Reg = NewReg;
3680       bool Reads, Writes;
3681       std::tie(Reads, Writes) =
3682           (*I)->getInstr()->readsWritesVirtualRegister(Reg);
3683       if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
3684         OrderBeforeUse = true;
3685         MoveUse = Pos;
3686       } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
3687         // Add the instruction after the scheduled instruction.
3688         OrderAfterDef = true;
3689         MoveDef = Pos;
3690       } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
3691         if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
3692           OrderBeforeUse = true;
3693           MoveUse = Pos;
3694         } else {
3695           OrderAfterDef = true;
3696           MoveDef = Pos;
3697         }
3698       } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
3699         OrderBeforeUse = true;
3700         MoveUse = Pos;
3701         if (MoveUse != 0) {
3702           OrderAfterDef = true;
3703           MoveDef = Pos - 1;
3704         }
3705       } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
3706         // Add the instruction before the scheduled instruction.
3707         OrderBeforeUse = true;
3708         MoveUse = Pos;
3709       } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
3710                  isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
3711         OrderBeforeDef = true;
3712         MoveUse = Pos;
3713       }
3714     }
3715     // Check for order dependences between instructions. Make sure the source
3716     // is ordered before the destination.
3717     for (auto &S : SU->Succs)
3718       if (S.getKind() == SDep::Order) {
3719         if (S.getSUnit() == *I && stageScheduled(*I) == StageInst1) {
3720           OrderBeforeUse = true;
3721           MoveUse = Pos;
3722         }
3723       } else if (TargetRegisterInfo::isPhysicalRegister(S.getReg())) {
3724         if (cycleScheduled(SU) != cycleScheduled(S.getSUnit())) {
3725           if (S.isAssignedRegDep()) {
3726             OrderAfterDef = true;
3727             MoveDef = Pos;
3728           }
3729         } else {
3730           OrderBeforeUse = true;
3731           MoveUse = Pos;
3732         }
3733       }
3734     for (auto &P : SU->Preds)
3735       if (P.getKind() == SDep::Order) {
3736         if (P.getSUnit() == *I && stageScheduled(*I) == StageInst1) {
3737           OrderAfterDef = true;
3738           MoveDef = Pos;
3739         }
3740       } else if (TargetRegisterInfo::isPhysicalRegister(P.getReg())) {
3741         if (cycleScheduled(SU) != cycleScheduled(P.getSUnit())) {
3742           if (P.isAssignedRegDep()) {
3743             OrderBeforeUse = true;
3744             MoveUse = Pos;
3745           }
3746         } else {
3747           OrderAfterDef = true;
3748           MoveDef = Pos;
3749         }
3750       }
3751   }
3752 
3753   // A circular dependence.
3754   if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3755     OrderBeforeUse = false;
3756 
3757   // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3758   // to a loop-carried dependence.
3759   if (OrderBeforeDef)
3760     OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
3761 
3762   // The uncommon case when the instruction order needs to be updated because
3763   // there is both a use and def.
3764   if (OrderBeforeUse && OrderAfterDef) {
3765     SUnit *UseSU = Insts.at(MoveUse);
3766     SUnit *DefSU = Insts.at(MoveDef);
3767     if (MoveUse > MoveDef) {
3768       Insts.erase(Insts.begin() + MoveUse);
3769       Insts.erase(Insts.begin() + MoveDef);
3770     } else {
3771       Insts.erase(Insts.begin() + MoveDef);
3772       Insts.erase(Insts.begin() + MoveUse);
3773     }
3774     if (orderDependence(SSD, UseSU, Insts)) {
3775       Insts.push_front(SU);
3776       orderDependence(SSD, DefSU, Insts);
3777       return true;
3778     }
3779     Insts.pop_back();
3780     Insts.push_back(SU);
3781     Insts.push_back(UseSU);
3782     orderDependence(SSD, DefSU, Insts);
3783     return false;
3784   }
3785   // Put the new instruction first if there is a use in the list. Otherwise,
3786   // put it at the end of the list.
3787   if (OrderBeforeUse)
3788     Insts.push_front(SU);
3789   else
3790     Insts.push_back(SU);
3791   return OrderBeforeUse;
3792 }
3793 
3794 /// Return true if the scheduled Phi has a loop carried operand.
3795 bool SMSchedule::isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi) {
3796   if (!Phi.isPHI())
3797     return false;
3798   assert(Phi.isPHI() && "Expecing a Phi.");
3799   SUnit *DefSU = SSD->getSUnit(&Phi);
3800   unsigned DefCycle = cycleScheduled(DefSU);
3801   int DefStage = stageScheduled(DefSU);
3802 
3803   unsigned InitVal = 0;
3804   unsigned LoopVal = 0;
3805   getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
3806   SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
3807   if (!UseSU)
3808     return true;
3809   if (UseSU->getInstr()->isPHI())
3810     return true;
3811   unsigned LoopCycle = cycleScheduled(UseSU);
3812   int LoopStage = stageScheduled(UseSU);
3813   return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
3814 }
3815 
3816 /// Return true if the instruction is a definition that is loop carried
3817 /// and defines the use on the next iteration.
3818 ///        v1 = phi(v2, v3)
3819 ///  (Def) v3 = op v1
3820 ///  (MO)   = v1
3821 /// If MO appears before Def, then then v1 and v3 may get assigned to the same
3822 /// register.
3823 bool SMSchedule::isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD,
3824                                        MachineInstr *Def, MachineOperand &MO) {
3825   if (!MO.isReg())
3826     return false;
3827   if (Def->isPHI())
3828     return false;
3829   MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
3830   if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
3831     return false;
3832   if (!isLoopCarried(SSD, *Phi))
3833     return false;
3834   unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
3835   for (unsigned i = 0, e = Def->getNumOperands(); i != e; ++i) {
3836     MachineOperand &DMO = Def->getOperand(i);
3837     if (!DMO.isReg() || !DMO.isDef())
3838       continue;
3839     if (DMO.getReg() == LoopReg)
3840       return true;
3841   }
3842   return false;
3843 }
3844 
3845 // Check if the generated schedule is valid. This function checks if
3846 // an instruction that uses a physical register is scheduled in a
3847 // different stage than the definition. The pipeliner does not handle
3848 // physical register values that may cross a basic block boundary.
3849 bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
3850   for (int i = 0, e = SSD->SUnits.size(); i < e; ++i) {
3851     SUnit &SU = SSD->SUnits[i];
3852     if (!SU.hasPhysRegDefs)
3853       continue;
3854     int StageDef = stageScheduled(&SU);
3855     assert(StageDef != -1 && "Instruction should have been scheduled.");
3856     for (auto &SI : SU.Succs)
3857       if (SI.isAssignedRegDep())
3858         if (ST.getRegisterInfo()->isPhysicalRegister(SI.getReg()))
3859           if (stageScheduled(SI.getSUnit()) != StageDef)
3860             return false;
3861   }
3862   return true;
3863 }
3864 
3865 /// Attempt to fix the degenerate cases when the instruction serialization
3866 /// causes the register lifetimes to overlap. For example,
3867 ///   p' = store_pi(p, b)
3868 ///      = load p, offset
3869 /// In this case p and p' overlap, which means that two registers are needed.
3870 /// Instead, this function changes the load to use p' and updates the offset.
3871 void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
3872   unsigned OverlapReg = 0;
3873   unsigned NewBaseReg = 0;
3874   for (SUnit *SU : Instrs) {
3875     MachineInstr *MI = SU->getInstr();
3876     for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3877       const MachineOperand &MO = MI->getOperand(i);
3878       // Look for an instruction that uses p. The instruction occurs in the
3879       // same cycle but occurs later in the serialized order.
3880       if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
3881         // Check that the instruction appears in the InstrChanges structure,
3882         // which contains instructions that can have the offset updated.
3883         DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
3884           InstrChanges.find(SU);
3885         if (It != InstrChanges.end()) {
3886           unsigned BasePos, OffsetPos;
3887           // Update the base register and adjust the offset.
3888           if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
3889             MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3890             NewMI->getOperand(BasePos).setReg(NewBaseReg);
3891             int64_t NewOffset =
3892                 MI->getOperand(OffsetPos).getImm() - It->second.second;
3893             NewMI->getOperand(OffsetPos).setImm(NewOffset);
3894             SU->setInstr(NewMI);
3895             MISUnitMap[NewMI] = SU;
3896             NewMIs.insert(NewMI);
3897           }
3898         }
3899         OverlapReg = 0;
3900         NewBaseReg = 0;
3901         break;
3902       }
3903       // Look for an instruction of the form p' = op(p), which uses and defines
3904       // two virtual registers that get allocated to the same physical register.
3905       unsigned TiedUseIdx = 0;
3906       if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
3907         // OverlapReg is p in the example above.
3908         OverlapReg = MI->getOperand(TiedUseIdx).getReg();
3909         // NewBaseReg is p' in the example above.
3910         NewBaseReg = MI->getOperand(i).getReg();
3911         break;
3912       }
3913     }
3914   }
3915 }
3916 
3917 /// After the schedule has been formed, call this function to combine
3918 /// the instructions from the different stages/cycles.  That is, this
3919 /// function creates a schedule that represents a single iteration.
3920 void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
3921   // Move all instructions to the first stage from later stages.
3922   for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3923     for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
3924          ++stage) {
3925       std::deque<SUnit *> &cycleInstrs =
3926           ScheduledInstrs[cycle + (stage * InitiationInterval)];
3927       for (std::deque<SUnit *>::reverse_iterator I = cycleInstrs.rbegin(),
3928                                                  E = cycleInstrs.rend();
3929            I != E; ++I)
3930         ScheduledInstrs[cycle].push_front(*I);
3931     }
3932   }
3933   // Iterate over the definitions in each instruction, and compute the
3934   // stage difference for each use.  Keep the maximum value.
3935   for (auto &I : InstrToCycle) {
3936     int DefStage = stageScheduled(I.first);
3937     MachineInstr *MI = I.first->getInstr();
3938     for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3939       MachineOperand &Op = MI->getOperand(i);
3940       if (!Op.isReg() || !Op.isDef())
3941         continue;
3942 
3943       unsigned Reg = Op.getReg();
3944       unsigned MaxDiff = 0;
3945       bool PhiIsSwapped = false;
3946       for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(Reg),
3947                                              EI = MRI.use_end();
3948            UI != EI; ++UI) {
3949         MachineOperand &UseOp = *UI;
3950         MachineInstr *UseMI = UseOp.getParent();
3951         SUnit *SUnitUse = SSD->getSUnit(UseMI);
3952         int UseStage = stageScheduled(SUnitUse);
3953         unsigned Diff = 0;
3954         if (UseStage != -1 && UseStage >= DefStage)
3955           Diff = UseStage - DefStage;
3956         if (MI->isPHI()) {
3957           if (isLoopCarried(SSD, *MI))
3958             ++Diff;
3959           else
3960             PhiIsSwapped = true;
3961         }
3962         MaxDiff = std::max(Diff, MaxDiff);
3963       }
3964       RegToStageDiff[Reg] = std::make_pair(MaxDiff, PhiIsSwapped);
3965     }
3966   }
3967 
3968   // Erase all the elements in the later stages. Only one iteration should
3969   // remain in the scheduled list, and it contains all the instructions.
3970   for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
3971     ScheduledInstrs.erase(cycle);
3972 
3973   // Change the registers in instruction as specified in the InstrChanges
3974   // map. We need to use the new registers to create the correct order.
3975   for (int i = 0, e = SSD->SUnits.size(); i != e; ++i) {
3976     SUnit *SU = &SSD->SUnits[i];
3977     SSD->applyInstrChange(SU->getInstr(), *this);
3978   }
3979 
3980   // Reorder the instructions in each cycle to fix and improve the
3981   // generated code.
3982   for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
3983     std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
3984     std::deque<SUnit *> newOrderZC;
3985     // Put the zero-cost, pseudo instructions at the start of the cycle.
3986     for (unsigned i = 0, e = cycleInstrs.size(); i < e; ++i) {
3987       SUnit *SU = cycleInstrs[i];
3988       if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
3989         orderDependence(SSD, SU, newOrderZC);
3990     }
3991     std::deque<SUnit *> newOrderI;
3992     // Then, add the regular instructions back.
3993     for (unsigned i = 0, e = cycleInstrs.size(); i < e; ++i) {
3994       SUnit *SU = cycleInstrs[i];
3995       if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
3996         orderDependence(SSD, SU, newOrderI);
3997     }
3998     // Replace the old order with the new order.
3999     cycleInstrs.swap(newOrderZC);
4000     cycleInstrs.insert(cycleInstrs.end(), newOrderI.begin(), newOrderI.end());
4001     SSD->fixupRegisterOverlaps(cycleInstrs);
4002   }
4003 
4004   DEBUG(dump(););
4005 }
4006 
4007 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4008 /// Print the schedule information to the given output.
4009 void SMSchedule::print(raw_ostream &os) const {
4010   // Iterate over each cycle.
4011   for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
4012     // Iterate over each instruction in the cycle.
4013     const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
4014     for (SUnit *CI : cycleInstrs->second) {
4015       os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
4016       os << "(" << CI->NodeNum << ") ";
4017       CI->getInstr()->print(os);
4018       os << "\n";
4019     }
4020   }
4021 }
4022 
4023 /// Utility function used for debugging to print the schedule.
4024 LLVM_DUMP_METHOD void SMSchedule::dump() const { print(dbgs()); }
4025 #endif
4026