1 //===- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies ---------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Copies from VGPR to SGPR registers are illegal and the register coalescer
11 /// will sometimes generate these illegal copies in situations like this:
12 ///
13 ///  Register Class <vsrc> is the union of <vgpr> and <sgpr>
14 ///
15 /// BB0:
16 ///   %0 <sgpr> = SCALAR_INST
17 ///   %1 <vsrc> = COPY %0 <sgpr>
18 ///    ...
19 ///    BRANCH %cond BB1, BB2
20 ///  BB1:
21 ///    %2 <vgpr> = VECTOR_INST
22 ///    %3 <vsrc> = COPY %2 <vgpr>
23 ///  BB2:
24 ///    %4 <vsrc> = PHI %1 <vsrc>, <%bb.0>, %3 <vrsc>, <%bb.1>
25 ///    %5 <vgpr> = VECTOR_INST %4 <vsrc>
26 ///
27 ///
28 /// The coalescer will begin at BB0 and eliminate its copy, then the resulting
29 /// code will look like this:
30 ///
31 /// BB0:
32 ///   %0 <sgpr> = SCALAR_INST
33 ///    ...
34 ///    BRANCH %cond BB1, BB2
35 /// BB1:
36 ///   %2 <vgpr> = VECTOR_INST
37 ///   %3 <vsrc> = COPY %2 <vgpr>
38 /// BB2:
39 ///   %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <vsrc>, <%bb.1>
40 ///   %5 <vgpr> = VECTOR_INST %4 <sgpr>
41 ///
42 /// Now that the result of the PHI instruction is an SGPR, the register
43 /// allocator is now forced to constrain the register class of %3 to
44 /// <sgpr> so we end up with final code like this:
45 ///
46 /// BB0:
47 ///   %0 <sgpr> = SCALAR_INST
48 ///    ...
49 ///    BRANCH %cond BB1, BB2
50 /// BB1:
51 ///   %2 <vgpr> = VECTOR_INST
52 ///   %3 <sgpr> = COPY %2 <vgpr>
53 /// BB2:
54 ///   %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <sgpr>, <%bb.1>
55 ///   %5 <vgpr> = VECTOR_INST %4 <sgpr>
56 ///
57 /// Now this code contains an illegal copy from a VGPR to an SGPR.
58 ///
59 /// In order to avoid this problem, this pass searches for PHI instructions
60 /// which define a <vsrc> register and constrains its definition class to
61 /// <vgpr> if the user of the PHI's definition register is a vector instruction.
62 /// If the PHI's definition class is constrained to <vgpr> then the coalescer
63 /// will be unable to perform the COPY removal from the above example  which
64 /// ultimately led to the creation of an illegal COPY.
65 //===----------------------------------------------------------------------===//
66 
67 #include "AMDGPU.h"
68 #include "AMDGPUSubtarget.h"
69 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
70 #include "SIInstrInfo.h"
71 #include "SIRegisterInfo.h"
72 #include "llvm/ADT/DenseSet.h"
73 #include "llvm/ADT/STLExtras.h"
74 #include "llvm/ADT/SmallSet.h"
75 #include "llvm/ADT/SmallVector.h"
76 #include "llvm/CodeGen/MachineBasicBlock.h"
77 #include "llvm/CodeGen/MachineDominators.h"
78 #include "llvm/CodeGen/MachineFunction.h"
79 #include "llvm/CodeGen/MachineFunctionPass.h"
80 #include "llvm/CodeGen/MachineInstr.h"
81 #include "llvm/CodeGen/MachineInstrBuilder.h"
82 #include "llvm/CodeGen/MachineOperand.h"
83 #include "llvm/CodeGen/MachineRegisterInfo.h"
84 #include "llvm/CodeGen/TargetRegisterInfo.h"
85 #include "llvm/InitializePasses.h"
86 #include "llvm/Pass.h"
87 #include "llvm/Support/CodeGen.h"
88 #include "llvm/Support/CommandLine.h"
89 #include "llvm/Support/Debug.h"
90 #include "llvm/Support/raw_ostream.h"
91 #include "llvm/Target/TargetMachine.h"
92 #include <cassert>
93 #include <cstdint>
94 #include <iterator>
95 #include <list>
96 #include <map>
97 #include <tuple>
98 #include <utility>
99 
100 using namespace llvm;
101 
102 #define DEBUG_TYPE "si-fix-sgpr-copies"
103 
104 static cl::opt<bool> EnableM0Merge(
105   "amdgpu-enable-merge-m0",
106   cl::desc("Merge and hoist M0 initializations"),
107   cl::init(true));
108 
109 namespace {
110 
111 class SIFixSGPRCopies : public MachineFunctionPass {
112   MachineDominatorTree *MDT;
113 
114 public:
115   static char ID;
116 
117   MachineRegisterInfo *MRI;
118   const SIRegisterInfo *TRI;
119   const SIInstrInfo *TII;
120 
121   SIFixSGPRCopies() : MachineFunctionPass(ID) {}
122 
123   bool runOnMachineFunction(MachineFunction &MF) override;
124 
125   void processPHINode(MachineInstr &MI);
126 
127   StringRef getPassName() const override { return "SI Fix SGPR copies"; }
128 
129   void getAnalysisUsage(AnalysisUsage &AU) const override {
130     AU.addRequired<MachineDominatorTree>();
131     AU.addPreserved<MachineDominatorTree>();
132     AU.setPreservesCFG();
133     MachineFunctionPass::getAnalysisUsage(AU);
134   }
135 };
136 
137 } // end anonymous namespace
138 
139 INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE,
140                      "SI Fix SGPR copies", false, false)
141 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
142 INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE,
143                      "SI Fix SGPR copies", false, false)
144 
145 char SIFixSGPRCopies::ID = 0;
146 
147 char &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID;
148 
149 FunctionPass *llvm::createSIFixSGPRCopiesPass() {
150   return new SIFixSGPRCopies();
151 }
152 
153 static bool hasVectorOperands(const MachineInstr &MI,
154                               const SIRegisterInfo *TRI) {
155   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
156   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
157     if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
158       continue;
159 
160     if (TRI->hasVectorRegisters(MRI.getRegClass(MI.getOperand(i).getReg())))
161       return true;
162   }
163   return false;
164 }
165 
166 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
167 getCopyRegClasses(const MachineInstr &Copy,
168                   const SIRegisterInfo &TRI,
169                   const MachineRegisterInfo &MRI) {
170   Register DstReg = Copy.getOperand(0).getReg();
171   Register SrcReg = Copy.getOperand(1).getReg();
172 
173   const TargetRegisterClass *SrcRC = SrcReg.isVirtual()
174                                          ? MRI.getRegClass(SrcReg)
175                                          : TRI.getPhysRegClass(SrcReg);
176 
177   // We don't really care about the subregister here.
178   // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg());
179 
180   const TargetRegisterClass *DstRC = DstReg.isVirtual()
181                                          ? MRI.getRegClass(DstReg)
182                                          : TRI.getPhysRegClass(DstReg);
183 
184   return std::make_pair(SrcRC, DstRC);
185 }
186 
187 static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
188                              const TargetRegisterClass *DstRC,
189                              const SIRegisterInfo &TRI) {
190   return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) &&
191          TRI.hasVectorRegisters(SrcRC);
192 }
193 
194 static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
195                              const TargetRegisterClass *DstRC,
196                              const SIRegisterInfo &TRI) {
197   return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) &&
198          TRI.hasVectorRegisters(DstRC);
199 }
200 
201 static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
202                                       const SIRegisterInfo *TRI,
203                                       const SIInstrInfo *TII) {
204   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
205   auto &Src = MI.getOperand(1);
206   Register DstReg = MI.getOperand(0).getReg();
207   Register SrcReg = Src.getReg();
208   if (!SrcReg.isVirtual() || !DstReg.isVirtual())
209     return false;
210 
211   for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) {
212     const auto *UseMI = MO.getParent();
213     if (UseMI == &MI)
214       continue;
215     if (MO.isDef() || UseMI->getParent() != MI.getParent() ||
216         UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END ||
217         !TII->isOperandLegal(*UseMI, UseMI->getOperandNo(&MO), &Src))
218       return false;
219   }
220   // Change VGPR to SGPR destination.
221   MRI.setRegClass(DstReg, TRI->getEquivalentSGPRClass(MRI.getRegClass(DstReg)));
222   return true;
223 }
224 
225 // Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE.
226 //
227 // SGPRx = ...
228 // SGPRy = REG_SEQUENCE SGPRx, sub0 ...
229 // VGPRz = COPY SGPRy
230 //
231 // ==>
232 //
233 // VGPRx = COPY SGPRx
234 // VGPRz = REG_SEQUENCE VGPRx, sub0
235 //
236 // This exposes immediate folding opportunities when materializing 64-bit
237 // immediates.
238 static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
239                                         const SIRegisterInfo *TRI,
240                                         const SIInstrInfo *TII,
241                                         MachineRegisterInfo &MRI) {
242   assert(MI.isRegSequence());
243 
244   Register DstReg = MI.getOperand(0).getReg();
245   if (!TRI->isSGPRClass(MRI.getRegClass(DstReg)))
246     return false;
247 
248   if (!MRI.hasOneUse(DstReg))
249     return false;
250 
251   MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg);
252   if (!CopyUse.isCopy())
253     return false;
254 
255   // It is illegal to have vreg inputs to a physreg defining reg_sequence.
256   if (CopyUse.getOperand(0).getReg().isPhysical())
257     return false;
258 
259   const TargetRegisterClass *SrcRC, *DstRC;
260   std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI);
261 
262   if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI))
263     return false;
264 
265   if (tryChangeVGPRtoSGPRinCopy(CopyUse, TRI, TII))
266     return true;
267 
268   // TODO: Could have multiple extracts?
269   unsigned SubReg = CopyUse.getOperand(1).getSubReg();
270   if (SubReg != AMDGPU::NoSubRegister)
271     return false;
272 
273   MRI.setRegClass(DstReg, DstRC);
274 
275   // SGPRx = ...
276   // SGPRy = REG_SEQUENCE SGPRx, sub0 ...
277   // VGPRz = COPY SGPRy
278 
279   // =>
280   // VGPRx = COPY SGPRx
281   // VGPRz = REG_SEQUENCE VGPRx, sub0
282 
283   MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg());
284   bool IsAGPR = TRI->hasAGPRs(DstRC);
285 
286   for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
287     Register SrcReg = MI.getOperand(I).getReg();
288     unsigned SrcSubReg = MI.getOperand(I).getSubReg();
289 
290     const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
291     assert(TRI->isSGPRClass(SrcRC) &&
292            "Expected SGPR REG_SEQUENCE to only have SGPR inputs");
293 
294     SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg);
295     const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC);
296 
297     Register TmpReg = MRI.createVirtualRegister(NewSrcRC);
298 
299     BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
300             TmpReg)
301         .add(MI.getOperand(I));
302 
303     if (IsAGPR) {
304       const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC);
305       Register TmpAReg = MRI.createVirtualRegister(NewSrcRC);
306       unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ?
307         AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY;
308       BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc),
309             TmpAReg)
310         .addReg(TmpReg, RegState::Kill);
311       TmpReg = TmpAReg;
312     }
313 
314     MI.getOperand(I).setReg(TmpReg);
315   }
316 
317   CopyUse.eraseFromParent();
318   return true;
319 }
320 
321 static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
322                                     const MachineInstr *MoveImm,
323                                     const SIInstrInfo *TII,
324                                     unsigned &SMovOp,
325                                     int64_t &Imm) {
326   if (Copy->getOpcode() != AMDGPU::COPY)
327     return false;
328 
329   if (!MoveImm->isMoveImmediate())
330     return false;
331 
332   const MachineOperand *ImmOp =
333       TII->getNamedOperand(*MoveImm, AMDGPU::OpName::src0);
334   if (!ImmOp->isImm())
335     return false;
336 
337   // FIXME: Handle copies with sub-regs.
338   if (Copy->getOperand(0).getSubReg())
339     return false;
340 
341   switch (MoveImm->getOpcode()) {
342   default:
343     return false;
344   case AMDGPU::V_MOV_B32_e32:
345     SMovOp = AMDGPU::S_MOV_B32;
346     break;
347   case AMDGPU::V_MOV_B64_PSEUDO:
348     SMovOp = AMDGPU::S_MOV_B64;
349     break;
350   }
351   Imm = ImmOp->getImm();
352   return true;
353 }
354 
355 template <class UnaryPredicate>
356 bool searchPredecessors(const MachineBasicBlock *MBB,
357                         const MachineBasicBlock *CutOff,
358                         UnaryPredicate Predicate) {
359   if (MBB == CutOff)
360     return false;
361 
362   DenseSet<const MachineBasicBlock *> Visited;
363   SmallVector<MachineBasicBlock *, 4> Worklist(MBB->pred_begin(),
364                                                MBB->pred_end());
365 
366   while (!Worklist.empty()) {
367     MachineBasicBlock *MBB = Worklist.pop_back_val();
368 
369     if (!Visited.insert(MBB).second)
370       continue;
371     if (MBB == CutOff)
372       continue;
373     if (Predicate(MBB))
374       return true;
375 
376     Worklist.append(MBB->pred_begin(), MBB->pred_end());
377   }
378 
379   return false;
380 }
381 
382 // Checks if there is potential path From instruction To instruction.
383 // If CutOff is specified and it sits in between of that path we ignore
384 // a higher portion of the path and report it is not reachable.
385 static bool isReachable(const MachineInstr *From,
386                         const MachineInstr *To,
387                         const MachineBasicBlock *CutOff,
388                         MachineDominatorTree &MDT) {
389   // If either From block dominates To block or instructions are in the same
390   // block and From is higher.
391   if (MDT.dominates(From, To))
392     return true;
393 
394   const MachineBasicBlock *MBBFrom = From->getParent();
395   const MachineBasicBlock *MBBTo = To->getParent();
396   if (MBBFrom == MBBTo)
397     return false;
398 
399   // Instructions are in different blocks, do predecessor search.
400   // We should almost never get here since we do not usually produce M0 stores
401   // other than -1.
402   return searchPredecessors(MBBTo, CutOff, [MBBFrom]
403            (const MachineBasicBlock *MBB) { return MBB == MBBFrom; });
404 }
405 
406 // Return the first non-prologue instruction in the block.
407 static MachineBasicBlock::iterator
408 getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) {
409   MachineBasicBlock::iterator I = MBB->getFirstNonPHI();
410   while (I != MBB->end() && TII->isBasicBlockPrologue(*I))
411     ++I;
412 
413   return I;
414 }
415 
416 // Hoist and merge identical SGPR initializations into a common predecessor.
417 // This is intended to combine M0 initializations, but can work with any
418 // SGPR. A VGPR cannot be processed since we cannot guarantee vector
419 // executioon.
420 static bool hoistAndMergeSGPRInits(unsigned Reg,
421                                    const MachineRegisterInfo &MRI,
422                                    const TargetRegisterInfo *TRI,
423                                    MachineDominatorTree &MDT,
424                                    const TargetInstrInfo *TII) {
425   // List of inits by immediate value.
426   using InitListMap = std::map<unsigned, std::list<MachineInstr *>>;
427   InitListMap Inits;
428   // List of clobbering instructions.
429   SmallVector<MachineInstr*, 8> Clobbers;
430   // List of instructions marked for deletion.
431   SmallSet<MachineInstr*, 8> MergedInstrs;
432 
433   bool Changed = false;
434 
435   for (auto &MI : MRI.def_instructions(Reg)) {
436     MachineOperand *Imm = nullptr;
437     for (auto &MO : MI.operands()) {
438       if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) ||
439           (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) {
440         Imm = nullptr;
441         break;
442       } else if (MO.isImm())
443         Imm = &MO;
444     }
445     if (Imm)
446       Inits[Imm->getImm()].push_front(&MI);
447     else
448       Clobbers.push_back(&MI);
449   }
450 
451   for (auto &Init : Inits) {
452     auto &Defs = Init.second;
453 
454     for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) {
455       MachineInstr *MI1 = *I1;
456 
457       for (auto I2 = std::next(I1); I2 != E; ) {
458         MachineInstr *MI2 = *I2;
459 
460         // Check any possible interference
461         auto interferes = [&](MachineBasicBlock::iterator From,
462                               MachineBasicBlock::iterator To) -> bool {
463 
464           assert(MDT.dominates(&*To, &*From));
465 
466           auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool {
467             const MachineBasicBlock *MBBFrom = From->getParent();
468             const MachineBasicBlock *MBBTo = To->getParent();
469             bool MayClobberFrom = isReachable(Clobber, &*From, MBBTo, MDT);
470             bool MayClobberTo = isReachable(Clobber, &*To, MBBTo, MDT);
471             if (!MayClobberFrom && !MayClobberTo)
472               return false;
473             if ((MayClobberFrom && !MayClobberTo) ||
474                 (!MayClobberFrom && MayClobberTo))
475               return true;
476             // Both can clobber, this is not an interference only if both are
477             // dominated by Clobber and belong to the same block or if Clobber
478             // properly dominates To, given that To >> From, so it dominates
479             // both and located in a common dominator.
480             return !((MBBFrom == MBBTo &&
481                       MDT.dominates(Clobber, &*From) &&
482                       MDT.dominates(Clobber, &*To)) ||
483                      MDT.properlyDominates(Clobber->getParent(), MBBTo));
484           };
485 
486           return (llvm::any_of(Clobbers, interferes)) ||
487                  (llvm::any_of(Inits, [&](InitListMap::value_type &C) {
488                     return C.first != Init.first &&
489                            llvm::any_of(C.second, interferes);
490                   }));
491         };
492 
493         if (MDT.dominates(MI1, MI2)) {
494           if (!interferes(MI2, MI1)) {
495             LLVM_DEBUG(dbgs()
496                        << "Erasing from "
497                        << printMBBReference(*MI2->getParent()) << " " << *MI2);
498             MergedInstrs.insert(MI2);
499             Changed = true;
500             ++I2;
501             continue;
502           }
503         } else if (MDT.dominates(MI2, MI1)) {
504           if (!interferes(MI1, MI2)) {
505             LLVM_DEBUG(dbgs()
506                        << "Erasing from "
507                        << printMBBReference(*MI1->getParent()) << " " << *MI1);
508             MergedInstrs.insert(MI1);
509             Changed = true;
510             ++I1;
511             break;
512           }
513         } else {
514           auto *MBB = MDT.findNearestCommonDominator(MI1->getParent(),
515                                                      MI2->getParent());
516           if (!MBB) {
517             ++I2;
518             continue;
519           }
520 
521           MachineBasicBlock::iterator I = getFirstNonPrologue(MBB, TII);
522           if (!interferes(MI1, I) && !interferes(MI2, I)) {
523             LLVM_DEBUG(dbgs()
524                        << "Erasing from "
525                        << printMBBReference(*MI1->getParent()) << " " << *MI1
526                        << "and moving from "
527                        << printMBBReference(*MI2->getParent()) << " to "
528                        << printMBBReference(*I->getParent()) << " " << *MI2);
529             I->getParent()->splice(I, MI2->getParent(), MI2);
530             MergedInstrs.insert(MI1);
531             Changed = true;
532             ++I1;
533             break;
534           }
535         }
536         ++I2;
537       }
538       ++I1;
539     }
540   }
541 
542   // Remove initializations that were merged into another.
543   for (auto &Init : Inits) {
544     auto &Defs = Init.second;
545     auto I = Defs.begin();
546     while (I != Defs.end()) {
547       if (MergedInstrs.count(*I)) {
548         (*I)->eraseFromParent();
549         I = Defs.erase(I);
550       } else
551         ++I;
552     }
553   }
554 
555   // Try to schedule SGPR initializations as early as possible in the MBB.
556   for (auto &Init : Inits) {
557     auto &Defs = Init.second;
558     for (auto MI : Defs) {
559       auto MBB = MI->getParent();
560       MachineInstr &BoundaryMI = *getFirstNonPrologue(MBB, TII);
561       MachineBasicBlock::reverse_iterator B(BoundaryMI);
562       // Check if B should actually be a boundary. If not set the previous
563       // instruction as the boundary instead.
564       if (!TII->isBasicBlockPrologue(*B))
565         B++;
566 
567       auto R = std::next(MI->getReverseIterator());
568       const unsigned Threshold = 50;
569       // Search until B or Threshold for a place to insert the initialization.
570       for (unsigned I = 0; R != B && I < Threshold; ++R, ++I)
571         if (R->readsRegister(Reg, TRI) || R->definesRegister(Reg, TRI) ||
572             TII->isSchedulingBoundary(*R, MBB, *MBB->getParent()))
573           break;
574 
575       // Move to directly after R.
576       if (&*--R != MI)
577         MBB->splice(*R, MBB, MI);
578     }
579   }
580 
581   if (Changed)
582     MRI.clearKillFlags(Reg);
583 
584   return Changed;
585 }
586 
587 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
588   // Only need to run this in SelectionDAG path.
589   if (MF.getProperties().hasProperty(
590         MachineFunctionProperties::Property::Selected))
591     return false;
592 
593   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
594   MRI = &MF.getRegInfo();
595   TRI = ST.getRegisterInfo();
596   TII = ST.getInstrInfo();
597   MDT = &getAnalysis<MachineDominatorTree>();
598 
599   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
600                                                   BI != BE; ++BI) {
601     MachineBasicBlock &MBB = *BI;
602     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
603          I != E; ++I) {
604       MachineInstr &MI = *I;
605 
606       switch (MI.getOpcode()) {
607       default:
608         continue;
609       case AMDGPU::COPY:
610       case AMDGPU::WQM:
611       case AMDGPU::SOFT_WQM:
612       case AMDGPU::WWM: {
613         Register DstReg = MI.getOperand(0).getReg();
614 
615         const TargetRegisterClass *SrcRC, *DstRC;
616         std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI);
617 
618         if (!DstReg.isVirtual()) {
619           // If the destination register is a physical register there isn't
620           // really much we can do to fix this.
621           // Some special instructions use M0 as an input. Some even only use
622           // the first lane. Insert a readfirstlane and hope for the best.
623           if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) {
624             Register TmpReg
625               = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
626 
627             BuildMI(MBB, MI, MI.getDebugLoc(),
628                     TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg)
629               .add(MI.getOperand(1));
630             MI.getOperand(1).setReg(TmpReg);
631           }
632 
633           continue;
634         }
635 
636         if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
637           Register SrcReg = MI.getOperand(1).getReg();
638           if (!SrcReg.isVirtual()) {
639             TII->moveToVALU(MI, MDT);
640             break;
641           }
642 
643           MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
644           unsigned SMovOp;
645           int64_t Imm;
646           // If we are just copying an immediate, we can replace the copy with
647           // s_mov_b32.
648           if (isSafeToFoldImmIntoCopy(&MI, DefMI, TII, SMovOp, Imm)) {
649             MI.getOperand(1).ChangeToImmediate(Imm);
650             MI.addImplicitDefUseOperands(MF);
651             MI.setDesc(TII->get(SMovOp));
652             break;
653           }
654           TII->moveToVALU(MI, MDT);
655         } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
656           tryChangeVGPRtoSGPRinCopy(MI, TRI, TII);
657         }
658 
659         break;
660       }
661       case AMDGPU::PHI: {
662         processPHINode(MI);
663         break;
664       }
665       case AMDGPU::REG_SEQUENCE:
666         if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) ||
667             !hasVectorOperands(MI, TRI)) {
668           foldVGPRCopyIntoRegSequence(MI, TRI, TII, *MRI);
669           continue;
670         }
671 
672         LLVM_DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
673 
674         TII->moveToVALU(MI, MDT);
675         break;
676       case AMDGPU::INSERT_SUBREG: {
677         const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
678         DstRC = MRI->getRegClass(MI.getOperand(0).getReg());
679         Src0RC = MRI->getRegClass(MI.getOperand(1).getReg());
680         Src1RC = MRI->getRegClass(MI.getOperand(2).getReg());
681         if (TRI->isSGPRClass(DstRC) &&
682             (TRI->hasVectorRegisters(Src0RC) ||
683              TRI->hasVectorRegisters(Src1RC))) {
684           LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
685           TII->moveToVALU(MI, MDT);
686         }
687         break;
688       }
689       case AMDGPU::V_WRITELANE_B32: {
690         // Some architectures allow more than one constant bus access without
691         // SGPR restriction
692         if (ST.getConstantBusLimit(MI.getOpcode()) != 1)
693           break;
694 
695         // Writelane is special in that it can use SGPR and M0 (which would
696         // normally count as using the constant bus twice - but in this case it
697         // is allowed since the lane selector doesn't count as a use of the
698         // constant bus). However, it is still required to abide by the 1 SGPR
699         // rule. Apply a fix here as we might have multiple SGPRs after
700         // legalizing VGPRs to SGPRs
701         int Src0Idx =
702             AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
703         int Src1Idx =
704             AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);
705         MachineOperand &Src0 = MI.getOperand(Src0Idx);
706         MachineOperand &Src1 = MI.getOperand(Src1Idx);
707 
708         // Check to see if the instruction violates the 1 SGPR rule
709         if ((Src0.isReg() && TRI->isSGPRReg(*MRI, Src0.getReg()) &&
710              Src0.getReg() != AMDGPU::M0) &&
711             (Src1.isReg() && TRI->isSGPRReg(*MRI, Src1.getReg()) &&
712              Src1.getReg() != AMDGPU::M0)) {
713 
714           // Check for trivially easy constant prop into one of the operands
715           // If this is the case then perform the operation now to resolve SGPR
716           // issue. If we don't do that here we will always insert a mov to m0
717           // that can't be resolved in later operand folding pass
718           bool Resolved = false;
719           for (MachineOperand *MO : {&Src0, &Src1}) {
720             if (MO->getReg().isVirtual()) {
721               MachineInstr *DefMI = MRI->getVRegDef(MO->getReg());
722               if (DefMI && TII->isFoldableCopy(*DefMI)) {
723                 const MachineOperand &Def = DefMI->getOperand(0);
724                 if (Def.isReg() &&
725                     MO->getReg() == Def.getReg() &&
726                     MO->getSubReg() == Def.getSubReg()) {
727                   const MachineOperand &Copied = DefMI->getOperand(1);
728                   if (Copied.isImm() &&
729                       TII->isInlineConstant(APInt(64, Copied.getImm(), true))) {
730                     MO->ChangeToImmediate(Copied.getImm());
731                     Resolved = true;
732                     break;
733                   }
734                 }
735               }
736             }
737           }
738 
739           if (!Resolved) {
740             // Haven't managed to resolve by replacing an SGPR with an immediate
741             // Move src1 to be in M0
742             BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
743                     TII->get(AMDGPU::COPY), AMDGPU::M0)
744                 .add(Src1);
745             Src1.ChangeToRegister(AMDGPU::M0, false);
746           }
747         }
748         break;
749       }
750       }
751     }
752   }
753 
754   if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge)
755     hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII);
756 
757   return true;
758 }
759 
760 void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
761   unsigned numVGPRUses = 0;
762   bool AllAGPRUses = true;
763   SetVector<const MachineInstr *> worklist;
764   SmallSet<const MachineInstr *, 4> Visited;
765   SetVector<MachineInstr *> PHIOperands;
766   worklist.insert(&MI);
767   Visited.insert(&MI);
768   while (!worklist.empty()) {
769     const MachineInstr *Instr = worklist.pop_back_val();
770     Register Reg = Instr->getOperand(0).getReg();
771     for (const auto &Use : MRI->use_operands(Reg)) {
772       const MachineInstr *UseMI = Use.getParent();
773       AllAGPRUses &= (UseMI->isCopy() &&
774                       TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg())) ||
775                      TRI->isAGPR(*MRI, Use.getReg());
776       if (UseMI->isCopy() || UseMI->isRegSequence()) {
777         if (UseMI->isCopy() &&
778           UseMI->getOperand(0).getReg().isPhysical() &&
779           !TRI->isSGPRReg(*MRI, UseMI->getOperand(0).getReg())) {
780           numVGPRUses++;
781         }
782         if (Visited.insert(UseMI).second)
783           worklist.insert(UseMI);
784 
785         continue;
786       }
787 
788       if (UseMI->isPHI()) {
789         const TargetRegisterClass *UseRC = MRI->getRegClass(Use.getReg());
790         if (!TRI->isSGPRReg(*MRI, Use.getReg()) &&
791           UseRC != &AMDGPU::VReg_1RegClass)
792           numVGPRUses++;
793         continue;
794       }
795 
796       const TargetRegisterClass *OpRC =
797         TII->getOpRegClass(*UseMI, UseMI->getOperandNo(&Use));
798       if (!TRI->isSGPRClass(OpRC) && OpRC != &AMDGPU::VS_32RegClass &&
799         OpRC != &AMDGPU::VS_64RegClass) {
800         numVGPRUses++;
801       }
802     }
803   }
804 
805   Register PHIRes = MI.getOperand(0).getReg();
806   const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes);
807   if (AllAGPRUses && numVGPRUses && !TRI->hasAGPRs(RC0)) {
808     LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI);
809     MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0));
810     for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
811       MachineInstr *DefMI = MRI->getVRegDef(MI.getOperand(I).getReg());
812       if (DefMI && DefMI->isPHI())
813         PHIOperands.insert(DefMI);
814     }
815   }
816 
817   bool hasVGPRInput = false;
818   for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
819     Register InputReg = MI.getOperand(i).getReg();
820     MachineInstr *Def = MRI->getVRegDef(InputReg);
821     if (TRI->isVectorRegister(*MRI, InputReg)) {
822       if (Def->isCopy()) {
823         Register SrcReg = Def->getOperand(1).getReg();
824         const TargetRegisterClass *RC =
825           TRI->getRegClassForReg(*MRI, SrcReg);
826         if (TRI->isSGPRClass(RC))
827           continue;
828       }
829       hasVGPRInput = true;
830       break;
831     }
832     else if (Def->isCopy() &&
833       TRI->isVectorRegister(*MRI, Def->getOperand(1).getReg())) {
834       Register SrcReg = Def->getOperand(1).getReg();
835       MachineInstr *SrcDef = MRI->getVRegDef(SrcReg);
836       unsigned SMovOp;
837       int64_t Imm;
838       if (!isSafeToFoldImmIntoCopy(Def, SrcDef, TII, SMovOp, Imm)) {
839         hasVGPRInput = true;
840         break;
841       } else {
842         // Formally, if we did not do this right away
843         // it would be done on the next iteration of the
844         // runOnMachineFunction main loop. But why not if we can?
845         MachineFunction *MF = MI.getParent()->getParent();
846         Def->getOperand(1).ChangeToImmediate(Imm);
847         Def->addImplicitDefUseOperands(*MF);
848         Def->setDesc(TII->get(SMovOp));
849       }
850     }
851   }
852 
853   if ((!TRI->isVectorRegister(*MRI, PHIRes) &&
854        RC0 != &AMDGPU::VReg_1RegClass) &&
855     (hasVGPRInput || numVGPRUses > 1)) {
856     LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
857     TII->moveToVALU(MI);
858   }
859   else {
860     LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI);
861     TII->legalizeOperands(MI, MDT);
862   }
863 
864   // Propagate register class back to PHI operands which are PHI themselves.
865   while (!PHIOperands.empty()) {
866     processPHINode(*PHIOperands.pop_back_val());
867   }
868 }
869