1 //===- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies ---------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Copies from VGPR to SGPR registers are illegal and the register coalescer
11 /// will sometimes generate these illegal copies in situations like this:
12 ///
13 ///  Register Class <vsrc> is the union of <vgpr> and <sgpr>
14 ///
15 /// BB0:
16 ///   %0 <sgpr> = SCALAR_INST
17 ///   %1 <vsrc> = COPY %0 <sgpr>
18 ///    ...
19 ///    BRANCH %cond BB1, BB2
20 ///  BB1:
21 ///    %2 <vgpr> = VECTOR_INST
22 ///    %3 <vsrc> = COPY %2 <vgpr>
23 ///  BB2:
24 ///    %4 <vsrc> = PHI %1 <vsrc>, <%bb.0>, %3 <vrsc>, <%bb.1>
25 ///    %5 <vgpr> = VECTOR_INST %4 <vsrc>
26 ///
27 ///
28 /// The coalescer will begin at BB0 and eliminate its copy, then the resulting
29 /// code will look like this:
30 ///
31 /// BB0:
32 ///   %0 <sgpr> = SCALAR_INST
33 ///    ...
34 ///    BRANCH %cond BB1, BB2
35 /// BB1:
36 ///   %2 <vgpr> = VECTOR_INST
37 ///   %3 <vsrc> = COPY %2 <vgpr>
38 /// BB2:
39 ///   %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <vsrc>, <%bb.1>
40 ///   %5 <vgpr> = VECTOR_INST %4 <sgpr>
41 ///
42 /// Now that the result of the PHI instruction is an SGPR, the register
43 /// allocator is now forced to constrain the register class of %3 to
44 /// <sgpr> so we end up with final code like this:
45 ///
46 /// BB0:
47 ///   %0 <sgpr> = SCALAR_INST
48 ///    ...
49 ///    BRANCH %cond BB1, BB2
50 /// BB1:
51 ///   %2 <vgpr> = VECTOR_INST
52 ///   %3 <sgpr> = COPY %2 <vgpr>
53 /// BB2:
54 ///   %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <sgpr>, <%bb.1>
55 ///   %5 <vgpr> = VECTOR_INST %4 <sgpr>
56 ///
57 /// Now this code contains an illegal copy from a VGPR to an SGPR.
58 ///
59 /// In order to avoid this problem, this pass searches for PHI instructions
60 /// which define a <vsrc> register and constrains its definition class to
61 /// <vgpr> if the user of the PHI's definition register is a vector instruction.
62 /// If the PHI's definition class is constrained to <vgpr> then the coalescer
63 /// will be unable to perform the COPY removal from the above example  which
64 /// ultimately led to the creation of an illegal COPY.
65 //===----------------------------------------------------------------------===//
66 
67 #include "AMDGPU.h"
68 #include "AMDGPUSubtarget.h"
69 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
70 #include "SIInstrInfo.h"
71 #include "SIRegisterInfo.h"
72 #include "llvm/ADT/DenseSet.h"
73 #include "llvm/ADT/STLExtras.h"
74 #include "llvm/ADT/SmallSet.h"
75 #include "llvm/ADT/SmallVector.h"
76 #include "llvm/CodeGen/MachineBasicBlock.h"
77 #include "llvm/CodeGen/MachineDominators.h"
78 #include "llvm/CodeGen/MachineFunction.h"
79 #include "llvm/CodeGen/MachineFunctionPass.h"
80 #include "llvm/CodeGen/MachineInstr.h"
81 #include "llvm/CodeGen/MachineInstrBuilder.h"
82 #include "llvm/CodeGen/MachineOperand.h"
83 #include "llvm/CodeGen/MachineRegisterInfo.h"
84 #include "llvm/CodeGen/TargetRegisterInfo.h"
85 #include "llvm/InitializePasses.h"
86 #include "llvm/Pass.h"
87 #include "llvm/Support/CodeGen.h"
88 #include "llvm/Support/CommandLine.h"
89 #include "llvm/Support/Debug.h"
90 #include "llvm/Support/raw_ostream.h"
91 #include "llvm/Target/TargetMachine.h"
92 #include <cassert>
93 #include <cstdint>
94 #include <iterator>
95 #include <list>
96 #include <map>
97 #include <tuple>
98 #include <utility>
99 
100 using namespace llvm;
101 
102 #define DEBUG_TYPE "si-fix-sgpr-copies"
103 
104 static cl::opt<bool> EnableM0Merge(
105   "amdgpu-enable-merge-m0",
106   cl::desc("Merge and hoist M0 initializations"),
107   cl::init(true));
108 
109 namespace {
110 
111 class SIFixSGPRCopies : public MachineFunctionPass {
112   MachineDominatorTree *MDT;
113 
114 public:
115   static char ID;
116 
117   MachineRegisterInfo *MRI;
118   const SIRegisterInfo *TRI;
119   const SIInstrInfo *TII;
120 
121   SIFixSGPRCopies() : MachineFunctionPass(ID) {}
122 
123   bool runOnMachineFunction(MachineFunction &MF) override;
124 
125   void processPHINode(MachineInstr &MI);
126 
127   StringRef getPassName() const override { return "SI Fix SGPR copies"; }
128 
129   void getAnalysisUsage(AnalysisUsage &AU) const override {
130     AU.addRequired<MachineDominatorTree>();
131     AU.addPreserved<MachineDominatorTree>();
132     AU.setPreservesCFG();
133     MachineFunctionPass::getAnalysisUsage(AU);
134   }
135 };
136 
137 } // end anonymous namespace
138 
139 INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE,
140                      "SI Fix SGPR copies", false, false)
141 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
142 INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE,
143                      "SI Fix SGPR copies", false, false)
144 
145 char SIFixSGPRCopies::ID = 0;
146 
147 char &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID;
148 
149 FunctionPass *llvm::createSIFixSGPRCopiesPass() {
150   return new SIFixSGPRCopies();
151 }
152 
153 static bool hasVectorOperands(const MachineInstr &MI,
154                               const SIRegisterInfo *TRI) {
155   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
156   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
157     if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
158       continue;
159 
160     if (TRI->hasVectorRegisters(MRI.getRegClass(MI.getOperand(i).getReg())))
161       return true;
162   }
163   return false;
164 }
165 
166 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
167 getCopyRegClasses(const MachineInstr &Copy,
168                   const SIRegisterInfo &TRI,
169                   const MachineRegisterInfo &MRI) {
170   Register DstReg = Copy.getOperand(0).getReg();
171   Register SrcReg = Copy.getOperand(1).getReg();
172 
173   const TargetRegisterClass *SrcRC = SrcReg.isVirtual()
174                                          ? MRI.getRegClass(SrcReg)
175                                          : TRI.getPhysRegClass(SrcReg);
176 
177   // We don't really care about the subregister here.
178   // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg());
179 
180   const TargetRegisterClass *DstRC = DstReg.isVirtual()
181                                          ? MRI.getRegClass(DstReg)
182                                          : TRI.getPhysRegClass(DstReg);
183 
184   return std::make_pair(SrcRC, DstRC);
185 }
186 
187 static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
188                              const TargetRegisterClass *DstRC,
189                              const SIRegisterInfo &TRI) {
190   return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) &&
191          TRI.hasVectorRegisters(SrcRC);
192 }
193 
194 static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
195                              const TargetRegisterClass *DstRC,
196                              const SIRegisterInfo &TRI) {
197   return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) &&
198          TRI.hasVectorRegisters(DstRC);
199 }
200 
201 static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
202                                       const SIRegisterInfo *TRI,
203                                       const SIInstrInfo *TII) {
204   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
205   auto &Src = MI.getOperand(1);
206   Register DstReg = MI.getOperand(0).getReg();
207   Register SrcReg = Src.getReg();
208   if (!SrcReg.isVirtual() || !DstReg.isVirtual())
209     return false;
210 
211   for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) {
212     const auto *UseMI = MO.getParent();
213     if (UseMI == &MI)
214       continue;
215     if (MO.isDef() || UseMI->getParent() != MI.getParent() ||
216         UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END ||
217         !TII->isOperandLegal(*UseMI, UseMI->getOperandNo(&MO), &Src))
218       return false;
219   }
220   // Change VGPR to SGPR destination.
221   MRI.setRegClass(DstReg, TRI->getEquivalentSGPRClass(MRI.getRegClass(DstReg)));
222   return true;
223 }
224 
225 // Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE.
226 //
227 // SGPRx = ...
228 // SGPRy = REG_SEQUENCE SGPRx, sub0 ...
229 // VGPRz = COPY SGPRy
230 //
231 // ==>
232 //
233 // VGPRx = COPY SGPRx
234 // VGPRz = REG_SEQUENCE VGPRx, sub0
235 //
236 // This exposes immediate folding opportunities when materializing 64-bit
237 // immediates.
238 static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
239                                         const SIRegisterInfo *TRI,
240                                         const SIInstrInfo *TII,
241                                         MachineRegisterInfo &MRI) {
242   assert(MI.isRegSequence());
243 
244   Register DstReg = MI.getOperand(0).getReg();
245   if (!TRI->isSGPRClass(MRI.getRegClass(DstReg)))
246     return false;
247 
248   if (!MRI.hasOneUse(DstReg))
249     return false;
250 
251   MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg);
252   if (!CopyUse.isCopy())
253     return false;
254 
255   // It is illegal to have vreg inputs to a physreg defining reg_sequence.
256   if (CopyUse.getOperand(0).getReg().isPhysical())
257     return false;
258 
259   const TargetRegisterClass *SrcRC, *DstRC;
260   std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI);
261 
262   if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI))
263     return false;
264 
265   if (tryChangeVGPRtoSGPRinCopy(CopyUse, TRI, TII))
266     return true;
267 
268   // TODO: Could have multiple extracts?
269   unsigned SubReg = CopyUse.getOperand(1).getSubReg();
270   if (SubReg != AMDGPU::NoSubRegister)
271     return false;
272 
273   MRI.setRegClass(DstReg, DstRC);
274 
275   // SGPRx = ...
276   // SGPRy = REG_SEQUENCE SGPRx, sub0 ...
277   // VGPRz = COPY SGPRy
278 
279   // =>
280   // VGPRx = COPY SGPRx
281   // VGPRz = REG_SEQUENCE VGPRx, sub0
282 
283   MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg());
284   bool IsAGPR = TRI->hasAGPRs(DstRC);
285 
286   for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
287     Register SrcReg = MI.getOperand(I).getReg();
288     unsigned SrcSubReg = MI.getOperand(I).getSubReg();
289 
290     const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
291     assert(TRI->isSGPRClass(SrcRC) &&
292            "Expected SGPR REG_SEQUENCE to only have SGPR inputs");
293 
294     SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg);
295     const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC);
296 
297     Register TmpReg = MRI.createVirtualRegister(NewSrcRC);
298 
299     BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
300             TmpReg)
301         .add(MI.getOperand(I));
302 
303     if (IsAGPR) {
304       const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC);
305       Register TmpAReg = MRI.createVirtualRegister(NewSrcRC);
306       unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ?
307         AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY;
308       BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc),
309             TmpAReg)
310         .addReg(TmpReg, RegState::Kill);
311       TmpReg = TmpAReg;
312     }
313 
314     MI.getOperand(I).setReg(TmpReg);
315   }
316 
317   CopyUse.eraseFromParent();
318   return true;
319 }
320 
321 static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
322                                     const MachineInstr *MoveImm,
323                                     const SIInstrInfo *TII,
324                                     unsigned &SMovOp,
325                                     int64_t &Imm) {
326   if (Copy->getOpcode() != AMDGPU::COPY)
327     return false;
328 
329   if (!MoveImm->isMoveImmediate())
330     return false;
331 
332   const MachineOperand *ImmOp =
333       TII->getNamedOperand(*MoveImm, AMDGPU::OpName::src0);
334   if (!ImmOp->isImm())
335     return false;
336 
337   // FIXME: Handle copies with sub-regs.
338   if (Copy->getOperand(0).getSubReg())
339     return false;
340 
341   switch (MoveImm->getOpcode()) {
342   default:
343     return false;
344   case AMDGPU::V_MOV_B32_e32:
345     SMovOp = AMDGPU::S_MOV_B32;
346     break;
347   case AMDGPU::V_MOV_B64_PSEUDO:
348     SMovOp = AMDGPU::S_MOV_B64;
349     break;
350   }
351   Imm = ImmOp->getImm();
352   return true;
353 }
354 
355 template <class UnaryPredicate>
356 bool searchPredecessors(const MachineBasicBlock *MBB,
357                         const MachineBasicBlock *CutOff,
358                         UnaryPredicate Predicate) {
359   if (MBB == CutOff)
360     return false;
361 
362   DenseSet<const MachineBasicBlock *> Visited;
363   SmallVector<MachineBasicBlock *, 4> Worklist(MBB->pred_begin(),
364                                                MBB->pred_end());
365 
366   while (!Worklist.empty()) {
367     MachineBasicBlock *MBB = Worklist.pop_back_val();
368 
369     if (!Visited.insert(MBB).second)
370       continue;
371     if (MBB == CutOff)
372       continue;
373     if (Predicate(MBB))
374       return true;
375 
376     Worklist.append(MBB->pred_begin(), MBB->pred_end());
377   }
378 
379   return false;
380 }
381 
382 // Checks if there is potential path From instruction To instruction.
383 // If CutOff is specified and it sits in between of that path we ignore
384 // a higher portion of the path and report it is not reachable.
385 static bool isReachable(const MachineInstr *From,
386                         const MachineInstr *To,
387                         const MachineBasicBlock *CutOff,
388                         MachineDominatorTree &MDT) {
389   if (MDT.dominates(From, To))
390     return true;
391 
392   const MachineBasicBlock *MBBFrom = From->getParent();
393   const MachineBasicBlock *MBBTo = To->getParent();
394 
395   // Do predecessor search.
396   // We should almost never get here since we do not usually produce M0 stores
397   // other than -1.
398   return searchPredecessors(MBBTo, CutOff, [MBBFrom]
399            (const MachineBasicBlock *MBB) { return MBB == MBBFrom; });
400 }
401 
402 // Return the first non-prologue instruction in the block.
403 static MachineBasicBlock::iterator
404 getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) {
405   MachineBasicBlock::iterator I = MBB->getFirstNonPHI();
406   while (I != MBB->end() && TII->isBasicBlockPrologue(*I))
407     ++I;
408 
409   return I;
410 }
411 
412 // Hoist and merge identical SGPR initializations into a common predecessor.
413 // This is intended to combine M0 initializations, but can work with any
414 // SGPR. A VGPR cannot be processed since we cannot guarantee vector
415 // executioon.
416 static bool hoistAndMergeSGPRInits(unsigned Reg,
417                                    const MachineRegisterInfo &MRI,
418                                    const TargetRegisterInfo *TRI,
419                                    MachineDominatorTree &MDT,
420                                    const TargetInstrInfo *TII) {
421   // List of inits by immediate value.
422   using InitListMap = std::map<unsigned, std::list<MachineInstr *>>;
423   InitListMap Inits;
424   // List of clobbering instructions.
425   SmallVector<MachineInstr*, 8> Clobbers;
426   // List of instructions marked for deletion.
427   SmallSet<MachineInstr*, 8> MergedInstrs;
428 
429   bool Changed = false;
430 
431   for (auto &MI : MRI.def_instructions(Reg)) {
432     MachineOperand *Imm = nullptr;
433     for (auto &MO : MI.operands()) {
434       if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) ||
435           (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) {
436         Imm = nullptr;
437         break;
438       } else if (MO.isImm())
439         Imm = &MO;
440     }
441     if (Imm)
442       Inits[Imm->getImm()].push_front(&MI);
443     else
444       Clobbers.push_back(&MI);
445   }
446 
447   for (auto &Init : Inits) {
448     auto &Defs = Init.second;
449 
450     for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) {
451       MachineInstr *MI1 = *I1;
452 
453       for (auto I2 = std::next(I1); I2 != E; ) {
454         MachineInstr *MI2 = *I2;
455 
456         // Check any possible interference
457         auto interferes = [&](MachineBasicBlock::iterator From,
458                               MachineBasicBlock::iterator To) -> bool {
459 
460           assert(MDT.dominates(&*To, &*From));
461 
462           auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool {
463             const MachineBasicBlock *MBBFrom = From->getParent();
464             const MachineBasicBlock *MBBTo = To->getParent();
465             bool MayClobberFrom = isReachable(Clobber, &*From, MBBTo, MDT);
466             bool MayClobberTo = isReachable(Clobber, &*To, MBBTo, MDT);
467             if (!MayClobberFrom && !MayClobberTo)
468               return false;
469             if ((MayClobberFrom && !MayClobberTo) ||
470                 (!MayClobberFrom && MayClobberTo))
471               return true;
472             // Both can clobber, this is not an interference only if both are
473             // dominated by Clobber and belong to the same block or if Clobber
474             // properly dominates To, given that To >> From, so it dominates
475             // both and located in a common dominator.
476             return !((MBBFrom == MBBTo &&
477                       MDT.dominates(Clobber, &*From) &&
478                       MDT.dominates(Clobber, &*To)) ||
479                      MDT.properlyDominates(Clobber->getParent(), MBBTo));
480           };
481 
482           return (llvm::any_of(Clobbers, interferes)) ||
483                  (llvm::any_of(Inits, [&](InitListMap::value_type &C) {
484                     return C.first != Init.first &&
485                            llvm::any_of(C.second, interferes);
486                   }));
487         };
488 
489         if (MDT.dominates(MI1, MI2)) {
490           if (!interferes(MI2, MI1)) {
491             LLVM_DEBUG(dbgs()
492                        << "Erasing from "
493                        << printMBBReference(*MI2->getParent()) << " " << *MI2);
494             MergedInstrs.insert(MI2);
495             Changed = true;
496             ++I2;
497             continue;
498           }
499         } else if (MDT.dominates(MI2, MI1)) {
500           if (!interferes(MI1, MI2)) {
501             LLVM_DEBUG(dbgs()
502                        << "Erasing from "
503                        << printMBBReference(*MI1->getParent()) << " " << *MI1);
504             MergedInstrs.insert(MI1);
505             Changed = true;
506             ++I1;
507             break;
508           }
509         } else {
510           auto *MBB = MDT.findNearestCommonDominator(MI1->getParent(),
511                                                      MI2->getParent());
512           if (!MBB) {
513             ++I2;
514             continue;
515           }
516 
517           MachineBasicBlock::iterator I = getFirstNonPrologue(MBB, TII);
518           if (!interferes(MI1, I) && !interferes(MI2, I)) {
519             LLVM_DEBUG(dbgs()
520                        << "Erasing from "
521                        << printMBBReference(*MI1->getParent()) << " " << *MI1
522                        << "and moving from "
523                        << printMBBReference(*MI2->getParent()) << " to "
524                        << printMBBReference(*I->getParent()) << " " << *MI2);
525             I->getParent()->splice(I, MI2->getParent(), MI2);
526             MergedInstrs.insert(MI1);
527             Changed = true;
528             ++I1;
529             break;
530           }
531         }
532         ++I2;
533       }
534       ++I1;
535     }
536   }
537 
538   // Remove initializations that were merged into another.
539   for (auto &Init : Inits) {
540     auto &Defs = Init.second;
541     auto I = Defs.begin();
542     while (I != Defs.end()) {
543       if (MergedInstrs.count(*I)) {
544         (*I)->eraseFromParent();
545         I = Defs.erase(I);
546       } else
547         ++I;
548     }
549   }
550 
551   // Try to schedule SGPR initializations as early as possible in the MBB.
552   for (auto &Init : Inits) {
553     auto &Defs = Init.second;
554     for (auto MI : Defs) {
555       auto MBB = MI->getParent();
556       MachineInstr &BoundaryMI = *getFirstNonPrologue(MBB, TII);
557       MachineBasicBlock::reverse_iterator B(BoundaryMI);
558       // Check if B should actually be a boundary. If not set the previous
559       // instruction as the boundary instead.
560       if (!TII->isBasicBlockPrologue(*B))
561         B++;
562 
563       auto R = std::next(MI->getReverseIterator());
564       const unsigned Threshold = 50;
565       // Search until B or Threshold for a place to insert the initialization.
566       for (unsigned I = 0; R != B && I < Threshold; ++R, ++I)
567         if (R->readsRegister(Reg, TRI) || R->definesRegister(Reg, TRI) ||
568             TII->isSchedulingBoundary(*R, MBB, *MBB->getParent()))
569           break;
570 
571       // Move to directly after R.
572       if (&*--R != MI)
573         MBB->splice(*R, MBB, MI);
574     }
575   }
576 
577   if (Changed)
578     MRI.clearKillFlags(Reg);
579 
580   return Changed;
581 }
582 
583 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
584   // Only need to run this in SelectionDAG path.
585   if (MF.getProperties().hasProperty(
586         MachineFunctionProperties::Property::Selected))
587     return false;
588 
589   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
590   MRI = &MF.getRegInfo();
591   TRI = ST.getRegisterInfo();
592   TII = ST.getInstrInfo();
593   MDT = &getAnalysis<MachineDominatorTree>();
594 
595   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
596                                                   BI != BE; ++BI) {
597     MachineBasicBlock &MBB = *BI;
598     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
599          I != E; ++I) {
600       MachineInstr &MI = *I;
601 
602       switch (MI.getOpcode()) {
603       default:
604         continue;
605       case AMDGPU::COPY:
606       case AMDGPU::WQM:
607       case AMDGPU::SOFT_WQM:
608       case AMDGPU::WWM: {
609         Register DstReg = MI.getOperand(0).getReg();
610 
611         const TargetRegisterClass *SrcRC, *DstRC;
612         std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI);
613 
614         if (!DstReg.isVirtual()) {
615           // If the destination register is a physical register there isn't
616           // really much we can do to fix this.
617           // Some special instructions use M0 as an input. Some even only use
618           // the first lane. Insert a readfirstlane and hope for the best.
619           if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) {
620             Register TmpReg
621               = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
622 
623             BuildMI(MBB, MI, MI.getDebugLoc(),
624                     TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg)
625               .add(MI.getOperand(1));
626             MI.getOperand(1).setReg(TmpReg);
627           }
628 
629           continue;
630         }
631 
632         if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
633           Register SrcReg = MI.getOperand(1).getReg();
634           if (!SrcReg.isVirtual()) {
635             TII->moveToVALU(MI, MDT);
636             break;
637           }
638 
639           MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
640           unsigned SMovOp;
641           int64_t Imm;
642           // If we are just copying an immediate, we can replace the copy with
643           // s_mov_b32.
644           if (isSafeToFoldImmIntoCopy(&MI, DefMI, TII, SMovOp, Imm)) {
645             MI.getOperand(1).ChangeToImmediate(Imm);
646             MI.addImplicitDefUseOperands(MF);
647             MI.setDesc(TII->get(SMovOp));
648             break;
649           }
650           TII->moveToVALU(MI, MDT);
651         } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
652           tryChangeVGPRtoSGPRinCopy(MI, TRI, TII);
653         }
654 
655         break;
656       }
657       case AMDGPU::PHI: {
658         processPHINode(MI);
659         break;
660       }
661       case AMDGPU::REG_SEQUENCE:
662         if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) ||
663             !hasVectorOperands(MI, TRI)) {
664           foldVGPRCopyIntoRegSequence(MI, TRI, TII, *MRI);
665           continue;
666         }
667 
668         LLVM_DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
669 
670         TII->moveToVALU(MI, MDT);
671         break;
672       case AMDGPU::INSERT_SUBREG: {
673         const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
674         DstRC = MRI->getRegClass(MI.getOperand(0).getReg());
675         Src0RC = MRI->getRegClass(MI.getOperand(1).getReg());
676         Src1RC = MRI->getRegClass(MI.getOperand(2).getReg());
677         if (TRI->isSGPRClass(DstRC) &&
678             (TRI->hasVectorRegisters(Src0RC) ||
679              TRI->hasVectorRegisters(Src1RC))) {
680           LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
681           TII->moveToVALU(MI, MDT);
682         }
683         break;
684       }
685       case AMDGPU::V_WRITELANE_B32: {
686         // Some architectures allow more than one constant bus access without
687         // SGPR restriction
688         if (ST.getConstantBusLimit(MI.getOpcode()) != 1)
689           break;
690 
691         // Writelane is special in that it can use SGPR and M0 (which would
692         // normally count as using the constant bus twice - but in this case it
693         // is allowed since the lane selector doesn't count as a use of the
694         // constant bus). However, it is still required to abide by the 1 SGPR
695         // rule. Apply a fix here as we might have multiple SGPRs after
696         // legalizing VGPRs to SGPRs
697         int Src0Idx =
698             AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
699         int Src1Idx =
700             AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);
701         MachineOperand &Src0 = MI.getOperand(Src0Idx);
702         MachineOperand &Src1 = MI.getOperand(Src1Idx);
703 
704         // Check to see if the instruction violates the 1 SGPR rule
705         if ((Src0.isReg() && TRI->isSGPRReg(*MRI, Src0.getReg()) &&
706              Src0.getReg() != AMDGPU::M0) &&
707             (Src1.isReg() && TRI->isSGPRReg(*MRI, Src1.getReg()) &&
708              Src1.getReg() != AMDGPU::M0)) {
709 
710           // Check for trivially easy constant prop into one of the operands
711           // If this is the case then perform the operation now to resolve SGPR
712           // issue. If we don't do that here we will always insert a mov to m0
713           // that can't be resolved in later operand folding pass
714           bool Resolved = false;
715           for (MachineOperand *MO : {&Src0, &Src1}) {
716             if (MO->getReg().isVirtual()) {
717               MachineInstr *DefMI = MRI->getVRegDef(MO->getReg());
718               if (DefMI && TII->isFoldableCopy(*DefMI)) {
719                 const MachineOperand &Def = DefMI->getOperand(0);
720                 if (Def.isReg() &&
721                     MO->getReg() == Def.getReg() &&
722                     MO->getSubReg() == Def.getSubReg()) {
723                   const MachineOperand &Copied = DefMI->getOperand(1);
724                   if (Copied.isImm() &&
725                       TII->isInlineConstant(APInt(64, Copied.getImm(), true))) {
726                     MO->ChangeToImmediate(Copied.getImm());
727                     Resolved = true;
728                     break;
729                   }
730                 }
731               }
732             }
733           }
734 
735           if (!Resolved) {
736             // Haven't managed to resolve by replacing an SGPR with an immediate
737             // Move src1 to be in M0
738             BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
739                     TII->get(AMDGPU::COPY), AMDGPU::M0)
740                 .add(Src1);
741             Src1.ChangeToRegister(AMDGPU::M0, false);
742           }
743         }
744         break;
745       }
746       }
747     }
748   }
749 
750   if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge)
751     hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII);
752 
753   return true;
754 }
755 
756 void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
757   unsigned numVGPRUses = 0;
758   bool AllAGPRUses = true;
759   SetVector<const MachineInstr *> worklist;
760   SmallSet<const MachineInstr *, 4> Visited;
761   SetVector<MachineInstr *> PHIOperands;
762   worklist.insert(&MI);
763   Visited.insert(&MI);
764   while (!worklist.empty()) {
765     const MachineInstr *Instr = worklist.pop_back_val();
766     Register Reg = Instr->getOperand(0).getReg();
767     for (const auto &Use : MRI->use_operands(Reg)) {
768       const MachineInstr *UseMI = Use.getParent();
769       AllAGPRUses &= (UseMI->isCopy() &&
770                       TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg())) ||
771                      TRI->isAGPR(*MRI, Use.getReg());
772       if (UseMI->isCopy() || UseMI->isRegSequence()) {
773         if (UseMI->isCopy() &&
774           UseMI->getOperand(0).getReg().isPhysical() &&
775           !TRI->isSGPRReg(*MRI, UseMI->getOperand(0).getReg())) {
776           numVGPRUses++;
777         }
778         if (Visited.insert(UseMI).second)
779           worklist.insert(UseMI);
780 
781         continue;
782       }
783 
784       if (UseMI->isPHI()) {
785         const TargetRegisterClass *UseRC = MRI->getRegClass(Use.getReg());
786         if (!TRI->isSGPRReg(*MRI, Use.getReg()) &&
787           UseRC != &AMDGPU::VReg_1RegClass)
788           numVGPRUses++;
789         continue;
790       }
791 
792       const TargetRegisterClass *OpRC =
793         TII->getOpRegClass(*UseMI, UseMI->getOperandNo(&Use));
794       if (!TRI->isSGPRClass(OpRC) && OpRC != &AMDGPU::VS_32RegClass &&
795         OpRC != &AMDGPU::VS_64RegClass) {
796         numVGPRUses++;
797       }
798     }
799   }
800 
801   Register PHIRes = MI.getOperand(0).getReg();
802   const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes);
803   if (AllAGPRUses && numVGPRUses && !TRI->hasAGPRs(RC0)) {
804     LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI);
805     MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0));
806     for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
807       MachineInstr *DefMI = MRI->getVRegDef(MI.getOperand(I).getReg());
808       if (DefMI && DefMI->isPHI())
809         PHIOperands.insert(DefMI);
810     }
811   }
812 
813   bool hasVGPRInput = false;
814   for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
815     Register InputReg = MI.getOperand(i).getReg();
816     MachineInstr *Def = MRI->getVRegDef(InputReg);
817     if (TRI->isVectorRegister(*MRI, InputReg)) {
818       if (Def->isCopy()) {
819         Register SrcReg = Def->getOperand(1).getReg();
820         const TargetRegisterClass *RC =
821           TRI->getRegClassForReg(*MRI, SrcReg);
822         if (TRI->isSGPRClass(RC))
823           continue;
824       }
825       hasVGPRInput = true;
826       break;
827     }
828     else if (Def->isCopy() &&
829       TRI->isVectorRegister(*MRI, Def->getOperand(1).getReg())) {
830       Register SrcReg = Def->getOperand(1).getReg();
831       MachineInstr *SrcDef = MRI->getVRegDef(SrcReg);
832       unsigned SMovOp;
833       int64_t Imm;
834       if (!isSafeToFoldImmIntoCopy(Def, SrcDef, TII, SMovOp, Imm)) {
835         hasVGPRInput = true;
836         break;
837       } else {
838         // Formally, if we did not do this right away
839         // it would be done on the next iteration of the
840         // runOnMachineFunction main loop. But why not if we can?
841         MachineFunction *MF = MI.getParent()->getParent();
842         Def->getOperand(1).ChangeToImmediate(Imm);
843         Def->addImplicitDefUseOperands(*MF);
844         Def->setDesc(TII->get(SMovOp));
845       }
846     }
847   }
848 
849   if ((!TRI->isVectorRegister(*MRI, PHIRes) &&
850        RC0 != &AMDGPU::VReg_1RegClass) &&
851     (hasVGPRInput || numVGPRUses > 1)) {
852     LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
853     TII->moveToVALU(MI);
854   }
855   else {
856     LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI);
857     TII->legalizeOperands(MI, MDT);
858   }
859 
860   // Propagate register class back to PHI operands which are PHI themselves.
861   while (!PHIOperands.empty()) {
862     processPHINode(*PHIOperands.pop_back_val());
863   }
864 }
865