1 //===- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies ---------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Copies from VGPR to SGPR registers are illegal and the register coalescer
11 /// will sometimes generate these illegal copies in situations like this:
12 ///
13 ///  Register Class <vsrc> is the union of <vgpr> and <sgpr>
14 ///
15 /// BB0:
16 ///   %0 <sgpr> = SCALAR_INST
17 ///   %1 <vsrc> = COPY %0 <sgpr>
18 ///    ...
19 ///    BRANCH %cond BB1, BB2
20 ///  BB1:
21 ///    %2 <vgpr> = VECTOR_INST
22 ///    %3 <vsrc> = COPY %2 <vgpr>
23 ///  BB2:
24 ///    %4 <vsrc> = PHI %1 <vsrc>, <%bb.0>, %3 <vrsc>, <%bb.1>
25 ///    %5 <vgpr> = VECTOR_INST %4 <vsrc>
26 ///
27 ///
28 /// The coalescer will begin at BB0 and eliminate its copy, then the resulting
29 /// code will look like this:
30 ///
31 /// BB0:
32 ///   %0 <sgpr> = SCALAR_INST
33 ///    ...
34 ///    BRANCH %cond BB1, BB2
35 /// BB1:
36 ///   %2 <vgpr> = VECTOR_INST
37 ///   %3 <vsrc> = COPY %2 <vgpr>
38 /// BB2:
39 ///   %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <vsrc>, <%bb.1>
40 ///   %5 <vgpr> = VECTOR_INST %4 <sgpr>
41 ///
42 /// Now that the result of the PHI instruction is an SGPR, the register
43 /// allocator is now forced to constrain the register class of %3 to
44 /// <sgpr> so we end up with final code like this:
45 ///
46 /// BB0:
47 ///   %0 <sgpr> = SCALAR_INST
48 ///    ...
49 ///    BRANCH %cond BB1, BB2
50 /// BB1:
51 ///   %2 <vgpr> = VECTOR_INST
52 ///   %3 <sgpr> = COPY %2 <vgpr>
53 /// BB2:
54 ///   %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <sgpr>, <%bb.1>
55 ///   %5 <vgpr> = VECTOR_INST %4 <sgpr>
56 ///
57 /// Now this code contains an illegal copy from a VGPR to an SGPR.
58 ///
59 /// In order to avoid this problem, this pass searches for PHI instructions
60 /// which define a <vsrc> register and constrains its definition class to
61 /// <vgpr> if the user of the PHI's definition register is a vector instruction.
62 /// If the PHI's definition class is constrained to <vgpr> then the coalescer
63 /// will be unable to perform the COPY removal from the above example  which
64 /// ultimately led to the creation of an illegal COPY.
65 //===----------------------------------------------------------------------===//
66 
67 #include "AMDGPU.h"
68 #include "AMDGPUSubtarget.h"
69 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
70 #include "SIInstrInfo.h"
71 #include "SIRegisterInfo.h"
72 #include "llvm/ADT/DenseSet.h"
73 #include "llvm/ADT/STLExtras.h"
74 #include "llvm/ADT/SmallSet.h"
75 #include "llvm/ADT/SmallVector.h"
76 #include "llvm/CodeGen/MachineBasicBlock.h"
77 #include "llvm/CodeGen/MachineDominators.h"
78 #include "llvm/CodeGen/MachineFunction.h"
79 #include "llvm/CodeGen/MachineFunctionPass.h"
80 #include "llvm/CodeGen/MachineInstr.h"
81 #include "llvm/CodeGen/MachineInstrBuilder.h"
82 #include "llvm/CodeGen/MachineOperand.h"
83 #include "llvm/CodeGen/MachineRegisterInfo.h"
84 #include "llvm/CodeGen/TargetRegisterInfo.h"
85 #include "llvm/InitializePasses.h"
86 #include "llvm/Pass.h"
87 #include "llvm/Support/CodeGen.h"
88 #include "llvm/Support/CommandLine.h"
89 #include "llvm/Support/Debug.h"
90 #include "llvm/Support/raw_ostream.h"
91 #include "llvm/Target/TargetMachine.h"
92 #include <cassert>
93 #include <cstdint>
94 #include <iterator>
95 #include <list>
96 #include <map>
97 #include <tuple>
98 #include <utility>
99 
100 using namespace llvm;
101 
102 #define DEBUG_TYPE "si-fix-sgpr-copies"
103 
104 static cl::opt<bool> EnableM0Merge(
105   "amdgpu-enable-merge-m0",
106   cl::desc("Merge and hoist M0 initializations"),
107   cl::init(true));
108 
109 namespace {
110 
111 class SIFixSGPRCopies : public MachineFunctionPass {
112   MachineDominatorTree *MDT;
113 
114 public:
115   static char ID;
116 
117   MachineRegisterInfo *MRI;
118   const SIRegisterInfo *TRI;
119   const SIInstrInfo *TII;
120 
121   SIFixSGPRCopies() : MachineFunctionPass(ID) {}
122 
123   bool runOnMachineFunction(MachineFunction &MF) override;
124 
125   MachineBasicBlock *processPHINode(MachineInstr &MI);
126 
127   StringRef getPassName() const override { return "SI Fix SGPR copies"; }
128 
129   void getAnalysisUsage(AnalysisUsage &AU) const override {
130     AU.addRequired<MachineDominatorTree>();
131     AU.addPreserved<MachineDominatorTree>();
132     AU.setPreservesCFG();
133     MachineFunctionPass::getAnalysisUsage(AU);
134   }
135 };
136 
137 } // end anonymous namespace
138 
139 INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE,
140                      "SI Fix SGPR copies", false, false)
141 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
142 INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE,
143                      "SI Fix SGPR copies", false, false)
144 
145 char SIFixSGPRCopies::ID = 0;
146 
147 char &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID;
148 
149 FunctionPass *llvm::createSIFixSGPRCopiesPass() {
150   return new SIFixSGPRCopies();
151 }
152 
153 static bool hasVectorOperands(const MachineInstr &MI,
154                               const SIRegisterInfo *TRI) {
155   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
156   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
157     if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
158       continue;
159 
160     if (TRI->hasVectorRegisters(MRI.getRegClass(MI.getOperand(i).getReg())))
161       return true;
162   }
163   return false;
164 }
165 
166 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
167 getCopyRegClasses(const MachineInstr &Copy,
168                   const SIRegisterInfo &TRI,
169                   const MachineRegisterInfo &MRI) {
170   Register DstReg = Copy.getOperand(0).getReg();
171   Register SrcReg = Copy.getOperand(1).getReg();
172 
173   const TargetRegisterClass *SrcRC = SrcReg.isVirtual()
174                                          ? MRI.getRegClass(SrcReg)
175                                          : TRI.getPhysRegClass(SrcReg);
176 
177   // We don't really care about the subregister here.
178   // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg());
179 
180   const TargetRegisterClass *DstRC = DstReg.isVirtual()
181                                          ? MRI.getRegClass(DstReg)
182                                          : TRI.getPhysRegClass(DstReg);
183 
184   return std::make_pair(SrcRC, DstRC);
185 }
186 
187 static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
188                              const TargetRegisterClass *DstRC,
189                              const SIRegisterInfo &TRI) {
190   return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) &&
191          TRI.hasVectorRegisters(SrcRC);
192 }
193 
194 static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
195                              const TargetRegisterClass *DstRC,
196                              const SIRegisterInfo &TRI) {
197   return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) &&
198          TRI.hasVectorRegisters(DstRC);
199 }
200 
201 static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
202                                       const SIRegisterInfo *TRI,
203                                       const SIInstrInfo *TII) {
204   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
205   auto &Src = MI.getOperand(1);
206   Register DstReg = MI.getOperand(0).getReg();
207   Register SrcReg = Src.getReg();
208   if (!SrcReg.isVirtual() || !DstReg.isVirtual())
209     return false;
210 
211   for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) {
212     const auto *UseMI = MO.getParent();
213     if (UseMI == &MI)
214       continue;
215     if (MO.isDef() || UseMI->getParent() != MI.getParent() ||
216         UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END ||
217         !TII->isOperandLegal(*UseMI, UseMI->getOperandNo(&MO), &Src))
218       return false;
219   }
220   // Change VGPR to SGPR destination.
221   MRI.setRegClass(DstReg, TRI->getEquivalentSGPRClass(MRI.getRegClass(DstReg)));
222   return true;
223 }
224 
225 // Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE.
226 //
227 // SGPRx = ...
228 // SGPRy = REG_SEQUENCE SGPRx, sub0 ...
229 // VGPRz = COPY SGPRy
230 //
231 // ==>
232 //
233 // VGPRx = COPY SGPRx
234 // VGPRz = REG_SEQUENCE VGPRx, sub0
235 //
236 // This exposes immediate folding opportunities when materializing 64-bit
237 // immediates.
238 static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
239                                         const SIRegisterInfo *TRI,
240                                         const SIInstrInfo *TII,
241                                         MachineRegisterInfo &MRI) {
242   assert(MI.isRegSequence());
243 
244   Register DstReg = MI.getOperand(0).getReg();
245   if (!TRI->isSGPRClass(MRI.getRegClass(DstReg)))
246     return false;
247 
248   if (!MRI.hasOneUse(DstReg))
249     return false;
250 
251   MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg);
252   if (!CopyUse.isCopy())
253     return false;
254 
255   // It is illegal to have vreg inputs to a physreg defining reg_sequence.
256   if (CopyUse.getOperand(0).getReg().isPhysical())
257     return false;
258 
259   const TargetRegisterClass *SrcRC, *DstRC;
260   std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI);
261 
262   if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI))
263     return false;
264 
265   if (tryChangeVGPRtoSGPRinCopy(CopyUse, TRI, TII))
266     return true;
267 
268   // TODO: Could have multiple extracts?
269   unsigned SubReg = CopyUse.getOperand(1).getSubReg();
270   if (SubReg != AMDGPU::NoSubRegister)
271     return false;
272 
273   MRI.setRegClass(DstReg, DstRC);
274 
275   // SGPRx = ...
276   // SGPRy = REG_SEQUENCE SGPRx, sub0 ...
277   // VGPRz = COPY SGPRy
278 
279   // =>
280   // VGPRx = COPY SGPRx
281   // VGPRz = REG_SEQUENCE VGPRx, sub0
282 
283   MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg());
284   bool IsAGPR = TRI->hasAGPRs(DstRC);
285 
286   for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
287     Register SrcReg = MI.getOperand(I).getReg();
288     unsigned SrcSubReg = MI.getOperand(I).getSubReg();
289 
290     const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
291     assert(TRI->isSGPRClass(SrcRC) &&
292            "Expected SGPR REG_SEQUENCE to only have SGPR inputs");
293 
294     SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg);
295     const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC);
296 
297     Register TmpReg = MRI.createVirtualRegister(NewSrcRC);
298 
299     BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
300             TmpReg)
301         .add(MI.getOperand(I));
302 
303     if (IsAGPR) {
304       const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC);
305       Register TmpAReg = MRI.createVirtualRegister(NewSrcRC);
306       unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ?
307         AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY;
308       BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc),
309             TmpAReg)
310         .addReg(TmpReg, RegState::Kill);
311       TmpReg = TmpAReg;
312     }
313 
314     MI.getOperand(I).setReg(TmpReg);
315   }
316 
317   CopyUse.eraseFromParent();
318   return true;
319 }
320 
321 static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
322                                     const MachineInstr *MoveImm,
323                                     const SIInstrInfo *TII,
324                                     unsigned &SMovOp,
325                                     int64_t &Imm) {
326   if (Copy->getOpcode() != AMDGPU::COPY)
327     return false;
328 
329   if (!MoveImm->isMoveImmediate())
330     return false;
331 
332   const MachineOperand *ImmOp =
333       TII->getNamedOperand(*MoveImm, AMDGPU::OpName::src0);
334   if (!ImmOp->isImm())
335     return false;
336 
337   // FIXME: Handle copies with sub-regs.
338   if (Copy->getOperand(0).getSubReg())
339     return false;
340 
341   switch (MoveImm->getOpcode()) {
342   default:
343     return false;
344   case AMDGPU::V_MOV_B32_e32:
345     SMovOp = AMDGPU::S_MOV_B32;
346     break;
347   case AMDGPU::V_MOV_B64_PSEUDO:
348     SMovOp = AMDGPU::S_MOV_B64;
349     break;
350   }
351   Imm = ImmOp->getImm();
352   return true;
353 }
354 
355 template <class UnaryPredicate>
356 bool searchPredecessors(const MachineBasicBlock *MBB,
357                         const MachineBasicBlock *CutOff,
358                         UnaryPredicate Predicate) {
359   if (MBB == CutOff)
360     return false;
361 
362   DenseSet<const MachineBasicBlock *> Visited;
363   SmallVector<MachineBasicBlock *, 4> Worklist(MBB->pred_begin(),
364                                                MBB->pred_end());
365 
366   while (!Worklist.empty()) {
367     MachineBasicBlock *MBB = Worklist.pop_back_val();
368 
369     if (!Visited.insert(MBB).second)
370       continue;
371     if (MBB == CutOff)
372       continue;
373     if (Predicate(MBB))
374       return true;
375 
376     Worklist.append(MBB->pred_begin(), MBB->pred_end());
377   }
378 
379   return false;
380 }
381 
382 // Checks if there is potential path From instruction To instruction.
383 // If CutOff is specified and it sits in between of that path we ignore
384 // a higher portion of the path and report it is not reachable.
385 static bool isReachable(const MachineInstr *From,
386                         const MachineInstr *To,
387                         const MachineBasicBlock *CutOff,
388                         MachineDominatorTree &MDT) {
389   if (MDT.dominates(From, To))
390     return true;
391 
392   const MachineBasicBlock *MBBFrom = From->getParent();
393   const MachineBasicBlock *MBBTo = To->getParent();
394 
395   // Do predecessor search.
396   // We should almost never get here since we do not usually produce M0 stores
397   // other than -1.
398   return searchPredecessors(MBBTo, CutOff, [MBBFrom]
399            (const MachineBasicBlock *MBB) { return MBB == MBBFrom; });
400 }
401 
402 // Return the first non-prologue instruction in the block.
403 static MachineBasicBlock::iterator
404 getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) {
405   MachineBasicBlock::iterator I = MBB->getFirstNonPHI();
406   while (I != MBB->end() && TII->isBasicBlockPrologue(*I))
407     ++I;
408 
409   return I;
410 }
411 
412 // Hoist and merge identical SGPR initializations into a common predecessor.
413 // This is intended to combine M0 initializations, but can work with any
414 // SGPR. A VGPR cannot be processed since we cannot guarantee vector
415 // executioon.
416 static bool hoistAndMergeSGPRInits(unsigned Reg,
417                                    const MachineRegisterInfo &MRI,
418                                    const TargetRegisterInfo *TRI,
419                                    MachineDominatorTree &MDT,
420                                    const TargetInstrInfo *TII) {
421   // List of inits by immediate value.
422   using InitListMap = std::map<unsigned, std::list<MachineInstr *>>;
423   InitListMap Inits;
424   // List of clobbering instructions.
425   SmallVector<MachineInstr*, 8> Clobbers;
426   // List of instructions marked for deletion.
427   SmallSet<MachineInstr*, 8> MergedInstrs;
428 
429   bool Changed = false;
430 
431   for (auto &MI : MRI.def_instructions(Reg)) {
432     MachineOperand *Imm = nullptr;
433     for (auto &MO : MI.operands()) {
434       if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) ||
435           (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) {
436         Imm = nullptr;
437         break;
438       } else if (MO.isImm())
439         Imm = &MO;
440     }
441     if (Imm)
442       Inits[Imm->getImm()].push_front(&MI);
443     else
444       Clobbers.push_back(&MI);
445   }
446 
447   for (auto &Init : Inits) {
448     auto &Defs = Init.second;
449 
450     for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) {
451       MachineInstr *MI1 = *I1;
452 
453       for (auto I2 = std::next(I1); I2 != E; ) {
454         MachineInstr *MI2 = *I2;
455 
456         // Check any possible interference
457         auto interferes = [&](MachineBasicBlock::iterator From,
458                               MachineBasicBlock::iterator To) -> bool {
459 
460           assert(MDT.dominates(&*To, &*From));
461 
462           auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool {
463             const MachineBasicBlock *MBBFrom = From->getParent();
464             const MachineBasicBlock *MBBTo = To->getParent();
465             bool MayClobberFrom = isReachable(Clobber, &*From, MBBTo, MDT);
466             bool MayClobberTo = isReachable(Clobber, &*To, MBBTo, MDT);
467             if (!MayClobberFrom && !MayClobberTo)
468               return false;
469             if ((MayClobberFrom && !MayClobberTo) ||
470                 (!MayClobberFrom && MayClobberTo))
471               return true;
472             // Both can clobber, this is not an interference only if both are
473             // dominated by Clobber and belong to the same block or if Clobber
474             // properly dominates To, given that To >> From, so it dominates
475             // both and located in a common dominator.
476             return !((MBBFrom == MBBTo &&
477                       MDT.dominates(Clobber, &*From) &&
478                       MDT.dominates(Clobber, &*To)) ||
479                      MDT.properlyDominates(Clobber->getParent(), MBBTo));
480           };
481 
482           return (llvm::any_of(Clobbers, interferes)) ||
483                  (llvm::any_of(Inits, [&](InitListMap::value_type &C) {
484                     return C.first != Init.first &&
485                            llvm::any_of(C.second, interferes);
486                   }));
487         };
488 
489         if (MDT.dominates(MI1, MI2)) {
490           if (!interferes(MI2, MI1)) {
491             LLVM_DEBUG(dbgs()
492                        << "Erasing from "
493                        << printMBBReference(*MI2->getParent()) << " " << *MI2);
494             MergedInstrs.insert(MI2);
495             Changed = true;
496             ++I2;
497             continue;
498           }
499         } else if (MDT.dominates(MI2, MI1)) {
500           if (!interferes(MI1, MI2)) {
501             LLVM_DEBUG(dbgs()
502                        << "Erasing from "
503                        << printMBBReference(*MI1->getParent()) << " " << *MI1);
504             MergedInstrs.insert(MI1);
505             Changed = true;
506             ++I1;
507             break;
508           }
509         } else {
510           auto *MBB = MDT.findNearestCommonDominator(MI1->getParent(),
511                                                      MI2->getParent());
512           if (!MBB) {
513             ++I2;
514             continue;
515           }
516 
517           MachineBasicBlock::iterator I = getFirstNonPrologue(MBB, TII);
518           if (!interferes(MI1, I) && !interferes(MI2, I)) {
519             LLVM_DEBUG(dbgs()
520                        << "Erasing from "
521                        << printMBBReference(*MI1->getParent()) << " " << *MI1
522                        << "and moving from "
523                        << printMBBReference(*MI2->getParent()) << " to "
524                        << printMBBReference(*I->getParent()) << " " << *MI2);
525             I->getParent()->splice(I, MI2->getParent(), MI2);
526             MergedInstrs.insert(MI1);
527             Changed = true;
528             ++I1;
529             break;
530           }
531         }
532         ++I2;
533       }
534       ++I1;
535     }
536   }
537 
538   // Remove initializations that were merged into another.
539   for (auto &Init : Inits) {
540     auto &Defs = Init.second;
541     auto I = Defs.begin();
542     while (I != Defs.end()) {
543       if (MergedInstrs.count(*I)) {
544         (*I)->eraseFromParent();
545         I = Defs.erase(I);
546       } else
547         ++I;
548     }
549   }
550 
551   // Try to schedule SGPR initializations as early as possible in the MBB.
552   for (auto &Init : Inits) {
553     auto &Defs = Init.second;
554     for (auto MI : Defs) {
555       auto MBB = MI->getParent();
556       MachineInstr &BoundaryMI = *getFirstNonPrologue(MBB, TII);
557       MachineBasicBlock::reverse_iterator B(BoundaryMI);
558       // Check if B should actually be a boundary. If not set the previous
559       // instruction as the boundary instead.
560       if (!TII->isBasicBlockPrologue(*B))
561         B++;
562 
563       auto R = std::next(MI->getReverseIterator());
564       const unsigned Threshold = 50;
565       // Search until B or Threshold for a place to insert the initialization.
566       for (unsigned I = 0; R != B && I < Threshold; ++R, ++I)
567         if (R->readsRegister(Reg, TRI) || R->definesRegister(Reg, TRI) ||
568             TII->isSchedulingBoundary(*R, MBB, *MBB->getParent()))
569           break;
570 
571       // Move to directly after R.
572       if (&*--R != MI)
573         MBB->splice(*R, MBB, MI);
574     }
575   }
576 
577   if (Changed)
578     MRI.clearKillFlags(Reg);
579 
580   return Changed;
581 }
582 
583 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
584   // Only need to run this in SelectionDAG path.
585   if (MF.getProperties().hasProperty(
586         MachineFunctionProperties::Property::Selected))
587     return false;
588 
589   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
590   MRI = &MF.getRegInfo();
591   TRI = ST.getRegisterInfo();
592   TII = ST.getInstrInfo();
593   MDT = &getAnalysis<MachineDominatorTree>();
594 
595   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
596                                                   BI != BE; ++BI) {
597     MachineBasicBlock *MBB = &*BI;
598     for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
599          ++I) {
600       MachineInstr &MI = *I;
601 
602       switch (MI.getOpcode()) {
603       default:
604         continue;
605       case AMDGPU::COPY:
606       case AMDGPU::WQM:
607       case AMDGPU::SOFT_WQM:
608       case AMDGPU::WWM: {
609         Register DstReg = MI.getOperand(0).getReg();
610 
611         const TargetRegisterClass *SrcRC, *DstRC;
612         std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI);
613 
614         if (!DstReg.isVirtual()) {
615           // If the destination register is a physical register there isn't
616           // really much we can do to fix this.
617           // Some special instructions use M0 as an input. Some even only use
618           // the first lane. Insert a readfirstlane and hope for the best.
619           if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) {
620             Register TmpReg
621               = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
622 
623             BuildMI(*MBB, MI, MI.getDebugLoc(),
624                     TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg)
625                 .add(MI.getOperand(1));
626             MI.getOperand(1).setReg(TmpReg);
627           }
628 
629           continue;
630         }
631 
632         if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
633           Register SrcReg = MI.getOperand(1).getReg();
634           if (!SrcReg.isVirtual()) {
635             MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT);
636             if (NewBB && NewBB != MBB) {
637               MBB = NewBB;
638               E = MBB->end();
639               BI = MachineFunction::iterator(MBB);
640               BE = MF.end();
641             }
642             assert((!NewBB || NewBB == I->getParent()) &&
643                    "moveToVALU did not return the right basic block");
644             break;
645           }
646 
647           MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
648           unsigned SMovOp;
649           int64_t Imm;
650           // If we are just copying an immediate, we can replace the copy with
651           // s_mov_b32.
652           if (isSafeToFoldImmIntoCopy(&MI, DefMI, TII, SMovOp, Imm)) {
653             MI.getOperand(1).ChangeToImmediate(Imm);
654             MI.addImplicitDefUseOperands(MF);
655             MI.setDesc(TII->get(SMovOp));
656             break;
657           }
658           MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT);
659           if (NewBB && NewBB != MBB) {
660             MBB = NewBB;
661             E = MBB->end();
662             BI = MachineFunction::iterator(MBB);
663             BE = MF.end();
664           }
665           assert((!NewBB || NewBB == I->getParent()) &&
666                  "moveToVALU did not return the right basic block");
667         } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
668           tryChangeVGPRtoSGPRinCopy(MI, TRI, TII);
669         }
670 
671         break;
672       }
673       case AMDGPU::PHI: {
674         MachineBasicBlock *NewBB = processPHINode(MI);
675         if (NewBB && NewBB != MBB) {
676           MBB = NewBB;
677           E = MBB->end();
678           BI = MachineFunction::iterator(MBB);
679           BE = MF.end();
680         }
681         assert((!NewBB || NewBB == I->getParent()) &&
682                "moveToVALU did not return the right basic block");
683         break;
684       }
685       case AMDGPU::REG_SEQUENCE: {
686         if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) ||
687             !hasVectorOperands(MI, TRI)) {
688           foldVGPRCopyIntoRegSequence(MI, TRI, TII, *MRI);
689           continue;
690         }
691 
692         LLVM_DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
693 
694         MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT);
695         if (NewBB && NewBB != MBB) {
696           MBB = NewBB;
697           E = MBB->end();
698           BI = MachineFunction::iterator(MBB);
699           BE = MF.end();
700         }
701         assert((!NewBB || NewBB == I->getParent()) &&
702                "moveToVALU did not return the right basic block");
703         break;
704       }
705       case AMDGPU::INSERT_SUBREG: {
706         const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
707         DstRC = MRI->getRegClass(MI.getOperand(0).getReg());
708         Src0RC = MRI->getRegClass(MI.getOperand(1).getReg());
709         Src1RC = MRI->getRegClass(MI.getOperand(2).getReg());
710         if (TRI->isSGPRClass(DstRC) &&
711             (TRI->hasVectorRegisters(Src0RC) ||
712              TRI->hasVectorRegisters(Src1RC))) {
713           LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
714           MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT);
715           if (NewBB && NewBB != MBB) {
716             MBB = NewBB;
717             E = MBB->end();
718             BI = MachineFunction::iterator(MBB);
719             BE = MF.end();
720           }
721           assert((!NewBB || NewBB == I->getParent()) &&
722                  "moveToVALU did not return the right basic block");
723         }
724         break;
725       }
726       case AMDGPU::V_WRITELANE_B32: {
727         // Some architectures allow more than one constant bus access without
728         // SGPR restriction
729         if (ST.getConstantBusLimit(MI.getOpcode()) != 1)
730           break;
731 
732         // Writelane is special in that it can use SGPR and M0 (which would
733         // normally count as using the constant bus twice - but in this case it
734         // is allowed since the lane selector doesn't count as a use of the
735         // constant bus). However, it is still required to abide by the 1 SGPR
736         // rule. Apply a fix here as we might have multiple SGPRs after
737         // legalizing VGPRs to SGPRs
738         int Src0Idx =
739             AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
740         int Src1Idx =
741             AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);
742         MachineOperand &Src0 = MI.getOperand(Src0Idx);
743         MachineOperand &Src1 = MI.getOperand(Src1Idx);
744 
745         // Check to see if the instruction violates the 1 SGPR rule
746         if ((Src0.isReg() && TRI->isSGPRReg(*MRI, Src0.getReg()) &&
747              Src0.getReg() != AMDGPU::M0) &&
748             (Src1.isReg() && TRI->isSGPRReg(*MRI, Src1.getReg()) &&
749              Src1.getReg() != AMDGPU::M0)) {
750 
751           // Check for trivially easy constant prop into one of the operands
752           // If this is the case then perform the operation now to resolve SGPR
753           // issue. If we don't do that here we will always insert a mov to m0
754           // that can't be resolved in later operand folding pass
755           bool Resolved = false;
756           for (MachineOperand *MO : {&Src0, &Src1}) {
757             if (MO->getReg().isVirtual()) {
758               MachineInstr *DefMI = MRI->getVRegDef(MO->getReg());
759               if (DefMI && TII->isFoldableCopy(*DefMI)) {
760                 const MachineOperand &Def = DefMI->getOperand(0);
761                 if (Def.isReg() &&
762                     MO->getReg() == Def.getReg() &&
763                     MO->getSubReg() == Def.getSubReg()) {
764                   const MachineOperand &Copied = DefMI->getOperand(1);
765                   if (Copied.isImm() &&
766                       TII->isInlineConstant(APInt(64, Copied.getImm(), true))) {
767                     MO->ChangeToImmediate(Copied.getImm());
768                     Resolved = true;
769                     break;
770                   }
771                 }
772               }
773             }
774           }
775 
776           if (!Resolved) {
777             // Haven't managed to resolve by replacing an SGPR with an immediate
778             // Move src1 to be in M0
779             BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
780                     TII->get(AMDGPU::COPY), AMDGPU::M0)
781                 .add(Src1);
782             Src1.ChangeToRegister(AMDGPU::M0, false);
783           }
784         }
785         break;
786       }
787       }
788     }
789   }
790 
791   if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge)
792     hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII);
793 
794   return true;
795 }
796 
797 MachineBasicBlock *SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
798   unsigned numVGPRUses = 0;
799   bool AllAGPRUses = true;
800   SetVector<const MachineInstr *> worklist;
801   SmallSet<const MachineInstr *, 4> Visited;
802   SetVector<MachineInstr *> PHIOperands;
803   MachineBasicBlock *CreatedBB = nullptr;
804   worklist.insert(&MI);
805   Visited.insert(&MI);
806   while (!worklist.empty()) {
807     const MachineInstr *Instr = worklist.pop_back_val();
808     Register Reg = Instr->getOperand(0).getReg();
809     for (const auto &Use : MRI->use_operands(Reg)) {
810       const MachineInstr *UseMI = Use.getParent();
811       AllAGPRUses &= (UseMI->isCopy() &&
812                       TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg())) ||
813                      TRI->isAGPR(*MRI, Use.getReg());
814       if (UseMI->isCopy() || UseMI->isRegSequence()) {
815         if (UseMI->isCopy() &&
816           UseMI->getOperand(0).getReg().isPhysical() &&
817           !TRI->isSGPRReg(*MRI, UseMI->getOperand(0).getReg())) {
818           numVGPRUses++;
819         }
820         if (Visited.insert(UseMI).second)
821           worklist.insert(UseMI);
822 
823         continue;
824       }
825 
826       if (UseMI->isPHI()) {
827         const TargetRegisterClass *UseRC = MRI->getRegClass(Use.getReg());
828         if (!TRI->isSGPRReg(*MRI, Use.getReg()) &&
829           UseRC != &AMDGPU::VReg_1RegClass)
830           numVGPRUses++;
831         continue;
832       }
833 
834       const TargetRegisterClass *OpRC =
835         TII->getOpRegClass(*UseMI, UseMI->getOperandNo(&Use));
836       if (!TRI->isSGPRClass(OpRC) && OpRC != &AMDGPU::VS_32RegClass &&
837         OpRC != &AMDGPU::VS_64RegClass) {
838         numVGPRUses++;
839       }
840     }
841   }
842 
843   Register PHIRes = MI.getOperand(0).getReg();
844   const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes);
845   if (AllAGPRUses && numVGPRUses && !TRI->hasAGPRs(RC0)) {
846     LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI);
847     MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0));
848     for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
849       MachineInstr *DefMI = MRI->getVRegDef(MI.getOperand(I).getReg());
850       if (DefMI && DefMI->isPHI())
851         PHIOperands.insert(DefMI);
852     }
853   }
854 
855   bool hasVGPRInput = false;
856   for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
857     Register InputReg = MI.getOperand(i).getReg();
858     MachineInstr *Def = MRI->getVRegDef(InputReg);
859     if (TRI->isVectorRegister(*MRI, InputReg)) {
860       if (Def->isCopy()) {
861         Register SrcReg = Def->getOperand(1).getReg();
862         const TargetRegisterClass *RC =
863           TRI->getRegClassForReg(*MRI, SrcReg);
864         if (TRI->isSGPRClass(RC))
865           continue;
866       }
867       hasVGPRInput = true;
868       break;
869     }
870     else if (Def->isCopy() &&
871       TRI->isVectorRegister(*MRI, Def->getOperand(1).getReg())) {
872       Register SrcReg = Def->getOperand(1).getReg();
873       MachineInstr *SrcDef = MRI->getVRegDef(SrcReg);
874       unsigned SMovOp;
875       int64_t Imm;
876       if (!isSafeToFoldImmIntoCopy(Def, SrcDef, TII, SMovOp, Imm)) {
877         hasVGPRInput = true;
878         break;
879       } else {
880         // Formally, if we did not do this right away
881         // it would be done on the next iteration of the
882         // runOnMachineFunction main loop. But why not if we can?
883         MachineFunction *MF = MI.getParent()->getParent();
884         Def->getOperand(1).ChangeToImmediate(Imm);
885         Def->addImplicitDefUseOperands(*MF);
886         Def->setDesc(TII->get(SMovOp));
887       }
888     }
889   }
890 
891   if ((!TRI->isVectorRegister(*MRI, PHIRes) &&
892        RC0 != &AMDGPU::VReg_1RegClass) &&
893     (hasVGPRInput || numVGPRUses > 1)) {
894     LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
895     CreatedBB = TII->moveToVALU(MI);
896   }
897   else {
898     LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI);
899     TII->legalizeOperands(MI, MDT);
900   }
901 
902   // Propagate register class back to PHI operands which are PHI themselves.
903   while (!PHIOperands.empty()) {
904     processPHINode(*PHIOperands.pop_back_val());
905   }
906   return CreatedBB;
907 }
908