1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 /// The pass tries to use the 32-bit encoding for instructions when possible.
8 //===----------------------------------------------------------------------===//
9 //
10 
11 #include "AMDGPU.h"
12 #include "AMDGPUSubtarget.h"
13 #include "SIInstrInfo.h"
14 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
15 #include "llvm/ADT/Statistic.h"
16 #include "llvm/CodeGen/MachineFunctionPass.h"
17 #include "llvm/CodeGen/MachineInstrBuilder.h"
18 #include "llvm/CodeGen/MachineRegisterInfo.h"
19 #include "llvm/IR/Constants.h"
20 #include "llvm/IR/Function.h"
21 #include "llvm/IR/LLVMContext.h"
22 #include "llvm/Support/Debug.h"
23 #include "llvm/Support/raw_ostream.h"
24 #include "llvm/Target/TargetMachine.h"
25 
26 #define DEBUG_TYPE "si-shrink-instructions"
27 
28 STATISTIC(NumInstructionsShrunk,
29           "Number of 64-bit instruction reduced to 32-bit.");
30 STATISTIC(NumLiteralConstantsFolded,
31           "Number of literal constants folded into 32-bit instructions.");
32 
33 using namespace llvm;
34 
35 namespace {
36 
37 class SIShrinkInstructions : public MachineFunctionPass {
38 public:
39   static char ID;
40 
41   void shrinkMIMG(MachineInstr &MI);
42 
43 public:
44   SIShrinkInstructions() : MachineFunctionPass(ID) {
45   }
46 
47   bool runOnMachineFunction(MachineFunction &MF) override;
48 
49   StringRef getPassName() const override { return "SI Shrink Instructions"; }
50 
51   void getAnalysisUsage(AnalysisUsage &AU) const override {
52     AU.setPreservesCFG();
53     MachineFunctionPass::getAnalysisUsage(AU);
54   }
55 };
56 
57 } // End anonymous namespace.
58 
59 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE,
60                 "SI Shrink Instructions", false, false)
61 
62 char SIShrinkInstructions::ID = 0;
63 
64 FunctionPass *llvm::createSIShrinkInstructionsPass() {
65   return new SIShrinkInstructions();
66 }
67 
68 /// This function checks \p MI for operands defined by a move immediate
69 /// instruction and then folds the literal constant into the instruction if it
70 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
71 static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
72                            MachineRegisterInfo &MRI, bool TryToCommute = true) {
73   assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
74 
75   int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
76 
77   // Try to fold Src0
78   MachineOperand &Src0 = MI.getOperand(Src0Idx);
79   if (Src0.isReg()) {
80     unsigned Reg = Src0.getReg();
81     if (TargetRegisterInfo::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) {
82       MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
83       if (Def && Def->isMoveImmediate()) {
84         MachineOperand &MovSrc = Def->getOperand(1);
85         bool ConstantFolded = false;
86 
87         if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) ||
88                                isUInt<32>(MovSrc.getImm()))) {
89           // It's possible to have only one component of a super-reg defined by
90           // a single mov, so we need to clear any subregister flag.
91           Src0.setSubReg(0);
92           Src0.ChangeToImmediate(MovSrc.getImm());
93           ConstantFolded = true;
94         } else if (MovSrc.isFI()) {
95           Src0.setSubReg(0);
96           Src0.ChangeToFrameIndex(MovSrc.getIndex());
97           ConstantFolded = true;
98         }
99 
100         if (ConstantFolded) {
101           assert(MRI.use_empty(Reg));
102           Def->eraseFromParent();
103           ++NumLiteralConstantsFolded;
104           return true;
105         }
106       }
107     }
108   }
109 
110   // We have failed to fold src0, so commute the instruction and try again.
111   if (TryToCommute && MI.isCommutable()) {
112     if (TII->commuteInstruction(MI)) {
113       if (foldImmediates(MI, TII, MRI, false))
114         return true;
115 
116       // Commute back.
117       TII->commuteInstruction(MI);
118     }
119   }
120 
121   return false;
122 }
123 
124 static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
125   return isInt<16>(Src.getImm()) &&
126     !TII->isInlineConstant(*Src.getParent(),
127                            Src.getParent()->getOperandNo(&Src));
128 }
129 
130 static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
131   return isUInt<16>(Src.getImm()) &&
132     !TII->isInlineConstant(*Src.getParent(),
133                            Src.getParent()->getOperandNo(&Src));
134 }
135 
136 static bool isKImmOrKUImmOperand(const SIInstrInfo *TII,
137                                  const MachineOperand &Src,
138                                  bool &IsUnsigned) {
139   if (isInt<16>(Src.getImm())) {
140     IsUnsigned = false;
141     return !TII->isInlineConstant(Src);
142   }
143 
144   if (isUInt<16>(Src.getImm())) {
145     IsUnsigned = true;
146     return !TII->isInlineConstant(Src);
147   }
148 
149   return false;
150 }
151 
152 /// \returns true if the constant in \p Src should be replaced with a bitreverse
153 /// of an inline immediate.
154 static bool isReverseInlineImm(const SIInstrInfo *TII,
155                                const MachineOperand &Src,
156                                int32_t &ReverseImm) {
157   if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src))
158     return false;
159 
160   ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm()));
161   return ReverseImm >= -16 && ReverseImm <= 64;
162 }
163 
164 /// Copy implicit register operands from specified instruction to this
165 /// instruction that are not part of the instruction definition.
166 static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF,
167                                  const MachineInstr &MI) {
168   for (unsigned i = MI.getDesc().getNumOperands() +
169          MI.getDesc().getNumImplicitUses() +
170          MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
171        i != e; ++i) {
172     const MachineOperand &MO = MI.getOperand(i);
173     if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask())
174       NewMI.addOperand(MF, MO);
175   }
176 }
177 
178 static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
179   // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
180   // get constants on the RHS.
181   if (!MI.getOperand(0).isReg())
182     TII->commuteInstruction(MI, false, 0, 1);
183 
184   const MachineOperand &Src1 = MI.getOperand(1);
185   if (!Src1.isImm())
186     return;
187 
188   int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode());
189   if (SOPKOpc == -1)
190     return;
191 
192   // eq/ne is special because the imm16 can be treated as signed or unsigned,
193   // and initially selectd to the unsigned versions.
194   if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) {
195     bool HasUImm;
196     if (isKImmOrKUImmOperand(TII, Src1, HasUImm)) {
197       if (!HasUImm) {
198         SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
199           AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32;
200       }
201 
202       MI.setDesc(TII->get(SOPKOpc));
203     }
204 
205     return;
206   }
207 
208   const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
209 
210   if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(TII, Src1)) ||
211       (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(TII, Src1))) {
212     MI.setDesc(NewDesc);
213   }
214 }
215 
216 // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding.
217 void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) {
218   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
219   if (Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
220     return;
221 
222   MachineFunction *MF = MI.getParent()->getParent();
223   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
224   const SIInstrInfo *TII = ST.getInstrInfo();
225   const SIRegisterInfo &TRI = TII->getRegisterInfo();
226   int VAddr0Idx =
227       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
228   unsigned NewAddrDwords = Info->VAddrDwords;
229   const TargetRegisterClass *RC;
230 
231   if (Info->VAddrDwords == 2) {
232     RC = &AMDGPU::VReg_64RegClass;
233   } else if (Info->VAddrDwords == 3) {
234     RC = &AMDGPU::VReg_96RegClass;
235   } else if (Info->VAddrDwords == 4) {
236     RC = &AMDGPU::VReg_128RegClass;
237   } else if (Info->VAddrDwords <= 8) {
238     RC = &AMDGPU::VReg_256RegClass;
239     NewAddrDwords = 8;
240   } else {
241     RC = &AMDGPU::VReg_512RegClass;
242     NewAddrDwords = 16;
243   }
244 
245   unsigned VgprBase = 0;
246   bool IsUndef = true;
247   bool IsKill = NewAddrDwords == Info->VAddrDwords;
248   for (unsigned i = 0; i < Info->VAddrDwords; ++i) {
249     const MachineOperand &Op = MI.getOperand(VAddr0Idx + i);
250     unsigned Vgpr = TRI.getHWRegIndex(Op.getReg());
251 
252     if (i == 0) {
253       VgprBase = Vgpr;
254     } else if (VgprBase + i != Vgpr)
255       return;
256 
257     if (!Op.isUndef())
258       IsUndef = false;
259     if (!Op.isKill())
260       IsKill = false;
261   }
262 
263   if (VgprBase + NewAddrDwords > 256)
264     return;
265 
266   // Further check for implicit tied operands - this may be present if TFE is
267   // enabled
268   int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
269   int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe);
270   unsigned TFEVal = MI.getOperand(TFEIdx).getImm();
271   unsigned LWEVal = MI.getOperand(LWEIdx).getImm();
272   int ToUntie = -1;
273   if (TFEVal || LWEVal) {
274     // TFE/LWE is enabled so we need to deal with an implicit tied operand
275     for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) {
276       if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() &&
277           MI.getOperand(i).isImplicit()) {
278         // This is the tied operand
279         assert(
280             ToUntie == -1 &&
281             "found more than one tied implicit operand when expecting only 1");
282         ToUntie = i;
283         MI.untieRegOperand(ToUntie);
284       }
285     }
286   }
287 
288   unsigned NewOpcode =
289       AMDGPU::getMIMGOpcode(Info->BaseOpcode, AMDGPU::MIMGEncGfx10Default,
290                             Info->VDataDwords, NewAddrDwords);
291   MI.setDesc(TII->get(NewOpcode));
292   MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase));
293   MI.getOperand(VAddr0Idx).setIsUndef(IsUndef);
294   MI.getOperand(VAddr0Idx).setIsKill(IsKill);
295 
296   for (unsigned i = 1; i < Info->VAddrDwords; ++i)
297     MI.RemoveOperand(VAddr0Idx + 1);
298 
299   if (ToUntie >= 0) {
300     MI.tieOperands(
301         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata),
302         ToUntie - (Info->VAddrDwords - 1));
303   }
304 }
305 
306 /// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals.
307 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
308 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or
309 /// XNOR (as a ^ b == ~(a ^ ~b)).
310 /// \returns true if the caller should continue the machine function iterator
311 static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
312                                 MachineRegisterInfo &MRI,
313                                 const SIInstrInfo *TII,
314                                 MachineInstr &MI) {
315   unsigned Opc = MI.getOpcode();
316   const MachineOperand *Dest = &MI.getOperand(0);
317   MachineOperand *Src0 = &MI.getOperand(1);
318   MachineOperand *Src1 = &MI.getOperand(2);
319   MachineOperand *SrcReg = Src0;
320   MachineOperand *SrcImm = Src1;
321 
322   if (SrcImm->isImm() &&
323       !AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm())) {
324     uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm());
325     uint32_t NewImm = 0;
326 
327     if (Opc == AMDGPU::S_AND_B32) {
328       if (isPowerOf2_32(~Imm)) {
329         NewImm = countTrailingOnes(Imm);
330         Opc = AMDGPU::S_BITSET0_B32;
331       } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
332         NewImm = ~Imm;
333         Opc = AMDGPU::S_ANDN2_B32;
334       }
335     } else if (Opc == AMDGPU::S_OR_B32) {
336       if (isPowerOf2_32(Imm)) {
337         NewImm = countTrailingZeros(Imm);
338         Opc = AMDGPU::S_BITSET1_B32;
339       } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
340         NewImm = ~Imm;
341         Opc = AMDGPU::S_ORN2_B32;
342       }
343     } else if (Opc == AMDGPU::S_XOR_B32) {
344       if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
345         NewImm = ~Imm;
346         Opc = AMDGPU::S_XNOR_B32;
347       }
348     } else {
349       llvm_unreachable("unexpected opcode");
350     }
351 
352     if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) &&
353         SrcImm == Src0) {
354       if (!TII->commuteInstruction(MI, false, 1, 2))
355         NewImm = 0;
356     }
357 
358     if (NewImm != 0) {
359       if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) &&
360         SrcReg->isReg()) {
361         MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
362         MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
363         return true;
364       }
365 
366       if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) {
367         MI.setDesc(TII->get(Opc));
368         if (Opc == AMDGPU::S_BITSET0_B32 ||
369             Opc == AMDGPU::S_BITSET1_B32) {
370           Src0->ChangeToImmediate(NewImm);
371           // Remove the immediate and add the tied input.
372           MI.getOperand(2).ChangeToRegister(Dest->getReg(), false);
373           MI.tieOperands(0, 2);
374         } else {
375           SrcImm->setImm(NewImm);
376         }
377       }
378     }
379   }
380 
381   return false;
382 }
383 
384 // This is the same as MachineInstr::readsRegister/modifiesRegister except
385 // it takes subregs into account.
386 static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
387                           unsigned Reg, unsigned SubReg,
388                           const SIRegisterInfo &TRI) {
389   for (const MachineOperand &MO : R) {
390     if (!MO.isReg())
391       continue;
392 
393     if (TargetRegisterInfo::isPhysicalRegister(Reg) &&
394         TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
395       if (TRI.regsOverlap(Reg, MO.getReg()))
396         return true;
397     } else if (MO.getReg() == Reg &&
398                TargetRegisterInfo::isVirtualRegister(Reg)) {
399       LaneBitmask Overlap = TRI.getSubRegIndexLaneMask(SubReg) &
400                             TRI.getSubRegIndexLaneMask(MO.getSubReg());
401       if (Overlap.any())
402         return true;
403     }
404   }
405   return false;
406 }
407 
408 static bool instReadsReg(const MachineInstr *MI,
409                          unsigned Reg, unsigned SubReg,
410                          const SIRegisterInfo &TRI) {
411   return instAccessReg(MI->uses(), Reg, SubReg, TRI);
412 }
413 
414 static bool instModifiesReg(const MachineInstr *MI,
415                             unsigned Reg, unsigned SubReg,
416                             const SIRegisterInfo &TRI) {
417   return instAccessReg(MI->defs(), Reg, SubReg, TRI);
418 }
419 
420 static TargetInstrInfo::RegSubRegPair
421 getSubRegForIndex(unsigned Reg, unsigned Sub, unsigned I,
422                   const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) {
423   if (TRI.getRegSizeInBits(Reg, MRI) != 32) {
424     if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
425       Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I));
426     } else {
427       LaneBitmask LM = TRI.getSubRegIndexLaneMask(Sub);
428       Sub = TRI.getSubRegFromChannel(I + countTrailingZeros(LM.getAsInteger()));
429     }
430   }
431   return TargetInstrInfo::RegSubRegPair(Reg, Sub);
432 }
433 
434 // Match:
435 // mov t, x
436 // mov x, y
437 // mov y, t
438 //
439 // =>
440 //
441 // mov t, x (t is potentially dead and move eliminated)
442 // v_swap_b32 x, y
443 //
444 // Returns next valid instruction pointer if was able to create v_swap_b32.
445 //
446 // This shall not be done too early not to prevent possible folding which may
447 // remove matched moves, and this should prefereably be done before RA to
448 // release saved registers and also possibly after RA which can insert copies
449 // too.
450 //
451 // This is really just a generic peephole that is not a canocical shrinking,
452 // although requirements match the pass placement and it reduces code size too.
453 static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
454                                const SIInstrInfo *TII) {
455   assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
456          MovT.getOpcode() == AMDGPU::COPY);
457 
458   unsigned T = MovT.getOperand(0).getReg();
459   unsigned Tsub = MovT.getOperand(0).getSubReg();
460   MachineOperand &Xop = MovT.getOperand(1);
461 
462   if (!Xop.isReg())
463     return nullptr;
464   unsigned X = Xop.getReg();
465   unsigned Xsub = Xop.getSubReg();
466 
467   unsigned Size = TII->getOpSize(MovT, 0) / 4;
468 
469   const SIRegisterInfo &TRI = TII->getRegisterInfo();
470   if (!TRI.isVGPR(MRI, X))
471     return nullptr;
472 
473   for (MachineOperand &YTop : MRI.use_nodbg_operands(T)) {
474     if (YTop.getSubReg() != Tsub)
475       continue;
476 
477     MachineInstr &MovY = *YTop.getParent();
478     if ((MovY.getOpcode() != AMDGPU::V_MOV_B32_e32 &&
479          MovY.getOpcode() != AMDGPU::COPY) ||
480         MovY.getOperand(1).getSubReg() != Tsub)
481       continue;
482 
483     unsigned Y = MovY.getOperand(0).getReg();
484     unsigned Ysub = MovY.getOperand(0).getSubReg();
485 
486     if (!TRI.isVGPR(MRI, Y) || MovT.getParent() != MovY.getParent())
487       continue;
488 
489     MachineInstr *MovX = nullptr;
490     auto I = std::next(MovT.getIterator()), E = MovT.getParent()->instr_end();
491     for (auto IY = MovY.getIterator(); I != E && I != IY; ++I) {
492       if (instReadsReg(&*I, X, Xsub, TRI) ||
493           instModifiesReg(&*I, Y, Ysub, TRI) ||
494           instModifiesReg(&*I, T, Tsub, TRI) ||
495           (MovX && instModifiesReg(&*I, X, Xsub, TRI))) {
496         MovX = nullptr;
497         break;
498       }
499       if (!instReadsReg(&*I, Y, Ysub, TRI)) {
500         if (!MovX && instModifiesReg(&*I, X, Xsub, TRI)) {
501           MovX = nullptr;
502           break;
503         }
504         continue;
505       }
506       if (MovX ||
507           (I->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
508            I->getOpcode() != AMDGPU::COPY) ||
509           I->getOperand(0).getReg() != X ||
510           I->getOperand(0).getSubReg() != Xsub) {
511         MovX = nullptr;
512         break;
513       }
514       MovX = &*I;
515     }
516 
517     if (!MovX || I == E)
518       continue;
519 
520     LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << MovY);
521 
522     for (unsigned I = 0; I < Size; ++I) {
523       TargetInstrInfo::RegSubRegPair X1, Y1;
524       X1 = getSubRegForIndex(X, Xsub, I, TRI, MRI);
525       Y1 = getSubRegForIndex(Y, Ysub, I, TRI, MRI);
526       BuildMI(*MovT.getParent(), MovX->getIterator(), MovT.getDebugLoc(),
527                 TII->get(AMDGPU::V_SWAP_B32))
528         .addDef(X1.Reg, 0, X1.SubReg)
529         .addDef(Y1.Reg, 0, Y1.SubReg)
530         .addReg(Y1.Reg, 0, Y1.SubReg)
531         .addReg(X1.Reg, 0, X1.SubReg).getInstr();
532     }
533     MovX->eraseFromParent();
534     MovY.eraseFromParent();
535     MachineInstr *Next = &*std::next(MovT.getIterator());
536     if (MRI.use_nodbg_empty(T))
537       MovT.eraseFromParent();
538     else
539       Xop.setIsKill(false);
540 
541     return Next;
542   }
543 
544   return nullptr;
545 }
546 
547 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
548   if (skipFunction(MF.getFunction()))
549     return false;
550 
551   MachineRegisterInfo &MRI = MF.getRegInfo();
552   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
553   const SIInstrInfo *TII = ST.getInstrInfo();
554 
555   std::vector<unsigned> I1Defs;
556 
557   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
558                                                   BI != BE; ++BI) {
559 
560     MachineBasicBlock &MBB = *BI;
561     MachineBasicBlock::iterator I, Next;
562     for (I = MBB.begin(); I != MBB.end(); I = Next) {
563       Next = std::next(I);
564       MachineInstr &MI = *I;
565 
566       if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
567         // If this has a literal constant source that is the same as the
568         // reversed bits of an inline immediate, replace with a bitreverse of
569         // that constant. This saves 4 bytes in the common case of materializing
570         // sign bits.
571 
572         // Test if we are after regalloc. We only want to do this after any
573         // optimizations happen because this will confuse them.
574         // XXX - not exactly a check for post-regalloc run.
575         MachineOperand &Src = MI.getOperand(1);
576         if (Src.isImm() &&
577             TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) {
578           int32_t ReverseImm;
579           if (isReverseInlineImm(TII, Src, ReverseImm)) {
580             MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
581             Src.setImm(ReverseImm);
582             continue;
583           }
584         }
585       }
586 
587       if (ST.hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
588                            MI.getOpcode() == AMDGPU::COPY)) {
589         if (auto *NextMI = matchSwap(MI, MRI, TII)) {
590           Next = NextMI->getIterator();
591           continue;
592         }
593       }
594 
595       // Combine adjacent s_nops to use the immediate operand encoding how long
596       // to wait.
597       //
598       // s_nop N
599       // s_nop M
600       //  =>
601       // s_nop (N + M)
602       if (MI.getOpcode() == AMDGPU::S_NOP &&
603           Next != MBB.end() &&
604           (*Next).getOpcode() == AMDGPU::S_NOP) {
605 
606         MachineInstr &NextMI = *Next;
607         // The instruction encodes the amount to wait with an offset of 1,
608         // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back
609         // after adding.
610         uint8_t Nop0 = MI.getOperand(0).getImm() + 1;
611         uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1;
612 
613         // Make sure we don't overflow the bounds.
614         if (Nop0 + Nop1 <= 8) {
615           NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1);
616           MI.eraseFromParent();
617         }
618 
619         continue;
620       }
621 
622       // FIXME: We also need to consider movs of constant operands since
623       // immediate operands are not folded if they have more than one use, and
624       // the operand folding pass is unaware if the immediate will be free since
625       // it won't know if the src == dest constraint will end up being
626       // satisfied.
627       if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
628           MI.getOpcode() == AMDGPU::S_MUL_I32) {
629         const MachineOperand *Dest = &MI.getOperand(0);
630         MachineOperand *Src0 = &MI.getOperand(1);
631         MachineOperand *Src1 = &MI.getOperand(2);
632 
633         if (!Src0->isReg() && Src1->isReg()) {
634           if (TII->commuteInstruction(MI, false, 1, 2))
635             std::swap(Src0, Src1);
636         }
637 
638         // FIXME: This could work better if hints worked with subregisters. If
639         // we have a vector add of a constant, we usually don't get the correct
640         // allocation due to the subregister usage.
641         if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) &&
642             Src0->isReg()) {
643           MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
644           MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
645           continue;
646         }
647 
648         if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
649           if (Src1->isImm() && isKImmOperand(TII, *Src1)) {
650             unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
651               AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
652 
653             MI.setDesc(TII->get(Opc));
654             MI.tieOperands(0, 1);
655           }
656         }
657       }
658 
659       // Try to use s_cmpk_*
660       if (MI.isCompare() && TII->isSOPC(MI)) {
661         shrinkScalarCompare(TII, MI);
662         continue;
663       }
664 
665       // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
666       if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
667         const MachineOperand &Dst = MI.getOperand(0);
668         MachineOperand &Src = MI.getOperand(1);
669 
670         if (Src.isImm() &&
671             TargetRegisterInfo::isPhysicalRegister(Dst.getReg())) {
672           int32_t ReverseImm;
673           if (isKImmOperand(TII, Src))
674             MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
675           else if (isReverseInlineImm(TII, Src, ReverseImm)) {
676             MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
677             Src.setImm(ReverseImm);
678           }
679         }
680 
681         continue;
682       }
683 
684       // Shrink scalar logic operations.
685       if (MI.getOpcode() == AMDGPU::S_AND_B32 ||
686           MI.getOpcode() == AMDGPU::S_OR_B32 ||
687           MI.getOpcode() == AMDGPU::S_XOR_B32) {
688         if (shrinkScalarLogicOp(ST, MRI, TII, MI))
689           continue;
690       }
691 
692       if (TII->isMIMG(MI.getOpcode()) &&
693           ST.getGeneration() >= AMDGPUSubtarget::GFX10 &&
694           MF.getProperties().hasProperty(
695               MachineFunctionProperties::Property::NoVRegs)) {
696         shrinkMIMG(MI);
697         continue;
698       }
699 
700       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
701         continue;
702 
703       if (!TII->canShrink(MI, MRI)) {
704         // Try commuting the instruction and see if that enables us to shrink
705         // it.
706         if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
707             !TII->canShrink(MI, MRI))
708           continue;
709       }
710 
711       // getVOPe32 could be -1 here if we started with an instruction that had
712       // a 32-bit encoding and then commuted it to an instruction that did not.
713       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
714         continue;
715 
716       int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
717 
718       if (TII->isVOPC(Op32)) {
719         unsigned DstReg = MI.getOperand(0).getReg();
720         if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
721           // VOPC instructions can only write to the VCC register. We can't
722           // force them to use VCC here, because this is only one register and
723           // cannot deal with sequences which would require multiple copies of
724           // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
725           //
726           // So, instead of forcing the instruction to write to VCC, we provide
727           // a hint to the register allocator to use VCC and then we will run
728           // this pass again after RA and shrink it if it outputs to VCC.
729           MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
730           continue;
731         }
732         if (DstReg != AMDGPU::VCC)
733           continue;
734       }
735 
736       if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
737         // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
738         // instructions.
739         const MachineOperand *Src2 =
740             TII->getNamedOperand(MI, AMDGPU::OpName::src2);
741         if (!Src2->isReg())
742           continue;
743         unsigned SReg = Src2->getReg();
744         if (TargetRegisterInfo::isVirtualRegister(SReg)) {
745           MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC);
746           continue;
747         }
748         if (SReg != AMDGPU::VCC)
749           continue;
750       }
751 
752       // Check for the bool flag output for instructions like V_ADD_I32_e64.
753       const MachineOperand *SDst = TII->getNamedOperand(MI,
754                                                         AMDGPU::OpName::sdst);
755 
756       // Check the carry-in operand for v_addc_u32_e64.
757       const MachineOperand *Src2 = TII->getNamedOperand(MI,
758                                                         AMDGPU::OpName::src2);
759 
760       if (SDst) {
761         if (SDst->getReg() != AMDGPU::VCC) {
762           if (TargetRegisterInfo::isVirtualRegister(SDst->getReg()))
763             MRI.setRegAllocationHint(SDst->getReg(), 0, AMDGPU::VCC);
764           continue;
765         }
766 
767         // All of the instructions with carry outs also have an SGPR input in
768         // src2.
769         if (Src2 && Src2->getReg() != AMDGPU::VCC) {
770           if (TargetRegisterInfo::isVirtualRegister(Src2->getReg()))
771             MRI.setRegAllocationHint(Src2->getReg(), 0, AMDGPU::VCC);
772 
773           continue;
774         }
775       }
776 
777       // We can shrink this instruction
778       LLVM_DEBUG(dbgs() << "Shrinking " << MI);
779 
780       MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32);
781       ++NumInstructionsShrunk;
782 
783       // Copy extra operands not present in the instruction definition.
784       copyExtraImplicitOps(*Inst32, MF, MI);
785 
786       MI.eraseFromParent();
787       foldImmediates(*Inst32, TII, MRI);
788 
789       LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
790     }
791   }
792   return false;
793 }
794