1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 /// The pass tries to use the 32-bit encoding for instructions when possible.
8 //===----------------------------------------------------------------------===//
9 //
10 
11 #include "AMDGPU.h"
12 #include "GCNSubtarget.h"
13 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
14 #include "llvm/ADT/Statistic.h"
15 #include "llvm/CodeGen/MachineFunctionPass.h"
16 
17 #define DEBUG_TYPE "si-shrink-instructions"
18 
19 STATISTIC(NumInstructionsShrunk,
20           "Number of 64-bit instruction reduced to 32-bit.");
21 STATISTIC(NumLiteralConstantsFolded,
22           "Number of literal constants folded into 32-bit instructions.");
23 
24 using namespace llvm;
25 
26 namespace {
27 
28 class SIShrinkInstructions : public MachineFunctionPass {
29   MachineRegisterInfo *MRI;
30   const GCNSubtarget *ST;
31   const SIInstrInfo *TII;
32   const SIRegisterInfo *TRI;
33 
34 public:
35   static char ID;
36 
37 public:
38   SIShrinkInstructions() : MachineFunctionPass(ID) {
39   }
40 
41   bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const;
42   bool isKImmOperand(const MachineOperand &Src) const;
43   bool isKUImmOperand(const MachineOperand &Src) const;
44   bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const;
45   bool isReverseInlineImm(const MachineOperand &Src, int32_t &ReverseImm) const;
46   void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const;
47   void shrinkScalarCompare(MachineInstr &MI) const;
48   void shrinkMIMG(MachineInstr &MI) const;
49   void shrinkMadFma(MachineInstr &MI) const;
50   bool shrinkScalarLogicOp(MachineInstr &MI) const;
51   bool tryReplaceDeadSDST(MachineInstr &MI) const;
52   bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
53                      Register Reg, unsigned SubReg) const;
54   bool instReadsReg(const MachineInstr *MI, unsigned Reg,
55                     unsigned SubReg) const;
56   bool instModifiesReg(const MachineInstr *MI, unsigned Reg,
57                        unsigned SubReg) const;
58   TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub,
59                                                    unsigned I) const;
60   void dropInstructionKeepingImpDefs(MachineInstr &MI) const;
61   MachineInstr *matchSwap(MachineInstr &MovT) const;
62 
63   bool runOnMachineFunction(MachineFunction &MF) override;
64 
65   StringRef getPassName() const override { return "SI Shrink Instructions"; }
66 
67   void getAnalysisUsage(AnalysisUsage &AU) const override {
68     AU.setPreservesCFG();
69     MachineFunctionPass::getAnalysisUsage(AU);
70   }
71 };
72 
73 } // End anonymous namespace.
74 
75 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE,
76                 "SI Shrink Instructions", false, false)
77 
78 char SIShrinkInstructions::ID = 0;
79 
80 FunctionPass *llvm::createSIShrinkInstructionsPass() {
81   return new SIShrinkInstructions();
82 }
83 
84 /// This function checks \p MI for operands defined by a move immediate
85 /// instruction and then folds the literal constant into the instruction if it
86 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
87 bool SIShrinkInstructions::foldImmediates(MachineInstr &MI,
88                                           bool TryToCommute) const {
89   assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
90 
91   int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
92 
93   // Try to fold Src0
94   MachineOperand &Src0 = MI.getOperand(Src0Idx);
95   if (Src0.isReg()) {
96     Register Reg = Src0.getReg();
97     if (Reg.isVirtual()) {
98       MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
99       if (Def && Def->isMoveImmediate()) {
100         MachineOperand &MovSrc = Def->getOperand(1);
101         bool ConstantFolded = false;
102 
103         if (TII->isOperandLegal(MI, Src0Idx, &MovSrc)) {
104           if (MovSrc.isImm() &&
105               (isInt<32>(MovSrc.getImm()) || isUInt<32>(MovSrc.getImm()))) {
106             Src0.ChangeToImmediate(MovSrc.getImm());
107             ConstantFolded = true;
108           } else if (MovSrc.isFI()) {
109             Src0.ChangeToFrameIndex(MovSrc.getIndex());
110             ConstantFolded = true;
111           } else if (MovSrc.isGlobal()) {
112             Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(),
113                             MovSrc.getTargetFlags());
114             ConstantFolded = true;
115           }
116         }
117 
118         if (ConstantFolded) {
119           if (MRI->use_nodbg_empty(Reg))
120             Def->eraseFromParent();
121           ++NumLiteralConstantsFolded;
122           return true;
123         }
124       }
125     }
126   }
127 
128   // We have failed to fold src0, so commute the instruction and try again.
129   if (TryToCommute && MI.isCommutable()) {
130     if (TII->commuteInstruction(MI)) {
131       if (foldImmediates(MI, false))
132         return true;
133 
134       // Commute back.
135       TII->commuteInstruction(MI);
136     }
137   }
138 
139   return false;
140 }
141 
142 bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const {
143   return isInt<16>(Src.getImm()) &&
144     !TII->isInlineConstant(*Src.getParent(),
145                            Src.getParent()->getOperandNo(&Src));
146 }
147 
148 bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const {
149   return isUInt<16>(Src.getImm()) &&
150     !TII->isInlineConstant(*Src.getParent(),
151                            Src.getParent()->getOperandNo(&Src));
152 }
153 
154 bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src,
155                                                 bool &IsUnsigned) const {
156   if (isInt<16>(Src.getImm())) {
157     IsUnsigned = false;
158     return !TII->isInlineConstant(Src);
159   }
160 
161   if (isUInt<16>(Src.getImm())) {
162     IsUnsigned = true;
163     return !TII->isInlineConstant(Src);
164   }
165 
166   return false;
167 }
168 
169 /// \returns true if the constant in \p Src should be replaced with a bitreverse
170 /// of an inline immediate.
171 bool SIShrinkInstructions::isReverseInlineImm(const MachineOperand &Src,
172                                               int32_t &ReverseImm) const {
173   if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src))
174     return false;
175 
176   ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm()));
177   return ReverseImm >= -16 && ReverseImm <= 64;
178 }
179 
180 /// Copy implicit register operands from specified instruction to this
181 /// instruction that are not part of the instruction definition.
182 void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI,
183                                                 MachineInstr &MI) const {
184   MachineFunction &MF = *MI.getMF();
185   for (unsigned i = MI.getDesc().getNumOperands() +
186          MI.getDesc().getNumImplicitUses() +
187          MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
188        i != e; ++i) {
189     const MachineOperand &MO = MI.getOperand(i);
190     if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask())
191       NewMI.addOperand(MF, MO);
192   }
193 }
194 
195 void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
196   // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
197   // get constants on the RHS.
198   if (!MI.getOperand(0).isReg())
199     TII->commuteInstruction(MI, false, 0, 1);
200 
201   // cmpk requires src0 to be a register
202   const MachineOperand &Src0 = MI.getOperand(0);
203   if (!Src0.isReg())
204     return;
205 
206   const MachineOperand &Src1 = MI.getOperand(1);
207   if (!Src1.isImm())
208     return;
209 
210   int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode());
211   if (SOPKOpc == -1)
212     return;
213 
214   // eq/ne is special because the imm16 can be treated as signed or unsigned,
215   // and initially selected to the unsigned versions.
216   if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) {
217     bool HasUImm;
218     if (isKImmOrKUImmOperand(Src1, HasUImm)) {
219       if (!HasUImm) {
220         SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
221           AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32;
222       }
223 
224       MI.setDesc(TII->get(SOPKOpc));
225     }
226 
227     return;
228   }
229 
230   const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
231 
232   if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(Src1)) ||
233       (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(Src1))) {
234     MI.setDesc(NewDesc);
235   }
236 }
237 
238 // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding.
239 void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
240   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
241   if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
242     return;
243 
244   int VAddr0Idx =
245       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
246   unsigned NewAddrDwords = Info->VAddrDwords;
247   const TargetRegisterClass *RC;
248 
249   if (Info->VAddrDwords == 2) {
250     RC = &AMDGPU::VReg_64RegClass;
251   } else if (Info->VAddrDwords == 3) {
252     RC = &AMDGPU::VReg_96RegClass;
253   } else if (Info->VAddrDwords == 4) {
254     RC = &AMDGPU::VReg_128RegClass;
255   } else if (Info->VAddrDwords == 5) {
256     RC = &AMDGPU::VReg_160RegClass;
257   } else if (Info->VAddrDwords == 6) {
258     RC = &AMDGPU::VReg_192RegClass;
259   } else if (Info->VAddrDwords == 7) {
260     RC = &AMDGPU::VReg_224RegClass;
261   } else if (Info->VAddrDwords == 8) {
262     RC = &AMDGPU::VReg_256RegClass;
263   } else {
264     RC = &AMDGPU::VReg_512RegClass;
265     NewAddrDwords = 16;
266   }
267 
268   unsigned VgprBase = 0;
269   bool IsUndef = true;
270   bool IsKill = NewAddrDwords == Info->VAddrDwords;
271   for (unsigned i = 0; i < Info->VAddrDwords; ++i) {
272     const MachineOperand &Op = MI.getOperand(VAddr0Idx + i);
273     unsigned Vgpr = TRI->getHWRegIndex(Op.getReg());
274 
275     if (i == 0) {
276       VgprBase = Vgpr;
277     } else if (VgprBase + i != Vgpr)
278       return;
279 
280     if (!Op.isUndef())
281       IsUndef = false;
282     if (!Op.isKill())
283       IsKill = false;
284   }
285 
286   if (VgprBase + NewAddrDwords > 256)
287     return;
288 
289   // Further check for implicit tied operands - this may be present if TFE is
290   // enabled
291   int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
292   int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe);
293   unsigned TFEVal = (TFEIdx == -1) ? 0 : MI.getOperand(TFEIdx).getImm();
294   unsigned LWEVal = (LWEIdx == -1) ? 0 : MI.getOperand(LWEIdx).getImm();
295   int ToUntie = -1;
296   if (TFEVal || LWEVal) {
297     // TFE/LWE is enabled so we need to deal with an implicit tied operand
298     for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) {
299       if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() &&
300           MI.getOperand(i).isImplicit()) {
301         // This is the tied operand
302         assert(
303             ToUntie == -1 &&
304             "found more than one tied implicit operand when expecting only 1");
305         ToUntie = i;
306         MI.untieRegOperand(ToUntie);
307       }
308     }
309   }
310 
311   unsigned NewOpcode =
312       AMDGPU::getMIMGOpcode(Info->BaseOpcode, AMDGPU::MIMGEncGfx10Default,
313                             Info->VDataDwords, NewAddrDwords);
314   MI.setDesc(TII->get(NewOpcode));
315   MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase));
316   MI.getOperand(VAddr0Idx).setIsUndef(IsUndef);
317   MI.getOperand(VAddr0Idx).setIsKill(IsKill);
318 
319   for (unsigned i = 1; i < Info->VAddrDwords; ++i)
320     MI.removeOperand(VAddr0Idx + 1);
321 
322   if (ToUntie >= 0) {
323     MI.tieOperands(
324         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata),
325         ToUntie - (Info->VAddrDwords - 1));
326   }
327 }
328 
329 // Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK.
330 void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
331   if (!ST->hasVOP3Literal())
332     return;
333 
334   if (TII->hasAnyModifiersSet(MI))
335     return;
336 
337   const unsigned Opcode = MI.getOpcode();
338   MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
339   MachineOperand &Src1 = *TII->getNamedOperand(MI, AMDGPU::OpName::src1);
340   MachineOperand &Src2 = *TII->getNamedOperand(MI, AMDGPU::OpName::src2);
341   unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END;
342 
343   bool Swap;
344 
345   // Detect "Dst = VSrc * VGPR + Imm" and convert to AK form.
346   if (Src2.isImm() && !TII->isInlineConstant(Src2)) {
347     if (Src1.isReg() && TRI->isVGPR(*MRI, Src1.getReg()))
348       Swap = false;
349     else if (Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg()))
350       Swap = true;
351     else
352       return;
353 
354     switch (Opcode) {
355     default:
356       llvm_unreachable("Unexpected mad/fma opcode!");
357     case AMDGPU::V_MAD_F32_e64:
358       NewOpcode = AMDGPU::V_MADAK_F32;
359       break;
360     case AMDGPU::V_FMA_F32_e64:
361       NewOpcode = AMDGPU::V_FMAAK_F32;
362       break;
363     case AMDGPU::V_MAD_F16_e64:
364       NewOpcode = AMDGPU::V_MADAK_F16;
365       break;
366     case AMDGPU::V_FMA_F16_e64:
367       NewOpcode = AMDGPU::V_FMAAK_F16;
368       break;
369     }
370   }
371 
372   // Detect "Dst = VSrc * Imm + VGPR" and convert to MK form.
373   if (Src2.isReg() && TRI->isVGPR(*MRI, Src2.getReg())) {
374     if (Src1.isImm() && !TII->isInlineConstant(Src1))
375       Swap = false;
376     else if (Src0.isImm() && !TII->isInlineConstant(Src0))
377       Swap = true;
378     else
379       return;
380 
381     switch (Opcode) {
382     default:
383       llvm_unreachable("Unexpected mad/fma opcode!");
384     case AMDGPU::V_MAD_F32_e64:
385       NewOpcode = AMDGPU::V_MADMK_F32;
386       break;
387     case AMDGPU::V_FMA_F32_e64:
388       NewOpcode = AMDGPU::V_FMAMK_F32;
389       break;
390     case AMDGPU::V_MAD_F16_e64:
391       NewOpcode = AMDGPU::V_MADMK_F16;
392       break;
393     case AMDGPU::V_FMA_F16_e64:
394       NewOpcode = AMDGPU::V_FMAMK_F16;
395       break;
396     }
397   }
398 
399   if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END)
400     return;
401 
402   if (Swap) {
403     // Swap Src0 and Src1 by building a new instruction.
404     BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(NewOpcode),
405             MI.getOperand(0).getReg())
406         .add(Src1)
407         .add(Src0)
408         .add(Src2)
409         .setMIFlags(MI.getFlags());
410     MI.eraseFromParent();
411   } else {
412     TII->removeModOperands(MI);
413     MI.setDesc(TII->get(NewOpcode));
414   }
415 }
416 
417 /// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals.
418 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
419 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or
420 /// XNOR (as a ^ b == ~(a ^ ~b)).
421 /// \returns true if the caller should continue the machine function iterator
422 bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
423   unsigned Opc = MI.getOpcode();
424   const MachineOperand *Dest = &MI.getOperand(0);
425   MachineOperand *Src0 = &MI.getOperand(1);
426   MachineOperand *Src1 = &MI.getOperand(2);
427   MachineOperand *SrcReg = Src0;
428   MachineOperand *SrcImm = Src1;
429 
430   if (!SrcImm->isImm() ||
431       AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST->hasInv2PiInlineImm()))
432     return false;
433 
434   uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm());
435   uint32_t NewImm = 0;
436 
437   if (Opc == AMDGPU::S_AND_B32) {
438     if (isPowerOf2_32(~Imm)) {
439       NewImm = countTrailingOnes(Imm);
440       Opc = AMDGPU::S_BITSET0_B32;
441     } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
442       NewImm = ~Imm;
443       Opc = AMDGPU::S_ANDN2_B32;
444     }
445   } else if (Opc == AMDGPU::S_OR_B32) {
446     if (isPowerOf2_32(Imm)) {
447       NewImm = countTrailingZeros(Imm);
448       Opc = AMDGPU::S_BITSET1_B32;
449     } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
450       NewImm = ~Imm;
451       Opc = AMDGPU::S_ORN2_B32;
452     }
453   } else if (Opc == AMDGPU::S_XOR_B32) {
454     if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
455       NewImm = ~Imm;
456       Opc = AMDGPU::S_XNOR_B32;
457     }
458   } else {
459     llvm_unreachable("unexpected opcode");
460   }
461 
462   if (NewImm != 0) {
463     if (Dest->getReg().isVirtual() && SrcReg->isReg()) {
464       MRI->setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
465       MRI->setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
466       return true;
467     }
468 
469     if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) {
470       const bool IsUndef = SrcReg->isUndef();
471       const bool IsKill = SrcReg->isKill();
472       MI.setDesc(TII->get(Opc));
473       if (Opc == AMDGPU::S_BITSET0_B32 ||
474           Opc == AMDGPU::S_BITSET1_B32) {
475         Src0->ChangeToImmediate(NewImm);
476         // Remove the immediate and add the tied input.
477         MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false,
478                                           /*isImp*/ false, IsKill,
479                                           /*isDead*/ false, IsUndef);
480         MI.tieOperands(0, 2);
481       } else {
482         SrcImm->setImm(NewImm);
483       }
484     }
485   }
486 
487   return false;
488 }
489 
490 // This is the same as MachineInstr::readsRegister/modifiesRegister except
491 // it takes subregs into account.
492 bool SIShrinkInstructions::instAccessReg(
493     iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg,
494     unsigned SubReg) const {
495   for (const MachineOperand &MO : R) {
496     if (!MO.isReg())
497       continue;
498 
499     if (Reg.isPhysical() && MO.getReg().isPhysical()) {
500       if (TRI->regsOverlap(Reg, MO.getReg()))
501         return true;
502     } else if (MO.getReg() == Reg && Reg.isVirtual()) {
503       LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubReg) &
504                             TRI->getSubRegIndexLaneMask(MO.getSubReg());
505       if (Overlap.any())
506         return true;
507     }
508   }
509   return false;
510 }
511 
512 bool SIShrinkInstructions::instReadsReg(const MachineInstr *MI, unsigned Reg,
513                                         unsigned SubReg) const {
514   return instAccessReg(MI->uses(), Reg, SubReg);
515 }
516 
517 bool SIShrinkInstructions::instModifiesReg(const MachineInstr *MI, unsigned Reg,
518                                            unsigned SubReg) const {
519   return instAccessReg(MI->defs(), Reg, SubReg);
520 }
521 
522 TargetInstrInfo::RegSubRegPair
523 SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub,
524                                         unsigned I) const {
525   if (TRI->getRegSizeInBits(Reg, *MRI) != 32) {
526     if (Reg.isPhysical()) {
527       Reg = TRI->getSubReg(Reg, TRI->getSubRegFromChannel(I));
528     } else {
529       Sub = TRI->getSubRegFromChannel(I + TRI->getChannelFromSubReg(Sub));
530     }
531   }
532   return TargetInstrInfo::RegSubRegPair(Reg, Sub);
533 }
534 
535 void SIShrinkInstructions::dropInstructionKeepingImpDefs(
536     MachineInstr &MI) const {
537   for (unsigned i = MI.getDesc().getNumOperands() +
538          MI.getDesc().getNumImplicitUses() +
539          MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
540        i != e; ++i) {
541     const MachineOperand &Op = MI.getOperand(i);
542     if (!Op.isDef())
543       continue;
544     BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
545             TII->get(AMDGPU::IMPLICIT_DEF), Op.getReg());
546   }
547 
548   MI.eraseFromParent();
549 }
550 
551 // Match:
552 // mov t, x
553 // mov x, y
554 // mov y, t
555 //
556 // =>
557 //
558 // mov t, x (t is potentially dead and move eliminated)
559 // v_swap_b32 x, y
560 //
561 // Returns next valid instruction pointer if was able to create v_swap_b32.
562 //
563 // This shall not be done too early not to prevent possible folding which may
564 // remove matched moves, and this should preferably be done before RA to
565 // release saved registers and also possibly after RA which can insert copies
566 // too.
567 //
568 // This is really just a generic peephole that is not a canonical shrinking,
569 // although requirements match the pass placement and it reduces code size too.
570 MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
571   assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
572          MovT.getOpcode() == AMDGPU::COPY);
573 
574   Register T = MovT.getOperand(0).getReg();
575   unsigned Tsub = MovT.getOperand(0).getSubReg();
576   MachineOperand &Xop = MovT.getOperand(1);
577 
578   if (!Xop.isReg())
579     return nullptr;
580   Register X = Xop.getReg();
581   unsigned Xsub = Xop.getSubReg();
582 
583   unsigned Size = TII->getOpSize(MovT, 0) / 4;
584 
585   if (!TRI->isVGPR(*MRI, X))
586     return nullptr;
587 
588   if (MovT.hasRegisterImplicitUseOperand(AMDGPU::M0))
589     return nullptr;
590 
591   const unsigned SearchLimit = 16;
592   unsigned Count = 0;
593   bool KilledT = false;
594   for (auto Iter = std::next(MovT.getIterator()),
595             E = MovT.getParent()->instr_end();
596        Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) {
597 
598     MachineInstr *MovY = &*Iter;
599     KilledT = MovY->killsRegister(T, TRI);
600 
601     if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
602          MovY->getOpcode() != AMDGPU::COPY) ||
603         !MovY->getOperand(1).isReg()        ||
604         MovY->getOperand(1).getReg() != T   ||
605         MovY->getOperand(1).getSubReg() != Tsub ||
606         MovY->hasRegisterImplicitUseOperand(AMDGPU::M0))
607       continue;
608 
609     Register Y = MovY->getOperand(0).getReg();
610     unsigned Ysub = MovY->getOperand(0).getSubReg();
611 
612     if (!TRI->isVGPR(*MRI, Y))
613       continue;
614 
615     MachineInstr *MovX = nullptr;
616     for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator());
617          I != IY; ++I) {
618       if (instReadsReg(&*I, X, Xsub) || instModifiesReg(&*I, Y, Ysub) ||
619           instModifiesReg(&*I, T, Tsub) ||
620           (MovX && instModifiesReg(&*I, X, Xsub))) {
621         MovX = nullptr;
622         break;
623       }
624       if (!instReadsReg(&*I, Y, Ysub)) {
625         if (!MovX && instModifiesReg(&*I, X, Xsub)) {
626           MovX = nullptr;
627           break;
628         }
629         continue;
630       }
631       if (MovX ||
632           (I->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
633            I->getOpcode() != AMDGPU::COPY) ||
634           I->getOperand(0).getReg() != X ||
635           I->getOperand(0).getSubReg() != Xsub) {
636         MovX = nullptr;
637         break;
638       }
639       // Implicit use of M0 is an indirect move.
640       if (I->hasRegisterImplicitUseOperand(AMDGPU::M0))
641         continue;
642 
643       if (Size > 1 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U)))
644         continue;
645 
646       MovX = &*I;
647     }
648 
649     if (!MovX)
650       continue;
651 
652     LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << *MovY);
653 
654     for (unsigned I = 0; I < Size; ++I) {
655       TargetInstrInfo::RegSubRegPair X1, Y1;
656       X1 = getSubRegForIndex(X, Xsub, I);
657       Y1 = getSubRegForIndex(Y, Ysub, I);
658       MachineBasicBlock &MBB = *MovT.getParent();
659       auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(),
660                          TII->get(AMDGPU::V_SWAP_B32))
661         .addDef(X1.Reg, 0, X1.SubReg)
662         .addDef(Y1.Reg, 0, Y1.SubReg)
663         .addReg(Y1.Reg, 0, Y1.SubReg)
664         .addReg(X1.Reg, 0, X1.SubReg).getInstr();
665       if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
666         // Drop implicit EXEC.
667         MIB->removeOperand(MIB->getNumExplicitOperands());
668         MIB->copyImplicitOps(*MBB.getParent(), *MovX);
669       }
670     }
671     MovX->eraseFromParent();
672     dropInstructionKeepingImpDefs(*MovY);
673     MachineInstr *Next = &*std::next(MovT.getIterator());
674 
675     if (T.isVirtual() && MRI->use_nodbg_empty(T)) {
676       dropInstructionKeepingImpDefs(MovT);
677     } else {
678       Xop.setIsKill(false);
679       for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) {
680         unsigned OpNo = MovT.getNumExplicitOperands() + I;
681         const MachineOperand &Op = MovT.getOperand(OpNo);
682         if (Op.isKill() && TRI->regsOverlap(X, Op.getReg()))
683           MovT.removeOperand(OpNo);
684       }
685     }
686 
687     return Next;
688   }
689 
690   return nullptr;
691 }
692 
693 // If an instruction has dead sdst replace it with NULL register on gfx10+
694 bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const {
695   if (ST->getGeneration() < AMDGPUSubtarget::GFX10)
696     return false;
697 
698   MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
699   if (!Op)
700     return false;
701   Register SDstReg = Op->getReg();
702   if (SDstReg.isPhysical() || !MRI->use_nodbg_empty(SDstReg))
703     return false;
704 
705   Op->setReg(ST->isWave32() ? AMDGPU::SGPR_NULL : AMDGPU::SGPR_NULL64);
706   return true;
707 }
708 
709 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
710   if (skipFunction(MF.getFunction()))
711     return false;
712 
713   MRI = &MF.getRegInfo();
714   ST = &MF.getSubtarget<GCNSubtarget>();
715   TII = ST->getInstrInfo();
716   TRI = &TII->getRegisterInfo();
717 
718   unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
719 
720   std::vector<unsigned> I1Defs;
721 
722   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
723                                                   BI != BE; ++BI) {
724 
725     MachineBasicBlock &MBB = *BI;
726     MachineBasicBlock::iterator I, Next;
727     for (I = MBB.begin(); I != MBB.end(); I = Next) {
728       Next = std::next(I);
729       MachineInstr &MI = *I;
730 
731       if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
732         // If this has a literal constant source that is the same as the
733         // reversed bits of an inline immediate, replace with a bitreverse of
734         // that constant. This saves 4 bytes in the common case of materializing
735         // sign bits.
736 
737         // Test if we are after regalloc. We only want to do this after any
738         // optimizations happen because this will confuse them.
739         // XXX - not exactly a check for post-regalloc run.
740         MachineOperand &Src = MI.getOperand(1);
741         if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) {
742           int32_t ReverseImm;
743           if (isReverseInlineImm(Src, ReverseImm)) {
744             MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
745             Src.setImm(ReverseImm);
746             continue;
747           }
748         }
749       }
750 
751       if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
752                             MI.getOpcode() == AMDGPU::COPY)) {
753         if (auto *NextMI = matchSwap(MI)) {
754           Next = NextMI->getIterator();
755           continue;
756         }
757       }
758 
759       // Try to use S_ADDK_I32 and S_MULK_I32.
760       if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
761           MI.getOpcode() == AMDGPU::S_MUL_I32) {
762         const MachineOperand *Dest = &MI.getOperand(0);
763         MachineOperand *Src0 = &MI.getOperand(1);
764         MachineOperand *Src1 = &MI.getOperand(2);
765 
766         if (!Src0->isReg() && Src1->isReg()) {
767           if (TII->commuteInstruction(MI, false, 1, 2))
768             std::swap(Src0, Src1);
769         }
770 
771         // FIXME: This could work better if hints worked with subregisters. If
772         // we have a vector add of a constant, we usually don't get the correct
773         // allocation due to the subregister usage.
774         if (Dest->getReg().isVirtual() && Src0->isReg()) {
775           MRI->setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
776           MRI->setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
777           continue;
778         }
779 
780         if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
781           if (Src1->isImm() && isKImmOperand(*Src1)) {
782             unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
783               AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
784 
785             MI.setDesc(TII->get(Opc));
786             MI.tieOperands(0, 1);
787           }
788         }
789       }
790 
791       // Try to use s_cmpk_*
792       if (MI.isCompare() && TII->isSOPC(MI)) {
793         shrinkScalarCompare(MI);
794         continue;
795       }
796 
797       // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
798       if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
799         const MachineOperand &Dst = MI.getOperand(0);
800         MachineOperand &Src = MI.getOperand(1);
801 
802         if (Src.isImm() && Dst.getReg().isPhysical()) {
803           int32_t ReverseImm;
804           if (isKImmOperand(Src))
805             MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
806           else if (isReverseInlineImm(Src, ReverseImm)) {
807             MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
808             Src.setImm(ReverseImm);
809           }
810         }
811 
812         continue;
813       }
814 
815       // Shrink scalar logic operations.
816       if (MI.getOpcode() == AMDGPU::S_AND_B32 ||
817           MI.getOpcode() == AMDGPU::S_OR_B32 ||
818           MI.getOpcode() == AMDGPU::S_XOR_B32) {
819         if (shrinkScalarLogicOp(MI))
820           continue;
821       }
822 
823       if (TII->isMIMG(MI.getOpcode()) &&
824           ST->getGeneration() >= AMDGPUSubtarget::GFX10 &&
825           MF.getProperties().hasProperty(
826               MachineFunctionProperties::Property::NoVRegs)) {
827         shrinkMIMG(MI);
828         continue;
829       }
830 
831       if (!TII->isVOP3(MI))
832         continue;
833 
834       if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 ||
835           MI.getOpcode() == AMDGPU::V_FMA_F32_e64 ||
836           MI.getOpcode() == AMDGPU::V_MAD_F16_e64 ||
837           MI.getOpcode() == AMDGPU::V_FMA_F16_e64) {
838         shrinkMadFma(MI);
839         continue;
840       }
841 
842       if (!TII->hasVALU32BitEncoding(MI.getOpcode())) {
843         // If there is no chance we will shrink it and use VCC as sdst to get
844         // a 32 bit form try to replace dead sdst with NULL.
845         tryReplaceDeadSDST(MI);
846         continue;
847       }
848 
849       if (!TII->canShrink(MI, *MRI)) {
850         // Try commuting the instruction and see if that enables us to shrink
851         // it.
852         if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
853             !TII->canShrink(MI, *MRI)) {
854           tryReplaceDeadSDST(MI);
855           continue;
856         }
857       }
858 
859       int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
860 
861       if (TII->isVOPC(Op32)) {
862         MachineOperand &Op0 = MI.getOperand(0);
863         if (Op0.isReg()) {
864           // Exclude VOPCX instructions as these don't explicitly write a
865           // dst.
866           Register DstReg = Op0.getReg();
867           if (DstReg.isVirtual()) {
868             // VOPC instructions can only write to the VCC register. We can't
869             // force them to use VCC here, because this is only one register and
870             // cannot deal with sequences which would require multiple copies of
871             // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
872             //
873             // So, instead of forcing the instruction to write to VCC, we
874             // provide a hint to the register allocator to use VCC and then we
875             // will run this pass again after RA and shrink it if it outputs to
876             // VCC.
877             MRI->setRegAllocationHint(DstReg, 0, VCCReg);
878             continue;
879           }
880           if (DstReg != VCCReg)
881             continue;
882         }
883       }
884 
885       if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
886         // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
887         // instructions.
888         const MachineOperand *Src2 =
889             TII->getNamedOperand(MI, AMDGPU::OpName::src2);
890         if (!Src2->isReg())
891           continue;
892         Register SReg = Src2->getReg();
893         if (SReg.isVirtual()) {
894           MRI->setRegAllocationHint(SReg, 0, VCCReg);
895           continue;
896         }
897         if (SReg != VCCReg)
898           continue;
899       }
900 
901       // Check for the bool flag output for instructions like V_ADD_I32_e64.
902       const MachineOperand *SDst = TII->getNamedOperand(MI,
903                                                         AMDGPU::OpName::sdst);
904 
905       if (SDst) {
906         bool Next = false;
907 
908         if (SDst->getReg() != VCCReg) {
909           if (SDst->getReg().isVirtual())
910             MRI->setRegAllocationHint(SDst->getReg(), 0, VCCReg);
911           Next = true;
912         }
913 
914         // All of the instructions with carry outs also have an SGPR input in
915         // src2.
916         const MachineOperand *Src2 = TII->getNamedOperand(MI,
917                                                           AMDGPU::OpName::src2);
918         if (Src2 && Src2->getReg() != VCCReg) {
919           if (Src2->getReg().isVirtual())
920             MRI->setRegAllocationHint(Src2->getReg(), 0, VCCReg);
921           Next = true;
922         }
923 
924         if (Next)
925           continue;
926       }
927 
928       // We can shrink this instruction
929       LLVM_DEBUG(dbgs() << "Shrinking " << MI);
930 
931       MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32);
932       ++NumInstructionsShrunk;
933 
934       // Copy extra operands not present in the instruction definition.
935       copyExtraImplicitOps(*Inst32, MI);
936 
937       // Copy deadness from the old explicit vcc def to the new implicit def.
938       if (SDst && SDst->isDead())
939         Inst32->findRegisterDefOperand(VCCReg)->setIsDead();
940 
941       MI.eraseFromParent();
942       foldImmediates(*Inst32);
943 
944       LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
945     }
946   }
947   return false;
948 }
949