1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 /// The pass tries to use the 32-bit encoding for instructions when possible.
8 //===----------------------------------------------------------------------===//
9 //
10 
11 #include "AMDGPU.h"
12 #include "GCNSubtarget.h"
13 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
14 #include "llvm/ADT/Statistic.h"
15 #include "llvm/CodeGen/MachineFunctionPass.h"
16 
17 #define DEBUG_TYPE "si-shrink-instructions"
18 
19 STATISTIC(NumInstructionsShrunk,
20           "Number of 64-bit instruction reduced to 32-bit.");
21 STATISTIC(NumLiteralConstantsFolded,
22           "Number of literal constants folded into 32-bit instructions.");
23 
24 using namespace llvm;
25 
26 namespace {
27 
28 class SIShrinkInstructions : public MachineFunctionPass {
29   MachineRegisterInfo *MRI;
30   const GCNSubtarget *ST;
31   const SIInstrInfo *TII;
32   const SIRegisterInfo *TRI;
33 
34 public:
35   static char ID;
36 
37 public:
38   SIShrinkInstructions() : MachineFunctionPass(ID) {
39   }
40 
41   bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const;
42   bool isKImmOperand(const MachineOperand &Src) const;
43   bool isKUImmOperand(const MachineOperand &Src) const;
44   bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const;
45   bool isReverseInlineImm(const MachineOperand &Src, int32_t &ReverseImm) const;
46   void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const;
47   void shrinkScalarCompare(MachineInstr &MI) const;
48   void shrinkMIMG(MachineInstr &MI) const;
49   bool shrinkScalarLogicOp(MachineInstr &MI) const;
50   bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
51                      Register Reg, unsigned SubReg) const;
52   bool instReadsReg(const MachineInstr *MI, unsigned Reg,
53                     unsigned SubReg) const;
54   bool instModifiesReg(const MachineInstr *MI, unsigned Reg,
55                        unsigned SubReg) const;
56   TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub,
57                                                    unsigned I) const;
58   void dropInstructionKeepingImpDefs(MachineInstr &MI) const;
59   MachineInstr *matchSwap(MachineInstr &MovT) const;
60 
61   bool runOnMachineFunction(MachineFunction &MF) override;
62 
63   StringRef getPassName() const override { return "SI Shrink Instructions"; }
64 
65   void getAnalysisUsage(AnalysisUsage &AU) const override {
66     AU.setPreservesCFG();
67     MachineFunctionPass::getAnalysisUsage(AU);
68   }
69 };
70 
71 } // End anonymous namespace.
72 
73 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE,
74                 "SI Shrink Instructions", false, false)
75 
76 char SIShrinkInstructions::ID = 0;
77 
78 FunctionPass *llvm::createSIShrinkInstructionsPass() {
79   return new SIShrinkInstructions();
80 }
81 
82 /// This function checks \p MI for operands defined by a move immediate
83 /// instruction and then folds the literal constant into the instruction if it
84 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
85 bool SIShrinkInstructions::foldImmediates(MachineInstr &MI,
86                                           bool TryToCommute) const {
87   assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
88 
89   int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
90 
91   // Try to fold Src0
92   MachineOperand &Src0 = MI.getOperand(Src0Idx);
93   if (Src0.isReg()) {
94     Register Reg = Src0.getReg();
95     if (Reg.isVirtual() && MRI->hasOneUse(Reg)) {
96       MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
97       if (Def && Def->isMoveImmediate()) {
98         MachineOperand &MovSrc = Def->getOperand(1);
99         bool ConstantFolded = false;
100 
101         if (TII->isOperandLegal(MI, Src0Idx, &MovSrc)) {
102           if (MovSrc.isImm() &&
103               (isInt<32>(MovSrc.getImm()) || isUInt<32>(MovSrc.getImm()))) {
104             Src0.ChangeToImmediate(MovSrc.getImm());
105             ConstantFolded = true;
106           } else if (MovSrc.isFI()) {
107             Src0.ChangeToFrameIndex(MovSrc.getIndex());
108             ConstantFolded = true;
109           } else if (MovSrc.isGlobal()) {
110             Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(),
111                             MovSrc.getTargetFlags());
112             ConstantFolded = true;
113           }
114         }
115 
116         if (ConstantFolded) {
117           assert(MRI->use_empty(Reg));
118           Def->eraseFromParent();
119           ++NumLiteralConstantsFolded;
120           return true;
121         }
122       }
123     }
124   }
125 
126   // We have failed to fold src0, so commute the instruction and try again.
127   if (TryToCommute && MI.isCommutable()) {
128     if (TII->commuteInstruction(MI)) {
129       if (foldImmediates(MI, false))
130         return true;
131 
132       // Commute back.
133       TII->commuteInstruction(MI);
134     }
135   }
136 
137   return false;
138 }
139 
140 bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const {
141   return isInt<16>(Src.getImm()) &&
142     !TII->isInlineConstant(*Src.getParent(),
143                            Src.getParent()->getOperandNo(&Src));
144 }
145 
146 bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const {
147   return isUInt<16>(Src.getImm()) &&
148     !TII->isInlineConstant(*Src.getParent(),
149                            Src.getParent()->getOperandNo(&Src));
150 }
151 
152 bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src,
153                                                 bool &IsUnsigned) const {
154   if (isInt<16>(Src.getImm())) {
155     IsUnsigned = false;
156     return !TII->isInlineConstant(Src);
157   }
158 
159   if (isUInt<16>(Src.getImm())) {
160     IsUnsigned = true;
161     return !TII->isInlineConstant(Src);
162   }
163 
164   return false;
165 }
166 
167 /// \returns true if the constant in \p Src should be replaced with a bitreverse
168 /// of an inline immediate.
169 bool SIShrinkInstructions::isReverseInlineImm(const MachineOperand &Src,
170                                               int32_t &ReverseImm) const {
171   if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src))
172     return false;
173 
174   ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm()));
175   return ReverseImm >= -16 && ReverseImm <= 64;
176 }
177 
178 /// Copy implicit register operands from specified instruction to this
179 /// instruction that are not part of the instruction definition.
180 void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI,
181                                                 MachineInstr &MI) const {
182   MachineFunction &MF = *MI.getMF();
183   for (unsigned i = MI.getDesc().getNumOperands() +
184          MI.getDesc().getNumImplicitUses() +
185          MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
186        i != e; ++i) {
187     const MachineOperand &MO = MI.getOperand(i);
188     if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask())
189       NewMI.addOperand(MF, MO);
190   }
191 }
192 
193 void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
194   // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
195   // get constants on the RHS.
196   if (!MI.getOperand(0).isReg())
197     TII->commuteInstruction(MI, false, 0, 1);
198 
199   // cmpk requires src0 to be a register
200   const MachineOperand &Src0 = MI.getOperand(0);
201   if (!Src0.isReg())
202     return;
203 
204   const MachineOperand &Src1 = MI.getOperand(1);
205   if (!Src1.isImm())
206     return;
207 
208   int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode());
209   if (SOPKOpc == -1)
210     return;
211 
212   // eq/ne is special because the imm16 can be treated as signed or unsigned,
213   // and initially selected to the unsigned versions.
214   if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) {
215     bool HasUImm;
216     if (isKImmOrKUImmOperand(Src1, HasUImm)) {
217       if (!HasUImm) {
218         SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
219           AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32;
220       }
221 
222       MI.setDesc(TII->get(SOPKOpc));
223     }
224 
225     return;
226   }
227 
228   const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
229 
230   if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(Src1)) ||
231       (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(Src1))) {
232     MI.setDesc(NewDesc);
233   }
234 }
235 
236 // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding.
237 void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
238   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
239   if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
240     return;
241 
242   int VAddr0Idx =
243       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
244   unsigned NewAddrDwords = Info->VAddrDwords;
245   const TargetRegisterClass *RC;
246 
247   if (Info->VAddrDwords == 2) {
248     RC = &AMDGPU::VReg_64RegClass;
249   } else if (Info->VAddrDwords == 3) {
250     RC = &AMDGPU::VReg_96RegClass;
251   } else if (Info->VAddrDwords == 4) {
252     RC = &AMDGPU::VReg_128RegClass;
253   } else if (Info->VAddrDwords == 5) {
254     RC = &AMDGPU::VReg_160RegClass;
255   } else if (Info->VAddrDwords == 6) {
256     RC = &AMDGPU::VReg_192RegClass;
257   } else if (Info->VAddrDwords == 7) {
258     RC = &AMDGPU::VReg_224RegClass;
259   } else if (Info->VAddrDwords == 8) {
260     RC = &AMDGPU::VReg_256RegClass;
261   } else {
262     RC = &AMDGPU::VReg_512RegClass;
263     NewAddrDwords = 16;
264   }
265 
266   unsigned VgprBase = 0;
267   bool IsUndef = true;
268   bool IsKill = NewAddrDwords == Info->VAddrDwords;
269   for (unsigned i = 0; i < Info->VAddrDwords; ++i) {
270     const MachineOperand &Op = MI.getOperand(VAddr0Idx + i);
271     unsigned Vgpr = TRI->getHWRegIndex(Op.getReg());
272 
273     if (i == 0) {
274       VgprBase = Vgpr;
275     } else if (VgprBase + i != Vgpr)
276       return;
277 
278     if (!Op.isUndef())
279       IsUndef = false;
280     if (!Op.isKill())
281       IsKill = false;
282   }
283 
284   if (VgprBase + NewAddrDwords > 256)
285     return;
286 
287   // Further check for implicit tied operands - this may be present if TFE is
288   // enabled
289   int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
290   int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe);
291   unsigned TFEVal = (TFEIdx == -1) ? 0 : MI.getOperand(TFEIdx).getImm();
292   unsigned LWEVal = (LWEIdx == -1) ? 0 : MI.getOperand(LWEIdx).getImm();
293   int ToUntie = -1;
294   if (TFEVal || LWEVal) {
295     // TFE/LWE is enabled so we need to deal with an implicit tied operand
296     for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) {
297       if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() &&
298           MI.getOperand(i).isImplicit()) {
299         // This is the tied operand
300         assert(
301             ToUntie == -1 &&
302             "found more than one tied implicit operand when expecting only 1");
303         ToUntie = i;
304         MI.untieRegOperand(ToUntie);
305       }
306     }
307   }
308 
309   unsigned NewOpcode =
310       AMDGPU::getMIMGOpcode(Info->BaseOpcode, AMDGPU::MIMGEncGfx10Default,
311                             Info->VDataDwords, NewAddrDwords);
312   MI.setDesc(TII->get(NewOpcode));
313   MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase));
314   MI.getOperand(VAddr0Idx).setIsUndef(IsUndef);
315   MI.getOperand(VAddr0Idx).setIsKill(IsKill);
316 
317   for (unsigned i = 1; i < Info->VAddrDwords; ++i)
318     MI.removeOperand(VAddr0Idx + 1);
319 
320   if (ToUntie >= 0) {
321     MI.tieOperands(
322         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata),
323         ToUntie - (Info->VAddrDwords - 1));
324   }
325 }
326 
327 /// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals.
328 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
329 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or
330 /// XNOR (as a ^ b == ~(a ^ ~b)).
331 /// \returns true if the caller should continue the machine function iterator
332 bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
333   unsigned Opc = MI.getOpcode();
334   const MachineOperand *Dest = &MI.getOperand(0);
335   MachineOperand *Src0 = &MI.getOperand(1);
336   MachineOperand *Src1 = &MI.getOperand(2);
337   MachineOperand *SrcReg = Src0;
338   MachineOperand *SrcImm = Src1;
339 
340   if (!SrcImm->isImm() ||
341       AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST->hasInv2PiInlineImm()))
342     return false;
343 
344   uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm());
345   uint32_t NewImm = 0;
346 
347   if (Opc == AMDGPU::S_AND_B32) {
348     if (isPowerOf2_32(~Imm)) {
349       NewImm = countTrailingOnes(Imm);
350       Opc = AMDGPU::S_BITSET0_B32;
351     } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
352       NewImm = ~Imm;
353       Opc = AMDGPU::S_ANDN2_B32;
354     }
355   } else if (Opc == AMDGPU::S_OR_B32) {
356     if (isPowerOf2_32(Imm)) {
357       NewImm = countTrailingZeros(Imm);
358       Opc = AMDGPU::S_BITSET1_B32;
359     } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
360       NewImm = ~Imm;
361       Opc = AMDGPU::S_ORN2_B32;
362     }
363   } else if (Opc == AMDGPU::S_XOR_B32) {
364     if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
365       NewImm = ~Imm;
366       Opc = AMDGPU::S_XNOR_B32;
367     }
368   } else {
369     llvm_unreachable("unexpected opcode");
370   }
371 
372   if (NewImm != 0) {
373     if (Dest->getReg().isVirtual() && SrcReg->isReg()) {
374       MRI->setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
375       MRI->setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
376       return true;
377     }
378 
379     if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) {
380       const bool IsUndef = SrcReg->isUndef();
381       const bool IsKill = SrcReg->isKill();
382       MI.setDesc(TII->get(Opc));
383       if (Opc == AMDGPU::S_BITSET0_B32 ||
384           Opc == AMDGPU::S_BITSET1_B32) {
385         Src0->ChangeToImmediate(NewImm);
386         // Remove the immediate and add the tied input.
387         MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false,
388                                           /*isImp*/ false, IsKill,
389                                           /*isDead*/ false, IsUndef);
390         MI.tieOperands(0, 2);
391       } else {
392         SrcImm->setImm(NewImm);
393       }
394     }
395   }
396 
397   return false;
398 }
399 
400 // This is the same as MachineInstr::readsRegister/modifiesRegister except
401 // it takes subregs into account.
402 bool SIShrinkInstructions::instAccessReg(
403     iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg,
404     unsigned SubReg) const {
405   for (const MachineOperand &MO : R) {
406     if (!MO.isReg())
407       continue;
408 
409     if (Reg.isPhysical() && MO.getReg().isPhysical()) {
410       if (TRI->regsOverlap(Reg, MO.getReg()))
411         return true;
412     } else if (MO.getReg() == Reg && Reg.isVirtual()) {
413       LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubReg) &
414                             TRI->getSubRegIndexLaneMask(MO.getSubReg());
415       if (Overlap.any())
416         return true;
417     }
418   }
419   return false;
420 }
421 
422 bool SIShrinkInstructions::instReadsReg(const MachineInstr *MI, unsigned Reg,
423                                         unsigned SubReg) const {
424   return instAccessReg(MI->uses(), Reg, SubReg);
425 }
426 
427 bool SIShrinkInstructions::instModifiesReg(const MachineInstr *MI, unsigned Reg,
428                                            unsigned SubReg) const {
429   return instAccessReg(MI->defs(), Reg, SubReg);
430 }
431 
432 TargetInstrInfo::RegSubRegPair
433 SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub,
434                                         unsigned I) const {
435   if (TRI->getRegSizeInBits(Reg, *MRI) != 32) {
436     if (Reg.isPhysical()) {
437       Reg = TRI->getSubReg(Reg, TRI->getSubRegFromChannel(I));
438     } else {
439       Sub = TRI->getSubRegFromChannel(I + TRI->getChannelFromSubReg(Sub));
440     }
441   }
442   return TargetInstrInfo::RegSubRegPair(Reg, Sub);
443 }
444 
445 void SIShrinkInstructions::dropInstructionKeepingImpDefs(
446     MachineInstr &MI) const {
447   for (unsigned i = MI.getDesc().getNumOperands() +
448          MI.getDesc().getNumImplicitUses() +
449          MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
450        i != e; ++i) {
451     const MachineOperand &Op = MI.getOperand(i);
452     if (!Op.isDef())
453       continue;
454     BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
455             TII->get(AMDGPU::IMPLICIT_DEF), Op.getReg());
456   }
457 
458   MI.eraseFromParent();
459 }
460 
461 // Match:
462 // mov t, x
463 // mov x, y
464 // mov y, t
465 //
466 // =>
467 //
468 // mov t, x (t is potentially dead and move eliminated)
469 // v_swap_b32 x, y
470 //
471 // Returns next valid instruction pointer if was able to create v_swap_b32.
472 //
473 // This shall not be done too early not to prevent possible folding which may
474 // remove matched moves, and this should preferably be done before RA to
475 // release saved registers and also possibly after RA which can insert copies
476 // too.
477 //
478 // This is really just a generic peephole that is not a canonical shrinking,
479 // although requirements match the pass placement and it reduces code size too.
480 MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
481   assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
482          MovT.getOpcode() == AMDGPU::COPY);
483 
484   Register T = MovT.getOperand(0).getReg();
485   unsigned Tsub = MovT.getOperand(0).getSubReg();
486   MachineOperand &Xop = MovT.getOperand(1);
487 
488   if (!Xop.isReg())
489     return nullptr;
490   Register X = Xop.getReg();
491   unsigned Xsub = Xop.getSubReg();
492 
493   unsigned Size = TII->getOpSize(MovT, 0) / 4;
494 
495   if (!TRI->isVGPR(*MRI, X))
496     return nullptr;
497 
498   if (MovT.hasRegisterImplicitUseOperand(AMDGPU::M0))
499     return nullptr;
500 
501   const unsigned SearchLimit = 16;
502   unsigned Count = 0;
503   bool KilledT = false;
504   for (auto Iter = std::next(MovT.getIterator()),
505             E = MovT.getParent()->instr_end();
506        Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) {
507 
508     MachineInstr *MovY = &*Iter;
509     KilledT = MovY->killsRegister(T, TRI);
510 
511     if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
512          MovY->getOpcode() != AMDGPU::COPY) ||
513         !MovY->getOperand(1).isReg()        ||
514         MovY->getOperand(1).getReg() != T   ||
515         MovY->getOperand(1).getSubReg() != Tsub ||
516         MovY->hasRegisterImplicitUseOperand(AMDGPU::M0))
517       continue;
518 
519     Register Y = MovY->getOperand(0).getReg();
520     unsigned Ysub = MovY->getOperand(0).getSubReg();
521 
522     if (!TRI->isVGPR(*MRI, Y))
523       continue;
524 
525     MachineInstr *MovX = nullptr;
526     for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator());
527          I != IY; ++I) {
528       if (instReadsReg(&*I, X, Xsub) || instModifiesReg(&*I, Y, Ysub) ||
529           instModifiesReg(&*I, T, Tsub) ||
530           (MovX && instModifiesReg(&*I, X, Xsub))) {
531         MovX = nullptr;
532         break;
533       }
534       if (!instReadsReg(&*I, Y, Ysub)) {
535         if (!MovX && instModifiesReg(&*I, X, Xsub)) {
536           MovX = nullptr;
537           break;
538         }
539         continue;
540       }
541       if (MovX ||
542           (I->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
543            I->getOpcode() != AMDGPU::COPY) ||
544           I->getOperand(0).getReg() != X ||
545           I->getOperand(0).getSubReg() != Xsub) {
546         MovX = nullptr;
547         break;
548       }
549       // Implicit use of M0 is an indirect move.
550       if (I->hasRegisterImplicitUseOperand(AMDGPU::M0))
551         continue;
552 
553       if (Size > 1 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U)))
554         continue;
555 
556       MovX = &*I;
557     }
558 
559     if (!MovX)
560       continue;
561 
562     LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << *MovY);
563 
564     for (unsigned I = 0; I < Size; ++I) {
565       TargetInstrInfo::RegSubRegPair X1, Y1;
566       X1 = getSubRegForIndex(X, Xsub, I);
567       Y1 = getSubRegForIndex(Y, Ysub, I);
568       MachineBasicBlock &MBB = *MovT.getParent();
569       auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(),
570                          TII->get(AMDGPU::V_SWAP_B32))
571         .addDef(X1.Reg, 0, X1.SubReg)
572         .addDef(Y1.Reg, 0, Y1.SubReg)
573         .addReg(Y1.Reg, 0, Y1.SubReg)
574         .addReg(X1.Reg, 0, X1.SubReg).getInstr();
575       if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
576         // Drop implicit EXEC.
577         MIB->removeOperand(MIB->getNumExplicitOperands());
578         MIB->copyImplicitOps(*MBB.getParent(), *MovX);
579       }
580     }
581     MovX->eraseFromParent();
582     dropInstructionKeepingImpDefs(*MovY);
583     MachineInstr *Next = &*std::next(MovT.getIterator());
584 
585     if (T.isVirtual() && MRI->use_nodbg_empty(T)) {
586       dropInstructionKeepingImpDefs(MovT);
587     } else {
588       Xop.setIsKill(false);
589       for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) {
590         unsigned OpNo = MovT.getNumExplicitOperands() + I;
591         const MachineOperand &Op = MovT.getOperand(OpNo);
592         if (Op.isKill() && TRI->regsOverlap(X, Op.getReg()))
593           MovT.removeOperand(OpNo);
594       }
595     }
596 
597     return Next;
598   }
599 
600   return nullptr;
601 }
602 
603 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
604   if (skipFunction(MF.getFunction()))
605     return false;
606 
607   MRI = &MF.getRegInfo();
608   ST = &MF.getSubtarget<GCNSubtarget>();
609   TII = ST->getInstrInfo();
610   TRI = &TII->getRegisterInfo();
611 
612   unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
613 
614   std::vector<unsigned> I1Defs;
615 
616   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
617                                                   BI != BE; ++BI) {
618 
619     MachineBasicBlock &MBB = *BI;
620     MachineBasicBlock::iterator I, Next;
621     for (I = MBB.begin(); I != MBB.end(); I = Next) {
622       Next = std::next(I);
623       MachineInstr &MI = *I;
624 
625       if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
626         // If this has a literal constant source that is the same as the
627         // reversed bits of an inline immediate, replace with a bitreverse of
628         // that constant. This saves 4 bytes in the common case of materializing
629         // sign bits.
630 
631         // Test if we are after regalloc. We only want to do this after any
632         // optimizations happen because this will confuse them.
633         // XXX - not exactly a check for post-regalloc run.
634         MachineOperand &Src = MI.getOperand(1);
635         if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) {
636           int32_t ReverseImm;
637           if (isReverseInlineImm(Src, ReverseImm)) {
638             MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
639             Src.setImm(ReverseImm);
640             continue;
641           }
642         }
643       }
644 
645       if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
646                             MI.getOpcode() == AMDGPU::COPY)) {
647         if (auto *NextMI = matchSwap(MI)) {
648           Next = NextMI->getIterator();
649           continue;
650         }
651       }
652 
653       // FIXME: We also need to consider movs of constant operands since
654       // immediate operands are not folded if they have more than one use, and
655       // the operand folding pass is unaware if the immediate will be free since
656       // it won't know if the src == dest constraint will end up being
657       // satisfied.
658       if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
659           MI.getOpcode() == AMDGPU::S_MUL_I32) {
660         const MachineOperand *Dest = &MI.getOperand(0);
661         MachineOperand *Src0 = &MI.getOperand(1);
662         MachineOperand *Src1 = &MI.getOperand(2);
663 
664         if (!Src0->isReg() && Src1->isReg()) {
665           if (TII->commuteInstruction(MI, false, 1, 2))
666             std::swap(Src0, Src1);
667         }
668 
669         // FIXME: This could work better if hints worked with subregisters. If
670         // we have a vector add of a constant, we usually don't get the correct
671         // allocation due to the subregister usage.
672         if (Dest->getReg().isVirtual() && Src0->isReg()) {
673           MRI->setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
674           MRI->setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
675           continue;
676         }
677 
678         if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
679           if (Src1->isImm() && isKImmOperand(*Src1)) {
680             unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
681               AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
682 
683             MI.setDesc(TII->get(Opc));
684             MI.tieOperands(0, 1);
685           }
686         }
687       }
688 
689       // Try to use s_cmpk_*
690       if (MI.isCompare() && TII->isSOPC(MI)) {
691         shrinkScalarCompare(MI);
692         continue;
693       }
694 
695       // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
696       if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
697         const MachineOperand &Dst = MI.getOperand(0);
698         MachineOperand &Src = MI.getOperand(1);
699 
700         if (Src.isImm() && Dst.getReg().isPhysical()) {
701           int32_t ReverseImm;
702           if (isKImmOperand(Src))
703             MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
704           else if (isReverseInlineImm(Src, ReverseImm)) {
705             MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
706             Src.setImm(ReverseImm);
707           }
708         }
709 
710         continue;
711       }
712 
713       // Shrink scalar logic operations.
714       if (MI.getOpcode() == AMDGPU::S_AND_B32 ||
715           MI.getOpcode() == AMDGPU::S_OR_B32 ||
716           MI.getOpcode() == AMDGPU::S_XOR_B32) {
717         if (shrinkScalarLogicOp(MI))
718           continue;
719       }
720 
721       if (TII->isMIMG(MI.getOpcode()) &&
722           ST->getGeneration() >= AMDGPUSubtarget::GFX10 &&
723           MF.getProperties().hasProperty(
724               MachineFunctionProperties::Property::NoVRegs)) {
725         shrinkMIMG(MI);
726         continue;
727       }
728 
729       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
730         continue;
731 
732       if (!TII->canShrink(MI, *MRI)) {
733         // Try commuting the instruction and see if that enables us to shrink
734         // it.
735         if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
736             !TII->canShrink(MI, *MRI))
737           continue;
738       }
739 
740       int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
741 
742       if (TII->isVOPC(Op32)) {
743         MachineOperand &Op0 = MI.getOperand(0);
744         if (Op0.isReg()) {
745           // Exclude VOPCX instructions as these don't explicitly write a
746           // dst.
747           Register DstReg = Op0.getReg();
748           if (DstReg.isVirtual()) {
749             // VOPC instructions can only write to the VCC register. We can't
750             // force them to use VCC here, because this is only one register and
751             // cannot deal with sequences which would require multiple copies of
752             // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
753             //
754             // So, instead of forcing the instruction to write to VCC, we
755             // provide a hint to the register allocator to use VCC and then we
756             // will run this pass again after RA and shrink it if it outputs to
757             // VCC.
758             MRI->setRegAllocationHint(DstReg, 0, VCCReg);
759             continue;
760           }
761           if (DstReg != VCCReg)
762             continue;
763         }
764       }
765 
766       if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
767         // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
768         // instructions.
769         const MachineOperand *Src2 =
770             TII->getNamedOperand(MI, AMDGPU::OpName::src2);
771         if (!Src2->isReg())
772           continue;
773         Register SReg = Src2->getReg();
774         if (SReg.isVirtual()) {
775           MRI->setRegAllocationHint(SReg, 0, VCCReg);
776           continue;
777         }
778         if (SReg != VCCReg)
779           continue;
780       }
781 
782       // Check for the bool flag output for instructions like V_ADD_I32_e64.
783       const MachineOperand *SDst = TII->getNamedOperand(MI,
784                                                         AMDGPU::OpName::sdst);
785 
786       if (SDst) {
787         bool Next = false;
788 
789         if (SDst->getReg() != VCCReg) {
790           if (SDst->getReg().isVirtual())
791             MRI->setRegAllocationHint(SDst->getReg(), 0, VCCReg);
792           Next = true;
793         }
794 
795         // All of the instructions with carry outs also have an SGPR input in
796         // src2.
797         const MachineOperand *Src2 = TII->getNamedOperand(MI,
798                                                           AMDGPU::OpName::src2);
799         if (Src2 && Src2->getReg() != VCCReg) {
800           if (Src2->getReg().isVirtual())
801             MRI->setRegAllocationHint(Src2->getReg(), 0, VCCReg);
802           Next = true;
803         }
804 
805         if (Next)
806           continue;
807       }
808 
809       // We can shrink this instruction
810       LLVM_DEBUG(dbgs() << "Shrinking " << MI);
811 
812       MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32);
813       ++NumInstructionsShrunk;
814 
815       // Copy extra operands not present in the instruction definition.
816       copyExtraImplicitOps(*Inst32, MI);
817 
818       // Copy deadness from the old explicit vcc def to the new implicit def.
819       if (SDst && SDst->isDead())
820         Inst32->findRegisterDefOperand(VCCReg)->setIsDead();
821 
822       MI.eraseFromParent();
823       foldImmediates(*Inst32);
824 
825       LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
826     }
827   }
828   return false;
829 }
830