1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 /// The pass tries to use the 32-bit encoding for instructions when possible.
9 //===----------------------------------------------------------------------===//
10 //
11 
12 #include "AMDGPU.h"
13 #include "AMDGPUMCInstLower.h"
14 #include "AMDGPUSubtarget.h"
15 #include "SIInstrInfo.h"
16 #include "llvm/ADT/Statistic.h"
17 #include "llvm/CodeGen/MachineFunctionPass.h"
18 #include "llvm/CodeGen/MachineInstrBuilder.h"
19 #include "llvm/CodeGen/MachineRegisterInfo.h"
20 #include "llvm/IR/Constants.h"
21 #include "llvm/IR/Function.h"
22 #include "llvm/IR/LLVMContext.h"
23 #include "llvm/Support/Debug.h"
24 #include "llvm/Support/raw_ostream.h"
25 #include "llvm/Target/TargetMachine.h"
26 
27 #define DEBUG_TYPE "si-shrink-instructions"
28 
29 STATISTIC(NumInstructionsShrunk,
30           "Number of 64-bit instruction reduced to 32-bit.");
31 STATISTIC(NumLiteralConstantsFolded,
32           "Number of literal constants folded into 32-bit instructions.");
33 
34 using namespace llvm;
35 
36 namespace {
37 
38 class SIShrinkInstructions : public MachineFunctionPass {
39 public:
40   static char ID;
41 
42 public:
43   SIShrinkInstructions() : MachineFunctionPass(ID) {
44   }
45 
46   bool runOnMachineFunction(MachineFunction &MF) override;
47 
48   StringRef getPassName() const override { return "SI Shrink Instructions"; }
49 
50   void getAnalysisUsage(AnalysisUsage &AU) const override {
51     AU.setPreservesCFG();
52     MachineFunctionPass::getAnalysisUsage(AU);
53   }
54 };
55 
56 } // End anonymous namespace.
57 
58 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE,
59                 "SI Shrink Instructions", false, false)
60 
61 char SIShrinkInstructions::ID = 0;
62 
63 FunctionPass *llvm::createSIShrinkInstructionsPass() {
64   return new SIShrinkInstructions();
65 }
66 
67 static bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI,
68                    const MachineRegisterInfo &MRI) {
69   if (!MO->isReg())
70     return false;
71 
72   if (TargetRegisterInfo::isVirtualRegister(MO->getReg()))
73     return TRI.hasVGPRs(MRI.getRegClass(MO->getReg()));
74 
75   return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg()));
76 }
77 
78 static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
79                       const SIRegisterInfo &TRI,
80                       const MachineRegisterInfo &MRI) {
81 
82   const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
83   // Can't shrink instruction with three operands.
84   // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
85   // a special case for it.  It can only be shrunk if the third operand
86   // is vcc.  We should handle this the same way we handle vopc, by addding
87   // a register allocation hint pre-regalloc and then do the shrinking
88   // post-regalloc.
89   if (Src2) {
90     switch (MI.getOpcode()) {
91       default: return false;
92 
93       case AMDGPU::V_ADDC_U32_e64:
94       case AMDGPU::V_SUBB_U32_e64:
95       case AMDGPU::V_SUBBREV_U32_e64:
96         if (!isVGPR(TII->getNamedOperand(MI, AMDGPU::OpName::src1), TRI, MRI))
97           return false;
98         // Additional verification is needed for sdst/src2.
99         return true;
100 
101       case AMDGPU::V_MAC_F32_e64:
102       case AMDGPU::V_MAC_F16_e64:
103         if (!isVGPR(Src2, TRI, MRI) ||
104             TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
105           return false;
106         break;
107 
108       case AMDGPU::V_CNDMASK_B32_e64:
109         break;
110     }
111   }
112 
113   const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
114   if (Src1 && (!isVGPR(Src1, TRI, MRI) ||
115                TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
116     return false;
117 
118   // We don't need to check src0, all input types are legal, so just make sure
119   // src0 isn't using any modifiers.
120   if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
121     return false;
122 
123   // Check output modifiers
124   return !TII->hasModifiersSet(MI, AMDGPU::OpName::omod) &&
125          !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp);
126 }
127 
128 /// \brief This function checks \p MI for operands defined by a move immediate
129 /// instruction and then folds the literal constant into the instruction if it
130 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
131 static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
132                            MachineRegisterInfo &MRI, bool TryToCommute = true) {
133   assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
134 
135   int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
136 
137   // Try to fold Src0
138   MachineOperand &Src0 = MI.getOperand(Src0Idx);
139   if (Src0.isReg()) {
140     unsigned Reg = Src0.getReg();
141     if (TargetRegisterInfo::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) {
142       MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
143       if (Def && Def->isMoveImmediate()) {
144         MachineOperand &MovSrc = Def->getOperand(1);
145         bool ConstantFolded = false;
146 
147         if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) ||
148                                isUInt<32>(MovSrc.getImm()))) {
149           // It's possible to have only one component of a super-reg defined by
150           // a single mov, so we need to clear any subregister flag.
151           Src0.setSubReg(0);
152           Src0.ChangeToImmediate(MovSrc.getImm());
153           ConstantFolded = true;
154         } else if (MovSrc.isFI()) {
155           Src0.setSubReg(0);
156           Src0.ChangeToFrameIndex(MovSrc.getIndex());
157           ConstantFolded = true;
158         }
159 
160         if (ConstantFolded) {
161           assert(MRI.use_empty(Reg));
162           Def->eraseFromParent();
163           ++NumLiteralConstantsFolded;
164           return true;
165         }
166       }
167     }
168   }
169 
170   // We have failed to fold src0, so commute the instruction and try again.
171   if (TryToCommute && MI.isCommutable()) {
172     if (TII->commuteInstruction(MI)) {
173       if (foldImmediates(MI, TII, MRI, false))
174         return true;
175 
176       // Commute back.
177       TII->commuteInstruction(MI);
178     }
179   }
180 
181   return false;
182 }
183 
184 // Copy MachineOperand with all flags except setting it as implicit.
185 static void copyFlagsToImplicitVCC(MachineInstr &MI,
186                                    const MachineOperand &Orig) {
187 
188   for (MachineOperand &Use : MI.implicit_operands()) {
189     if (Use.isUse() && Use.getReg() == AMDGPU::VCC) {
190       Use.setIsUndef(Orig.isUndef());
191       Use.setIsKill(Orig.isKill());
192       return;
193     }
194   }
195 }
196 
197 static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
198   return isInt<16>(Src.getImm()) &&
199     !TII->isInlineConstant(*Src.getParent(),
200                            Src.getParent()->getOperandNo(&Src));
201 }
202 
203 static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
204   return isUInt<16>(Src.getImm()) &&
205     !TII->isInlineConstant(*Src.getParent(),
206                            Src.getParent()->getOperandNo(&Src));
207 }
208 
209 static bool isKImmOrKUImmOperand(const SIInstrInfo *TII,
210                                  const MachineOperand &Src,
211                                  bool &IsUnsigned) {
212   if (isInt<16>(Src.getImm())) {
213     IsUnsigned = false;
214     return !TII->isInlineConstant(Src);
215   }
216 
217   if (isUInt<16>(Src.getImm())) {
218     IsUnsigned = true;
219     return !TII->isInlineConstant(Src);
220   }
221 
222   return false;
223 }
224 
225 /// \returns true if the constant in \p Src should be replaced with a bitreverse
226 /// of an inline immediate.
227 static bool isReverseInlineImm(const SIInstrInfo *TII,
228                                const MachineOperand &Src,
229                                int32_t &ReverseImm) {
230   if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src))
231     return false;
232 
233   ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm()));
234   return ReverseImm >= -16 && ReverseImm <= 64;
235 }
236 
237 /// Copy implicit register operands from specified instruction to this
238 /// instruction that are not part of the instruction definition.
239 static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF,
240                                  const MachineInstr &MI) {
241   for (unsigned i = MI.getDesc().getNumOperands() +
242          MI.getDesc().getNumImplicitUses() +
243          MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
244        i != e; ++i) {
245     const MachineOperand &MO = MI.getOperand(i);
246     if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask())
247       NewMI.addOperand(MF, MO);
248   }
249 }
250 
251 static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
252   // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
253   // get constants on the RHS.
254   if (!MI.getOperand(0).isReg())
255     TII->commuteInstruction(MI, false, 0, 1);
256 
257   const MachineOperand &Src1 = MI.getOperand(1);
258   if (!Src1.isImm())
259     return;
260 
261   int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode());
262   if (SOPKOpc == -1)
263     return;
264 
265   // eq/ne is special because the imm16 can be treated as signed or unsigned,
266   // and initially selectd to the unsigned versions.
267   if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) {
268     bool HasUImm;
269     if (isKImmOrKUImmOperand(TII, Src1, HasUImm)) {
270       if (!HasUImm) {
271         SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
272           AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32;
273       }
274 
275       MI.setDesc(TII->get(SOPKOpc));
276     }
277 
278     return;
279   }
280 
281   const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
282 
283   if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(TII, Src1)) ||
284       (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(TII, Src1))) {
285     MI.setDesc(NewDesc);
286   }
287 }
288 
289 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
290   if (skipFunction(MF.getFunction()))
291     return false;
292 
293   MachineRegisterInfo &MRI = MF.getRegInfo();
294   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
295   const SIInstrInfo *TII = ST.getInstrInfo();
296   const SIRegisterInfo &TRI = TII->getRegisterInfo();
297 
298   std::vector<unsigned> I1Defs;
299 
300   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
301                                                   BI != BE; ++BI) {
302 
303     MachineBasicBlock &MBB = *BI;
304     MachineBasicBlock::iterator I, Next;
305     for (I = MBB.begin(); I != MBB.end(); I = Next) {
306       Next = std::next(I);
307       MachineInstr &MI = *I;
308 
309       if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
310         // If this has a literal constant source that is the same as the
311         // reversed bits of an inline immediate, replace with a bitreverse of
312         // that constant. This saves 4 bytes in the common case of materializing
313         // sign bits.
314 
315         // Test if we are after regalloc. We only want to do this after any
316         // optimizations happen because this will confuse them.
317         // XXX - not exactly a check for post-regalloc run.
318         MachineOperand &Src = MI.getOperand(1);
319         if (Src.isImm() &&
320             TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) {
321           int32_t ReverseImm;
322           if (isReverseInlineImm(TII, Src, ReverseImm)) {
323             MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
324             Src.setImm(ReverseImm);
325             continue;
326           }
327         }
328       }
329 
330       // Combine adjacent s_nops to use the immediate operand encoding how long
331       // to wait.
332       //
333       // s_nop N
334       // s_nop M
335       //  =>
336       // s_nop (N + M)
337       if (MI.getOpcode() == AMDGPU::S_NOP &&
338           Next != MBB.end() &&
339           (*Next).getOpcode() == AMDGPU::S_NOP) {
340 
341         MachineInstr &NextMI = *Next;
342         // The instruction encodes the amount to wait with an offset of 1,
343         // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back
344         // after adding.
345         uint8_t Nop0 = MI.getOperand(0).getImm() + 1;
346         uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1;
347 
348         // Make sure we don't overflow the bounds.
349         if (Nop0 + Nop1 <= 8) {
350           NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1);
351           MI.eraseFromParent();
352         }
353 
354         continue;
355       }
356 
357       // FIXME: We also need to consider movs of constant operands since
358       // immediate operands are not folded if they have more than one use, and
359       // the operand folding pass is unaware if the immediate will be free since
360       // it won't know if the src == dest constraint will end up being
361       // satisfied.
362       if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
363           MI.getOpcode() == AMDGPU::S_MUL_I32) {
364         const MachineOperand *Dest = &MI.getOperand(0);
365         MachineOperand *Src0 = &MI.getOperand(1);
366         MachineOperand *Src1 = &MI.getOperand(2);
367 
368         if (!Src0->isReg() && Src1->isReg()) {
369           if (TII->commuteInstruction(MI, false, 1, 2))
370             std::swap(Src0, Src1);
371         }
372 
373         // FIXME: This could work better if hints worked with subregisters. If
374         // we have a vector add of a constant, we usually don't get the correct
375         // allocation due to the subregister usage.
376         if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) &&
377             Src0->isReg()) {
378           MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
379           MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
380           continue;
381         }
382 
383         if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
384           if (Src1->isImm() && isKImmOperand(TII, *Src1)) {
385             unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
386               AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
387 
388             MI.setDesc(TII->get(Opc));
389             MI.tieOperands(0, 1);
390           }
391         }
392       }
393 
394       // Try to use s_cmpk_*
395       if (MI.isCompare() && TII->isSOPC(MI)) {
396         shrinkScalarCompare(TII, MI);
397         continue;
398       }
399 
400       // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
401       if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
402         const MachineOperand &Dst = MI.getOperand(0);
403         MachineOperand &Src = MI.getOperand(1);
404 
405         if (Src.isImm() &&
406             TargetRegisterInfo::isPhysicalRegister(Dst.getReg())) {
407           int32_t ReverseImm;
408           if (isKImmOperand(TII, Src))
409             MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
410           else if (isReverseInlineImm(TII, Src, ReverseImm)) {
411             MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
412             Src.setImm(ReverseImm);
413           }
414         }
415 
416         continue;
417       }
418 
419       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
420         continue;
421 
422       if (!canShrink(MI, TII, TRI, MRI)) {
423         // Try commuting the instruction and see if that enables us to shrink
424         // it.
425         if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
426             !canShrink(MI, TII, TRI, MRI))
427           continue;
428       }
429 
430       // getVOPe32 could be -1 here if we started with an instruction that had
431       // a 32-bit encoding and then commuted it to an instruction that did not.
432       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
433         continue;
434 
435       int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
436 
437       if (TII->isVOPC(Op32)) {
438         unsigned DstReg = MI.getOperand(0).getReg();
439         if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
440           // VOPC instructions can only write to the VCC register. We can't
441           // force them to use VCC here, because this is only one register and
442           // cannot deal with sequences which would require multiple copies of
443           // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
444           //
445           // So, instead of forcing the instruction to write to VCC, we provide
446           // a hint to the register allocator to use VCC and then we we will run
447           // this pass again after RA and shrink it if it outputs to VCC.
448           MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
449           continue;
450         }
451         if (DstReg != AMDGPU::VCC)
452           continue;
453       }
454 
455       if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
456         // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
457         // instructions.
458         const MachineOperand *Src2 =
459             TII->getNamedOperand(MI, AMDGPU::OpName::src2);
460         if (!Src2->isReg())
461           continue;
462         unsigned SReg = Src2->getReg();
463         if (TargetRegisterInfo::isVirtualRegister(SReg)) {
464           MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC);
465           continue;
466         }
467         if (SReg != AMDGPU::VCC)
468           continue;
469       }
470 
471       // Check for the bool flag output for instructions like V_ADD_I32_e64.
472       const MachineOperand *SDst = TII->getNamedOperand(MI,
473                                                         AMDGPU::OpName::sdst);
474 
475       // Check the carry-in operand for v_addc_u32_e64.
476       const MachineOperand *Src2 = TII->getNamedOperand(MI,
477                                                         AMDGPU::OpName::src2);
478 
479       if (SDst) {
480         if (SDst->getReg() != AMDGPU::VCC) {
481           if (TargetRegisterInfo::isVirtualRegister(SDst->getReg()))
482             MRI.setRegAllocationHint(SDst->getReg(), 0, AMDGPU::VCC);
483           continue;
484         }
485 
486         // All of the instructions with carry outs also have an SGPR input in
487         // src2.
488         if (Src2 && Src2->getReg() != AMDGPU::VCC) {
489           if (TargetRegisterInfo::isVirtualRegister(Src2->getReg()))
490             MRI.setRegAllocationHint(Src2->getReg(), 0, AMDGPU::VCC);
491 
492           continue;
493         }
494       }
495 
496       // We can shrink this instruction
497       DEBUG(dbgs() << "Shrinking " << MI);
498 
499       MachineInstrBuilder Inst32 =
500           BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
501 
502       // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
503       // For VOPC instructions, this is replaced by an implicit def of vcc.
504       int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
505       if (Op32DstIdx != -1) {
506         // dst
507         Inst32.add(MI.getOperand(0));
508       } else {
509         assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
510                "Unexpected case");
511       }
512 
513 
514       Inst32.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
515 
516       const MachineOperand *Src1 =
517           TII->getNamedOperand(MI, AMDGPU::OpName::src1);
518       if (Src1)
519         Inst32.add(*Src1);
520 
521       if (Src2) {
522         int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
523         if (Op32Src2Idx != -1) {
524           Inst32.add(*Src2);
525         } else {
526           // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
527           // replaced with an implicit read of vcc. This was already added
528           // during the initial BuildMI, so find it to preserve the flags.
529           copyFlagsToImplicitVCC(*Inst32, *Src2);
530         }
531       }
532 
533       ++NumInstructionsShrunk;
534 
535       // Copy extra operands not present in the instruction definition.
536       copyExtraImplicitOps(*Inst32, MF, MI);
537 
538       MI.eraseFromParent();
539       foldImmediates(*Inst32, TII, MRI);
540 
541       DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
542 
543 
544     }
545   }
546   return false;
547 }
548