1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 /// The pass tries to use the 32-bit encoding for instructions when possible.
9 //===----------------------------------------------------------------------===//
10 //
11 
12 #include "AMDGPU.h"
13 #include "AMDGPUMCInstLower.h"
14 #include "AMDGPUSubtarget.h"
15 #include "SIInstrInfo.h"
16 #include "llvm/ADT/Statistic.h"
17 #include "llvm/CodeGen/MachineFunctionPass.h"
18 #include "llvm/CodeGen/MachineInstrBuilder.h"
19 #include "llvm/CodeGen/MachineRegisterInfo.h"
20 #include "llvm/IR/Constants.h"
21 #include "llvm/IR/Function.h"
22 #include "llvm/IR/LLVMContext.h"
23 #include "llvm/Support/Debug.h"
24 #include "llvm/Support/raw_ostream.h"
25 #include "llvm/Target/TargetMachine.h"
26 
27 #define DEBUG_TYPE "si-shrink-instructions"
28 
29 STATISTIC(NumInstructionsShrunk,
30           "Number of 64-bit instruction reduced to 32-bit.");
31 STATISTIC(NumLiteralConstantsFolded,
32           "Number of literal constants folded into 32-bit instructions.");
33 
34 using namespace llvm;
35 
36 namespace {
37 
38 class SIShrinkInstructions : public MachineFunctionPass {
39 public:
40   static char ID;
41 
42 public:
43   SIShrinkInstructions() : MachineFunctionPass(ID) {
44   }
45 
46   bool runOnMachineFunction(MachineFunction &MF) override;
47 
48   StringRef getPassName() const override { return "SI Shrink Instructions"; }
49 
50   void getAnalysisUsage(AnalysisUsage &AU) const override {
51     AU.setPreservesCFG();
52     MachineFunctionPass::getAnalysisUsage(AU);
53   }
54 };
55 
56 } // End anonymous namespace.
57 
58 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE,
59                 "SI Shrink Instructions", false, false)
60 
61 char SIShrinkInstructions::ID = 0;
62 
63 FunctionPass *llvm::createSIShrinkInstructionsPass() {
64   return new SIShrinkInstructions();
65 }
66 
67 static bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI,
68                    const MachineRegisterInfo &MRI) {
69   if (!MO->isReg())
70     return false;
71 
72   if (TargetRegisterInfo::isVirtualRegister(MO->getReg()))
73     return TRI.hasVGPRs(MRI.getRegClass(MO->getReg()));
74 
75   return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg()));
76 }
77 
78 static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
79                       const SIRegisterInfo &TRI,
80                       const MachineRegisterInfo &MRI) {
81 
82   const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
83   // Can't shrink instruction with three operands.
84   // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
85   // a special case for it.  It can only be shrunk if the third operand
86   // is vcc.  We should handle this the same way we handle vopc, by addding
87   // a register allocation hint pre-regalloc and then do the shrining
88   // post-regalloc.
89   if (Src2) {
90     switch (MI.getOpcode()) {
91       default: return false;
92 
93       case AMDGPU::V_MAC_F32_e64:
94         if (!isVGPR(Src2, TRI, MRI) ||
95             TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
96           return false;
97         break;
98 
99       case AMDGPU::V_CNDMASK_B32_e64:
100         break;
101     }
102   }
103 
104   const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
105   const MachineOperand *Src1Mod =
106       TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
107 
108   if (Src1 && (!isVGPR(Src1, TRI, MRI) || (Src1Mod && Src1Mod->getImm() != 0)))
109     return false;
110 
111   // We don't need to check src0, all input types are legal, so just make sure
112   // src0 isn't using any modifiers.
113   if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
114     return false;
115 
116   // Check output modifiers
117   if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
118     return false;
119 
120   return !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp);
121 }
122 
123 /// \brief This function checks \p MI for operands defined by a move immediate
124 /// instruction and then folds the literal constant into the instruction if it
125 /// can.  This function assumes that \p MI is a VOP1, VOP2, or VOPC instruction
126 /// and will only fold literal constants if we are still in SSA.
127 static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
128                            MachineRegisterInfo &MRI, bool TryToCommute = true) {
129 
130   if (!MRI.isSSA())
131     return;
132 
133   assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
134 
135   int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
136   MachineOperand &Src0 = MI.getOperand(Src0Idx);
137 
138   // Only one literal constant is allowed per instruction, so if src0 is a
139   // literal constant then we can't do any folding.
140   if (Src0.isImm() &&
141       TII->isLiteralConstant(Src0, TII->getOpSize(MI, Src0Idx)))
142     return;
143 
144   // Try to fold Src0
145   if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) {
146     unsigned Reg = Src0.getReg();
147     MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
148     if (Def && Def->isMoveImmediate()) {
149       MachineOperand &MovSrc = Def->getOperand(1);
150       bool ConstantFolded = false;
151 
152       if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) ||
153                              isUInt<32>(MovSrc.getImm()))) {
154         Src0.ChangeToImmediate(MovSrc.getImm());
155         ConstantFolded = true;
156       }
157       if (ConstantFolded) {
158         if (MRI.use_empty(Reg))
159           Def->eraseFromParent();
160         ++NumLiteralConstantsFolded;
161         return;
162       }
163     }
164   }
165 
166   // We have failed to fold src0, so commute the instruction and try again.
167   if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(MI))
168     foldImmediates(MI, TII, MRI, false);
169 
170 }
171 
172 // Copy MachineOperand with all flags except setting it as implicit.
173 static void copyFlagsToImplicitVCC(MachineInstr &MI,
174                                    const MachineOperand &Orig) {
175 
176   for (MachineOperand &Use : MI.implicit_operands()) {
177     if (Use.getReg() == AMDGPU::VCC) {
178       Use.setIsUndef(Orig.isUndef());
179       Use.setIsKill(Orig.isKill());
180       return;
181     }
182   }
183 }
184 
185 static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
186   return isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4);
187 }
188 
189 static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
190   return isUInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4);
191 }
192 
193 static bool isKImmOrKUImmOperand(const SIInstrInfo *TII,
194                                  const MachineOperand &Src,
195                                  bool &IsUnsigned) {
196   if (isInt<16>(Src.getImm())) {
197     IsUnsigned = false;
198     return !TII->isInlineConstant(Src, 4);
199   }
200 
201   if (isUInt<16>(Src.getImm())) {
202     IsUnsigned = true;
203     return !TII->isInlineConstant(Src, 4);
204   }
205 
206   return false;
207 }
208 
209 /// Copy implicit register operands from specified instruction to this
210 /// instruction that are not part of the instruction definition.
211 static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF,
212                                  const MachineInstr &MI) {
213   for (unsigned i = MI.getDesc().getNumOperands() +
214          MI.getDesc().getNumImplicitUses() +
215          MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
216        i != e; ++i) {
217     const MachineOperand &MO = MI.getOperand(i);
218     if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask())
219       NewMI.addOperand(MF, MO);
220   }
221 }
222 
223 static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
224   // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
225   // get constants on the RHS.
226   if (!MI.getOperand(0).isReg())
227     TII->commuteInstruction(MI, false, 0, 1);
228 
229   const MachineOperand &Src1 = MI.getOperand(1);
230   if (!Src1.isImm())
231     return;
232 
233   int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode());
234   if (SOPKOpc == -1)
235     return;
236 
237   // eq/ne is special because the imm16 can be treated as signed or unsigned,
238   // and initially selectd to the unsigned versions.
239   if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) {
240     bool HasUImm;
241     if (isKImmOrKUImmOperand(TII, Src1, HasUImm)) {
242       if (!HasUImm) {
243         SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
244           AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32;
245       }
246 
247       MI.setDesc(TII->get(SOPKOpc));
248     }
249 
250     return;
251   }
252 
253   const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
254 
255   if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(TII, Src1)) ||
256       (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(TII, Src1))) {
257     MI.setDesc(NewDesc);
258   }
259 }
260 
261 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
262   if (skipFunction(*MF.getFunction()))
263     return false;
264 
265   MachineRegisterInfo &MRI = MF.getRegInfo();
266   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
267   const SIInstrInfo *TII = ST.getInstrInfo();
268   const SIRegisterInfo &TRI = TII->getRegisterInfo();
269 
270   std::vector<unsigned> I1Defs;
271 
272   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
273                                                   BI != BE; ++BI) {
274 
275     MachineBasicBlock &MBB = *BI;
276     MachineBasicBlock::iterator I, Next;
277     for (I = MBB.begin(); I != MBB.end(); I = Next) {
278       Next = std::next(I);
279       MachineInstr &MI = *I;
280 
281       if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
282         // If this has a literal constant source that is the same as the
283         // reversed bits of an inline immediate, replace with a bitreverse of
284         // that constant. This saves 4 bytes in the common case of materializing
285         // sign bits.
286 
287         // Test if we are after regalloc. We only want to do this after any
288         // optimizations happen because this will confuse them.
289         // XXX - not exactly a check for post-regalloc run.
290         MachineOperand &Src = MI.getOperand(1);
291         if (Src.isImm() &&
292             TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) {
293           int64_t Imm = Src.getImm();
294           if (isInt<32>(Imm) && !TII->isInlineConstant(Src, 4)) {
295             int32_t ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Imm));
296             if (ReverseImm >= -16 && ReverseImm <= 64) {
297               MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
298               Src.setImm(ReverseImm);
299               continue;
300             }
301           }
302         }
303       }
304 
305       // Combine adjacent s_nops to use the immediate operand encoding how long
306       // to wait.
307       //
308       // s_nop N
309       // s_nop M
310       //  =>
311       // s_nop (N + M)
312       if (MI.getOpcode() == AMDGPU::S_NOP &&
313           Next != MBB.end() &&
314           (*Next).getOpcode() == AMDGPU::S_NOP) {
315 
316         MachineInstr &NextMI = *Next;
317         // The instruction encodes the amount to wait with an offset of 1,
318         // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back
319         // after adding.
320         uint8_t Nop0 = MI.getOperand(0).getImm() + 1;
321         uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1;
322 
323         // Make sure we don't overflow the bounds.
324         if (Nop0 + Nop1 <= 8) {
325           NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1);
326           MI.eraseFromParent();
327         }
328 
329         continue;
330       }
331 
332       // FIXME: We also need to consider movs of constant operands since
333       // immediate operands are not folded if they have more than one use, and
334       // the operand folding pass is unaware if the immediate will be free since
335       // it won't know if the src == dest constraint will end up being
336       // satisfied.
337       if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
338           MI.getOpcode() == AMDGPU::S_MUL_I32) {
339         const MachineOperand *Dest = &MI.getOperand(0);
340         MachineOperand *Src0 = &MI.getOperand(1);
341         MachineOperand *Src1 = &MI.getOperand(2);
342 
343         if (!Src0->isReg() && Src1->isReg()) {
344           if (TII->commuteInstruction(MI, false, 1, 2))
345             std::swap(Src0, Src1);
346         }
347 
348         // FIXME: This could work better if hints worked with subregisters. If
349         // we have a vector add of a constant, we usually don't get the correct
350         // allocation due to the subregister usage.
351         if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) &&
352             Src0->isReg()) {
353           MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
354           MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
355           continue;
356         }
357 
358         if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
359           if (Src1->isImm() && isKImmOperand(TII, *Src1)) {
360             unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
361               AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
362 
363             MI.setDesc(TII->get(Opc));
364             MI.tieOperands(0, 1);
365           }
366         }
367       }
368 
369       // Try to use s_cmpk_*
370       if (MI.isCompare() && TII->isSOPC(MI)) {
371         shrinkScalarCompare(TII, MI);
372         continue;
373       }
374 
375       // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
376       if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
377         const MachineOperand &Src = MI.getOperand(1);
378 
379         if (Src.isImm() && isKImmOperand(TII, Src))
380           MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
381 
382         continue;
383       }
384 
385       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
386         continue;
387 
388       if (!canShrink(MI, TII, TRI, MRI)) {
389         // Try commuting the instruction and see if that enables us to shrink
390         // it.
391         if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
392             !canShrink(MI, TII, TRI, MRI))
393           continue;
394       }
395 
396       // getVOPe32 could be -1 here if we started with an instruction that had
397       // a 32-bit encoding and then commuted it to an instruction that did not.
398       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
399         continue;
400 
401       int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
402 
403       if (TII->isVOPC(Op32)) {
404         unsigned DstReg = MI.getOperand(0).getReg();
405         if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
406           // VOPC instructions can only write to the VCC register. We can't
407           // force them to use VCC here, because this is only one register and
408           // cannot deal with sequences which would require multiple copies of
409           // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
410           //
411           // So, instead of forcing the instruction to write to VCC, we provide
412           // a hint to the register allocator to use VCC and then we we will run
413           // this pass again after RA and shrink it if it outputs to VCC.
414           MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
415           continue;
416         }
417         if (DstReg != AMDGPU::VCC)
418           continue;
419       }
420 
421       if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
422         // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
423         // instructions.
424         const MachineOperand *Src2 =
425             TII->getNamedOperand(MI, AMDGPU::OpName::src2);
426         if (!Src2->isReg())
427           continue;
428         unsigned SReg = Src2->getReg();
429         if (TargetRegisterInfo::isVirtualRegister(SReg)) {
430           MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC);
431           continue;
432         }
433         if (SReg != AMDGPU::VCC)
434           continue;
435       }
436 
437       // We can shrink this instruction
438       DEBUG(dbgs() << "Shrinking " << MI);
439 
440       MachineInstrBuilder Inst32 =
441           BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
442 
443       // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
444       // For VOPC instructions, this is replaced by an implicit def of vcc.
445       int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
446       if (Op32DstIdx != -1) {
447         // dst
448         Inst32.addOperand(MI.getOperand(0));
449       } else {
450         assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
451                "Unexpected case");
452       }
453 
454 
455       Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
456 
457       const MachineOperand *Src1 =
458           TII->getNamedOperand(MI, AMDGPU::OpName::src1);
459       if (Src1)
460         Inst32.addOperand(*Src1);
461 
462       const MachineOperand *Src2 =
463         TII->getNamedOperand(MI, AMDGPU::OpName::src2);
464       if (Src2) {
465         int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
466         if (Op32Src2Idx != -1) {
467           Inst32.addOperand(*Src2);
468         } else {
469           // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
470           // replaced with an implicit read of vcc. This was already added
471           // during the initial BuildMI, so find it to preserve the flags.
472           copyFlagsToImplicitVCC(*Inst32, *Src2);
473         }
474       }
475 
476       ++NumInstructionsShrunk;
477 
478       // Copy extra operands not present in the instruction definition.
479       copyExtraImplicitOps(*Inst32, MF, MI);
480 
481       MI.eraseFromParent();
482       foldImmediates(*Inst32, TII, MRI);
483 
484       DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
485 
486 
487     }
488   }
489   return false;
490 }
491