1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 /// The pass tries to use the 32-bit encoding for instructions when possible.
9 //===----------------------------------------------------------------------===//
10 //
11 
12 #include "AMDGPU.h"
13 #include "AMDGPUMCInstLower.h"
14 #include "AMDGPUSubtarget.h"
15 #include "SIInstrInfo.h"
16 #include "llvm/ADT/Statistic.h"
17 #include "llvm/CodeGen/MachineFunctionPass.h"
18 #include "llvm/CodeGen/MachineInstrBuilder.h"
19 #include "llvm/CodeGen/MachineRegisterInfo.h"
20 #include "llvm/IR/Constants.h"
21 #include "llvm/IR/Function.h"
22 #include "llvm/IR/LLVMContext.h"
23 #include "llvm/Support/Debug.h"
24 #include "llvm/Support/raw_ostream.h"
25 #include "llvm/Target/TargetMachine.h"
26 
27 #define DEBUG_TYPE "si-shrink-instructions"
28 
29 STATISTIC(NumInstructionsShrunk,
30           "Number of 64-bit instruction reduced to 32-bit.");
31 STATISTIC(NumLiteralConstantsFolded,
32           "Number of literal constants folded into 32-bit instructions.");
33 
34 using namespace llvm;
35 
36 namespace {
37 
38 class SIShrinkInstructions : public MachineFunctionPass {
39 public:
40   static char ID;
41 
42 public:
43   SIShrinkInstructions() : MachineFunctionPass(ID) {
44   }
45 
46   bool runOnMachineFunction(MachineFunction &MF) override;
47 
48   StringRef getPassName() const override { return "SI Shrink Instructions"; }
49 
50   void getAnalysisUsage(AnalysisUsage &AU) const override {
51     AU.setPreservesCFG();
52     MachineFunctionPass::getAnalysisUsage(AU);
53   }
54 };
55 
56 } // End anonymous namespace.
57 
58 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE,
59                 "SI Shrink Instructions", false, false)
60 
61 char SIShrinkInstructions::ID = 0;
62 
63 FunctionPass *llvm::createSIShrinkInstructionsPass() {
64   return new SIShrinkInstructions();
65 }
66 
67 static bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI,
68                    const MachineRegisterInfo &MRI) {
69   if (!MO->isReg())
70     return false;
71 
72   if (TargetRegisterInfo::isVirtualRegister(MO->getReg()))
73     return TRI.hasVGPRs(MRI.getRegClass(MO->getReg()));
74 
75   return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg()));
76 }
77 
78 static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
79                       const SIRegisterInfo &TRI,
80                       const MachineRegisterInfo &MRI) {
81 
82   const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
83   // Can't shrink instruction with three operands.
84   // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
85   // a special case for it.  It can only be shrunk if the third operand
86   // is vcc.  We should handle this the same way we handle vopc, by addding
87   // a register allocation hint pre-regalloc and then do the shrining
88   // post-regalloc.
89   if (Src2) {
90     switch (MI.getOpcode()) {
91       default: return false;
92 
93       case AMDGPU::V_MAC_F32_e64:
94       case AMDGPU::V_MAC_F16_e64:
95         if (!isVGPR(Src2, TRI, MRI) ||
96             TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
97           return false;
98         break;
99 
100       case AMDGPU::V_CNDMASK_B32_e64:
101         break;
102     }
103   }
104 
105   const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
106   const MachineOperand *Src1Mod =
107       TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
108 
109   if (Src1 && (!isVGPR(Src1, TRI, MRI) || (Src1Mod && Src1Mod->getImm() != 0)))
110     return false;
111 
112   // We don't need to check src0, all input types are legal, so just make sure
113   // src0 isn't using any modifiers.
114   if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
115     return false;
116 
117   // Check output modifiers
118   if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
119     return false;
120 
121   return !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp);
122 }
123 
124 /// \brief This function checks \p MI for operands defined by a move immediate
125 /// instruction and then folds the literal constant into the instruction if it
126 /// can.  This function assumes that \p MI is a VOP1, VOP2, or VOPC instruction
127 /// and will only fold literal constants if we are still in SSA.
128 static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
129                            MachineRegisterInfo &MRI, bool TryToCommute = true) {
130 
131   if (!MRI.isSSA())
132     return;
133 
134   assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
135 
136   int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
137   MachineOperand &Src0 = MI.getOperand(Src0Idx);
138 
139   // Only one literal constant is allowed per instruction, so if src0 is a
140   // literal constant then we can't do any folding.
141   if (Src0.isImm() &&
142       TII->isLiteralConstant(Src0, TII->getOpSize(MI, Src0Idx)))
143     return;
144 
145   // Try to fold Src0
146   if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) {
147     unsigned Reg = Src0.getReg();
148     MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
149     if (Def && Def->isMoveImmediate()) {
150       MachineOperand &MovSrc = Def->getOperand(1);
151       bool ConstantFolded = false;
152 
153       if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) ||
154                              isUInt<32>(MovSrc.getImm()))) {
155         Src0.ChangeToImmediate(MovSrc.getImm());
156         ConstantFolded = true;
157       }
158       if (ConstantFolded) {
159         if (MRI.use_empty(Reg))
160           Def->eraseFromParent();
161         ++NumLiteralConstantsFolded;
162         return;
163       }
164     }
165   }
166 
167   // We have failed to fold src0, so commute the instruction and try again.
168   if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(MI))
169     foldImmediates(MI, TII, MRI, false);
170 
171 }
172 
173 // Copy MachineOperand with all flags except setting it as implicit.
174 static void copyFlagsToImplicitVCC(MachineInstr &MI,
175                                    const MachineOperand &Orig) {
176 
177   for (MachineOperand &Use : MI.implicit_operands()) {
178     if (Use.getReg() == AMDGPU::VCC) {
179       Use.setIsUndef(Orig.isUndef());
180       Use.setIsKill(Orig.isKill());
181       return;
182     }
183   }
184 }
185 
186 static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
187   return isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4);
188 }
189 
190 static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
191   return isUInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4);
192 }
193 
194 static bool isKImmOrKUImmOperand(const SIInstrInfo *TII,
195                                  const MachineOperand &Src,
196                                  bool &IsUnsigned) {
197   if (isInt<16>(Src.getImm())) {
198     IsUnsigned = false;
199     return !TII->isInlineConstant(Src, 4);
200   }
201 
202   if (isUInt<16>(Src.getImm())) {
203     IsUnsigned = true;
204     return !TII->isInlineConstant(Src, 4);
205   }
206 
207   return false;
208 }
209 
210 /// \returns true if the constant in \p Src should be replaced with a bitreverse
211 /// of an inline immediate.
212 static bool isReverseInlineImm(const SIInstrInfo *TII,
213                                const MachineOperand &Src,
214                                int32_t &ReverseImm) {
215   if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src, 4))
216     return false;
217 
218   ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm()));
219   return ReverseImm >= -16 && ReverseImm <= 64;
220 }
221 
222 /// Copy implicit register operands from specified instruction to this
223 /// instruction that are not part of the instruction definition.
224 static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF,
225                                  const MachineInstr &MI) {
226   for (unsigned i = MI.getDesc().getNumOperands() +
227          MI.getDesc().getNumImplicitUses() +
228          MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
229        i != e; ++i) {
230     const MachineOperand &MO = MI.getOperand(i);
231     if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask())
232       NewMI.addOperand(MF, MO);
233   }
234 }
235 
236 static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
237   // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
238   // get constants on the RHS.
239   if (!MI.getOperand(0).isReg())
240     TII->commuteInstruction(MI, false, 0, 1);
241 
242   const MachineOperand &Src1 = MI.getOperand(1);
243   if (!Src1.isImm())
244     return;
245 
246   int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode());
247   if (SOPKOpc == -1)
248     return;
249 
250   // eq/ne is special because the imm16 can be treated as signed or unsigned,
251   // and initially selectd to the unsigned versions.
252   if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) {
253     bool HasUImm;
254     if (isKImmOrKUImmOperand(TII, Src1, HasUImm)) {
255       if (!HasUImm) {
256         SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
257           AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32;
258       }
259 
260       MI.setDesc(TII->get(SOPKOpc));
261     }
262 
263     return;
264   }
265 
266   const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
267 
268   if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(TII, Src1)) ||
269       (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(TII, Src1))) {
270     MI.setDesc(NewDesc);
271   }
272 }
273 
274 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
275   if (skipFunction(*MF.getFunction()))
276     return false;
277 
278   MachineRegisterInfo &MRI = MF.getRegInfo();
279   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
280   const SIInstrInfo *TII = ST.getInstrInfo();
281   const SIRegisterInfo &TRI = TII->getRegisterInfo();
282 
283   std::vector<unsigned> I1Defs;
284 
285   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
286                                                   BI != BE; ++BI) {
287 
288     MachineBasicBlock &MBB = *BI;
289     MachineBasicBlock::iterator I, Next;
290     for (I = MBB.begin(); I != MBB.end(); I = Next) {
291       Next = std::next(I);
292       MachineInstr &MI = *I;
293 
294       if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
295         // If this has a literal constant source that is the same as the
296         // reversed bits of an inline immediate, replace with a bitreverse of
297         // that constant. This saves 4 bytes in the common case of materializing
298         // sign bits.
299 
300         // Test if we are after regalloc. We only want to do this after any
301         // optimizations happen because this will confuse them.
302         // XXX - not exactly a check for post-regalloc run.
303         MachineOperand &Src = MI.getOperand(1);
304         if (Src.isImm() &&
305             TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) {
306           int32_t ReverseImm;
307           if (isReverseInlineImm(TII, Src, ReverseImm)) {
308             MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
309             Src.setImm(ReverseImm);
310             continue;
311           }
312         }
313       }
314 
315       // Combine adjacent s_nops to use the immediate operand encoding how long
316       // to wait.
317       //
318       // s_nop N
319       // s_nop M
320       //  =>
321       // s_nop (N + M)
322       if (MI.getOpcode() == AMDGPU::S_NOP &&
323           Next != MBB.end() &&
324           (*Next).getOpcode() == AMDGPU::S_NOP) {
325 
326         MachineInstr &NextMI = *Next;
327         // The instruction encodes the amount to wait with an offset of 1,
328         // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back
329         // after adding.
330         uint8_t Nop0 = MI.getOperand(0).getImm() + 1;
331         uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1;
332 
333         // Make sure we don't overflow the bounds.
334         if (Nop0 + Nop1 <= 8) {
335           NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1);
336           MI.eraseFromParent();
337         }
338 
339         continue;
340       }
341 
342       // FIXME: We also need to consider movs of constant operands since
343       // immediate operands are not folded if they have more than one use, and
344       // the operand folding pass is unaware if the immediate will be free since
345       // it won't know if the src == dest constraint will end up being
346       // satisfied.
347       if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
348           MI.getOpcode() == AMDGPU::S_MUL_I32) {
349         const MachineOperand *Dest = &MI.getOperand(0);
350         MachineOperand *Src0 = &MI.getOperand(1);
351         MachineOperand *Src1 = &MI.getOperand(2);
352 
353         if (!Src0->isReg() && Src1->isReg()) {
354           if (TII->commuteInstruction(MI, false, 1, 2))
355             std::swap(Src0, Src1);
356         }
357 
358         // FIXME: This could work better if hints worked with subregisters. If
359         // we have a vector add of a constant, we usually don't get the correct
360         // allocation due to the subregister usage.
361         if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) &&
362             Src0->isReg()) {
363           MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
364           MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
365           continue;
366         }
367 
368         if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
369           if (Src1->isImm() && isKImmOperand(TII, *Src1)) {
370             unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
371               AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
372 
373             MI.setDesc(TII->get(Opc));
374             MI.tieOperands(0, 1);
375           }
376         }
377       }
378 
379       // Try to use s_cmpk_*
380       if (MI.isCompare() && TII->isSOPC(MI)) {
381         shrinkScalarCompare(TII, MI);
382         continue;
383       }
384 
385       // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
386       if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
387         const MachineOperand &Dst = MI.getOperand(0);
388         MachineOperand &Src = MI.getOperand(1);
389 
390         if (Src.isImm() &&
391             TargetRegisterInfo::isPhysicalRegister(Dst.getReg())) {
392           int32_t ReverseImm;
393           if (isKImmOperand(TII, Src))
394             MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
395           else if (isReverseInlineImm(TII, Src, ReverseImm)) {
396             MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
397             Src.setImm(ReverseImm);
398           }
399         }
400 
401         continue;
402       }
403 
404       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
405         continue;
406 
407       if (!canShrink(MI, TII, TRI, MRI)) {
408         // Try commuting the instruction and see if that enables us to shrink
409         // it.
410         if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
411             !canShrink(MI, TII, TRI, MRI))
412           continue;
413       }
414 
415       // getVOPe32 could be -1 here if we started with an instruction that had
416       // a 32-bit encoding and then commuted it to an instruction that did not.
417       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
418         continue;
419 
420       int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
421 
422       if (TII->isVOPC(Op32)) {
423         unsigned DstReg = MI.getOperand(0).getReg();
424         if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
425           // VOPC instructions can only write to the VCC register. We can't
426           // force them to use VCC here, because this is only one register and
427           // cannot deal with sequences which would require multiple copies of
428           // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
429           //
430           // So, instead of forcing the instruction to write to VCC, we provide
431           // a hint to the register allocator to use VCC and then we we will run
432           // this pass again after RA and shrink it if it outputs to VCC.
433           MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
434           continue;
435         }
436         if (DstReg != AMDGPU::VCC)
437           continue;
438       }
439 
440       if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
441         // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
442         // instructions.
443         const MachineOperand *Src2 =
444             TII->getNamedOperand(MI, AMDGPU::OpName::src2);
445         if (!Src2->isReg())
446           continue;
447         unsigned SReg = Src2->getReg();
448         if (TargetRegisterInfo::isVirtualRegister(SReg)) {
449           MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC);
450           continue;
451         }
452         if (SReg != AMDGPU::VCC)
453           continue;
454       }
455 
456       // We can shrink this instruction
457       DEBUG(dbgs() << "Shrinking " << MI);
458 
459       MachineInstrBuilder Inst32 =
460           BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
461 
462       // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
463       // For VOPC instructions, this is replaced by an implicit def of vcc.
464       int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
465       if (Op32DstIdx != -1) {
466         // dst
467         Inst32.addOperand(MI.getOperand(0));
468       } else {
469         assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
470                "Unexpected case");
471       }
472 
473 
474       Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
475 
476       const MachineOperand *Src1 =
477           TII->getNamedOperand(MI, AMDGPU::OpName::src1);
478       if (Src1)
479         Inst32.addOperand(*Src1);
480 
481       const MachineOperand *Src2 =
482         TII->getNamedOperand(MI, AMDGPU::OpName::src2);
483       if (Src2) {
484         int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
485         if (Op32Src2Idx != -1) {
486           Inst32.addOperand(*Src2);
487         } else {
488           // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
489           // replaced with an implicit read of vcc. This was already added
490           // during the initial BuildMI, so find it to preserve the flags.
491           copyFlagsToImplicitVCC(*Inst32, *Src2);
492         }
493       }
494 
495       ++NumInstructionsShrunk;
496 
497       // Copy extra operands not present in the instruction definition.
498       copyExtraImplicitOps(*Inst32, MF, MI);
499 
500       MI.eraseFromParent();
501       foldImmediates(*Inst32, TII, MRI);
502 
503       DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
504 
505 
506     }
507   }
508   return false;
509 }
510