1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 /// The pass tries to use the 32-bit encoding for instructions when possible.
9 //===----------------------------------------------------------------------===//
10 //
11 
12 #include "AMDGPU.h"
13 #include "AMDGPUSubtarget.h"
14 #include "SIInstrInfo.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "llvm/ADT/Statistic.h"
17 #include "llvm/CodeGen/MachineFunctionPass.h"
18 #include "llvm/CodeGen/MachineInstrBuilder.h"
19 #include "llvm/CodeGen/MachineRegisterInfo.h"
20 #include "llvm/IR/Constants.h"
21 #include "llvm/IR/Function.h"
22 #include "llvm/IR/LLVMContext.h"
23 #include "llvm/Support/Debug.h"
24 #include "llvm/Support/raw_ostream.h"
25 #include "llvm/Target/TargetMachine.h"
26 
27 #define DEBUG_TYPE "si-shrink-instructions"
28 
29 STATISTIC(NumInstructionsShrunk,
30           "Number of 64-bit instruction reduced to 32-bit.");
31 STATISTIC(NumLiteralConstantsFolded,
32           "Number of literal constants folded into 32-bit instructions.");
33 
34 using namespace llvm;
35 
36 namespace {
37 
38 class SIShrinkInstructions : public MachineFunctionPass {
39 public:
40   static char ID;
41 
42 public:
43   SIShrinkInstructions() : MachineFunctionPass(ID) {
44   }
45 
46   bool runOnMachineFunction(MachineFunction &MF) override;
47 
48   StringRef getPassName() const override { return "SI Shrink Instructions"; }
49 
50   void getAnalysisUsage(AnalysisUsage &AU) const override {
51     AU.setPreservesCFG();
52     MachineFunctionPass::getAnalysisUsage(AU);
53   }
54 };
55 
56 } // End anonymous namespace.
57 
58 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE,
59                 "SI Shrink Instructions", false, false)
60 
61 char SIShrinkInstructions::ID = 0;
62 
63 FunctionPass *llvm::createSIShrinkInstructionsPass() {
64   return new SIShrinkInstructions();
65 }
66 
67 static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
68                       const SIRegisterInfo &TRI,
69                       const MachineRegisterInfo &MRI) {
70 
71   const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
72   // Can't shrink instruction with three operands.
73   // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
74   // a special case for it.  It can only be shrunk if the third operand
75   // is vcc.  We should handle this the same way we handle vopc, by addding
76   // a register allocation hint pre-regalloc and then do the shrinking
77   // post-regalloc.
78   if (Src2) {
79     switch (MI.getOpcode()) {
80       default: return false;
81 
82       case AMDGPU::V_ADDC_U32_e64:
83       case AMDGPU::V_SUBB_U32_e64:
84       case AMDGPU::V_SUBBREV_U32_e64: {
85         const MachineOperand *Src1
86           = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
87         if (!Src1->isReg() || !TRI.isVGPR(MRI, Src1->getReg()))
88           return false;
89         // Additional verification is needed for sdst/src2.
90         return true;
91       }
92       case AMDGPU::V_MAC_F32_e64:
93       case AMDGPU::V_MAC_F16_e64:
94       case AMDGPU::V_FMAC_F32_e64:
95         if (!Src2->isReg() || !TRI.isVGPR(MRI, Src2->getReg()) ||
96             TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
97           return false;
98         break;
99 
100       case AMDGPU::V_CNDMASK_B32_e64:
101         break;
102     }
103   }
104 
105   const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
106   if (Src1 && (!Src1->isReg() || !TRI.isVGPR(MRI, Src1->getReg()) ||
107                TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
108     return false;
109 
110   // We don't need to check src0, all input types are legal, so just make sure
111   // src0 isn't using any modifiers.
112   if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
113     return false;
114 
115   // Check output modifiers
116   return !TII->hasModifiersSet(MI, AMDGPU::OpName::omod) &&
117          !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp);
118 }
119 
120 /// This function checks \p MI for operands defined by a move immediate
121 /// instruction and then folds the literal constant into the instruction if it
122 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
123 static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
124                            MachineRegisterInfo &MRI, bool TryToCommute = true) {
125   assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
126 
127   int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
128 
129   // Try to fold Src0
130   MachineOperand &Src0 = MI.getOperand(Src0Idx);
131   if (Src0.isReg()) {
132     unsigned Reg = Src0.getReg();
133     if (TargetRegisterInfo::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) {
134       MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
135       if (Def && Def->isMoveImmediate()) {
136         MachineOperand &MovSrc = Def->getOperand(1);
137         bool ConstantFolded = false;
138 
139         if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) ||
140                                isUInt<32>(MovSrc.getImm()))) {
141           // It's possible to have only one component of a super-reg defined by
142           // a single mov, so we need to clear any subregister flag.
143           Src0.setSubReg(0);
144           Src0.ChangeToImmediate(MovSrc.getImm());
145           ConstantFolded = true;
146         } else if (MovSrc.isFI()) {
147           Src0.setSubReg(0);
148           Src0.ChangeToFrameIndex(MovSrc.getIndex());
149           ConstantFolded = true;
150         }
151 
152         if (ConstantFolded) {
153           assert(MRI.use_empty(Reg));
154           Def->eraseFromParent();
155           ++NumLiteralConstantsFolded;
156           return true;
157         }
158       }
159     }
160   }
161 
162   // We have failed to fold src0, so commute the instruction and try again.
163   if (TryToCommute && MI.isCommutable()) {
164     if (TII->commuteInstruction(MI)) {
165       if (foldImmediates(MI, TII, MRI, false))
166         return true;
167 
168       // Commute back.
169       TII->commuteInstruction(MI);
170     }
171   }
172 
173   return false;
174 }
175 
176 // Copy MachineOperand with all flags except setting it as implicit.
177 static void copyFlagsToImplicitVCC(MachineInstr &MI,
178                                    const MachineOperand &Orig) {
179 
180   for (MachineOperand &Use : MI.implicit_operands()) {
181     if (Use.isUse() && Use.getReg() == AMDGPU::VCC) {
182       Use.setIsUndef(Orig.isUndef());
183       Use.setIsKill(Orig.isKill());
184       return;
185     }
186   }
187 }
188 
189 static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
190   return isInt<16>(Src.getImm()) &&
191     !TII->isInlineConstant(*Src.getParent(),
192                            Src.getParent()->getOperandNo(&Src));
193 }
194 
195 static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
196   return isUInt<16>(Src.getImm()) &&
197     !TII->isInlineConstant(*Src.getParent(),
198                            Src.getParent()->getOperandNo(&Src));
199 }
200 
201 static bool isKImmOrKUImmOperand(const SIInstrInfo *TII,
202                                  const MachineOperand &Src,
203                                  bool &IsUnsigned) {
204   if (isInt<16>(Src.getImm())) {
205     IsUnsigned = false;
206     return !TII->isInlineConstant(Src);
207   }
208 
209   if (isUInt<16>(Src.getImm())) {
210     IsUnsigned = true;
211     return !TII->isInlineConstant(Src);
212   }
213 
214   return false;
215 }
216 
217 /// \returns true if the constant in \p Src should be replaced with a bitreverse
218 /// of an inline immediate.
219 static bool isReverseInlineImm(const SIInstrInfo *TII,
220                                const MachineOperand &Src,
221                                int32_t &ReverseImm) {
222   if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src))
223     return false;
224 
225   ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm()));
226   return ReverseImm >= -16 && ReverseImm <= 64;
227 }
228 
229 /// Copy implicit register operands from specified instruction to this
230 /// instruction that are not part of the instruction definition.
231 static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF,
232                                  const MachineInstr &MI) {
233   for (unsigned i = MI.getDesc().getNumOperands() +
234          MI.getDesc().getNumImplicitUses() +
235          MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
236        i != e; ++i) {
237     const MachineOperand &MO = MI.getOperand(i);
238     if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask())
239       NewMI.addOperand(MF, MO);
240   }
241 }
242 
243 static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
244   // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
245   // get constants on the RHS.
246   if (!MI.getOperand(0).isReg())
247     TII->commuteInstruction(MI, false, 0, 1);
248 
249   const MachineOperand &Src1 = MI.getOperand(1);
250   if (!Src1.isImm())
251     return;
252 
253   int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode());
254   if (SOPKOpc == -1)
255     return;
256 
257   // eq/ne is special because the imm16 can be treated as signed or unsigned,
258   // and initially selectd to the unsigned versions.
259   if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) {
260     bool HasUImm;
261     if (isKImmOrKUImmOperand(TII, Src1, HasUImm)) {
262       if (!HasUImm) {
263         SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
264           AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32;
265       }
266 
267       MI.setDesc(TII->get(SOPKOpc));
268     }
269 
270     return;
271   }
272 
273   const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
274 
275   if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(TII, Src1)) ||
276       (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(TII, Src1))) {
277     MI.setDesc(NewDesc);
278   }
279 }
280 
281 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
282   if (skipFunction(MF.getFunction()))
283     return false;
284 
285   MachineRegisterInfo &MRI = MF.getRegInfo();
286   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
287   const SIInstrInfo *TII = ST.getInstrInfo();
288   const SIRegisterInfo &TRI = TII->getRegisterInfo();
289 
290   std::vector<unsigned> I1Defs;
291 
292   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
293                                                   BI != BE; ++BI) {
294 
295     MachineBasicBlock &MBB = *BI;
296     MachineBasicBlock::iterator I, Next;
297     for (I = MBB.begin(); I != MBB.end(); I = Next) {
298       Next = std::next(I);
299       MachineInstr &MI = *I;
300 
301       if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
302         // If this has a literal constant source that is the same as the
303         // reversed bits of an inline immediate, replace with a bitreverse of
304         // that constant. This saves 4 bytes in the common case of materializing
305         // sign bits.
306 
307         // Test if we are after regalloc. We only want to do this after any
308         // optimizations happen because this will confuse them.
309         // XXX - not exactly a check for post-regalloc run.
310         MachineOperand &Src = MI.getOperand(1);
311         if (Src.isImm() &&
312             TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) {
313           int32_t ReverseImm;
314           if (isReverseInlineImm(TII, Src, ReverseImm)) {
315             MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
316             Src.setImm(ReverseImm);
317             continue;
318           }
319         }
320       }
321 
322       // Combine adjacent s_nops to use the immediate operand encoding how long
323       // to wait.
324       //
325       // s_nop N
326       // s_nop M
327       //  =>
328       // s_nop (N + M)
329       if (MI.getOpcode() == AMDGPU::S_NOP &&
330           Next != MBB.end() &&
331           (*Next).getOpcode() == AMDGPU::S_NOP) {
332 
333         MachineInstr &NextMI = *Next;
334         // The instruction encodes the amount to wait with an offset of 1,
335         // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back
336         // after adding.
337         uint8_t Nop0 = MI.getOperand(0).getImm() + 1;
338         uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1;
339 
340         // Make sure we don't overflow the bounds.
341         if (Nop0 + Nop1 <= 8) {
342           NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1);
343           MI.eraseFromParent();
344         }
345 
346         continue;
347       }
348 
349       // FIXME: We also need to consider movs of constant operands since
350       // immediate operands are not folded if they have more than one use, and
351       // the operand folding pass is unaware if the immediate will be free since
352       // it won't know if the src == dest constraint will end up being
353       // satisfied.
354       if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
355           MI.getOpcode() == AMDGPU::S_MUL_I32) {
356         const MachineOperand *Dest = &MI.getOperand(0);
357         MachineOperand *Src0 = &MI.getOperand(1);
358         MachineOperand *Src1 = &MI.getOperand(2);
359 
360         if (!Src0->isReg() && Src1->isReg()) {
361           if (TII->commuteInstruction(MI, false, 1, 2))
362             std::swap(Src0, Src1);
363         }
364 
365         // FIXME: This could work better if hints worked with subregisters. If
366         // we have a vector add of a constant, we usually don't get the correct
367         // allocation due to the subregister usage.
368         if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) &&
369             Src0->isReg()) {
370           MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
371           MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
372           continue;
373         }
374 
375         if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
376           if (Src1->isImm() && isKImmOperand(TII, *Src1)) {
377             unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
378               AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
379 
380             MI.setDesc(TII->get(Opc));
381             MI.tieOperands(0, 1);
382           }
383         }
384       }
385 
386       // Try to use s_cmpk_*
387       if (MI.isCompare() && TII->isSOPC(MI)) {
388         shrinkScalarCompare(TII, MI);
389         continue;
390       }
391 
392       // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
393       if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
394         const MachineOperand &Dst = MI.getOperand(0);
395         MachineOperand &Src = MI.getOperand(1);
396 
397         if (Src.isImm() &&
398             TargetRegisterInfo::isPhysicalRegister(Dst.getReg())) {
399           int32_t ReverseImm;
400           if (isKImmOperand(TII, Src))
401             MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
402           else if (isReverseInlineImm(TII, Src, ReverseImm)) {
403             MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
404             Src.setImm(ReverseImm);
405           }
406         }
407 
408         continue;
409       }
410 
411       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
412         continue;
413 
414       if (!canShrink(MI, TII, TRI, MRI)) {
415         // Try commuting the instruction and see if that enables us to shrink
416         // it.
417         if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
418             !canShrink(MI, TII, TRI, MRI))
419           continue;
420       }
421 
422       // getVOPe32 could be -1 here if we started with an instruction that had
423       // a 32-bit encoding and then commuted it to an instruction that did not.
424       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
425         continue;
426 
427       int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
428 
429       if (TII->isVOPC(Op32)) {
430         unsigned DstReg = MI.getOperand(0).getReg();
431         if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
432           // VOPC instructions can only write to the VCC register. We can't
433           // force them to use VCC here, because this is only one register and
434           // cannot deal with sequences which would require multiple copies of
435           // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
436           //
437           // So, instead of forcing the instruction to write to VCC, we provide
438           // a hint to the register allocator to use VCC and then we will run
439           // this pass again after RA and shrink it if it outputs to VCC.
440           MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
441           continue;
442         }
443         if (DstReg != AMDGPU::VCC)
444           continue;
445       }
446 
447       if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
448         // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
449         // instructions.
450         const MachineOperand *Src2 =
451             TII->getNamedOperand(MI, AMDGPU::OpName::src2);
452         if (!Src2->isReg())
453           continue;
454         unsigned SReg = Src2->getReg();
455         if (TargetRegisterInfo::isVirtualRegister(SReg)) {
456           MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC);
457           continue;
458         }
459         if (SReg != AMDGPU::VCC)
460           continue;
461       }
462 
463       // Check for the bool flag output for instructions like V_ADD_I32_e64.
464       const MachineOperand *SDst = TII->getNamedOperand(MI,
465                                                         AMDGPU::OpName::sdst);
466 
467       // Check the carry-in operand for v_addc_u32_e64.
468       const MachineOperand *Src2 = TII->getNamedOperand(MI,
469                                                         AMDGPU::OpName::src2);
470 
471       if (SDst) {
472         if (SDst->getReg() != AMDGPU::VCC) {
473           if (TargetRegisterInfo::isVirtualRegister(SDst->getReg()))
474             MRI.setRegAllocationHint(SDst->getReg(), 0, AMDGPU::VCC);
475           continue;
476         }
477 
478         // All of the instructions with carry outs also have an SGPR input in
479         // src2.
480         if (Src2 && Src2->getReg() != AMDGPU::VCC) {
481           if (TargetRegisterInfo::isVirtualRegister(Src2->getReg()))
482             MRI.setRegAllocationHint(Src2->getReg(), 0, AMDGPU::VCC);
483 
484           continue;
485         }
486       }
487 
488       // We can shrink this instruction
489       LLVM_DEBUG(dbgs() << "Shrinking " << MI);
490 
491       MachineInstrBuilder Inst32 =
492           BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
493 
494       // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
495       // For VOPC instructions, this is replaced by an implicit def of vcc.
496       int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
497       if (Op32DstIdx != -1) {
498         // dst
499         Inst32.add(MI.getOperand(0));
500       } else {
501         assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
502                "Unexpected case");
503       }
504 
505 
506       Inst32.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
507 
508       const MachineOperand *Src1 =
509           TII->getNamedOperand(MI, AMDGPU::OpName::src1);
510       if (Src1)
511         Inst32.add(*Src1);
512 
513       if (Src2) {
514         int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
515         if (Op32Src2Idx != -1) {
516           Inst32.add(*Src2);
517         } else {
518           // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
519           // replaced with an implicit read of vcc. This was already added
520           // during the initial BuildMI, so find it to preserve the flags.
521           copyFlagsToImplicitVCC(*Inst32, *Src2);
522         }
523       }
524 
525       ++NumInstructionsShrunk;
526 
527       // Copy extra operands not present in the instruction definition.
528       copyExtraImplicitOps(*Inst32, MF, MI);
529 
530       MI.eraseFromParent();
531       foldImmediates(*Inst32, TII, MRI);
532 
533       LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
534     }
535   }
536   return false;
537 }
538