1 //=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0
9 // operand. If any of the use instruction cannot be combined with the mov the
10 // whole sequence is reverted.
11 //
12 // $old = ...
13 // $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane,
14 //                            dpp_controls..., $row_mask, $bank_mask, $bound_ctrl
15 // $res = VALU $dpp_value [, src1]
16 //
17 // to
18 //
19 // $res = VALU_DPP $combined_old, $vgpr_to_be_read_from_other_lane, [src1,]
20 //                 dpp_controls..., $row_mask, $bank_mask, $combined_bound_ctrl
21 //
22 // Combining rules :
23 //
24 // if $row_mask and $bank_mask are fully enabled (0xF) and
25 //    $bound_ctrl==DPP_BOUND_ZERO or $old==0
26 // -> $combined_old = undef,
27 //    $combined_bound_ctrl = DPP_BOUND_ZERO
28 //
29 // if the VALU op is binary and
30 //    $bound_ctrl==DPP_BOUND_OFF and
31 //    $old==identity value (immediate) for the VALU op
32 // -> $combined_old = src1,
33 //    $combined_bound_ctrl = DPP_BOUND_OFF
34 //
35 // Othervise cancel.
36 //
37 // The mov_dpp instruction should recide in the same BB as all it's uses
38 //===----------------------------------------------------------------------===//
39 
40 #include "AMDGPU.h"
41 #include "AMDGPUSubtarget.h"
42 #include "SIInstrInfo.h"
43 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
44 #include "llvm/ADT/SmallVector.h"
45 #include "llvm/ADT/Statistic.h"
46 #include "llvm/CodeGen/MachineBasicBlock.h"
47 #include "llvm/CodeGen/MachineFunction.h"
48 #include "llvm/CodeGen/MachineFunctionPass.h"
49 #include "llvm/CodeGen/MachineInstr.h"
50 #include "llvm/CodeGen/MachineInstrBuilder.h"
51 #include "llvm/CodeGen/MachineOperand.h"
52 #include "llvm/CodeGen/MachineRegisterInfo.h"
53 #include "llvm/CodeGen/TargetRegisterInfo.h"
54 #include "llvm/Pass.h"
55 #include <cassert>
56 
57 using namespace llvm;
58 
59 #define DEBUG_TYPE "gcn-dpp-combine"
60 
61 STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined.");
62 
63 namespace {
64 
65 class GCNDPPCombine : public MachineFunctionPass {
66   MachineRegisterInfo *MRI;
67   const SIInstrInfo *TII;
68 
69   using RegSubRegPair = TargetInstrInfo::RegSubRegPair;
70 
71   MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const;
72 
73   MachineInstr *createDPPInst(MachineInstr &OrigMI,
74                               MachineInstr &MovMI,
75                               RegSubRegPair CombOldVGPR,
76                               MachineOperand *OldOpnd,
77                               bool CombBCZ) const;
78 
79   MachineInstr *createDPPInst(MachineInstr &OrigMI,
80                               MachineInstr &MovMI,
81                               RegSubRegPair CombOldVGPR,
82                               bool CombBCZ) const;
83 
84   bool hasNoImmOrEqual(MachineInstr &MI,
85                        unsigned OpndName,
86                        int64_t Value,
87                        int64_t Mask = -1) const;
88 
89   bool combineDPPMov(MachineInstr &MI) const;
90 
91 public:
92   static char ID;
93 
94   GCNDPPCombine() : MachineFunctionPass(ID) {
95     initializeGCNDPPCombinePass(*PassRegistry::getPassRegistry());
96   }
97 
98   bool runOnMachineFunction(MachineFunction &MF) override;
99 
100   StringRef getPassName() const override { return "GCN DPP Combine"; }
101 
102   void getAnalysisUsage(AnalysisUsage &AU) const override {
103     AU.setPreservesCFG();
104     MachineFunctionPass::getAnalysisUsage(AU);
105   }
106 };
107 
108 } // end anonymous namespace
109 
110 INITIALIZE_PASS(GCNDPPCombine, DEBUG_TYPE, "GCN DPP Combine", false, false)
111 
112 char GCNDPPCombine::ID = 0;
113 
114 char &llvm::GCNDPPCombineID = GCNDPPCombine::ID;
115 
116 FunctionPass *llvm::createGCNDPPCombinePass() {
117   return new GCNDPPCombine();
118 }
119 
120 static int getDPPOp(unsigned Op) {
121   auto DPP32 = AMDGPU::getDPPOp32(Op);
122   if (DPP32 != -1)
123     return DPP32;
124 
125   auto E32 = AMDGPU::getVOPe32(Op);
126   return E32 != -1 ? AMDGPU::getDPPOp32(E32) : -1;
127 }
128 
129 // tracks the register operand definition and returns:
130 //   1. immediate operand used to initialize the register if found
131 //   2. nullptr if the register operand is undef
132 //   3. the operand itself otherwise
133 MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
134   auto *Def = getVRegSubRegDef(getRegSubRegPair(OldOpnd), *MRI);
135   if (!Def)
136     return nullptr;
137 
138   switch(Def->getOpcode()) {
139   default: break;
140   case AMDGPU::IMPLICIT_DEF:
141     return nullptr;
142   case AMDGPU::COPY:
143   case AMDGPU::V_MOV_B32_e32: {
144     auto &Op1 = Def->getOperand(1);
145     if (Op1.isImm())
146       return &Op1;
147     break;
148   }
149   }
150   return &OldOpnd;
151 }
152 
153 MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
154                                            MachineInstr &MovMI,
155                                            RegSubRegPair CombOldVGPR,
156                                            bool CombBCZ) const {
157   assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
158   assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() ==
159          TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg());
160 
161   auto OrigOp = OrigMI.getOpcode();
162   auto DPPOp = getDPPOp(OrigOp);
163   if (DPPOp == -1) {
164     LLVM_DEBUG(dbgs() << "  failed: no DPP opcode\n");
165     return nullptr;
166   }
167 
168   auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI,
169                          OrigMI.getDebugLoc(), TII->get(DPPOp));
170   bool Fail = false;
171   do {
172     auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst);
173     assert(Dst);
174     DPPInst.add(*Dst);
175     int NumOperands = 1;
176 
177     const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old);
178     if (OldIdx != -1) {
179       assert(OldIdx == NumOperands);
180       assert(isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI));
181       DPPInst.addReg(CombOldVGPR.Reg, 0, CombOldVGPR.SubReg);
182       ++NumOperands;
183     } else {
184       // TODO: this discards MAC/FMA instructions for now, let's add it later
185       LLVM_DEBUG(dbgs() << "  failed: no old operand in DPP instruction,"
186                            " TBD\n");
187       Fail = true;
188       break;
189     }
190 
191     if (auto *Mod0 = TII->getNamedOperand(OrigMI,
192                                           AMDGPU::OpName::src0_modifiers)) {
193       assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
194                                           AMDGPU::OpName::src0_modifiers));
195       assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
196       DPPInst.addImm(Mod0->getImm());
197       ++NumOperands;
198     }
199     auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
200     assert(Src0);
201     if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) {
202       LLVM_DEBUG(dbgs() << "  failed: src0 is illegal\n");
203       Fail = true;
204       break;
205     }
206     DPPInst.add(*Src0);
207     DPPInst->getOperand(NumOperands).setIsKill(false);
208     ++NumOperands;
209 
210     if (auto *Mod1 = TII->getNamedOperand(OrigMI,
211                                           AMDGPU::OpName::src1_modifiers)) {
212       assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
213                                           AMDGPU::OpName::src1_modifiers));
214       assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
215       DPPInst.addImm(Mod1->getImm());
216       ++NumOperands;
217     }
218     if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
219       if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) {
220         LLVM_DEBUG(dbgs() << "  failed: src1 is illegal\n");
221         Fail = true;
222         break;
223       }
224       DPPInst.add(*Src1);
225       ++NumOperands;
226     }
227 
228     if (auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2)) {
229       if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) {
230         LLVM_DEBUG(dbgs() << "  failed: src2 is illegal\n");
231         Fail = true;
232         break;
233       }
234       DPPInst.add(*Src2);
235     }
236 
237     DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
238     DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
239     DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask));
240     DPPInst.addImm(CombBCZ ? 1 : 0);
241   } while (false);
242 
243   if (Fail) {
244     DPPInst.getInstr()->eraseFromParent();
245     return nullptr;
246   }
247   LLVM_DEBUG(dbgs() << "  combined:  " << *DPPInst.getInstr());
248   return DPPInst.getInstr();
249 }
250 
251 static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd) {
252   assert(OldOpnd->isImm());
253   switch (OrigMIOp) {
254   default: break;
255   case AMDGPU::V_ADD_U32_e32:
256   case AMDGPU::V_ADD_I32_e32:
257   case AMDGPU::V_OR_B32_e32:
258   case AMDGPU::V_SUBREV_U32_e32:
259   case AMDGPU::V_SUBREV_I32_e32:
260   case AMDGPU::V_MAX_U32_e32:
261   case AMDGPU::V_XOR_B32_e32:
262     if (OldOpnd->getImm() == 0)
263       return true;
264     break;
265   case AMDGPU::V_AND_B32_e32:
266   case AMDGPU::V_MIN_U32_e32:
267     if (static_cast<uint32_t>(OldOpnd->getImm()) ==
268         std::numeric_limits<uint32_t>::max())
269       return true;
270     break;
271   case AMDGPU::V_MIN_I32_e32:
272     if (static_cast<int32_t>(OldOpnd->getImm()) ==
273         std::numeric_limits<int32_t>::max())
274       return true;
275     break;
276   case AMDGPU::V_MAX_I32_e32:
277     if (static_cast<int32_t>(OldOpnd->getImm()) ==
278         std::numeric_limits<int32_t>::min())
279       return true;
280     break;
281   case AMDGPU::V_MUL_I32_I24_e32:
282   case AMDGPU::V_MUL_U32_U24_e32:
283     if (OldOpnd->getImm() == 1)
284       return true;
285     break;
286   }
287   return false;
288 }
289 
290 MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
291                                            MachineInstr &MovMI,
292                                            RegSubRegPair CombOldVGPR,
293                                            MachineOperand *OldOpndValue,
294                                            bool CombBCZ) const {
295   assert(CombOldVGPR.Reg);
296   if (!CombBCZ && OldOpndValue && OldOpndValue->isImm()) {
297     auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
298     if (!Src1 || !Src1->isReg()) {
299       LLVM_DEBUG(dbgs() << "  failed: no src1 or it isn't a register\n");
300       return nullptr;
301     }
302     if (!isIdentityValue(OrigMI.getOpcode(), OldOpndValue)) {
303       LLVM_DEBUG(dbgs() << "  failed: old immediate ins't an identity\n");
304       return nullptr;
305     }
306     CombOldVGPR = getRegSubRegPair(*Src1);
307     if (!isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI)) {
308       LLVM_DEBUG(dbgs() << "  failed: src1 isn't a VGPR32 register\n");
309       return nullptr;
310     }
311   }
312   return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ);
313 }
314 
315 // returns true if MI doesn't have OpndName immediate operand or the
316 // operand has Value
317 bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName,
318                                     int64_t Value, int64_t Mask) const {
319   auto *Imm = TII->getNamedOperand(MI, OpndName);
320   if (!Imm)
321     return true;
322 
323   assert(Imm->isImm());
324   return (Imm->getImm() & Mask) == Value;
325 }
326 
327 bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
328   assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
329   LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
330 
331   auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
332   assert(DstOpnd && DstOpnd->isReg());
333   auto DPPMovReg = DstOpnd->getReg();
334   if (!isEXECMaskConstantBetweenDefAndUses(DPPMovReg, *MRI)) {
335     LLVM_DEBUG(dbgs() << "  failed: EXEC mask should remain the same"
336                          " for all uses\n");
337     return false;
338   }
339 
340   auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask);
341   assert(RowMaskOpnd && RowMaskOpnd->isImm());
342   auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask);
343   assert(BankMaskOpnd && BankMaskOpnd->isImm());
344   const bool MaskAllLanes = RowMaskOpnd->getImm() == 0xF &&
345                             BankMaskOpnd->getImm() == 0xF;
346 
347   auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl);
348   assert(BCZOpnd && BCZOpnd->isImm());
349   bool BoundCtrlZero = BCZOpnd->getImm();
350 
351   auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old);
352   assert(OldOpnd && OldOpnd->isReg());
353 
354   auto * const OldOpndValue = getOldOpndValue(*OldOpnd);
355   // OldOpndValue is either undef (IMPLICIT_DEF) or immediate or something else
356   // We could use: assert(!OldOpndValue || OldOpndValue->isImm())
357   // but the third option is used to distinguish undef from non-immediate
358   // to reuse IMPLICIT_DEF instruction later
359   assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd);
360 
361   bool CombBCZ = false;
362 
363   if (MaskAllLanes && BoundCtrlZero) { // [1]
364     CombBCZ = true;
365   } else {
366     if (!OldOpndValue || !OldOpndValue->isImm()) {
367       LLVM_DEBUG(dbgs() << "  failed: the DPP mov isn't combinable\n");
368       return false;
369     }
370 
371     if (OldOpndValue->getParent()->getParent() != MovMI.getParent()) {
372       LLVM_DEBUG(dbgs() <<
373         "  failed: old reg def and mov should be in the same BB\n");
374       return false;
375     }
376 
377     if (OldOpndValue->getImm() == 0) {
378       if (MaskAllLanes) {
379         assert(!BoundCtrlZero); // by check [1]
380         CombBCZ = true;
381       }
382     } else if (BoundCtrlZero) {
383       assert(!MaskAllLanes); // by check [1]
384       LLVM_DEBUG(dbgs() <<
385         "  failed: old!=0 and bctrl:0 and not all lanes isn't combinable\n");
386       return false;
387     }
388   }
389 
390   LLVM_DEBUG(dbgs() << "  old=";
391     if (!OldOpndValue)
392       dbgs() << "undef";
393     else
394       dbgs() << *OldOpndValue;
395     dbgs() << ", bound_ctrl=" << CombBCZ << '\n');
396 
397   SmallVector<MachineInstr*, 4> OrigMIs, DPPMIs;
398   auto CombOldVGPR = getRegSubRegPair(*OldOpnd);
399   // try to reuse previous old reg if its undefined (IMPLICIT_DEF)
400   if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef
401     CombOldVGPR = RegSubRegPair(
402       MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass));
403     auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(),
404                              TII->get(AMDGPU::IMPLICIT_DEF), CombOldVGPR.Reg);
405     DPPMIs.push_back(UndefInst.getInstr());
406   }
407 
408   OrigMIs.push_back(&MovMI);
409   bool Rollback = true;
410   for (auto &Use : MRI->use_nodbg_operands(DPPMovReg)) {
411     Rollback = true;
412 
413     auto &OrigMI = *Use.getParent();
414     LLVM_DEBUG(dbgs() << "  try: " << OrigMI);
415 
416     auto OrigOp = OrigMI.getOpcode();
417     if (TII->isVOP3(OrigOp)) {
418       if (!TII->hasVALU32BitEncoding(OrigOp)) {
419         LLVM_DEBUG(dbgs() << "  failed: VOP3 hasn't e32 equivalent\n");
420         break;
421       }
422       // check if other than abs|neg modifiers are set (opsel for example)
423       const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG);
424       if (!hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src0_modifiers, 0, Mask) ||
425           !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src1_modifiers, 0, Mask) ||
426           !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::clamp, 0) ||
427           !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::omod, 0)) {
428         LLVM_DEBUG(dbgs() << "  failed: VOP3 has non-default modifiers\n");
429         break;
430       }
431     } else if (!TII->isVOP1(OrigOp) && !TII->isVOP2(OrigOp)) {
432       LLVM_DEBUG(dbgs() << "  failed: not VOP1/2/3\n");
433       break;
434     }
435 
436     LLVM_DEBUG(dbgs() << "  combining: " << OrigMI);
437     if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) {
438       if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,
439                                         OldOpndValue, CombBCZ)) {
440         DPPMIs.push_back(DPPInst);
441         Rollback = false;
442       }
443     } else if (OrigMI.isCommutable() &&
444                &Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
445       auto *BB = OrigMI.getParent();
446       auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI);
447       BB->insert(OrigMI, NewMI);
448       if (TII->commuteInstruction(*NewMI)) {
449         LLVM_DEBUG(dbgs() << "  commuted:  " << *NewMI);
450         if (auto *DPPInst = createDPPInst(*NewMI, MovMI, CombOldVGPR,
451                                           OldOpndValue, CombBCZ)) {
452           DPPMIs.push_back(DPPInst);
453           Rollback = false;
454         }
455       } else
456         LLVM_DEBUG(dbgs() << "  failed: cannot be commuted\n");
457       NewMI->eraseFromParent();
458     } else
459       LLVM_DEBUG(dbgs() << "  failed: no suitable operands\n");
460     if (Rollback)
461       break;
462     OrigMIs.push_back(&OrigMI);
463   }
464 
465   for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs))
466     MI->eraseFromParent();
467 
468   return !Rollback;
469 }
470 
471 bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {
472   auto &ST = MF.getSubtarget<GCNSubtarget>();
473   if (!ST.hasDPP() || skipFunction(MF.getFunction()))
474     return false;
475 
476   MRI = &MF.getRegInfo();
477   TII = ST.getInstrInfo();
478 
479   assert(MRI->isSSA() && "Must be run on SSA");
480 
481   bool Changed = false;
482   for (auto &MBB : MF) {
483     for (auto I = MBB.rbegin(), E = MBB.rend(); I != E;) {
484       auto &MI = *I++;
485       if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) {
486         Changed = true;
487         ++NumDPPMovsCombined;
488       }
489     }
490   }
491   return Changed;
492 }
493