10002d4bfSBing1 Yu //===-- X86PreTileConfig.cpp - Tile Register Pre-configure-----------------===//
2f80b2987SLuo, Yuanke //
3f80b2987SLuo, Yuanke // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4f80b2987SLuo, Yuanke // See https://llvm.org/LICENSE.txt for license information.
5f80b2987SLuo, Yuanke // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6f80b2987SLuo, Yuanke //
7f80b2987SLuo, Yuanke //===----------------------------------------------------------------------===//
8f80b2987SLuo, Yuanke //
94cbaaf4aSWang, Pengfei /// \file Pass to pre-config the shapes of AMX registers
104cbaaf4aSWang, Pengfei /// AMX register needs to be configured before use. The shapes of AMX register
114cbaaf4aSWang, Pengfei /// are encoded in the 1st and 2nd machine operand of AMX pseudo instructions.
12f80b2987SLuo, Yuanke ///
134cbaaf4aSWang, Pengfei /// The instruction ldtilecfg is used to config the shapes. It must be reachable
144cbaaf4aSWang, Pengfei /// for all variable shapes. ldtilecfg will be inserted more than once if we
154cbaaf4aSWang, Pengfei /// cannot find a dominating point for all AMX instructions.
16f80b2987SLuo, Yuanke ///
174cbaaf4aSWang, Pengfei /// The configure register is caller saved according to ABI. We need to insert
184cbaaf4aSWang, Pengfei /// ldtilecfg again after the call instruction if callee clobbers any AMX
194cbaaf4aSWang, Pengfei /// registers.
20f80b2987SLuo, Yuanke ///
214cbaaf4aSWang, Pengfei /// This pass calculates all points that ldtilecfg need to be inserted to and
224cbaaf4aSWang, Pengfei /// insert them. It reports error if the reachability conditions aren't met.
23f80b2987SLuo, Yuanke //
24f80b2987SLuo, Yuanke //===----------------------------------------------------------------------===//
25f80b2987SLuo, Yuanke 
26f80b2987SLuo, Yuanke #include "X86.h"
27f80b2987SLuo, Yuanke #include "X86InstrBuilder.h"
28c4dba471SLuo, Yuanke #include "X86MachineFunctionInfo.h"
29f80b2987SLuo, Yuanke #include "X86RegisterInfo.h"
30f80b2987SLuo, Yuanke #include "X86Subtarget.h"
31f80b2987SLuo, Yuanke #include "llvm/CodeGen/MachineFunctionPass.h"
32f80b2987SLuo, Yuanke #include "llvm/CodeGen/MachineInstr.h"
334cbaaf4aSWang, Pengfei #include "llvm/CodeGen/MachineLoopInfo.h"
34f3ad7ea0SLuo, Yuanke #include "llvm/CodeGen/MachineModuleInfo.h"
35f80b2987SLuo, Yuanke #include "llvm/CodeGen/MachineRegisterInfo.h"
36f80b2987SLuo, Yuanke #include "llvm/CodeGen/Passes.h"
37f80b2987SLuo, Yuanke #include "llvm/CodeGen/TargetInstrInfo.h"
38f80b2987SLuo, Yuanke #include "llvm/CodeGen/TargetRegisterInfo.h"
39f80b2987SLuo, Yuanke #include "llvm/InitializePasses.h"
40f80b2987SLuo, Yuanke 
41f80b2987SLuo, Yuanke using namespace llvm;
42f80b2987SLuo, Yuanke 
43f80b2987SLuo, Yuanke #define DEBUG_TYPE "tile-pre-config"
44f3ad7ea0SLuo, Yuanke 
emitErrorMsg(MachineFunction & MF)45f3ad7ea0SLuo, Yuanke static void emitErrorMsg(MachineFunction &MF) {
46f3ad7ea0SLuo, Yuanke   SmallString<32> Str;
47f3ad7ea0SLuo, Yuanke   Twine ErrorMsg =
48f3ad7ea0SLuo, Yuanke       MF.getName() +
49f3ad7ea0SLuo, Yuanke       ": Failed to config tile register, please define the shape earlier";
50f3ad7ea0SLuo, Yuanke   LLVMContext &Context = MF.getMMI().getModule()->getContext();
51f3ad7ea0SLuo, Yuanke   Context.emitError(ErrorMsg);
52f3ad7ea0SLuo, Yuanke }
53f80b2987SLuo, Yuanke 
54f80b2987SLuo, Yuanke namespace {
55f80b2987SLuo, Yuanke 
564cbaaf4aSWang, Pengfei struct MIRef {
574cbaaf4aSWang, Pengfei   MachineInstr *MI = nullptr;
584cbaaf4aSWang, Pengfei   MachineBasicBlock *MBB = nullptr;
594cbaaf4aSWang, Pengfei   // A virtual position for instruction that will be inserted after MI.
604cbaaf4aSWang, Pengfei   size_t Pos = 0;
614cbaaf4aSWang, Pengfei   MIRef() = default;
MIRef__anon71fb239c0111::MIRef624cbaaf4aSWang, Pengfei   MIRef(MachineBasicBlock *MBB) : MBB(MBB) {
634cbaaf4aSWang, Pengfei     for (auto I = MBB->begin(), E = MBB->end(); I != E && I->isPHI();
644cbaaf4aSWang, Pengfei          ++I, ++Pos)
654cbaaf4aSWang, Pengfei       MI = &*I;
664cbaaf4aSWang, Pengfei   }
MIRef__anon71fb239c0111::MIRef67016092d7SWang, Pengfei   MIRef(MachineInstr *MI)
68016092d7SWang, Pengfei       : MI(MI), MBB(MI->getParent()),
69016092d7SWang, Pengfei         Pos(std::distance(MBB->instr_begin(), ++MI->getIterator())) {}
MIRef__anon71fb239c0111::MIRef704cbaaf4aSWang, Pengfei   MIRef(MachineInstr *MI, MachineBasicBlock *MBB)
714cbaaf4aSWang, Pengfei       : MI(MI), MBB(MBB),
724cbaaf4aSWang, Pengfei         Pos(std::distance(MBB->instr_begin(), ++MI->getIterator())) {}
MIRef__anon71fb239c0111::MIRef734cbaaf4aSWang, Pengfei   MIRef(MachineInstr *MI, MachineBasicBlock *MBB, size_t Pos)
744cbaaf4aSWang, Pengfei       : MI(MI), MBB(MBB), Pos(Pos) {}
operator bool__anon71fb239c0111::MIRef754cbaaf4aSWang, Pengfei   operator bool() const { return MBB != nullptr; }
operator ==__anon71fb239c0111::MIRef764cbaaf4aSWang, Pengfei   bool operator==(const MIRef &RHS) const {
774cbaaf4aSWang, Pengfei     return MI == RHS.MI && MBB == RHS.MBB;
784cbaaf4aSWang, Pengfei   }
operator !=__anon71fb239c0111::MIRef79016092d7SWang, Pengfei   bool operator!=(const MIRef &RHS) const { return !(*this == RHS); }
operator <__anon71fb239c0111::MIRef804cbaaf4aSWang, Pengfei   bool operator<(const MIRef &RHS) const {
81f69adfb8SWang, Pengfei     // Comparison between different BBs happens when inserting a MIRef into set.
82f69adfb8SWang, Pengfei     // So we compare MBB first to make the insertion happy.
83151e244fSWang, Pengfei     return MBB < RHS.MBB || (MBB == RHS.MBB && Pos < RHS.Pos);
844cbaaf4aSWang, Pengfei   }
operator >__anon71fb239c0111::MIRef854cbaaf4aSWang, Pengfei   bool operator>(const MIRef &RHS) const {
86f69adfb8SWang, Pengfei     // Comparison between different BBs happens when inserting a MIRef into set.
87f69adfb8SWang, Pengfei     // So we compare MBB first to make the insertion happy.
88151e244fSWang, Pengfei     return MBB > RHS.MBB || (MBB == RHS.MBB && Pos > RHS.Pos);
894cbaaf4aSWang, Pengfei   }
904cbaaf4aSWang, Pengfei };
91f80b2987SLuo, Yuanke 
924cbaaf4aSWang, Pengfei struct BBInfo {
934cbaaf4aSWang, Pengfei   MIRef FirstAMX;
944cbaaf4aSWang, Pengfei   MIRef LastCall;
95016092d7SWang, Pengfei   bool HasAMXRegLiveIn = false;
96a3b52a9dSWang, Pengfei   bool TileCfgForbidden = false;
974cbaaf4aSWang, Pengfei   bool NeedTileCfgLiveIn = false;
984cbaaf4aSWang, Pengfei };
994cbaaf4aSWang, Pengfei 
1004cbaaf4aSWang, Pengfei class X86PreTileConfig : public MachineFunctionPass {
1014cbaaf4aSWang, Pengfei   MachineRegisterInfo *MRI;
1024cbaaf4aSWang, Pengfei   const MachineLoopInfo *MLI;
1034cbaaf4aSWang, Pengfei   SmallSet<MachineInstr *, 8> DefVisited;
1044cbaaf4aSWang, Pengfei   DenseMap<MachineBasicBlock *, BBInfo> BBVisitedInfo;
105016092d7SWang, Pengfei   DenseMap<MachineBasicBlock *, SmallVector<MIRef, 8>> ShapeBBs;
1064cbaaf4aSWang, Pengfei 
1074cbaaf4aSWang, Pengfei   /// Check if the callee will clobber AMX registers.
isDestructiveCall(MachineInstr & MI,BitVector UsableRegs)1084cbaaf4aSWang, Pengfei   bool isDestructiveCall(MachineInstr &MI, BitVector UsableRegs) {
1094cbaaf4aSWang, Pengfei     auto Iter = llvm::find_if(
1104cbaaf4aSWang, Pengfei         MI.operands(), [](MachineOperand &MO) { return MO.isRegMask(); });
1114cbaaf4aSWang, Pengfei     if (Iter == MI.operands_end())
1124cbaaf4aSWang, Pengfei       return false;
1134cbaaf4aSWang, Pengfei     UsableRegs.clearBitsInMask(Iter->getRegMask());
1144cbaaf4aSWang, Pengfei     return !UsableRegs.none();
1154cbaaf4aSWang, Pengfei   }
1164cbaaf4aSWang, Pengfei 
1174cbaaf4aSWang, Pengfei   /// Check if MI is AMX pseudo instruction.
isAMXInstruction(MachineInstr & MI)1184cbaaf4aSWang, Pengfei   bool isAMXInstruction(MachineInstr &MI) {
1194cbaaf4aSWang, Pengfei     if (MI.isPHI() || MI.isDebugInstr() || MI.getNumOperands() < 3)
1204cbaaf4aSWang, Pengfei       return false;
1214cbaaf4aSWang, Pengfei     MachineOperand &MO = MI.getOperand(0);
1224cbaaf4aSWang, Pengfei     // We can simply check if it is AMX instruction by its def.
1234cbaaf4aSWang, Pengfei     // But we should exclude old API which uses physical registers.
1244cbaaf4aSWang, Pengfei     if (MO.isReg() && MO.getReg().isVirtual() &&
1254cbaaf4aSWang, Pengfei         MRI->getRegClass(MO.getReg())->getID() == X86::TILERegClassID) {
1264cbaaf4aSWang, Pengfei       collectShapeInfo(MI);
1274cbaaf4aSWang, Pengfei       return true;
1284cbaaf4aSWang, Pengfei     }
1294cbaaf4aSWang, Pengfei     // PTILESTOREDV is the only exception that doesn't def a AMX register.
1304cbaaf4aSWang, Pengfei     return MI.getOpcode() == X86::PTILESTOREDV;
1314cbaaf4aSWang, Pengfei   }
1324cbaaf4aSWang, Pengfei 
1334cbaaf4aSWang, Pengfei   /// Check if it is an edge from loop bottom to loop head.
isLoopBackEdge(MachineBasicBlock * Header,MachineBasicBlock * Bottom)1344cbaaf4aSWang, Pengfei   bool isLoopBackEdge(MachineBasicBlock *Header, MachineBasicBlock *Bottom) {
1354ed2b6ccSLuo, Yuanke     if (!MLI->isLoopHeader(Header))
1364ed2b6ccSLuo, Yuanke       return false;
1374ed2b6ccSLuo, Yuanke     auto *ML = MLI->getLoopFor(Header);
1384ed2b6ccSLuo, Yuanke     if (ML->contains(Bottom) && ML->isLoopLatch(Bottom))
1394ed2b6ccSLuo, Yuanke       return true;
1404ed2b6ccSLuo, Yuanke 
1414ed2b6ccSLuo, Yuanke     return false;
1424cbaaf4aSWang, Pengfei   }
1434cbaaf4aSWang, Pengfei 
1444cbaaf4aSWang, Pengfei   /// Collect the shape def information for later use.
1454cbaaf4aSWang, Pengfei   void collectShapeInfo(MachineInstr &MI);
146f80b2987SLuo, Yuanke 
147016092d7SWang, Pengfei   /// Try to hoist shapes definded below AMX instructions.
hoistShapesInBB(MachineBasicBlock * MBB,SmallVectorImpl<MIRef> & Shapes)148016092d7SWang, Pengfei   bool hoistShapesInBB(MachineBasicBlock *MBB, SmallVectorImpl<MIRef> &Shapes) {
149016092d7SWang, Pengfei     MIRef &FirstAMX = BBVisitedInfo[MBB].FirstAMX;
150016092d7SWang, Pengfei     auto FirstShapeBelowAMX = llvm::lower_bound(Shapes, FirstAMX);
151016092d7SWang, Pengfei     auto InsertPoint = FirstAMX.MI->getIterator();
152016092d7SWang, Pengfei     for (auto I = FirstShapeBelowAMX, E = Shapes.end(); I != E; ++I) {
153016092d7SWang, Pengfei       // Do not hoist instructions that access memory.
154016092d7SWang, Pengfei       if (I->MI->mayLoadOrStore())
155016092d7SWang, Pengfei         return false;
156016092d7SWang, Pengfei       for (auto &MO : I->MI->operands()) {
157016092d7SWang, Pengfei         if (MO.isDef())
158016092d7SWang, Pengfei           continue;
159016092d7SWang, Pengfei         // Do not hoist instructions if the sources' def under AMX instruction.
160016092d7SWang, Pengfei         // TODO: We can handle isMoveImmediate MI here.
161016092d7SWang, Pengfei         if (MO.isReg() && MIRef(MRI->getVRegDef(MO.getReg())) > FirstAMX)
162016092d7SWang, Pengfei           return false;
163016092d7SWang, Pengfei         // TODO: Maybe need more checks here.
164016092d7SWang, Pengfei       }
165016092d7SWang, Pengfei       MBB->insert(InsertPoint, I->MI->removeFromParent());
166016092d7SWang, Pengfei     }
167016092d7SWang, Pengfei     // We only need to mark the last shape in the BB now.
168016092d7SWang, Pengfei     Shapes.clear();
169016092d7SWang, Pengfei     Shapes.push_back(MIRef(&*--InsertPoint, MBB));
170016092d7SWang, Pengfei     return true;
171016092d7SWang, Pengfei   }
172016092d7SWang, Pengfei 
173f80b2987SLuo, Yuanke public:
X86PreTileConfig()174f80b2987SLuo, Yuanke   X86PreTileConfig() : MachineFunctionPass(ID) {}
175f80b2987SLuo, Yuanke 
176f80b2987SLuo, Yuanke   /// Return the pass name.
getPassName() const177f80b2987SLuo, Yuanke   StringRef getPassName() const override {
178f80b2987SLuo, Yuanke     return "Tile Register Pre-configure";
179f80b2987SLuo, Yuanke   }
180f80b2987SLuo, Yuanke 
181f80b2987SLuo, Yuanke   /// X86PreTileConfig analysis usage.
getAnalysisUsage(AnalysisUsage & AU) const1824cbaaf4aSWang, Pengfei   void getAnalysisUsage(AnalysisUsage &AU) const override {
1834cbaaf4aSWang, Pengfei     AU.setPreservesAll();
1844cbaaf4aSWang, Pengfei     AU.addRequired<MachineLoopInfo>();
1854cbaaf4aSWang, Pengfei     MachineFunctionPass::getAnalysisUsage(AU);
1864cbaaf4aSWang, Pengfei   }
187f80b2987SLuo, Yuanke 
1884cbaaf4aSWang, Pengfei   /// Clear MF related structures.
releaseMemory()1894cbaaf4aSWang, Pengfei   void releaseMemory() override {
1904cbaaf4aSWang, Pengfei     ShapeBBs.clear();
1914cbaaf4aSWang, Pengfei     DefVisited.clear();
1924cbaaf4aSWang, Pengfei     BBVisitedInfo.clear();
1934cbaaf4aSWang, Pengfei   }
1944cbaaf4aSWang, Pengfei 
1954cbaaf4aSWang, Pengfei   /// Perform ldtilecfg instructions inserting.
1964cbaaf4aSWang, Pengfei   bool runOnMachineFunction(MachineFunction &MF) override;
197f80b2987SLuo, Yuanke 
198f80b2987SLuo, Yuanke   static char ID;
199f80b2987SLuo, Yuanke };
200f80b2987SLuo, Yuanke 
201f80b2987SLuo, Yuanke } // end anonymous namespace
202f80b2987SLuo, Yuanke 
203f80b2987SLuo, Yuanke char X86PreTileConfig::ID = 0;
204f80b2987SLuo, Yuanke 
205f80b2987SLuo, Yuanke INITIALIZE_PASS_BEGIN(X86PreTileConfig, "tilepreconfig",
2060002d4bfSBing1 Yu                       "Tile Register Pre-configure", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)2074cbaaf4aSWang, Pengfei INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
208f80b2987SLuo, Yuanke INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig",
2090002d4bfSBing1 Yu                     "Tile Register Pre-configure", false, false)
210f80b2987SLuo, Yuanke 
2114cbaaf4aSWang, Pengfei void X86PreTileConfig::collectShapeInfo(MachineInstr &MI) {
2124cbaaf4aSWang, Pengfei   auto RecordShape = [&](MachineInstr *MI, MachineBasicBlock *MBB) {
2134cbaaf4aSWang, Pengfei     MIRef MIR(MI, MBB);
214016092d7SWang, Pengfei     auto I = llvm::lower_bound(ShapeBBs[MBB], MIR);
215016092d7SWang, Pengfei     if (I == ShapeBBs[MBB].end() || *I != MIR)
216016092d7SWang, Pengfei       ShapeBBs[MBB].insert(I, MIR);
217a5d9e0c7SWang, Pengfei   };
218a5d9e0c7SWang, Pengfei 
2194cbaaf4aSWang, Pengfei   SmallVector<Register, 8> WorkList(
2204cbaaf4aSWang, Pengfei       {MI.getOperand(1).getReg(), MI.getOperand(2).getReg()});
221a5d9e0c7SWang, Pengfei   while (!WorkList.empty()) {
2224cbaaf4aSWang, Pengfei     Register R = WorkList.pop_back_val();
2234cbaaf4aSWang, Pengfei     MachineInstr *DefMI = MRI->getVRegDef(R);
224f69adfb8SWang, Pengfei     assert(DefMI && "R must has one define instruction");
2254cbaaf4aSWang, Pengfei     MachineBasicBlock *DefMBB = DefMI->getParent();
226f69adfb8SWang, Pengfei     if (DefMI->isMoveImmediate() || !DefVisited.insert(DefMI).second)
2274cbaaf4aSWang, Pengfei       continue;
2284cbaaf4aSWang, Pengfei     if (DefMI->isPHI()) {
2294cbaaf4aSWang, Pengfei       for (unsigned I = 1; I < DefMI->getNumOperands(); I += 2)
2304cbaaf4aSWang, Pengfei         if (isLoopBackEdge(DefMBB, DefMI->getOperand(I + 1).getMBB()))
2314cbaaf4aSWang, Pengfei           RecordShape(DefMI, DefMBB); // In this case, PHI is also a shape def.
2324cbaaf4aSWang, Pengfei         else
2334cbaaf4aSWang, Pengfei           WorkList.push_back(DefMI->getOperand(I).getReg());
2344cbaaf4aSWang, Pengfei     } else {
2354cbaaf4aSWang, Pengfei       RecordShape(DefMI, DefMBB);
236a5d9e0c7SWang, Pengfei     }
237a5d9e0c7SWang, Pengfei   }
238a5d9e0c7SWang, Pengfei }
239a5d9e0c7SWang, Pengfei 
runOnMachineFunction(MachineFunction & MF)2404cbaaf4aSWang, Pengfei bool X86PreTileConfig::runOnMachineFunction(MachineFunction &MF) {
2414cbaaf4aSWang, Pengfei   const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
2424cbaaf4aSWang, Pengfei   const TargetInstrInfo *TII = ST.getInstrInfo();
2434cbaaf4aSWang, Pengfei   const TargetRegisterInfo *TRI = ST.getRegisterInfo();
244a5d9e0c7SWang, Pengfei   const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID);
245c4dba471SLuo, Yuanke   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
2464cbaaf4aSWang, Pengfei 
2474cbaaf4aSWang, Pengfei   BitVector AMXRegs(TRI->getNumRegs());
2484cbaaf4aSWang, Pengfei   for (unsigned I = 0; I < RC->getNumRegs(); I++)
2494cbaaf4aSWang, Pengfei     AMXRegs.set(X86::TMM0 + I);
2504cbaaf4aSWang, Pengfei 
2514cbaaf4aSWang, Pengfei   // Iterate MF to collect information.
2524cbaaf4aSWang, Pengfei   MRI = &MF.getRegInfo();
2534cbaaf4aSWang, Pengfei   MLI = &getAnalysis<MachineLoopInfo>();
2544cbaaf4aSWang, Pengfei   SmallSet<MIRef, 8> CfgNeedInsert;
2554cbaaf4aSWang, Pengfei   SmallVector<MachineBasicBlock *, 8> CfgLiveInBBs;
2564cbaaf4aSWang, Pengfei   for (auto &MBB : MF) {
2574cbaaf4aSWang, Pengfei     size_t Pos = 0;
2584cbaaf4aSWang, Pengfei     for (auto &MI : MBB) {
2594cbaaf4aSWang, Pengfei       ++Pos;
2604cbaaf4aSWang, Pengfei       if (isAMXInstruction(MI)) {
2614cbaaf4aSWang, Pengfei         // If there's call before the AMX, we need to reload tile config.
2624cbaaf4aSWang, Pengfei         if (BBVisitedInfo[&MBB].LastCall)
2634cbaaf4aSWang, Pengfei           CfgNeedInsert.insert(BBVisitedInfo[&MBB].LastCall);
2644cbaaf4aSWang, Pengfei         else // Otherwise, we need tile config to live in this BB.
2654cbaaf4aSWang, Pengfei           BBVisitedInfo[&MBB].NeedTileCfgLiveIn = true;
2664cbaaf4aSWang, Pengfei         // Always record the first AMX in case there's shape def after it.
2674cbaaf4aSWang, Pengfei         if (!BBVisitedInfo[&MBB].FirstAMX)
2684cbaaf4aSWang, Pengfei           BBVisitedInfo[&MBB].FirstAMX = MIRef(&MI, &MBB, Pos);
2694cbaaf4aSWang, Pengfei       } else if (MI.isCall() && isDestructiveCall(MI, AMXRegs)) {
2704cbaaf4aSWang, Pengfei         // Record the call only if the callee clobbers all AMX registers.
2714cbaaf4aSWang, Pengfei         BBVisitedInfo[&MBB].LastCall = MIRef(&MI, &MBB, Pos);
272a5d9e0c7SWang, Pengfei       }
2734cbaaf4aSWang, Pengfei     }
2744cbaaf4aSWang, Pengfei     if (BBVisitedInfo[&MBB].NeedTileCfgLiveIn) {
2754cbaaf4aSWang, Pengfei       if (&MBB == &MF.front())
2764cbaaf4aSWang, Pengfei         CfgNeedInsert.insert(MIRef(&MBB));
2774cbaaf4aSWang, Pengfei       else
2784cbaaf4aSWang, Pengfei         CfgLiveInBBs.push_back(&MBB);
279f80b2987SLuo, Yuanke     }
280016092d7SWang, Pengfei     if (BBVisitedInfo[&MBB].FirstAMX || BBVisitedInfo[&MBB].HasAMXRegLiveIn)
281016092d7SWang, Pengfei       for (auto *Succ : MBB.successors())
282016092d7SWang, Pengfei         if (!isLoopBackEdge(Succ, &MBB))
283016092d7SWang, Pengfei           BBVisitedInfo[Succ].HasAMXRegLiveIn = true;
284f80b2987SLuo, Yuanke   }
285f80b2987SLuo, Yuanke 
2864cbaaf4aSWang, Pengfei   // Update NeedTileCfgLiveIn for predecessors.
2874cbaaf4aSWang, Pengfei   while (!CfgLiveInBBs.empty()) {
2884cbaaf4aSWang, Pengfei     MachineBasicBlock *MBB = CfgLiveInBBs.pop_back_val();
2894cbaaf4aSWang, Pengfei     for (auto *Pred : MBB->predecessors()) {
2904cbaaf4aSWang, Pengfei       if (BBVisitedInfo[Pred].LastCall) {
2914cbaaf4aSWang, Pengfei         CfgNeedInsert.insert(BBVisitedInfo[Pred].LastCall);
2924cbaaf4aSWang, Pengfei       } else if (!BBVisitedInfo[Pred].NeedTileCfgLiveIn) {
2934cbaaf4aSWang, Pengfei         BBVisitedInfo[Pred].NeedTileCfgLiveIn = true;
2944cbaaf4aSWang, Pengfei         if (Pred == &MF.front())
2954cbaaf4aSWang, Pengfei           CfgNeedInsert.insert(MIRef(Pred));
2964cbaaf4aSWang, Pengfei         else
2974cbaaf4aSWang, Pengfei           CfgLiveInBBs.push_back(Pred);
2984cbaaf4aSWang, Pengfei       }
2994cbaaf4aSWang, Pengfei     }
3004cbaaf4aSWang, Pengfei   }
301f80b2987SLuo, Yuanke 
3024cbaaf4aSWang, Pengfei   // There's no AMX instruction if we didn't find a tile config live in point.
3034cbaaf4aSWang, Pengfei   if (CfgNeedInsert.empty())
304f80b2987SLuo, Yuanke     return false;
305c4dba471SLuo, Yuanke   X86FI->setHasVirtualTileReg(true);
3064cbaaf4aSWang, Pengfei 
307a3b52a9dSWang, Pengfei   // Avoid to insert ldtilecfg before any shape defs.
308016092d7SWang, Pengfei   SmallVector<MachineBasicBlock *, 8> WorkList;
309016092d7SWang, Pengfei   for (auto &I : ShapeBBs) {
310016092d7SWang, Pengfei     // TODO: We can hoist shapes across BBs here.
311f3ad7ea0SLuo, Yuanke     if (BBVisitedInfo[I.first].HasAMXRegLiveIn) {
312f3ad7ea0SLuo, Yuanke       // We are not able to config tile registers since the shape to config
313f3ad7ea0SLuo, Yuanke       // is not defined yet. Emit error message and continue. The function
314f3ad7ea0SLuo, Yuanke       // would not config tile registers.
315f3ad7ea0SLuo, Yuanke       emitErrorMsg(MF);
316f3ad7ea0SLuo, Yuanke       return false;
317f3ad7ea0SLuo, Yuanke     }
318016092d7SWang, Pengfei     if (BBVisitedInfo[I.first].FirstAMX &&
319016092d7SWang, Pengfei         BBVisitedInfo[I.first].FirstAMX < I.second.back() &&
320f3ad7ea0SLuo, Yuanke         !hoistShapesInBB(I.first, I.second)) {
321f3ad7ea0SLuo, Yuanke       emitErrorMsg(MF);
322f3ad7ea0SLuo, Yuanke       return false;
323f3ad7ea0SLuo, Yuanke     }
324016092d7SWang, Pengfei     WorkList.push_back(I.first);
325016092d7SWang, Pengfei   }
3264cbaaf4aSWang, Pengfei   while (!WorkList.empty()) {
3274cbaaf4aSWang, Pengfei     MachineBasicBlock *MBB = WorkList.pop_back_val();
328a3b52a9dSWang, Pengfei     for (auto *Pred : MBB->predecessors()) {
329a3b52a9dSWang, Pengfei       if (!BBVisitedInfo[Pred].TileCfgForbidden && !isLoopBackEdge(MBB, Pred)) {
330a3b52a9dSWang, Pengfei         BBVisitedInfo[Pred].TileCfgForbidden = true;
331a3b52a9dSWang, Pengfei         WorkList.push_back(Pred);
3324cbaaf4aSWang, Pengfei       }
333a3b52a9dSWang, Pengfei     }
3344cbaaf4aSWang, Pengfei   }
3354cbaaf4aSWang, Pengfei 
3364cbaaf4aSWang, Pengfei   DebugLoc DL;
3374cbaaf4aSWang, Pengfei   SmallSet<MIRef, 8> VisitedOrInserted;
3384cbaaf4aSWang, Pengfei   int SS = MF.getFrameInfo().CreateStackObject(
3394cbaaf4aSWang, Pengfei       ST.getTileConfigSize(), ST.getTileConfigAlignment(), false);
3404cbaaf4aSWang, Pengfei 
3414cbaaf4aSWang, Pengfei   // Try to insert for the tile config live in points.
342db23f277SSimon Pilgrim   for (const auto &I : CfgNeedInsert) {
3434cbaaf4aSWang, Pengfei     SmallSet<MIRef, 8> InsertPoints;
3444cbaaf4aSWang, Pengfei     SmallVector<MIRef, 8> WorkList({I});
3454cbaaf4aSWang, Pengfei     while (!WorkList.empty()) {
3464cbaaf4aSWang, Pengfei       MIRef I = WorkList.pop_back_val();
3474cbaaf4aSWang, Pengfei       if (!VisitedOrInserted.count(I)) {
348a3b52a9dSWang, Pengfei         if (!BBVisitedInfo[I.MBB].TileCfgForbidden) {
3494cbaaf4aSWang, Pengfei           // If the BB is all shapes reachable, stop sink and try to insert.
3504cbaaf4aSWang, Pengfei           InsertPoints.insert(I);
3514cbaaf4aSWang, Pengfei         } else {
3524cbaaf4aSWang, Pengfei           // Avoid the BB to be multi visited.
3534cbaaf4aSWang, Pengfei           VisitedOrInserted.insert(I);
3544cbaaf4aSWang, Pengfei           // Sink the inserting point along the chain with NeedTileCfgLiveIn =
3554cbaaf4aSWang, Pengfei           // true when MBB isn't all shapes reachable.
3564cbaaf4aSWang, Pengfei           for (auto *Succ : I.MBB->successors())
3574cbaaf4aSWang, Pengfei             if (BBVisitedInfo[Succ].NeedTileCfgLiveIn)
3584cbaaf4aSWang, Pengfei               WorkList.push_back(MIRef(Succ));
3594cbaaf4aSWang, Pengfei         }
3604cbaaf4aSWang, Pengfei       }
3614cbaaf4aSWang, Pengfei     }
3624cbaaf4aSWang, Pengfei 
3634cbaaf4aSWang, Pengfei     // A given point might be forked due to shape conditions are not met.
3644cbaaf4aSWang, Pengfei     for (MIRef I : InsertPoints) {
3654cbaaf4aSWang, Pengfei       // Make sure we insert ldtilecfg after the last shape def in MBB.
366016092d7SWang, Pengfei       if (ShapeBBs.count(I.MBB) && I < ShapeBBs[I.MBB].back())
367016092d7SWang, Pengfei         I = ShapeBBs[I.MBB].back();
3684cbaaf4aSWang, Pengfei       // There're chances the MBB is sunk more than once. Record it to avoid
3694cbaaf4aSWang, Pengfei       // multi insert.
3704cbaaf4aSWang, Pengfei       if (VisitedOrInserted.insert(I).second) {
3714cbaaf4aSWang, Pengfei         auto II = I.MI ? I.MI->getIterator() : I.MBB->instr_begin();
372*aaaf9cedSLuo, Yuanke         addFrameReference(BuildMI(*I.MBB, ++II, DL, TII->get(X86::PLDTILECFGV)),
3734cbaaf4aSWang, Pengfei                           SS);
3744cbaaf4aSWang, Pengfei       }
3754cbaaf4aSWang, Pengfei     }
3764cbaaf4aSWang, Pengfei   }
3774cbaaf4aSWang, Pengfei 
3784cbaaf4aSWang, Pengfei   // Zero stack slot.
3794cbaaf4aSWang, Pengfei   MachineBasicBlock &MBB = MF.front();
3804cbaaf4aSWang, Pengfei   MachineInstr *MI = &*MBB.begin();
3814cbaaf4aSWang, Pengfei   if (ST.hasAVX512()) {
3824cbaaf4aSWang, Pengfei     Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass);
383373ce147SLuo, Yuanke     BuildMI(MBB, MI, DL, TII->get(X86::AVX512_512_SET0), Zmm);
3844cbaaf4aSWang, Pengfei     addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSZmr)), SS)
3854cbaaf4aSWang, Pengfei         .addReg(Zmm);
3864cbaaf4aSWang, Pengfei   } else if (ST.hasAVX2()) {
3874cbaaf4aSWang, Pengfei     Register Ymm = MRI->createVirtualRegister(&X86::VR256RegClass);
388373ce147SLuo, Yuanke     BuildMI(MBB, MI, DL, TII->get(X86::AVX_SET0), Ymm);
3894cbaaf4aSWang, Pengfei     addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), SS)
3904cbaaf4aSWang, Pengfei         .addReg(Ymm);
3914cbaaf4aSWang, Pengfei     addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), SS, 32)
3924cbaaf4aSWang, Pengfei         .addReg(Ymm);
3934cbaaf4aSWang, Pengfei   } else {
3944cbaaf4aSWang, Pengfei     assert(ST.hasSSE2() && "AMX should assume SSE2 enabled");
395373ce147SLuo, Yuanke     unsigned StoreOpc = ST.hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr;
3964cbaaf4aSWang, Pengfei     Register Xmm = MRI->createVirtualRegister(&X86::VR128RegClass);
397373ce147SLuo, Yuanke     BuildMI(MBB, MI, DL, TII->get(X86::V_SET0), Xmm);
398373ce147SLuo, Yuanke     addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), SS).addReg(Xmm);
399373ce147SLuo, Yuanke     addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), SS, 16)
4004cbaaf4aSWang, Pengfei         .addReg(Xmm);
401373ce147SLuo, Yuanke     addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), SS, 32)
4024cbaaf4aSWang, Pengfei         .addReg(Xmm);
403373ce147SLuo, Yuanke     addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), SS, 48)
4044cbaaf4aSWang, Pengfei         .addReg(Xmm);
4054cbaaf4aSWang, Pengfei   }
406a3b52a9dSWang, Pengfei   // Fill in the palette first.
407a3b52a9dSWang, Pengfei   addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV8mi)), SS).addImm(1);
4084cbaaf4aSWang, Pengfei 
409f80b2987SLuo, Yuanke   return true;
410f80b2987SLuo, Yuanke }
411f80b2987SLuo, Yuanke 
createX86PreTileConfigPass()412f80b2987SLuo, Yuanke FunctionPass *llvm::createX86PreTileConfigPass() {
413f80b2987SLuo, Yuanke   return new X86PreTileConfig();
414f80b2987SLuo, Yuanke }
415