15a2583f0SAbderrazek Zaafrani //
22946cd70SChandler Carruth // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
32946cd70SChandler Carruth // See https://llvm.org/LICENSE.txt for license information.
42946cd70SChandler Carruth // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
55a2583f0SAbderrazek Zaafrani //
65a2583f0SAbderrazek Zaafrani //===----------------------------------------------------------------------===//
75a2583f0SAbderrazek Zaafrani //
85a2583f0SAbderrazek Zaafrani // This file contains a pass that performs optimization on SIMD instructions
95a2583f0SAbderrazek Zaafrani // with high latency by splitting them into more efficient series of
105a2583f0SAbderrazek Zaafrani // instructions.
115a2583f0SAbderrazek Zaafrani //
125a2583f0SAbderrazek Zaafrani // 1. Rewrite certain SIMD instructions with vector element due to their
135a2583f0SAbderrazek Zaafrani // inefficiency on some targets.
14a9134e86SEvandro Menezes //
15a9134e86SEvandro Menezes // For example:
165a2583f0SAbderrazek Zaafrani //    fmla v0.4s, v1.4s, v2.s[1]
17a9134e86SEvandro Menezes //
18a9134e86SEvandro Menezes // Is rewritten into:
195a2583f0SAbderrazek Zaafrani //    dup v3.4s, v2.s[1]
205a2583f0SAbderrazek Zaafrani //    fmla v0.4s, v1.4s, v3.4s
215a2583f0SAbderrazek Zaafrani //
22a9134e86SEvandro Menezes // 2. Rewrite interleaved memory access instructions due to their
235a2583f0SAbderrazek Zaafrani // inefficiency on some targets.
24a9134e86SEvandro Menezes //
25a9134e86SEvandro Menezes // For example:
265a2583f0SAbderrazek Zaafrani //    st2 {v0.4s, v1.4s}, addr
27a9134e86SEvandro Menezes //
28a9134e86SEvandro Menezes // Is rewritten into:
295a2583f0SAbderrazek Zaafrani //    zip1 v2.4s, v0.4s, v1.4s
305a2583f0SAbderrazek Zaafrani //    zip2 v3.4s, v0.4s, v1.4s
315a2583f0SAbderrazek Zaafrani //    stp  q2, q3,  addr
325a2583f0SAbderrazek Zaafrani //
335a2583f0SAbderrazek Zaafrani //===----------------------------------------------------------------------===//
345a2583f0SAbderrazek Zaafrani 
355a2583f0SAbderrazek Zaafrani #include "AArch64InstrInfo.h"
365a2583f0SAbderrazek Zaafrani #include "llvm/ADT/SmallVector.h"
375a2583f0SAbderrazek Zaafrani #include "llvm/ADT/Statistic.h"
385a2583f0SAbderrazek Zaafrani #include "llvm/ADT/StringRef.h"
395a2583f0SAbderrazek Zaafrani #include "llvm/CodeGen/MachineBasicBlock.h"
405a2583f0SAbderrazek Zaafrani #include "llvm/CodeGen/MachineFunction.h"
415a2583f0SAbderrazek Zaafrani #include "llvm/CodeGen/MachineFunctionPass.h"
425a2583f0SAbderrazek Zaafrani #include "llvm/CodeGen/MachineInstr.h"
435a2583f0SAbderrazek Zaafrani #include "llvm/CodeGen/MachineInstrBuilder.h"
445a2583f0SAbderrazek Zaafrani #include "llvm/CodeGen/MachineOperand.h"
455a2583f0SAbderrazek Zaafrani #include "llvm/CodeGen/MachineRegisterInfo.h"
465a2583f0SAbderrazek Zaafrani #include "llvm/CodeGen/TargetInstrInfo.h"
475a2583f0SAbderrazek Zaafrani #include "llvm/CodeGen/TargetSchedule.h"
485a2583f0SAbderrazek Zaafrani #include "llvm/CodeGen/TargetSubtargetInfo.h"
495a2583f0SAbderrazek Zaafrani #include "llvm/MC/MCInstrDesc.h"
505a2583f0SAbderrazek Zaafrani #include "llvm/MC/MCSchedule.h"
515a2583f0SAbderrazek Zaafrani #include "llvm/Pass.h"
525a2583f0SAbderrazek Zaafrani #include <unordered_map>
535a2583f0SAbderrazek Zaafrani 
545a2583f0SAbderrazek Zaafrani using namespace llvm;
555a2583f0SAbderrazek Zaafrani 
565a2583f0SAbderrazek Zaafrani #define DEBUG_TYPE "aarch64-simdinstr-opt"
575a2583f0SAbderrazek Zaafrani 
585a2583f0SAbderrazek Zaafrani STATISTIC(NumModifiedInstr,
595a2583f0SAbderrazek Zaafrani           "Number of SIMD instructions modified");
605a2583f0SAbderrazek Zaafrani 
615a2583f0SAbderrazek Zaafrani #define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME                                     \
625a2583f0SAbderrazek Zaafrani   "AArch64 SIMD instructions optimization pass"
635a2583f0SAbderrazek Zaafrani 
645a2583f0SAbderrazek Zaafrani namespace {
655a2583f0SAbderrazek Zaafrani 
665a2583f0SAbderrazek Zaafrani struct AArch64SIMDInstrOpt : public MachineFunctionPass {
675a2583f0SAbderrazek Zaafrani   static char ID;
685a2583f0SAbderrazek Zaafrani 
695a2583f0SAbderrazek Zaafrani   const TargetInstrInfo *TII;
705a2583f0SAbderrazek Zaafrani   MachineRegisterInfo *MRI;
715a2583f0SAbderrazek Zaafrani   TargetSchedModel SchedModel;
725a2583f0SAbderrazek Zaafrani 
735a2583f0SAbderrazek Zaafrani   // The two maps below are used to cache decisions instead of recomputing:
745a2583f0SAbderrazek Zaafrani   // This is used to cache instruction replacement decisions within function
755a2583f0SAbderrazek Zaafrani   // units and across function units.
765a2583f0SAbderrazek Zaafrani   std::map<std::pair<unsigned, std::string>, bool> SIMDInstrTable;
77a9134e86SEvandro Menezes   // This is used to cache the decision of whether to leave the interleaved
78a9134e86SEvandro Menezes   // store instructions replacement pass early or not for a particular target.
795a2583f0SAbderrazek Zaafrani   std::unordered_map<std::string, bool> InterlEarlyExit;
805a2583f0SAbderrazek Zaafrani 
815a2583f0SAbderrazek Zaafrani   typedef enum {
825a2583f0SAbderrazek Zaafrani     VectorElem,
835a2583f0SAbderrazek Zaafrani     Interleave
845a2583f0SAbderrazek Zaafrani   } Subpass;
855a2583f0SAbderrazek Zaafrani 
865a2583f0SAbderrazek Zaafrani   // Instruction represented by OrigOpc is replaced by instructions in ReplOpc.
875a2583f0SAbderrazek Zaafrani   struct InstReplInfo {
885a2583f0SAbderrazek Zaafrani     unsigned OrigOpc;
895a2583f0SAbderrazek Zaafrani 		std::vector<unsigned> ReplOpc;
905a2583f0SAbderrazek Zaafrani     const TargetRegisterClass RC;
915a2583f0SAbderrazek Zaafrani   };
925a2583f0SAbderrazek Zaafrani 
935a2583f0SAbderrazek Zaafrani #define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \
945a2583f0SAbderrazek Zaafrani   {OpcOrg, {OpcR0, OpcR1, OpcR2}, RC}
955a2583f0SAbderrazek Zaafrani #define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \
965a2583f0SAbderrazek Zaafrani                 OpcR7, OpcR8, OpcR9, RC) \
97a9134e86SEvandro Menezes   {OpcOrg, \
98a9134e86SEvandro Menezes    {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC}
995a2583f0SAbderrazek Zaafrani 
1005a2583f0SAbderrazek Zaafrani   // The Instruction Replacement Table:
1015a2583f0SAbderrazek Zaafrani   std::vector<InstReplInfo> IRT = {
1025a2583f0SAbderrazek Zaafrani     // ST2 instructions
1035a2583f0SAbderrazek Zaafrani     RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
1045a2583f0SAbderrazek Zaafrani           AArch64::STPQi, AArch64::FPR128RegClass),
1055a2583f0SAbderrazek Zaafrani     RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
1065a2583f0SAbderrazek Zaafrani           AArch64::STPQi, AArch64::FPR128RegClass),
1075a2583f0SAbderrazek Zaafrani     RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
1085a2583f0SAbderrazek Zaafrani           AArch64::STPDi, AArch64::FPR64RegClass),
1095a2583f0SAbderrazek Zaafrani     RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
1105a2583f0SAbderrazek Zaafrani           AArch64::STPQi, AArch64::FPR128RegClass),
1115a2583f0SAbderrazek Zaafrani     RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
1125a2583f0SAbderrazek Zaafrani           AArch64::STPDi, AArch64::FPR64RegClass),
1135a2583f0SAbderrazek Zaafrani     RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
1145a2583f0SAbderrazek Zaafrani           AArch64::STPQi, AArch64::FPR128RegClass),
1155a2583f0SAbderrazek Zaafrani     RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
1165a2583f0SAbderrazek Zaafrani           AArch64::STPDi, AArch64::FPR64RegClass),
1175a2583f0SAbderrazek Zaafrani     // ST4 instructions
1185a2583f0SAbderrazek Zaafrani     RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
1195a2583f0SAbderrazek Zaafrani           AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64,
1205a2583f0SAbderrazek Zaafrani           AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
1215a2583f0SAbderrazek Zaafrani           AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
1225a2583f0SAbderrazek Zaafrani     RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
1235a2583f0SAbderrazek Zaafrani           AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32,
1245a2583f0SAbderrazek Zaafrani           AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
1255a2583f0SAbderrazek Zaafrani           AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
1265a2583f0SAbderrazek Zaafrani     RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
1275a2583f0SAbderrazek Zaafrani           AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32,
1285a2583f0SAbderrazek Zaafrani           AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
1295a2583f0SAbderrazek Zaafrani           AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
1305a2583f0SAbderrazek Zaafrani     RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
1315a2583f0SAbderrazek Zaafrani           AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16,
1325a2583f0SAbderrazek Zaafrani           AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
1335a2583f0SAbderrazek Zaafrani           AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
1345a2583f0SAbderrazek Zaafrani     RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
1355a2583f0SAbderrazek Zaafrani           AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16,
1365a2583f0SAbderrazek Zaafrani           AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
1375a2583f0SAbderrazek Zaafrani           AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
1385a2583f0SAbderrazek Zaafrani     RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
1395a2583f0SAbderrazek Zaafrani           AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8,
1405a2583f0SAbderrazek Zaafrani           AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
1415a2583f0SAbderrazek Zaafrani           AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
1425a2583f0SAbderrazek Zaafrani     RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
1435a2583f0SAbderrazek Zaafrani           AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8,
1445a2583f0SAbderrazek Zaafrani           AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
1455a2583f0SAbderrazek Zaafrani           AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass)
1465a2583f0SAbderrazek Zaafrani   };
1475a2583f0SAbderrazek Zaafrani 
1485a2583f0SAbderrazek Zaafrani   // A costly instruction is replaced in this work by N efficient instructions
1495a2583f0SAbderrazek Zaafrani   // The maximum of N is curently 10 and it is for ST4 case.
1505a2583f0SAbderrazek Zaafrani   static const unsigned MaxNumRepl = 10;
1515a2583f0SAbderrazek Zaafrani 
AArch64SIMDInstrOpt__anone10fe4070111::AArch64SIMDInstrOpt1525a2583f0SAbderrazek Zaafrani   AArch64SIMDInstrOpt() : MachineFunctionPass(ID) {
1535a2583f0SAbderrazek Zaafrani     initializeAArch64SIMDInstrOptPass(*PassRegistry::getPassRegistry());
1545a2583f0SAbderrazek Zaafrani   }
1555a2583f0SAbderrazek Zaafrani 
1565a2583f0SAbderrazek Zaafrani   /// Based only on latency of instructions, determine if it is cost efficient
1575a2583f0SAbderrazek Zaafrani   /// to replace the instruction InstDesc by the instructions stored in the
1585a2583f0SAbderrazek Zaafrani   /// array InstDescRepl.
1595a2583f0SAbderrazek Zaafrani   /// Return true if replacement is expected to be faster.
1605a2583f0SAbderrazek Zaafrani   bool shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc,
1615a2583f0SAbderrazek Zaafrani                          SmallVectorImpl<const MCInstrDesc*> &ReplInstrMCID);
1625a2583f0SAbderrazek Zaafrani 
1635a2583f0SAbderrazek Zaafrani   /// Determine if we need to exit the instruction replacement optimization
164a9134e86SEvandro Menezes   /// passes early. This makes sure that no compile time is spent in this pass
165a9134e86SEvandro Menezes   /// for targets with no need for any of these optimizations.
166a9134e86SEvandro Menezes   /// Return true if early exit of the pass is recommended.
1675a2583f0SAbderrazek Zaafrani   bool shouldExitEarly(MachineFunction *MF, Subpass SP);
1685a2583f0SAbderrazek Zaafrani 
1695a2583f0SAbderrazek Zaafrani   /// Check whether an equivalent DUP instruction has already been
1705a2583f0SAbderrazek Zaafrani   /// created or not.
171a9134e86SEvandro Menezes   /// Return true when the DUP instruction already exists. In this case,
1725a2583f0SAbderrazek Zaafrani   /// DestReg will point to the destination of the already created DUP.
1735a2583f0SAbderrazek Zaafrani   bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg,
1745a2583f0SAbderrazek Zaafrani                 unsigned LaneNumber, unsigned *DestReg) const;
1755a2583f0SAbderrazek Zaafrani 
1765a2583f0SAbderrazek Zaafrani   /// Certain SIMD instructions with vector element operand are not efficient.
1775a2583f0SAbderrazek Zaafrani   /// Rewrite them into SIMD instructions with vector operands. This rewrite
1785a2583f0SAbderrazek Zaafrani   /// is driven by the latency of the instructions.
1795a2583f0SAbderrazek Zaafrani   /// Return true if the SIMD instruction is modified.
1805a2583f0SAbderrazek Zaafrani   bool optimizeVectElement(MachineInstr &MI);
1815a2583f0SAbderrazek Zaafrani 
1825a2583f0SAbderrazek Zaafrani   /// Process The REG_SEQUENCE instruction, and extract the source
183a9134e86SEvandro Menezes   /// operands of the ST2/4 instruction from it.
1845a2583f0SAbderrazek Zaafrani   /// Example of such instructions.
1855a2583f0SAbderrazek Zaafrani   ///    %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
1865a2583f0SAbderrazek Zaafrani   /// Return true when the instruction is processed successfully.
1875a2583f0SAbderrazek Zaafrani   bool processSeqRegInst(MachineInstr *DefiningMI, unsigned* StReg,
1885a2583f0SAbderrazek Zaafrani                          unsigned* StRegKill, unsigned NumArg) const;
1895a2583f0SAbderrazek Zaafrani 
1905a2583f0SAbderrazek Zaafrani   /// Load/Store Interleaving instructions are not always beneficial.
191a9134e86SEvandro Menezes   /// Replace them by ZIP instructionand classical load/store.
1925a2583f0SAbderrazek Zaafrani   /// Return true if the SIMD instruction is modified.
1935a2583f0SAbderrazek Zaafrani   bool optimizeLdStInterleave(MachineInstr &MI);
1945a2583f0SAbderrazek Zaafrani 
1955a2583f0SAbderrazek Zaafrani   /// Return the number of useful source registers for this
196a9134e86SEvandro Menezes   /// instruction (2 for ST2 and 4 for ST4).
1975a2583f0SAbderrazek Zaafrani   unsigned determineSrcReg(MachineInstr &MI) const;
1985a2583f0SAbderrazek Zaafrani 
1995a2583f0SAbderrazek Zaafrani   bool runOnMachineFunction(MachineFunction &Fn) override;
2005a2583f0SAbderrazek Zaafrani 
getPassName__anone10fe4070111::AArch64SIMDInstrOpt2015a2583f0SAbderrazek Zaafrani   StringRef getPassName() const override {
2025a2583f0SAbderrazek Zaafrani     return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME;
2035a2583f0SAbderrazek Zaafrani   }
2045a2583f0SAbderrazek Zaafrani };
2055a2583f0SAbderrazek Zaafrani 
2065a2583f0SAbderrazek Zaafrani char AArch64SIMDInstrOpt::ID = 0;
2075a2583f0SAbderrazek Zaafrani 
2085a2583f0SAbderrazek Zaafrani } // end anonymous namespace
2095a2583f0SAbderrazek Zaafrani 
2105a2583f0SAbderrazek Zaafrani INITIALIZE_PASS(AArch64SIMDInstrOpt, "aarch64-simdinstr-opt",
2115a2583f0SAbderrazek Zaafrani                 AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false)
2125a2583f0SAbderrazek Zaafrani 
2135a2583f0SAbderrazek Zaafrani /// Based only on latency of instructions, determine if it is cost efficient
2145a2583f0SAbderrazek Zaafrani /// to replace the instruction InstDesc by the instructions stored in the
2155a2583f0SAbderrazek Zaafrani /// array InstDescRepl.
2165a2583f0SAbderrazek Zaafrani /// Return true if replacement is expected to be faster.
2175a2583f0SAbderrazek Zaafrani bool AArch64SIMDInstrOpt::
shouldReplaceInst(MachineFunction * MF,const MCInstrDesc * InstDesc,SmallVectorImpl<const MCInstrDesc * > & InstDescRepl)2185a2583f0SAbderrazek Zaafrani shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc,
2195a2583f0SAbderrazek Zaafrani                   SmallVectorImpl<const MCInstrDesc*> &InstDescRepl) {
2205a2583f0SAbderrazek Zaafrani   // Check if replacement decision is already available in the cached table.
2215a2583f0SAbderrazek Zaafrani   // if so, return it.
222adcd0268SBenjamin Kramer   std::string Subtarget = std::string(SchedModel.getSubtargetInfo()->getCPU());
223a9134e86SEvandro Menezes   auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget);
22437ef2255SJon Roelofs   auto It = SIMDInstrTable.find(InstID);
22537ef2255SJon Roelofs   if (It != SIMDInstrTable.end())
22637ef2255SJon Roelofs     return It->second;
2275a2583f0SAbderrazek Zaafrani 
2285a2583f0SAbderrazek Zaafrani   unsigned SCIdx = InstDesc->getSchedClass();
2295a2583f0SAbderrazek Zaafrani   const MCSchedClassDesc *SCDesc =
2305a2583f0SAbderrazek Zaafrani     SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
2315a2583f0SAbderrazek Zaafrani 
232a9134e86SEvandro Menezes   // If a target does not define resources for the instructions
2335a2583f0SAbderrazek Zaafrani   // of interest, then return false for no replacement.
2345a2583f0SAbderrazek Zaafrani   const MCSchedClassDesc *SCDescRepl;
2355a2583f0SAbderrazek Zaafrani   if (!SCDesc->isValid() || SCDesc->isVariant())
2365a2583f0SAbderrazek Zaafrani   {
2375a2583f0SAbderrazek Zaafrani     SIMDInstrTable[InstID] = false;
2385a2583f0SAbderrazek Zaafrani     return false;
2395a2583f0SAbderrazek Zaafrani   }
2405a2583f0SAbderrazek Zaafrani   for (auto IDesc : InstDescRepl)
2415a2583f0SAbderrazek Zaafrani   {
2425a2583f0SAbderrazek Zaafrani     SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc(
2435a2583f0SAbderrazek Zaafrani       IDesc->getSchedClass());
2445a2583f0SAbderrazek Zaafrani     if (!SCDescRepl->isValid() || SCDescRepl->isVariant())
2455a2583f0SAbderrazek Zaafrani     {
2465a2583f0SAbderrazek Zaafrani       SIMDInstrTable[InstID] = false;
2475a2583f0SAbderrazek Zaafrani       return false;
2485a2583f0SAbderrazek Zaafrani     }
2495a2583f0SAbderrazek Zaafrani   }
2505a2583f0SAbderrazek Zaafrani 
2515a2583f0SAbderrazek Zaafrani   // Replacement cost.
2525a2583f0SAbderrazek Zaafrani   unsigned ReplCost = 0;
2535a2583f0SAbderrazek Zaafrani   for (auto IDesc :InstDescRepl)
2545a2583f0SAbderrazek Zaafrani     ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode());
2555a2583f0SAbderrazek Zaafrani 
2565a2583f0SAbderrazek Zaafrani   if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost)
2575a2583f0SAbderrazek Zaafrani   {
2585a2583f0SAbderrazek Zaafrani     SIMDInstrTable[InstID] = true;
2595a2583f0SAbderrazek Zaafrani     return true;
2605a2583f0SAbderrazek Zaafrani   }
2615a2583f0SAbderrazek Zaafrani   else
2625a2583f0SAbderrazek Zaafrani   {
2635a2583f0SAbderrazek Zaafrani     SIMDInstrTable[InstID] = false;
2645a2583f0SAbderrazek Zaafrani     return false;
2655a2583f0SAbderrazek Zaafrani   }
2665a2583f0SAbderrazek Zaafrani }
2675a2583f0SAbderrazek Zaafrani 
268a9134e86SEvandro Menezes /// Determine if we need to exit this pass for a kind of instruction replacement
269a9134e86SEvandro Menezes /// early. This makes sure that no compile time is spent in this pass for
270a9134e86SEvandro Menezes /// targets with no need for any of these optimizations beyond performing this
271a9134e86SEvandro Menezes /// check.
272a9134e86SEvandro Menezes /// Return true if early exit of this pass for a kind of instruction
273a9134e86SEvandro Menezes /// replacement is recommended for a target.
shouldExitEarly(MachineFunction * MF,Subpass SP)2745a2583f0SAbderrazek Zaafrani bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) {
2755a2583f0SAbderrazek Zaafrani   const MCInstrDesc* OriginalMCID;
2765a2583f0SAbderrazek Zaafrani   SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID;
2775a2583f0SAbderrazek Zaafrani 
2785a2583f0SAbderrazek Zaafrani   switch (SP) {
279a9134e86SEvandro Menezes   // For this optimization, check by comparing the latency of a representative
280a9134e86SEvandro Menezes   // instruction to that of the replacement instructions.
281a9134e86SEvandro Menezes   // TODO: check for all concerned instructions.
2825a2583f0SAbderrazek Zaafrani   case VectorElem:
2835a2583f0SAbderrazek Zaafrani     OriginalMCID = &TII->get(AArch64::FMLAv4i32_indexed);
2845a2583f0SAbderrazek Zaafrani     ReplInstrMCID.push_back(&TII->get(AArch64::DUPv4i32lane));
285a9134e86SEvandro Menezes     ReplInstrMCID.push_back(&TII->get(AArch64::FMLAv4f32));
2865a2583f0SAbderrazek Zaafrani     if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID))
2875a2583f0SAbderrazek Zaafrani       return false;
2885a2583f0SAbderrazek Zaafrani     break;
289a9134e86SEvandro Menezes 
290a9134e86SEvandro Menezes   // For this optimization, check for all concerned instructions.
2915a2583f0SAbderrazek Zaafrani   case Interleave:
292adcd0268SBenjamin Kramer     std::string Subtarget =
293adcd0268SBenjamin Kramer         std::string(SchedModel.getSubtargetInfo()->getCPU());
29483dc53d3SJon Roelofs     auto It = InterlEarlyExit.find(Subtarget);
29583dc53d3SJon Roelofs     if (It != InterlEarlyExit.end())
29683dc53d3SJon Roelofs       return It->second;
2975a2583f0SAbderrazek Zaafrani 
2985a2583f0SAbderrazek Zaafrani     for (auto &I : IRT) {
2995a2583f0SAbderrazek Zaafrani       OriginalMCID = &TII->get(I.OrigOpc);
3005a2583f0SAbderrazek Zaafrani       for (auto &Repl : I.ReplOpc)
3015a2583f0SAbderrazek Zaafrani         ReplInstrMCID.push_back(&TII->get(Repl));
3025a2583f0SAbderrazek Zaafrani       if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) {
3035a2583f0SAbderrazek Zaafrani         InterlEarlyExit[Subtarget] = false;
3045a2583f0SAbderrazek Zaafrani         return false;
3055a2583f0SAbderrazek Zaafrani       }
3065a2583f0SAbderrazek Zaafrani       ReplInstrMCID.clear();
3075a2583f0SAbderrazek Zaafrani     }
3085a2583f0SAbderrazek Zaafrani     InterlEarlyExit[Subtarget] = true;
3095a2583f0SAbderrazek Zaafrani     break;
3105a2583f0SAbderrazek Zaafrani   }
3115a2583f0SAbderrazek Zaafrani 
3125a2583f0SAbderrazek Zaafrani   return true;
3135a2583f0SAbderrazek Zaafrani }
3145a2583f0SAbderrazek Zaafrani 
3155a2583f0SAbderrazek Zaafrani /// Check whether an equivalent DUP instruction has already been
3165a2583f0SAbderrazek Zaafrani /// created or not.
317a9134e86SEvandro Menezes /// Return true when the DUP instruction already exists. In this case,
3185a2583f0SAbderrazek Zaafrani /// DestReg will point to the destination of the already created DUP.
reuseDUP(MachineInstr & MI,unsigned DupOpcode,unsigned SrcReg,unsigned LaneNumber,unsigned * DestReg) const3195a2583f0SAbderrazek Zaafrani bool AArch64SIMDInstrOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode,
3205a2583f0SAbderrazek Zaafrani                                          unsigned SrcReg, unsigned LaneNumber,
3215a2583f0SAbderrazek Zaafrani                                          unsigned *DestReg) const {
3225a2583f0SAbderrazek Zaafrani   for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin();
3235a2583f0SAbderrazek Zaafrani        MII != MIE;) {
3245a2583f0SAbderrazek Zaafrani     MII--;
3255a2583f0SAbderrazek Zaafrani     MachineInstr *CurrentMI = &*MII;
3265a2583f0SAbderrazek Zaafrani 
3275a2583f0SAbderrazek Zaafrani     if (CurrentMI->getOpcode() == DupOpcode &&
3285a2583f0SAbderrazek Zaafrani         CurrentMI->getNumOperands() == 3 &&
3295a2583f0SAbderrazek Zaafrani         CurrentMI->getOperand(1).getReg() == SrcReg &&
3305a2583f0SAbderrazek Zaafrani         CurrentMI->getOperand(2).getImm() == LaneNumber) {
3315a2583f0SAbderrazek Zaafrani       *DestReg = CurrentMI->getOperand(0).getReg();
3325a2583f0SAbderrazek Zaafrani       return true;
3335a2583f0SAbderrazek Zaafrani     }
3345a2583f0SAbderrazek Zaafrani   }
3355a2583f0SAbderrazek Zaafrani 
3365a2583f0SAbderrazek Zaafrani   return false;
3375a2583f0SAbderrazek Zaafrani }
3385a2583f0SAbderrazek Zaafrani 
3395a2583f0SAbderrazek Zaafrani /// Certain SIMD instructions with vector element operand are not efficient.
3405a2583f0SAbderrazek Zaafrani /// Rewrite them into SIMD instructions with vector operands. This rewrite
3415a2583f0SAbderrazek Zaafrani /// is driven by the latency of the instructions.
342a9134e86SEvandro Menezes /// The instruction of concerns are for the time being FMLA, FMLS, FMUL,
343a9134e86SEvandro Menezes /// and FMULX and hence they are hardcoded.
3445a2583f0SAbderrazek Zaafrani ///
345a9134e86SEvandro Menezes /// For example:
3465a2583f0SAbderrazek Zaafrani ///    fmla v0.4s, v1.4s, v2.s[1]
347a9134e86SEvandro Menezes ///
348a9134e86SEvandro Menezes /// Is rewritten into
349a9134e86SEvandro Menezes ///    dup  v3.4s, v2.s[1]      // DUP not necessary if redundant
3505a2583f0SAbderrazek Zaafrani ///    fmla v0.4s, v1.4s, v3.4s
351a9134e86SEvandro Menezes ///
3525a2583f0SAbderrazek Zaafrani /// Return true if the SIMD instruction is modified.
optimizeVectElement(MachineInstr & MI)3535a2583f0SAbderrazek Zaafrani bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) {
3545a2583f0SAbderrazek Zaafrani   const MCInstrDesc *MulMCID, *DupMCID;
3555a2583f0SAbderrazek Zaafrani   const TargetRegisterClass *RC = &AArch64::FPR128RegClass;
3565a2583f0SAbderrazek Zaafrani 
3575a2583f0SAbderrazek Zaafrani   switch (MI.getOpcode()) {
3585a2583f0SAbderrazek Zaafrani   default:
3595a2583f0SAbderrazek Zaafrani     return false;
3605a2583f0SAbderrazek Zaafrani 
3615a2583f0SAbderrazek Zaafrani   // 4X32 instructions
3625a2583f0SAbderrazek Zaafrani   case AArch64::FMLAv4i32_indexed:
3635a2583f0SAbderrazek Zaafrani     DupMCID = &TII->get(AArch64::DUPv4i32lane);
3645a2583f0SAbderrazek Zaafrani     MulMCID = &TII->get(AArch64::FMLAv4f32);
3655a2583f0SAbderrazek Zaafrani     break;
3665a2583f0SAbderrazek Zaafrani   case AArch64::FMLSv4i32_indexed:
3675a2583f0SAbderrazek Zaafrani     DupMCID = &TII->get(AArch64::DUPv4i32lane);
3685a2583f0SAbderrazek Zaafrani     MulMCID = &TII->get(AArch64::FMLSv4f32);
3695a2583f0SAbderrazek Zaafrani     break;
3705a2583f0SAbderrazek Zaafrani   case AArch64::FMULXv4i32_indexed:
3715a2583f0SAbderrazek Zaafrani     DupMCID = &TII->get(AArch64::DUPv4i32lane);
3725a2583f0SAbderrazek Zaafrani     MulMCID = &TII->get(AArch64::FMULXv4f32);
3735a2583f0SAbderrazek Zaafrani     break;
3745a2583f0SAbderrazek Zaafrani   case AArch64::FMULv4i32_indexed:
3755a2583f0SAbderrazek Zaafrani     DupMCID = &TII->get(AArch64::DUPv4i32lane);
3765a2583f0SAbderrazek Zaafrani     MulMCID = &TII->get(AArch64::FMULv4f32);
3775a2583f0SAbderrazek Zaafrani     break;
3785a2583f0SAbderrazek Zaafrani 
3795a2583f0SAbderrazek Zaafrani   // 2X64 instructions
3805a2583f0SAbderrazek Zaafrani   case AArch64::FMLAv2i64_indexed:
3815a2583f0SAbderrazek Zaafrani     DupMCID = &TII->get(AArch64::DUPv2i64lane);
3825a2583f0SAbderrazek Zaafrani     MulMCID = &TII->get(AArch64::FMLAv2f64);
3835a2583f0SAbderrazek Zaafrani     break;
3845a2583f0SAbderrazek Zaafrani   case AArch64::FMLSv2i64_indexed:
3855a2583f0SAbderrazek Zaafrani     DupMCID = &TII->get(AArch64::DUPv2i64lane);
3865a2583f0SAbderrazek Zaafrani     MulMCID = &TII->get(AArch64::FMLSv2f64);
3875a2583f0SAbderrazek Zaafrani     break;
3885a2583f0SAbderrazek Zaafrani   case AArch64::FMULXv2i64_indexed:
3895a2583f0SAbderrazek Zaafrani     DupMCID = &TII->get(AArch64::DUPv2i64lane);
3905a2583f0SAbderrazek Zaafrani     MulMCID = &TII->get(AArch64::FMULXv2f64);
3915a2583f0SAbderrazek Zaafrani     break;
3925a2583f0SAbderrazek Zaafrani   case AArch64::FMULv2i64_indexed:
3935a2583f0SAbderrazek Zaafrani     DupMCID = &TII->get(AArch64::DUPv2i64lane);
3945a2583f0SAbderrazek Zaafrani     MulMCID = &TII->get(AArch64::FMULv2f64);
3955a2583f0SAbderrazek Zaafrani     break;
3965a2583f0SAbderrazek Zaafrani 
3975a2583f0SAbderrazek Zaafrani   // 2X32 instructions
3985a2583f0SAbderrazek Zaafrani   case AArch64::FMLAv2i32_indexed:
3995a2583f0SAbderrazek Zaafrani     RC = &AArch64::FPR64RegClass;
4005a2583f0SAbderrazek Zaafrani     DupMCID = &TII->get(AArch64::DUPv2i32lane);
4015a2583f0SAbderrazek Zaafrani     MulMCID = &TII->get(AArch64::FMLAv2f32);
4025a2583f0SAbderrazek Zaafrani     break;
4035a2583f0SAbderrazek Zaafrani   case AArch64::FMLSv2i32_indexed:
4045a2583f0SAbderrazek Zaafrani     RC = &AArch64::FPR64RegClass;
4055a2583f0SAbderrazek Zaafrani     DupMCID = &TII->get(AArch64::DUPv2i32lane);
4065a2583f0SAbderrazek Zaafrani     MulMCID = &TII->get(AArch64::FMLSv2f32);
4075a2583f0SAbderrazek Zaafrani     break;
4085a2583f0SAbderrazek Zaafrani   case AArch64::FMULXv2i32_indexed:
4095a2583f0SAbderrazek Zaafrani     RC = &AArch64::FPR64RegClass;
4105a2583f0SAbderrazek Zaafrani     DupMCID = &TII->get(AArch64::DUPv2i32lane);
4115a2583f0SAbderrazek Zaafrani     MulMCID = &TII->get(AArch64::FMULXv2f32);
4125a2583f0SAbderrazek Zaafrani     break;
4135a2583f0SAbderrazek Zaafrani   case AArch64::FMULv2i32_indexed:
4145a2583f0SAbderrazek Zaafrani     RC = &AArch64::FPR64RegClass;
4155a2583f0SAbderrazek Zaafrani     DupMCID = &TII->get(AArch64::DUPv2i32lane);
4165a2583f0SAbderrazek Zaafrani     MulMCID = &TII->get(AArch64::FMULv2f32);
4175a2583f0SAbderrazek Zaafrani     break;
4185a2583f0SAbderrazek Zaafrani   }
4195a2583f0SAbderrazek Zaafrani 
4205a2583f0SAbderrazek Zaafrani   SmallVector<const MCInstrDesc*, 2> ReplInstrMCID;
4215a2583f0SAbderrazek Zaafrani   ReplInstrMCID.push_back(DupMCID);
4225a2583f0SAbderrazek Zaafrani   ReplInstrMCID.push_back(MulMCID);
4235a2583f0SAbderrazek Zaafrani   if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()),
4245a2583f0SAbderrazek Zaafrani                          ReplInstrMCID))
4255a2583f0SAbderrazek Zaafrani     return false;
4265a2583f0SAbderrazek Zaafrani 
4275a2583f0SAbderrazek Zaafrani   const DebugLoc &DL = MI.getDebugLoc();
4285a2583f0SAbderrazek Zaafrani   MachineBasicBlock &MBB = *MI.getParent();
4295a2583f0SAbderrazek Zaafrani   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4305a2583f0SAbderrazek Zaafrani 
431a9134e86SEvandro Menezes   // Get the operands of the current SIMD arithmetic instruction.
4325ae66e56SDaniel Sanders   Register MulDest = MI.getOperand(0).getReg();
4335ae66e56SDaniel Sanders   Register SrcReg0 = MI.getOperand(1).getReg();
4345a2583f0SAbderrazek Zaafrani   unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill());
4355ae66e56SDaniel Sanders   Register SrcReg1 = MI.getOperand(2).getReg();
4365a2583f0SAbderrazek Zaafrani   unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill());
4375a2583f0SAbderrazek Zaafrani   unsigned DupDest;
4385a2583f0SAbderrazek Zaafrani 
4395a2583f0SAbderrazek Zaafrani   // Instructions of interest have either 4 or 5 operands.
4405a2583f0SAbderrazek Zaafrani   if (MI.getNumOperands() == 5) {
4415ae66e56SDaniel Sanders     Register SrcReg2 = MI.getOperand(3).getReg();
4425a2583f0SAbderrazek Zaafrani     unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill());
4435a2583f0SAbderrazek Zaafrani     unsigned LaneNumber = MI.getOperand(4).getImm();
4445a2583f0SAbderrazek Zaafrani     // Create a new DUP instruction. Note that if an equivalent DUP instruction
445a9134e86SEvandro Menezes     // has already been created before, then use that one instead of creating
4465a2583f0SAbderrazek Zaafrani     // a new one.
4475a2583f0SAbderrazek Zaafrani     if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) {
4485a2583f0SAbderrazek Zaafrani       DupDest = MRI.createVirtualRegister(RC);
4495a2583f0SAbderrazek Zaafrani       BuildMI(MBB, MI, DL, *DupMCID, DupDest)
4505a2583f0SAbderrazek Zaafrani           .addReg(SrcReg2, Src2IsKill)
4515a2583f0SAbderrazek Zaafrani           .addImm(LaneNumber);
4525a2583f0SAbderrazek Zaafrani     }
4535a2583f0SAbderrazek Zaafrani     BuildMI(MBB, MI, DL, *MulMCID, MulDest)
4545a2583f0SAbderrazek Zaafrani         .addReg(SrcReg0, Src0IsKill)
4555a2583f0SAbderrazek Zaafrani         .addReg(SrcReg1, Src1IsKill)
4565a2583f0SAbderrazek Zaafrani         .addReg(DupDest, Src2IsKill);
4575a2583f0SAbderrazek Zaafrani   } else if (MI.getNumOperands() == 4) {
4585a2583f0SAbderrazek Zaafrani     unsigned LaneNumber = MI.getOperand(3).getImm();
4595a2583f0SAbderrazek Zaafrani     if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) {
4605a2583f0SAbderrazek Zaafrani       DupDest = MRI.createVirtualRegister(RC);
4615a2583f0SAbderrazek Zaafrani       BuildMI(MBB, MI, DL, *DupMCID, DupDest)
4625a2583f0SAbderrazek Zaafrani           .addReg(SrcReg1, Src1IsKill)
4635a2583f0SAbderrazek Zaafrani           .addImm(LaneNumber);
4645a2583f0SAbderrazek Zaafrani     }
4655a2583f0SAbderrazek Zaafrani     BuildMI(MBB, MI, DL, *MulMCID, MulDest)
4665a2583f0SAbderrazek Zaafrani         .addReg(SrcReg0, Src0IsKill)
4675a2583f0SAbderrazek Zaafrani         .addReg(DupDest, Src1IsKill);
4685a2583f0SAbderrazek Zaafrani   } else {
4695a2583f0SAbderrazek Zaafrani     return false;
4705a2583f0SAbderrazek Zaafrani   }
4715a2583f0SAbderrazek Zaafrani 
4725a2583f0SAbderrazek Zaafrani   ++NumModifiedInstr;
4735a2583f0SAbderrazek Zaafrani   return true;
4745a2583f0SAbderrazek Zaafrani }
4755a2583f0SAbderrazek Zaafrani 
4765a2583f0SAbderrazek Zaafrani /// Load/Store Interleaving instructions are not always beneficial.
477a9134e86SEvandro Menezes /// Replace them by ZIP instructions and classical load/store.
4785a2583f0SAbderrazek Zaafrani ///
479a9134e86SEvandro Menezes /// For example:
4805a2583f0SAbderrazek Zaafrani ///    st2 {v0.4s, v1.4s}, addr
481a9134e86SEvandro Menezes ///
482a9134e86SEvandro Menezes /// Is rewritten into:
4835a2583f0SAbderrazek Zaafrani ///    zip1 v2.4s, v0.4s, v1.4s
4845a2583f0SAbderrazek Zaafrani ///    zip2 v3.4s, v0.4s, v1.4s
4855a2583f0SAbderrazek Zaafrani ///    stp  q2, q3, addr
4865a2583f0SAbderrazek Zaafrani //
487a9134e86SEvandro Menezes /// For example:
4885a2583f0SAbderrazek Zaafrani ///    st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr
489a9134e86SEvandro Menezes ///
490a9134e86SEvandro Menezes /// Is rewritten into:
4915a2583f0SAbderrazek Zaafrani ///    zip1 v4.4s, v0.4s, v2.4s
4925a2583f0SAbderrazek Zaafrani ///    zip2 v5.4s, v0.4s, v2.4s
4935a2583f0SAbderrazek Zaafrani ///    zip1 v6.4s, v1.4s, v3.4s
4945a2583f0SAbderrazek Zaafrani ///    zip2 v7.4s, v1.4s, v3.4s
4955a2583f0SAbderrazek Zaafrani ///    zip1 v8.4s, v4.4s, v6.4s
4965a2583f0SAbderrazek Zaafrani ///    zip2 v9.4s, v4.4s, v6.4s
4975a2583f0SAbderrazek Zaafrani ///    zip1 v10.4s, v5.4s, v7.4s
4985a2583f0SAbderrazek Zaafrani ///    zip2 v11.4s, v5.4s, v7.4s
4995a2583f0SAbderrazek Zaafrani ///    stp  q8, q9, addr
5005a2583f0SAbderrazek Zaafrani ///    stp  q10, q11, addr+32
501a9134e86SEvandro Menezes ///
502a9134e86SEvandro Menezes /// Currently only instructions related to ST2 and ST4 are considered.
5035a2583f0SAbderrazek Zaafrani /// Other may be added later.
5045a2583f0SAbderrazek Zaafrani /// Return true if the SIMD instruction is modified.
optimizeLdStInterleave(MachineInstr & MI)5055a2583f0SAbderrazek Zaafrani bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr &MI) {
5065a2583f0SAbderrazek Zaafrani 
5075a2583f0SAbderrazek Zaafrani   unsigned SeqReg, AddrReg;
5085a2583f0SAbderrazek Zaafrani   unsigned StReg[4], StRegKill[4];
5095a2583f0SAbderrazek Zaafrani   MachineInstr *DefiningMI;
5105a2583f0SAbderrazek Zaafrani   const DebugLoc &DL = MI.getDebugLoc();
5115a2583f0SAbderrazek Zaafrani   MachineBasicBlock &MBB = *MI.getParent();
5125a2583f0SAbderrazek Zaafrani   SmallVector<unsigned, MaxNumRepl> ZipDest;
5135a2583f0SAbderrazek Zaafrani   SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID;
5145a2583f0SAbderrazek Zaafrani 
5155a2583f0SAbderrazek Zaafrani   // If current instruction matches any of the rewriting rules, then
5165a2583f0SAbderrazek Zaafrani   // gather information about parameters of the new instructions.
5175a2583f0SAbderrazek Zaafrani   bool Match = false;
5185a2583f0SAbderrazek Zaafrani   for (auto &I : IRT) {
5195a2583f0SAbderrazek Zaafrani     if (MI.getOpcode() == I.OrigOpc) {
5205a2583f0SAbderrazek Zaafrani       SeqReg  = MI.getOperand(0).getReg();
5215a2583f0SAbderrazek Zaafrani       AddrReg = MI.getOperand(1).getReg();
5225a2583f0SAbderrazek Zaafrani       DefiningMI = MRI->getUniqueVRegDef(SeqReg);
5235a2583f0SAbderrazek Zaafrani       unsigned NumReg = determineSrcReg(MI);
5245a2583f0SAbderrazek Zaafrani       if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg))
5255a2583f0SAbderrazek Zaafrani         return false;
5265a2583f0SAbderrazek Zaafrani 
5275a2583f0SAbderrazek Zaafrani       for (auto &Repl : I.ReplOpc) {
5285a2583f0SAbderrazek Zaafrani         ReplInstrMCID.push_back(&TII->get(Repl));
5295a2583f0SAbderrazek Zaafrani         // Generate destination registers but only for non-store instruction.
5305a2583f0SAbderrazek Zaafrani         if (Repl != AArch64::STPQi && Repl != AArch64::STPDi)
5315a2583f0SAbderrazek Zaafrani           ZipDest.push_back(MRI->createVirtualRegister(&I.RC));
5325a2583f0SAbderrazek Zaafrani       }
5335a2583f0SAbderrazek Zaafrani       Match = true;
5345a2583f0SAbderrazek Zaafrani       break;
5355a2583f0SAbderrazek Zaafrani     }
5365a2583f0SAbderrazek Zaafrani   }
5375a2583f0SAbderrazek Zaafrani 
5385a2583f0SAbderrazek Zaafrani   if (!Match)
5395a2583f0SAbderrazek Zaafrani     return false;
5405a2583f0SAbderrazek Zaafrani 
5415a2583f0SAbderrazek Zaafrani   // Determine if it is profitable to replace MI by the series of instructions
5425a2583f0SAbderrazek Zaafrani   // represented in ReplInstrMCID.
5435a2583f0SAbderrazek Zaafrani   if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()),
5445a2583f0SAbderrazek Zaafrani                          ReplInstrMCID))
5455a2583f0SAbderrazek Zaafrani     return false;
5465a2583f0SAbderrazek Zaafrani 
547a9134e86SEvandro Menezes   // Generate the replacement instructions composed of ZIP1, ZIP2, and STP (at
5485a2583f0SAbderrazek Zaafrani   // this point, the code generation is hardcoded and does not rely on the IRT
5495a2583f0SAbderrazek Zaafrani   // table used above given that code generation for ST2 replacement is somewhat
5505a2583f0SAbderrazek Zaafrani   // different than for ST4 replacement. We could have added more info into the
5515a2583f0SAbderrazek Zaafrani   // table related to how we build new instructions but we may be adding more
5525a2583f0SAbderrazek Zaafrani   // complexity with that).
5535a2583f0SAbderrazek Zaafrani   switch (MI.getOpcode()) {
5545a2583f0SAbderrazek Zaafrani   default:
5555a2583f0SAbderrazek Zaafrani     return false;
556a9134e86SEvandro Menezes 
5575a2583f0SAbderrazek Zaafrani   case AArch64::ST2Twov16b:
5585a2583f0SAbderrazek Zaafrani   case AArch64::ST2Twov8b:
5595a2583f0SAbderrazek Zaafrani   case AArch64::ST2Twov8h:
5605a2583f0SAbderrazek Zaafrani   case AArch64::ST2Twov4h:
5615a2583f0SAbderrazek Zaafrani   case AArch64::ST2Twov4s:
5625a2583f0SAbderrazek Zaafrani   case AArch64::ST2Twov2s:
5635a2583f0SAbderrazek Zaafrani   case AArch64::ST2Twov2d:
564a9134e86SEvandro Menezes     // ZIP instructions
5655a2583f0SAbderrazek Zaafrani     BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0])
5665a2583f0SAbderrazek Zaafrani         .addReg(StReg[0])
5675a2583f0SAbderrazek Zaafrani         .addReg(StReg[1]);
5685a2583f0SAbderrazek Zaafrani     BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1])
5695a2583f0SAbderrazek Zaafrani         .addReg(StReg[0], StRegKill[0])
5705a2583f0SAbderrazek Zaafrani         .addReg(StReg[1], StRegKill[1]);
571a9134e86SEvandro Menezes     // STP instructions
5725a2583f0SAbderrazek Zaafrani     BuildMI(MBB, MI, DL, *ReplInstrMCID[2])
5735a2583f0SAbderrazek Zaafrani         .addReg(ZipDest[0])
5745a2583f0SAbderrazek Zaafrani         .addReg(ZipDest[1])
5755a2583f0SAbderrazek Zaafrani         .addReg(AddrReg)
5765a2583f0SAbderrazek Zaafrani         .addImm(0);
5775a2583f0SAbderrazek Zaafrani     break;
578a9134e86SEvandro Menezes 
5795a2583f0SAbderrazek Zaafrani   case AArch64::ST4Fourv16b:
5805a2583f0SAbderrazek Zaafrani   case AArch64::ST4Fourv8b:
5815a2583f0SAbderrazek Zaafrani   case AArch64::ST4Fourv8h:
5825a2583f0SAbderrazek Zaafrani   case AArch64::ST4Fourv4h:
5835a2583f0SAbderrazek Zaafrani   case AArch64::ST4Fourv4s:
5845a2583f0SAbderrazek Zaafrani   case AArch64::ST4Fourv2s:
5855a2583f0SAbderrazek Zaafrani   case AArch64::ST4Fourv2d:
586a9134e86SEvandro Menezes     // ZIP instructions
5875a2583f0SAbderrazek Zaafrani     BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0])
5885a2583f0SAbderrazek Zaafrani         .addReg(StReg[0])
5895a2583f0SAbderrazek Zaafrani         .addReg(StReg[2]);
5905a2583f0SAbderrazek Zaafrani     BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1])
5915a2583f0SAbderrazek Zaafrani         .addReg(StReg[0], StRegKill[0])
5925a2583f0SAbderrazek Zaafrani         .addReg(StReg[2], StRegKill[2]);
5935a2583f0SAbderrazek Zaafrani     BuildMI(MBB, MI, DL, *ReplInstrMCID[2], ZipDest[2])
5945a2583f0SAbderrazek Zaafrani         .addReg(StReg[1])
5955a2583f0SAbderrazek Zaafrani         .addReg(StReg[3]);
5965a2583f0SAbderrazek Zaafrani     BuildMI(MBB, MI, DL, *ReplInstrMCID[3], ZipDest[3])
5975a2583f0SAbderrazek Zaafrani         .addReg(StReg[1], StRegKill[1])
5985a2583f0SAbderrazek Zaafrani         .addReg(StReg[3], StRegKill[3]);
5995a2583f0SAbderrazek Zaafrani     BuildMI(MBB, MI, DL, *ReplInstrMCID[4], ZipDest[4])
6005a2583f0SAbderrazek Zaafrani         .addReg(ZipDest[0])
6015a2583f0SAbderrazek Zaafrani         .addReg(ZipDest[2]);
6025a2583f0SAbderrazek Zaafrani     BuildMI(MBB, MI, DL, *ReplInstrMCID[5], ZipDest[5])
6035a2583f0SAbderrazek Zaafrani         .addReg(ZipDest[0])
6045a2583f0SAbderrazek Zaafrani         .addReg(ZipDest[2]);
6055a2583f0SAbderrazek Zaafrani     BuildMI(MBB, MI, DL, *ReplInstrMCID[6], ZipDest[6])
6065a2583f0SAbderrazek Zaafrani         .addReg(ZipDest[1])
6075a2583f0SAbderrazek Zaafrani         .addReg(ZipDest[3]);
6085a2583f0SAbderrazek Zaafrani     BuildMI(MBB, MI, DL, *ReplInstrMCID[7], ZipDest[7])
6095a2583f0SAbderrazek Zaafrani         .addReg(ZipDest[1])
6105a2583f0SAbderrazek Zaafrani         .addReg(ZipDest[3]);
6115a2583f0SAbderrazek Zaafrani     // stp instructions
6125a2583f0SAbderrazek Zaafrani     BuildMI(MBB, MI, DL, *ReplInstrMCID[8])
6135a2583f0SAbderrazek Zaafrani         .addReg(ZipDest[4])
6145a2583f0SAbderrazek Zaafrani         .addReg(ZipDest[5])
6155a2583f0SAbderrazek Zaafrani         .addReg(AddrReg)
6165a2583f0SAbderrazek Zaafrani         .addImm(0);
6175a2583f0SAbderrazek Zaafrani     BuildMI(MBB, MI, DL, *ReplInstrMCID[9])
6185a2583f0SAbderrazek Zaafrani         .addReg(ZipDest[6])
6195a2583f0SAbderrazek Zaafrani         .addReg(ZipDest[7])
6205a2583f0SAbderrazek Zaafrani         .addReg(AddrReg)
6215a2583f0SAbderrazek Zaafrani         .addImm(2);
6225a2583f0SAbderrazek Zaafrani     break;
6235a2583f0SAbderrazek Zaafrani   }
6245a2583f0SAbderrazek Zaafrani 
6255a2583f0SAbderrazek Zaafrani   ++NumModifiedInstr;
6265a2583f0SAbderrazek Zaafrani   return true;
6275a2583f0SAbderrazek Zaafrani }
6285a2583f0SAbderrazek Zaafrani 
6295a2583f0SAbderrazek Zaafrani /// Process The REG_SEQUENCE instruction, and extract the source
630a9134e86SEvandro Menezes /// operands of the ST2/4 instruction from it.
6315a2583f0SAbderrazek Zaafrani /// Example of such instruction.
6325a2583f0SAbderrazek Zaafrani ///    %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
6335a2583f0SAbderrazek Zaafrani /// Return true when the instruction is processed successfully.
processSeqRegInst(MachineInstr * DefiningMI,unsigned * StReg,unsigned * StRegKill,unsigned NumArg) const6345a2583f0SAbderrazek Zaafrani bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI,
6355a2583f0SAbderrazek Zaafrani      unsigned* StReg, unsigned* StRegKill, unsigned NumArg) const {
636*5a667c0eSKazu Hirata   assert(DefiningMI != nullptr);
6375a2583f0SAbderrazek Zaafrani   if (DefiningMI->getOpcode() != AArch64::REG_SEQUENCE)
6385a2583f0SAbderrazek Zaafrani     return false;
6395a2583f0SAbderrazek Zaafrani 
6405a2583f0SAbderrazek Zaafrani   for (unsigned i=0; i<NumArg; i++) {
6415a2583f0SAbderrazek Zaafrani     StReg[i]     = DefiningMI->getOperand(2*i+1).getReg();
6425a2583f0SAbderrazek Zaafrani     StRegKill[i] = getKillRegState(DefiningMI->getOperand(2*i+1).isKill());
6435a2583f0SAbderrazek Zaafrani 
6445b8bbbecSZarko Todorovski     // Validation check for the other arguments.
6455a2583f0SAbderrazek Zaafrani     if (DefiningMI->getOperand(2*i+2).isImm()) {
6465a2583f0SAbderrazek Zaafrani       switch (DefiningMI->getOperand(2*i+2).getImm()) {
6475a2583f0SAbderrazek Zaafrani       default:
6485a2583f0SAbderrazek Zaafrani         return false;
649a9134e86SEvandro Menezes 
6505a2583f0SAbderrazek Zaafrani       case AArch64::dsub0:
6515a2583f0SAbderrazek Zaafrani       case AArch64::dsub1:
6525a2583f0SAbderrazek Zaafrani       case AArch64::dsub2:
6535a2583f0SAbderrazek Zaafrani       case AArch64::dsub3:
6545a2583f0SAbderrazek Zaafrani       case AArch64::qsub0:
6555a2583f0SAbderrazek Zaafrani       case AArch64::qsub1:
6565a2583f0SAbderrazek Zaafrani       case AArch64::qsub2:
6575a2583f0SAbderrazek Zaafrani       case AArch64::qsub3:
6585a2583f0SAbderrazek Zaafrani         break;
6595a2583f0SAbderrazek Zaafrani       }
6605a2583f0SAbderrazek Zaafrani     }
6615a2583f0SAbderrazek Zaafrani     else
6625a2583f0SAbderrazek Zaafrani       return false;
6635a2583f0SAbderrazek Zaafrani   }
6645a2583f0SAbderrazek Zaafrani   return true;
6655a2583f0SAbderrazek Zaafrani }
6665a2583f0SAbderrazek Zaafrani 
6675a2583f0SAbderrazek Zaafrani /// Return the number of useful source registers for this instruction
6685a2583f0SAbderrazek Zaafrani /// (2 for ST2 and 4 for ST4).
determineSrcReg(MachineInstr & MI) const6695a2583f0SAbderrazek Zaafrani unsigned AArch64SIMDInstrOpt::determineSrcReg(MachineInstr &MI) const {
6705a2583f0SAbderrazek Zaafrani   switch (MI.getOpcode()) {
6715a2583f0SAbderrazek Zaafrani   default:
6725a2583f0SAbderrazek Zaafrani     llvm_unreachable("Unsupported instruction for this pass");
673a9134e86SEvandro Menezes 
6745a2583f0SAbderrazek Zaafrani   case AArch64::ST2Twov16b:
6755a2583f0SAbderrazek Zaafrani   case AArch64::ST2Twov8b:
6765a2583f0SAbderrazek Zaafrani   case AArch64::ST2Twov8h:
6775a2583f0SAbderrazek Zaafrani   case AArch64::ST2Twov4h:
6785a2583f0SAbderrazek Zaafrani   case AArch64::ST2Twov4s:
6795a2583f0SAbderrazek Zaafrani   case AArch64::ST2Twov2s:
6805a2583f0SAbderrazek Zaafrani   case AArch64::ST2Twov2d:
6815a2583f0SAbderrazek Zaafrani     return 2;
682a9134e86SEvandro Menezes 
6835a2583f0SAbderrazek Zaafrani   case AArch64::ST4Fourv16b:
6845a2583f0SAbderrazek Zaafrani   case AArch64::ST4Fourv8b:
6855a2583f0SAbderrazek Zaafrani   case AArch64::ST4Fourv8h:
6865a2583f0SAbderrazek Zaafrani   case AArch64::ST4Fourv4h:
6875a2583f0SAbderrazek Zaafrani   case AArch64::ST4Fourv4s:
6885a2583f0SAbderrazek Zaafrani   case AArch64::ST4Fourv2s:
6895a2583f0SAbderrazek Zaafrani   case AArch64::ST4Fourv2d:
6905a2583f0SAbderrazek Zaafrani     return 4;
6915a2583f0SAbderrazek Zaafrani   }
6925a2583f0SAbderrazek Zaafrani }
6935a2583f0SAbderrazek Zaafrani 
runOnMachineFunction(MachineFunction & MF)6945a2583f0SAbderrazek Zaafrani bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) {
695f1caa283SMatthias Braun   if (skipFunction(MF.getFunction()))
6965a2583f0SAbderrazek Zaafrani     return false;
6975a2583f0SAbderrazek Zaafrani 
6985a2583f0SAbderrazek Zaafrani   TII = MF.getSubtarget().getInstrInfo();
6995a2583f0SAbderrazek Zaafrani   MRI = &MF.getRegInfo();
7005a2583f0SAbderrazek Zaafrani   const TargetSubtargetInfo &ST = MF.getSubtarget();
7015a2583f0SAbderrazek Zaafrani   const AArch64InstrInfo *AAII =
7025a2583f0SAbderrazek Zaafrani       static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
7035a2583f0SAbderrazek Zaafrani   if (!AAII)
7045a2583f0SAbderrazek Zaafrani     return false;
7050d7df36cSSanjay Patel   SchedModel.init(&ST);
7065a2583f0SAbderrazek Zaafrani   if (!SchedModel.hasInstrSchedModel())
7075a2583f0SAbderrazek Zaafrani     return false;
7085a2583f0SAbderrazek Zaafrani 
7095a2583f0SAbderrazek Zaafrani   bool Changed = false;
7105a2583f0SAbderrazek Zaafrani   for (auto OptimizationKind : {VectorElem, Interleave}) {
7115a2583f0SAbderrazek Zaafrani     if (!shouldExitEarly(&MF, OptimizationKind)) {
7125a2583f0SAbderrazek Zaafrani       SmallVector<MachineInstr *, 8> RemoveMIs;
7135a2583f0SAbderrazek Zaafrani       for (MachineBasicBlock &MBB : MF) {
714ee0133dcSKazu Hirata         for (MachineInstr &MI : MBB) {
7155a2583f0SAbderrazek Zaafrani           bool InstRewrite;
7165a2583f0SAbderrazek Zaafrani           if (OptimizationKind == VectorElem)
7175a2583f0SAbderrazek Zaafrani             InstRewrite = optimizeVectElement(MI) ;
7185a2583f0SAbderrazek Zaafrani           else
7195a2583f0SAbderrazek Zaafrani             InstRewrite = optimizeLdStInterleave(MI);
7205a2583f0SAbderrazek Zaafrani           if (InstRewrite) {
7215a2583f0SAbderrazek Zaafrani             // Add MI to the list of instructions to be removed given that it
7225a2583f0SAbderrazek Zaafrani             // has been replaced.
7235a2583f0SAbderrazek Zaafrani             RemoveMIs.push_back(&MI);
7245a2583f0SAbderrazek Zaafrani             Changed = true;
7255a2583f0SAbderrazek Zaafrani           }
7265a2583f0SAbderrazek Zaafrani         }
7275a2583f0SAbderrazek Zaafrani       }
7285a2583f0SAbderrazek Zaafrani       for (MachineInstr *MI : RemoveMIs)
7295a2583f0SAbderrazek Zaafrani         MI->eraseFromParent();
7305a2583f0SAbderrazek Zaafrani     }
7315a2583f0SAbderrazek Zaafrani   }
7325a2583f0SAbderrazek Zaafrani 
7335a2583f0SAbderrazek Zaafrani   return Changed;
7345a2583f0SAbderrazek Zaafrani }
7355a2583f0SAbderrazek Zaafrani 
736a9134e86SEvandro Menezes /// Returns an instance of the high cost ASIMD instruction replacement
737a9134e86SEvandro Menezes /// optimization pass.
createAArch64SIMDInstrOptPass()7385a2583f0SAbderrazek Zaafrani FunctionPass *llvm::createAArch64SIMDInstrOptPass() {
7395a2583f0SAbderrazek Zaafrani   return new AArch64SIMDInstrOpt();
7405a2583f0SAbderrazek Zaafrani }
741