15a2583f0SAbderrazek Zaafrani //
22946cd70SChandler Carruth // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
32946cd70SChandler Carruth // See https://llvm.org/LICENSE.txt for license information.
42946cd70SChandler Carruth // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
55a2583f0SAbderrazek Zaafrani //
65a2583f0SAbderrazek Zaafrani //===----------------------------------------------------------------------===//
75a2583f0SAbderrazek Zaafrani //
85a2583f0SAbderrazek Zaafrani // This file contains a pass that performs optimization on SIMD instructions
95a2583f0SAbderrazek Zaafrani // with high latency by splitting them into more efficient series of
105a2583f0SAbderrazek Zaafrani // instructions.
115a2583f0SAbderrazek Zaafrani //
125a2583f0SAbderrazek Zaafrani // 1. Rewrite certain SIMD instructions with vector element due to their
135a2583f0SAbderrazek Zaafrani // inefficiency on some targets.
14a9134e86SEvandro Menezes //
15a9134e86SEvandro Menezes // For example:
165a2583f0SAbderrazek Zaafrani // fmla v0.4s, v1.4s, v2.s[1]
17a9134e86SEvandro Menezes //
18a9134e86SEvandro Menezes // Is rewritten into:
195a2583f0SAbderrazek Zaafrani // dup v3.4s, v2.s[1]
205a2583f0SAbderrazek Zaafrani // fmla v0.4s, v1.4s, v3.4s
215a2583f0SAbderrazek Zaafrani //
22a9134e86SEvandro Menezes // 2. Rewrite interleaved memory access instructions due to their
235a2583f0SAbderrazek Zaafrani // inefficiency on some targets.
24a9134e86SEvandro Menezes //
25a9134e86SEvandro Menezes // For example:
265a2583f0SAbderrazek Zaafrani // st2 {v0.4s, v1.4s}, addr
27a9134e86SEvandro Menezes //
28a9134e86SEvandro Menezes // Is rewritten into:
295a2583f0SAbderrazek Zaafrani // zip1 v2.4s, v0.4s, v1.4s
305a2583f0SAbderrazek Zaafrani // zip2 v3.4s, v0.4s, v1.4s
315a2583f0SAbderrazek Zaafrani // stp q2, q3, addr
325a2583f0SAbderrazek Zaafrani //
335a2583f0SAbderrazek Zaafrani //===----------------------------------------------------------------------===//
345a2583f0SAbderrazek Zaafrani
355a2583f0SAbderrazek Zaafrani #include "AArch64InstrInfo.h"
365a2583f0SAbderrazek Zaafrani #include "llvm/ADT/SmallVector.h"
375a2583f0SAbderrazek Zaafrani #include "llvm/ADT/Statistic.h"
385a2583f0SAbderrazek Zaafrani #include "llvm/ADT/StringRef.h"
395a2583f0SAbderrazek Zaafrani #include "llvm/CodeGen/MachineBasicBlock.h"
405a2583f0SAbderrazek Zaafrani #include "llvm/CodeGen/MachineFunction.h"
415a2583f0SAbderrazek Zaafrani #include "llvm/CodeGen/MachineFunctionPass.h"
425a2583f0SAbderrazek Zaafrani #include "llvm/CodeGen/MachineInstr.h"
435a2583f0SAbderrazek Zaafrani #include "llvm/CodeGen/MachineInstrBuilder.h"
445a2583f0SAbderrazek Zaafrani #include "llvm/CodeGen/MachineOperand.h"
455a2583f0SAbderrazek Zaafrani #include "llvm/CodeGen/MachineRegisterInfo.h"
465a2583f0SAbderrazek Zaafrani #include "llvm/CodeGen/TargetInstrInfo.h"
475a2583f0SAbderrazek Zaafrani #include "llvm/CodeGen/TargetSchedule.h"
485a2583f0SAbderrazek Zaafrani #include "llvm/CodeGen/TargetSubtargetInfo.h"
495a2583f0SAbderrazek Zaafrani #include "llvm/MC/MCInstrDesc.h"
505a2583f0SAbderrazek Zaafrani #include "llvm/MC/MCSchedule.h"
515a2583f0SAbderrazek Zaafrani #include "llvm/Pass.h"
525a2583f0SAbderrazek Zaafrani #include <unordered_map>
535a2583f0SAbderrazek Zaafrani
545a2583f0SAbderrazek Zaafrani using namespace llvm;
555a2583f0SAbderrazek Zaafrani
565a2583f0SAbderrazek Zaafrani #define DEBUG_TYPE "aarch64-simdinstr-opt"
575a2583f0SAbderrazek Zaafrani
585a2583f0SAbderrazek Zaafrani STATISTIC(NumModifiedInstr,
595a2583f0SAbderrazek Zaafrani "Number of SIMD instructions modified");
605a2583f0SAbderrazek Zaafrani
615a2583f0SAbderrazek Zaafrani #define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \
625a2583f0SAbderrazek Zaafrani "AArch64 SIMD instructions optimization pass"
635a2583f0SAbderrazek Zaafrani
645a2583f0SAbderrazek Zaafrani namespace {
655a2583f0SAbderrazek Zaafrani
665a2583f0SAbderrazek Zaafrani struct AArch64SIMDInstrOpt : public MachineFunctionPass {
675a2583f0SAbderrazek Zaafrani static char ID;
685a2583f0SAbderrazek Zaafrani
695a2583f0SAbderrazek Zaafrani const TargetInstrInfo *TII;
705a2583f0SAbderrazek Zaafrani MachineRegisterInfo *MRI;
715a2583f0SAbderrazek Zaafrani TargetSchedModel SchedModel;
725a2583f0SAbderrazek Zaafrani
735a2583f0SAbderrazek Zaafrani // The two maps below are used to cache decisions instead of recomputing:
745a2583f0SAbderrazek Zaafrani // This is used to cache instruction replacement decisions within function
755a2583f0SAbderrazek Zaafrani // units and across function units.
765a2583f0SAbderrazek Zaafrani std::map<std::pair<unsigned, std::string>, bool> SIMDInstrTable;
77a9134e86SEvandro Menezes // This is used to cache the decision of whether to leave the interleaved
78a9134e86SEvandro Menezes // store instructions replacement pass early or not for a particular target.
795a2583f0SAbderrazek Zaafrani std::unordered_map<std::string, bool> InterlEarlyExit;
805a2583f0SAbderrazek Zaafrani
815a2583f0SAbderrazek Zaafrani typedef enum {
825a2583f0SAbderrazek Zaafrani VectorElem,
835a2583f0SAbderrazek Zaafrani Interleave
845a2583f0SAbderrazek Zaafrani } Subpass;
855a2583f0SAbderrazek Zaafrani
865a2583f0SAbderrazek Zaafrani // Instruction represented by OrigOpc is replaced by instructions in ReplOpc.
875a2583f0SAbderrazek Zaafrani struct InstReplInfo {
885a2583f0SAbderrazek Zaafrani unsigned OrigOpc;
895a2583f0SAbderrazek Zaafrani std::vector<unsigned> ReplOpc;
905a2583f0SAbderrazek Zaafrani const TargetRegisterClass RC;
915a2583f0SAbderrazek Zaafrani };
925a2583f0SAbderrazek Zaafrani
935a2583f0SAbderrazek Zaafrani #define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \
945a2583f0SAbderrazek Zaafrani {OpcOrg, {OpcR0, OpcR1, OpcR2}, RC}
955a2583f0SAbderrazek Zaafrani #define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \
965a2583f0SAbderrazek Zaafrani OpcR7, OpcR8, OpcR9, RC) \
97a9134e86SEvandro Menezes {OpcOrg, \
98a9134e86SEvandro Menezes {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC}
995a2583f0SAbderrazek Zaafrani
1005a2583f0SAbderrazek Zaafrani // The Instruction Replacement Table:
1015a2583f0SAbderrazek Zaafrani std::vector<InstReplInfo> IRT = {
1025a2583f0SAbderrazek Zaafrani // ST2 instructions
1035a2583f0SAbderrazek Zaafrani RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
1045a2583f0SAbderrazek Zaafrani AArch64::STPQi, AArch64::FPR128RegClass),
1055a2583f0SAbderrazek Zaafrani RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
1065a2583f0SAbderrazek Zaafrani AArch64::STPQi, AArch64::FPR128RegClass),
1075a2583f0SAbderrazek Zaafrani RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
1085a2583f0SAbderrazek Zaafrani AArch64::STPDi, AArch64::FPR64RegClass),
1095a2583f0SAbderrazek Zaafrani RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
1105a2583f0SAbderrazek Zaafrani AArch64::STPQi, AArch64::FPR128RegClass),
1115a2583f0SAbderrazek Zaafrani RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
1125a2583f0SAbderrazek Zaafrani AArch64::STPDi, AArch64::FPR64RegClass),
1135a2583f0SAbderrazek Zaafrani RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
1145a2583f0SAbderrazek Zaafrani AArch64::STPQi, AArch64::FPR128RegClass),
1155a2583f0SAbderrazek Zaafrani RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
1165a2583f0SAbderrazek Zaafrani AArch64::STPDi, AArch64::FPR64RegClass),
1175a2583f0SAbderrazek Zaafrani // ST4 instructions
1185a2583f0SAbderrazek Zaafrani RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
1195a2583f0SAbderrazek Zaafrani AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64,
1205a2583f0SAbderrazek Zaafrani AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
1215a2583f0SAbderrazek Zaafrani AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
1225a2583f0SAbderrazek Zaafrani RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
1235a2583f0SAbderrazek Zaafrani AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32,
1245a2583f0SAbderrazek Zaafrani AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
1255a2583f0SAbderrazek Zaafrani AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
1265a2583f0SAbderrazek Zaafrani RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
1275a2583f0SAbderrazek Zaafrani AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32,
1285a2583f0SAbderrazek Zaafrani AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
1295a2583f0SAbderrazek Zaafrani AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
1305a2583f0SAbderrazek Zaafrani RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
1315a2583f0SAbderrazek Zaafrani AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16,
1325a2583f0SAbderrazek Zaafrani AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
1335a2583f0SAbderrazek Zaafrani AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
1345a2583f0SAbderrazek Zaafrani RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
1355a2583f0SAbderrazek Zaafrani AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16,
1365a2583f0SAbderrazek Zaafrani AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
1375a2583f0SAbderrazek Zaafrani AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
1385a2583f0SAbderrazek Zaafrani RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
1395a2583f0SAbderrazek Zaafrani AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8,
1405a2583f0SAbderrazek Zaafrani AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
1415a2583f0SAbderrazek Zaafrani AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
1425a2583f0SAbderrazek Zaafrani RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
1435a2583f0SAbderrazek Zaafrani AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8,
1445a2583f0SAbderrazek Zaafrani AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
1455a2583f0SAbderrazek Zaafrani AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass)
1465a2583f0SAbderrazek Zaafrani };
1475a2583f0SAbderrazek Zaafrani
1485a2583f0SAbderrazek Zaafrani // A costly instruction is replaced in this work by N efficient instructions
1495a2583f0SAbderrazek Zaafrani // The maximum of N is curently 10 and it is for ST4 case.
1505a2583f0SAbderrazek Zaafrani static const unsigned MaxNumRepl = 10;
1515a2583f0SAbderrazek Zaafrani
AArch64SIMDInstrOpt__anone10fe4070111::AArch64SIMDInstrOpt1525a2583f0SAbderrazek Zaafrani AArch64SIMDInstrOpt() : MachineFunctionPass(ID) {
1535a2583f0SAbderrazek Zaafrani initializeAArch64SIMDInstrOptPass(*PassRegistry::getPassRegistry());
1545a2583f0SAbderrazek Zaafrani }
1555a2583f0SAbderrazek Zaafrani
1565a2583f0SAbderrazek Zaafrani /// Based only on latency of instructions, determine if it is cost efficient
1575a2583f0SAbderrazek Zaafrani /// to replace the instruction InstDesc by the instructions stored in the
1585a2583f0SAbderrazek Zaafrani /// array InstDescRepl.
1595a2583f0SAbderrazek Zaafrani /// Return true if replacement is expected to be faster.
1605a2583f0SAbderrazek Zaafrani bool shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc,
1615a2583f0SAbderrazek Zaafrani SmallVectorImpl<const MCInstrDesc*> &ReplInstrMCID);
1625a2583f0SAbderrazek Zaafrani
1635a2583f0SAbderrazek Zaafrani /// Determine if we need to exit the instruction replacement optimization
164a9134e86SEvandro Menezes /// passes early. This makes sure that no compile time is spent in this pass
165a9134e86SEvandro Menezes /// for targets with no need for any of these optimizations.
166a9134e86SEvandro Menezes /// Return true if early exit of the pass is recommended.
1675a2583f0SAbderrazek Zaafrani bool shouldExitEarly(MachineFunction *MF, Subpass SP);
1685a2583f0SAbderrazek Zaafrani
1695a2583f0SAbderrazek Zaafrani /// Check whether an equivalent DUP instruction has already been
1705a2583f0SAbderrazek Zaafrani /// created or not.
171a9134e86SEvandro Menezes /// Return true when the DUP instruction already exists. In this case,
1725a2583f0SAbderrazek Zaafrani /// DestReg will point to the destination of the already created DUP.
1735a2583f0SAbderrazek Zaafrani bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg,
1745a2583f0SAbderrazek Zaafrani unsigned LaneNumber, unsigned *DestReg) const;
1755a2583f0SAbderrazek Zaafrani
1765a2583f0SAbderrazek Zaafrani /// Certain SIMD instructions with vector element operand are not efficient.
1775a2583f0SAbderrazek Zaafrani /// Rewrite them into SIMD instructions with vector operands. This rewrite
1785a2583f0SAbderrazek Zaafrani /// is driven by the latency of the instructions.
1795a2583f0SAbderrazek Zaafrani /// Return true if the SIMD instruction is modified.
1805a2583f0SAbderrazek Zaafrani bool optimizeVectElement(MachineInstr &MI);
1815a2583f0SAbderrazek Zaafrani
1825a2583f0SAbderrazek Zaafrani /// Process The REG_SEQUENCE instruction, and extract the source
183a9134e86SEvandro Menezes /// operands of the ST2/4 instruction from it.
1845a2583f0SAbderrazek Zaafrani /// Example of such instructions.
1855a2583f0SAbderrazek Zaafrani /// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
1865a2583f0SAbderrazek Zaafrani /// Return true when the instruction is processed successfully.
1875a2583f0SAbderrazek Zaafrani bool processSeqRegInst(MachineInstr *DefiningMI, unsigned* StReg,
1885a2583f0SAbderrazek Zaafrani unsigned* StRegKill, unsigned NumArg) const;
1895a2583f0SAbderrazek Zaafrani
1905a2583f0SAbderrazek Zaafrani /// Load/Store Interleaving instructions are not always beneficial.
191a9134e86SEvandro Menezes /// Replace them by ZIP instructionand classical load/store.
1925a2583f0SAbderrazek Zaafrani /// Return true if the SIMD instruction is modified.
1935a2583f0SAbderrazek Zaafrani bool optimizeLdStInterleave(MachineInstr &MI);
1945a2583f0SAbderrazek Zaafrani
1955a2583f0SAbderrazek Zaafrani /// Return the number of useful source registers for this
196a9134e86SEvandro Menezes /// instruction (2 for ST2 and 4 for ST4).
1975a2583f0SAbderrazek Zaafrani unsigned determineSrcReg(MachineInstr &MI) const;
1985a2583f0SAbderrazek Zaafrani
1995a2583f0SAbderrazek Zaafrani bool runOnMachineFunction(MachineFunction &Fn) override;
2005a2583f0SAbderrazek Zaafrani
getPassName__anone10fe4070111::AArch64SIMDInstrOpt2015a2583f0SAbderrazek Zaafrani StringRef getPassName() const override {
2025a2583f0SAbderrazek Zaafrani return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME;
2035a2583f0SAbderrazek Zaafrani }
2045a2583f0SAbderrazek Zaafrani };
2055a2583f0SAbderrazek Zaafrani
2065a2583f0SAbderrazek Zaafrani char AArch64SIMDInstrOpt::ID = 0;
2075a2583f0SAbderrazek Zaafrani
2085a2583f0SAbderrazek Zaafrani } // end anonymous namespace
2095a2583f0SAbderrazek Zaafrani
2105a2583f0SAbderrazek Zaafrani INITIALIZE_PASS(AArch64SIMDInstrOpt, "aarch64-simdinstr-opt",
2115a2583f0SAbderrazek Zaafrani AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false)
2125a2583f0SAbderrazek Zaafrani
2135a2583f0SAbderrazek Zaafrani /// Based only on latency of instructions, determine if it is cost efficient
2145a2583f0SAbderrazek Zaafrani /// to replace the instruction InstDesc by the instructions stored in the
2155a2583f0SAbderrazek Zaafrani /// array InstDescRepl.
2165a2583f0SAbderrazek Zaafrani /// Return true if replacement is expected to be faster.
2175a2583f0SAbderrazek Zaafrani bool AArch64SIMDInstrOpt::
shouldReplaceInst(MachineFunction * MF,const MCInstrDesc * InstDesc,SmallVectorImpl<const MCInstrDesc * > & InstDescRepl)2185a2583f0SAbderrazek Zaafrani shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc,
2195a2583f0SAbderrazek Zaafrani SmallVectorImpl<const MCInstrDesc*> &InstDescRepl) {
2205a2583f0SAbderrazek Zaafrani // Check if replacement decision is already available in the cached table.
2215a2583f0SAbderrazek Zaafrani // if so, return it.
222adcd0268SBenjamin Kramer std::string Subtarget = std::string(SchedModel.getSubtargetInfo()->getCPU());
223a9134e86SEvandro Menezes auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget);
22437ef2255SJon Roelofs auto It = SIMDInstrTable.find(InstID);
22537ef2255SJon Roelofs if (It != SIMDInstrTable.end())
22637ef2255SJon Roelofs return It->second;
2275a2583f0SAbderrazek Zaafrani
2285a2583f0SAbderrazek Zaafrani unsigned SCIdx = InstDesc->getSchedClass();
2295a2583f0SAbderrazek Zaafrani const MCSchedClassDesc *SCDesc =
2305a2583f0SAbderrazek Zaafrani SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
2315a2583f0SAbderrazek Zaafrani
232a9134e86SEvandro Menezes // If a target does not define resources for the instructions
2335a2583f0SAbderrazek Zaafrani // of interest, then return false for no replacement.
2345a2583f0SAbderrazek Zaafrani const MCSchedClassDesc *SCDescRepl;
2355a2583f0SAbderrazek Zaafrani if (!SCDesc->isValid() || SCDesc->isVariant())
2365a2583f0SAbderrazek Zaafrani {
2375a2583f0SAbderrazek Zaafrani SIMDInstrTable[InstID] = false;
2385a2583f0SAbderrazek Zaafrani return false;
2395a2583f0SAbderrazek Zaafrani }
2405a2583f0SAbderrazek Zaafrani for (auto IDesc : InstDescRepl)
2415a2583f0SAbderrazek Zaafrani {
2425a2583f0SAbderrazek Zaafrani SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc(
2435a2583f0SAbderrazek Zaafrani IDesc->getSchedClass());
2445a2583f0SAbderrazek Zaafrani if (!SCDescRepl->isValid() || SCDescRepl->isVariant())
2455a2583f0SAbderrazek Zaafrani {
2465a2583f0SAbderrazek Zaafrani SIMDInstrTable[InstID] = false;
2475a2583f0SAbderrazek Zaafrani return false;
2485a2583f0SAbderrazek Zaafrani }
2495a2583f0SAbderrazek Zaafrani }
2505a2583f0SAbderrazek Zaafrani
2515a2583f0SAbderrazek Zaafrani // Replacement cost.
2525a2583f0SAbderrazek Zaafrani unsigned ReplCost = 0;
2535a2583f0SAbderrazek Zaafrani for (auto IDesc :InstDescRepl)
2545a2583f0SAbderrazek Zaafrani ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode());
2555a2583f0SAbderrazek Zaafrani
2565a2583f0SAbderrazek Zaafrani if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost)
2575a2583f0SAbderrazek Zaafrani {
2585a2583f0SAbderrazek Zaafrani SIMDInstrTable[InstID] = true;
2595a2583f0SAbderrazek Zaafrani return true;
2605a2583f0SAbderrazek Zaafrani }
2615a2583f0SAbderrazek Zaafrani else
2625a2583f0SAbderrazek Zaafrani {
2635a2583f0SAbderrazek Zaafrani SIMDInstrTable[InstID] = false;
2645a2583f0SAbderrazek Zaafrani return false;
2655a2583f0SAbderrazek Zaafrani }
2665a2583f0SAbderrazek Zaafrani }
2675a2583f0SAbderrazek Zaafrani
268a9134e86SEvandro Menezes /// Determine if we need to exit this pass for a kind of instruction replacement
269a9134e86SEvandro Menezes /// early. This makes sure that no compile time is spent in this pass for
270a9134e86SEvandro Menezes /// targets with no need for any of these optimizations beyond performing this
271a9134e86SEvandro Menezes /// check.
272a9134e86SEvandro Menezes /// Return true if early exit of this pass for a kind of instruction
273a9134e86SEvandro Menezes /// replacement is recommended for a target.
shouldExitEarly(MachineFunction * MF,Subpass SP)2745a2583f0SAbderrazek Zaafrani bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) {
2755a2583f0SAbderrazek Zaafrani const MCInstrDesc* OriginalMCID;
2765a2583f0SAbderrazek Zaafrani SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID;
2775a2583f0SAbderrazek Zaafrani
2785a2583f0SAbderrazek Zaafrani switch (SP) {
279a9134e86SEvandro Menezes // For this optimization, check by comparing the latency of a representative
280a9134e86SEvandro Menezes // instruction to that of the replacement instructions.
281a9134e86SEvandro Menezes // TODO: check for all concerned instructions.
2825a2583f0SAbderrazek Zaafrani case VectorElem:
2835a2583f0SAbderrazek Zaafrani OriginalMCID = &TII->get(AArch64::FMLAv4i32_indexed);
2845a2583f0SAbderrazek Zaafrani ReplInstrMCID.push_back(&TII->get(AArch64::DUPv4i32lane));
285a9134e86SEvandro Menezes ReplInstrMCID.push_back(&TII->get(AArch64::FMLAv4f32));
2865a2583f0SAbderrazek Zaafrani if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID))
2875a2583f0SAbderrazek Zaafrani return false;
2885a2583f0SAbderrazek Zaafrani break;
289a9134e86SEvandro Menezes
290a9134e86SEvandro Menezes // For this optimization, check for all concerned instructions.
2915a2583f0SAbderrazek Zaafrani case Interleave:
292adcd0268SBenjamin Kramer std::string Subtarget =
293adcd0268SBenjamin Kramer std::string(SchedModel.getSubtargetInfo()->getCPU());
29483dc53d3SJon Roelofs auto It = InterlEarlyExit.find(Subtarget);
29583dc53d3SJon Roelofs if (It != InterlEarlyExit.end())
29683dc53d3SJon Roelofs return It->second;
2975a2583f0SAbderrazek Zaafrani
2985a2583f0SAbderrazek Zaafrani for (auto &I : IRT) {
2995a2583f0SAbderrazek Zaafrani OriginalMCID = &TII->get(I.OrigOpc);
3005a2583f0SAbderrazek Zaafrani for (auto &Repl : I.ReplOpc)
3015a2583f0SAbderrazek Zaafrani ReplInstrMCID.push_back(&TII->get(Repl));
3025a2583f0SAbderrazek Zaafrani if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) {
3035a2583f0SAbderrazek Zaafrani InterlEarlyExit[Subtarget] = false;
3045a2583f0SAbderrazek Zaafrani return false;
3055a2583f0SAbderrazek Zaafrani }
3065a2583f0SAbderrazek Zaafrani ReplInstrMCID.clear();
3075a2583f0SAbderrazek Zaafrani }
3085a2583f0SAbderrazek Zaafrani InterlEarlyExit[Subtarget] = true;
3095a2583f0SAbderrazek Zaafrani break;
3105a2583f0SAbderrazek Zaafrani }
3115a2583f0SAbderrazek Zaafrani
3125a2583f0SAbderrazek Zaafrani return true;
3135a2583f0SAbderrazek Zaafrani }
3145a2583f0SAbderrazek Zaafrani
3155a2583f0SAbderrazek Zaafrani /// Check whether an equivalent DUP instruction has already been
3165a2583f0SAbderrazek Zaafrani /// created or not.
317a9134e86SEvandro Menezes /// Return true when the DUP instruction already exists. In this case,
3185a2583f0SAbderrazek Zaafrani /// DestReg will point to the destination of the already created DUP.
reuseDUP(MachineInstr & MI,unsigned DupOpcode,unsigned SrcReg,unsigned LaneNumber,unsigned * DestReg) const3195a2583f0SAbderrazek Zaafrani bool AArch64SIMDInstrOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode,
3205a2583f0SAbderrazek Zaafrani unsigned SrcReg, unsigned LaneNumber,
3215a2583f0SAbderrazek Zaafrani unsigned *DestReg) const {
3225a2583f0SAbderrazek Zaafrani for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin();
3235a2583f0SAbderrazek Zaafrani MII != MIE;) {
3245a2583f0SAbderrazek Zaafrani MII--;
3255a2583f0SAbderrazek Zaafrani MachineInstr *CurrentMI = &*MII;
3265a2583f0SAbderrazek Zaafrani
3275a2583f0SAbderrazek Zaafrani if (CurrentMI->getOpcode() == DupOpcode &&
3285a2583f0SAbderrazek Zaafrani CurrentMI->getNumOperands() == 3 &&
3295a2583f0SAbderrazek Zaafrani CurrentMI->getOperand(1).getReg() == SrcReg &&
3305a2583f0SAbderrazek Zaafrani CurrentMI->getOperand(2).getImm() == LaneNumber) {
3315a2583f0SAbderrazek Zaafrani *DestReg = CurrentMI->getOperand(0).getReg();
3325a2583f0SAbderrazek Zaafrani return true;
3335a2583f0SAbderrazek Zaafrani }
3345a2583f0SAbderrazek Zaafrani }
3355a2583f0SAbderrazek Zaafrani
3365a2583f0SAbderrazek Zaafrani return false;
3375a2583f0SAbderrazek Zaafrani }
3385a2583f0SAbderrazek Zaafrani
3395a2583f0SAbderrazek Zaafrani /// Certain SIMD instructions with vector element operand are not efficient.
3405a2583f0SAbderrazek Zaafrani /// Rewrite them into SIMD instructions with vector operands. This rewrite
3415a2583f0SAbderrazek Zaafrani /// is driven by the latency of the instructions.
342a9134e86SEvandro Menezes /// The instruction of concerns are for the time being FMLA, FMLS, FMUL,
343a9134e86SEvandro Menezes /// and FMULX and hence they are hardcoded.
3445a2583f0SAbderrazek Zaafrani ///
345a9134e86SEvandro Menezes /// For example:
3465a2583f0SAbderrazek Zaafrani /// fmla v0.4s, v1.4s, v2.s[1]
347a9134e86SEvandro Menezes ///
348a9134e86SEvandro Menezes /// Is rewritten into
349a9134e86SEvandro Menezes /// dup v3.4s, v2.s[1] // DUP not necessary if redundant
3505a2583f0SAbderrazek Zaafrani /// fmla v0.4s, v1.4s, v3.4s
351a9134e86SEvandro Menezes ///
3525a2583f0SAbderrazek Zaafrani /// Return true if the SIMD instruction is modified.
optimizeVectElement(MachineInstr & MI)3535a2583f0SAbderrazek Zaafrani bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) {
3545a2583f0SAbderrazek Zaafrani const MCInstrDesc *MulMCID, *DupMCID;
3555a2583f0SAbderrazek Zaafrani const TargetRegisterClass *RC = &AArch64::FPR128RegClass;
3565a2583f0SAbderrazek Zaafrani
3575a2583f0SAbderrazek Zaafrani switch (MI.getOpcode()) {
3585a2583f0SAbderrazek Zaafrani default:
3595a2583f0SAbderrazek Zaafrani return false;
3605a2583f0SAbderrazek Zaafrani
3615a2583f0SAbderrazek Zaafrani // 4X32 instructions
3625a2583f0SAbderrazek Zaafrani case AArch64::FMLAv4i32_indexed:
3635a2583f0SAbderrazek Zaafrani DupMCID = &TII->get(AArch64::DUPv4i32lane);
3645a2583f0SAbderrazek Zaafrani MulMCID = &TII->get(AArch64::FMLAv4f32);
3655a2583f0SAbderrazek Zaafrani break;
3665a2583f0SAbderrazek Zaafrani case AArch64::FMLSv4i32_indexed:
3675a2583f0SAbderrazek Zaafrani DupMCID = &TII->get(AArch64::DUPv4i32lane);
3685a2583f0SAbderrazek Zaafrani MulMCID = &TII->get(AArch64::FMLSv4f32);
3695a2583f0SAbderrazek Zaafrani break;
3705a2583f0SAbderrazek Zaafrani case AArch64::FMULXv4i32_indexed:
3715a2583f0SAbderrazek Zaafrani DupMCID = &TII->get(AArch64::DUPv4i32lane);
3725a2583f0SAbderrazek Zaafrani MulMCID = &TII->get(AArch64::FMULXv4f32);
3735a2583f0SAbderrazek Zaafrani break;
3745a2583f0SAbderrazek Zaafrani case AArch64::FMULv4i32_indexed:
3755a2583f0SAbderrazek Zaafrani DupMCID = &TII->get(AArch64::DUPv4i32lane);
3765a2583f0SAbderrazek Zaafrani MulMCID = &TII->get(AArch64::FMULv4f32);
3775a2583f0SAbderrazek Zaafrani break;
3785a2583f0SAbderrazek Zaafrani
3795a2583f0SAbderrazek Zaafrani // 2X64 instructions
3805a2583f0SAbderrazek Zaafrani case AArch64::FMLAv2i64_indexed:
3815a2583f0SAbderrazek Zaafrani DupMCID = &TII->get(AArch64::DUPv2i64lane);
3825a2583f0SAbderrazek Zaafrani MulMCID = &TII->get(AArch64::FMLAv2f64);
3835a2583f0SAbderrazek Zaafrani break;
3845a2583f0SAbderrazek Zaafrani case AArch64::FMLSv2i64_indexed:
3855a2583f0SAbderrazek Zaafrani DupMCID = &TII->get(AArch64::DUPv2i64lane);
3865a2583f0SAbderrazek Zaafrani MulMCID = &TII->get(AArch64::FMLSv2f64);
3875a2583f0SAbderrazek Zaafrani break;
3885a2583f0SAbderrazek Zaafrani case AArch64::FMULXv2i64_indexed:
3895a2583f0SAbderrazek Zaafrani DupMCID = &TII->get(AArch64::DUPv2i64lane);
3905a2583f0SAbderrazek Zaafrani MulMCID = &TII->get(AArch64::FMULXv2f64);
3915a2583f0SAbderrazek Zaafrani break;
3925a2583f0SAbderrazek Zaafrani case AArch64::FMULv2i64_indexed:
3935a2583f0SAbderrazek Zaafrani DupMCID = &TII->get(AArch64::DUPv2i64lane);
3945a2583f0SAbderrazek Zaafrani MulMCID = &TII->get(AArch64::FMULv2f64);
3955a2583f0SAbderrazek Zaafrani break;
3965a2583f0SAbderrazek Zaafrani
3975a2583f0SAbderrazek Zaafrani // 2X32 instructions
3985a2583f0SAbderrazek Zaafrani case AArch64::FMLAv2i32_indexed:
3995a2583f0SAbderrazek Zaafrani RC = &AArch64::FPR64RegClass;
4005a2583f0SAbderrazek Zaafrani DupMCID = &TII->get(AArch64::DUPv2i32lane);
4015a2583f0SAbderrazek Zaafrani MulMCID = &TII->get(AArch64::FMLAv2f32);
4025a2583f0SAbderrazek Zaafrani break;
4035a2583f0SAbderrazek Zaafrani case AArch64::FMLSv2i32_indexed:
4045a2583f0SAbderrazek Zaafrani RC = &AArch64::FPR64RegClass;
4055a2583f0SAbderrazek Zaafrani DupMCID = &TII->get(AArch64::DUPv2i32lane);
4065a2583f0SAbderrazek Zaafrani MulMCID = &TII->get(AArch64::FMLSv2f32);
4075a2583f0SAbderrazek Zaafrani break;
4085a2583f0SAbderrazek Zaafrani case AArch64::FMULXv2i32_indexed:
4095a2583f0SAbderrazek Zaafrani RC = &AArch64::FPR64RegClass;
4105a2583f0SAbderrazek Zaafrani DupMCID = &TII->get(AArch64::DUPv2i32lane);
4115a2583f0SAbderrazek Zaafrani MulMCID = &TII->get(AArch64::FMULXv2f32);
4125a2583f0SAbderrazek Zaafrani break;
4135a2583f0SAbderrazek Zaafrani case AArch64::FMULv2i32_indexed:
4145a2583f0SAbderrazek Zaafrani RC = &AArch64::FPR64RegClass;
4155a2583f0SAbderrazek Zaafrani DupMCID = &TII->get(AArch64::DUPv2i32lane);
4165a2583f0SAbderrazek Zaafrani MulMCID = &TII->get(AArch64::FMULv2f32);
4175a2583f0SAbderrazek Zaafrani break;
4185a2583f0SAbderrazek Zaafrani }
4195a2583f0SAbderrazek Zaafrani
4205a2583f0SAbderrazek Zaafrani SmallVector<const MCInstrDesc*, 2> ReplInstrMCID;
4215a2583f0SAbderrazek Zaafrani ReplInstrMCID.push_back(DupMCID);
4225a2583f0SAbderrazek Zaafrani ReplInstrMCID.push_back(MulMCID);
4235a2583f0SAbderrazek Zaafrani if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()),
4245a2583f0SAbderrazek Zaafrani ReplInstrMCID))
4255a2583f0SAbderrazek Zaafrani return false;
4265a2583f0SAbderrazek Zaafrani
4275a2583f0SAbderrazek Zaafrani const DebugLoc &DL = MI.getDebugLoc();
4285a2583f0SAbderrazek Zaafrani MachineBasicBlock &MBB = *MI.getParent();
4295a2583f0SAbderrazek Zaafrani MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4305a2583f0SAbderrazek Zaafrani
431a9134e86SEvandro Menezes // Get the operands of the current SIMD arithmetic instruction.
4325ae66e56SDaniel Sanders Register MulDest = MI.getOperand(0).getReg();
4335ae66e56SDaniel Sanders Register SrcReg0 = MI.getOperand(1).getReg();
4345a2583f0SAbderrazek Zaafrani unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill());
4355ae66e56SDaniel Sanders Register SrcReg1 = MI.getOperand(2).getReg();
4365a2583f0SAbderrazek Zaafrani unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill());
4375a2583f0SAbderrazek Zaafrani unsigned DupDest;
4385a2583f0SAbderrazek Zaafrani
4395a2583f0SAbderrazek Zaafrani // Instructions of interest have either 4 or 5 operands.
4405a2583f0SAbderrazek Zaafrani if (MI.getNumOperands() == 5) {
4415ae66e56SDaniel Sanders Register SrcReg2 = MI.getOperand(3).getReg();
4425a2583f0SAbderrazek Zaafrani unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill());
4435a2583f0SAbderrazek Zaafrani unsigned LaneNumber = MI.getOperand(4).getImm();
4445a2583f0SAbderrazek Zaafrani // Create a new DUP instruction. Note that if an equivalent DUP instruction
445a9134e86SEvandro Menezes // has already been created before, then use that one instead of creating
4465a2583f0SAbderrazek Zaafrani // a new one.
4475a2583f0SAbderrazek Zaafrani if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) {
4485a2583f0SAbderrazek Zaafrani DupDest = MRI.createVirtualRegister(RC);
4495a2583f0SAbderrazek Zaafrani BuildMI(MBB, MI, DL, *DupMCID, DupDest)
4505a2583f0SAbderrazek Zaafrani .addReg(SrcReg2, Src2IsKill)
4515a2583f0SAbderrazek Zaafrani .addImm(LaneNumber);
4525a2583f0SAbderrazek Zaafrani }
4535a2583f0SAbderrazek Zaafrani BuildMI(MBB, MI, DL, *MulMCID, MulDest)
4545a2583f0SAbderrazek Zaafrani .addReg(SrcReg0, Src0IsKill)
4555a2583f0SAbderrazek Zaafrani .addReg(SrcReg1, Src1IsKill)
4565a2583f0SAbderrazek Zaafrani .addReg(DupDest, Src2IsKill);
4575a2583f0SAbderrazek Zaafrani } else if (MI.getNumOperands() == 4) {
4585a2583f0SAbderrazek Zaafrani unsigned LaneNumber = MI.getOperand(3).getImm();
4595a2583f0SAbderrazek Zaafrani if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) {
4605a2583f0SAbderrazek Zaafrani DupDest = MRI.createVirtualRegister(RC);
4615a2583f0SAbderrazek Zaafrani BuildMI(MBB, MI, DL, *DupMCID, DupDest)
4625a2583f0SAbderrazek Zaafrani .addReg(SrcReg1, Src1IsKill)
4635a2583f0SAbderrazek Zaafrani .addImm(LaneNumber);
4645a2583f0SAbderrazek Zaafrani }
4655a2583f0SAbderrazek Zaafrani BuildMI(MBB, MI, DL, *MulMCID, MulDest)
4665a2583f0SAbderrazek Zaafrani .addReg(SrcReg0, Src0IsKill)
4675a2583f0SAbderrazek Zaafrani .addReg(DupDest, Src1IsKill);
4685a2583f0SAbderrazek Zaafrani } else {
4695a2583f0SAbderrazek Zaafrani return false;
4705a2583f0SAbderrazek Zaafrani }
4715a2583f0SAbderrazek Zaafrani
4725a2583f0SAbderrazek Zaafrani ++NumModifiedInstr;
4735a2583f0SAbderrazek Zaafrani return true;
4745a2583f0SAbderrazek Zaafrani }
4755a2583f0SAbderrazek Zaafrani
4765a2583f0SAbderrazek Zaafrani /// Load/Store Interleaving instructions are not always beneficial.
477a9134e86SEvandro Menezes /// Replace them by ZIP instructions and classical load/store.
4785a2583f0SAbderrazek Zaafrani ///
479a9134e86SEvandro Menezes /// For example:
4805a2583f0SAbderrazek Zaafrani /// st2 {v0.4s, v1.4s}, addr
481a9134e86SEvandro Menezes ///
482a9134e86SEvandro Menezes /// Is rewritten into:
4835a2583f0SAbderrazek Zaafrani /// zip1 v2.4s, v0.4s, v1.4s
4845a2583f0SAbderrazek Zaafrani /// zip2 v3.4s, v0.4s, v1.4s
4855a2583f0SAbderrazek Zaafrani /// stp q2, q3, addr
4865a2583f0SAbderrazek Zaafrani //
487a9134e86SEvandro Menezes /// For example:
4885a2583f0SAbderrazek Zaafrani /// st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr
489a9134e86SEvandro Menezes ///
490a9134e86SEvandro Menezes /// Is rewritten into:
4915a2583f0SAbderrazek Zaafrani /// zip1 v4.4s, v0.4s, v2.4s
4925a2583f0SAbderrazek Zaafrani /// zip2 v5.4s, v0.4s, v2.4s
4935a2583f0SAbderrazek Zaafrani /// zip1 v6.4s, v1.4s, v3.4s
4945a2583f0SAbderrazek Zaafrani /// zip2 v7.4s, v1.4s, v3.4s
4955a2583f0SAbderrazek Zaafrani /// zip1 v8.4s, v4.4s, v6.4s
4965a2583f0SAbderrazek Zaafrani /// zip2 v9.4s, v4.4s, v6.4s
4975a2583f0SAbderrazek Zaafrani /// zip1 v10.4s, v5.4s, v7.4s
4985a2583f0SAbderrazek Zaafrani /// zip2 v11.4s, v5.4s, v7.4s
4995a2583f0SAbderrazek Zaafrani /// stp q8, q9, addr
5005a2583f0SAbderrazek Zaafrani /// stp q10, q11, addr+32
501a9134e86SEvandro Menezes ///
502a9134e86SEvandro Menezes /// Currently only instructions related to ST2 and ST4 are considered.
5035a2583f0SAbderrazek Zaafrani /// Other may be added later.
5045a2583f0SAbderrazek Zaafrani /// Return true if the SIMD instruction is modified.
optimizeLdStInterleave(MachineInstr & MI)5055a2583f0SAbderrazek Zaafrani bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr &MI) {
5065a2583f0SAbderrazek Zaafrani
5075a2583f0SAbderrazek Zaafrani unsigned SeqReg, AddrReg;
5085a2583f0SAbderrazek Zaafrani unsigned StReg[4], StRegKill[4];
5095a2583f0SAbderrazek Zaafrani MachineInstr *DefiningMI;
5105a2583f0SAbderrazek Zaafrani const DebugLoc &DL = MI.getDebugLoc();
5115a2583f0SAbderrazek Zaafrani MachineBasicBlock &MBB = *MI.getParent();
5125a2583f0SAbderrazek Zaafrani SmallVector<unsigned, MaxNumRepl> ZipDest;
5135a2583f0SAbderrazek Zaafrani SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID;
5145a2583f0SAbderrazek Zaafrani
5155a2583f0SAbderrazek Zaafrani // If current instruction matches any of the rewriting rules, then
5165a2583f0SAbderrazek Zaafrani // gather information about parameters of the new instructions.
5175a2583f0SAbderrazek Zaafrani bool Match = false;
5185a2583f0SAbderrazek Zaafrani for (auto &I : IRT) {
5195a2583f0SAbderrazek Zaafrani if (MI.getOpcode() == I.OrigOpc) {
5205a2583f0SAbderrazek Zaafrani SeqReg = MI.getOperand(0).getReg();
5215a2583f0SAbderrazek Zaafrani AddrReg = MI.getOperand(1).getReg();
5225a2583f0SAbderrazek Zaafrani DefiningMI = MRI->getUniqueVRegDef(SeqReg);
5235a2583f0SAbderrazek Zaafrani unsigned NumReg = determineSrcReg(MI);
5245a2583f0SAbderrazek Zaafrani if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg))
5255a2583f0SAbderrazek Zaafrani return false;
5265a2583f0SAbderrazek Zaafrani
5275a2583f0SAbderrazek Zaafrani for (auto &Repl : I.ReplOpc) {
5285a2583f0SAbderrazek Zaafrani ReplInstrMCID.push_back(&TII->get(Repl));
5295a2583f0SAbderrazek Zaafrani // Generate destination registers but only for non-store instruction.
5305a2583f0SAbderrazek Zaafrani if (Repl != AArch64::STPQi && Repl != AArch64::STPDi)
5315a2583f0SAbderrazek Zaafrani ZipDest.push_back(MRI->createVirtualRegister(&I.RC));
5325a2583f0SAbderrazek Zaafrani }
5335a2583f0SAbderrazek Zaafrani Match = true;
5345a2583f0SAbderrazek Zaafrani break;
5355a2583f0SAbderrazek Zaafrani }
5365a2583f0SAbderrazek Zaafrani }
5375a2583f0SAbderrazek Zaafrani
5385a2583f0SAbderrazek Zaafrani if (!Match)
5395a2583f0SAbderrazek Zaafrani return false;
5405a2583f0SAbderrazek Zaafrani
5415a2583f0SAbderrazek Zaafrani // Determine if it is profitable to replace MI by the series of instructions
5425a2583f0SAbderrazek Zaafrani // represented in ReplInstrMCID.
5435a2583f0SAbderrazek Zaafrani if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()),
5445a2583f0SAbderrazek Zaafrani ReplInstrMCID))
5455a2583f0SAbderrazek Zaafrani return false;
5465a2583f0SAbderrazek Zaafrani
547a9134e86SEvandro Menezes // Generate the replacement instructions composed of ZIP1, ZIP2, and STP (at
5485a2583f0SAbderrazek Zaafrani // this point, the code generation is hardcoded and does not rely on the IRT
5495a2583f0SAbderrazek Zaafrani // table used above given that code generation for ST2 replacement is somewhat
5505a2583f0SAbderrazek Zaafrani // different than for ST4 replacement. We could have added more info into the
5515a2583f0SAbderrazek Zaafrani // table related to how we build new instructions but we may be adding more
5525a2583f0SAbderrazek Zaafrani // complexity with that).
5535a2583f0SAbderrazek Zaafrani switch (MI.getOpcode()) {
5545a2583f0SAbderrazek Zaafrani default:
5555a2583f0SAbderrazek Zaafrani return false;
556a9134e86SEvandro Menezes
5575a2583f0SAbderrazek Zaafrani case AArch64::ST2Twov16b:
5585a2583f0SAbderrazek Zaafrani case AArch64::ST2Twov8b:
5595a2583f0SAbderrazek Zaafrani case AArch64::ST2Twov8h:
5605a2583f0SAbderrazek Zaafrani case AArch64::ST2Twov4h:
5615a2583f0SAbderrazek Zaafrani case AArch64::ST2Twov4s:
5625a2583f0SAbderrazek Zaafrani case AArch64::ST2Twov2s:
5635a2583f0SAbderrazek Zaafrani case AArch64::ST2Twov2d:
564a9134e86SEvandro Menezes // ZIP instructions
5655a2583f0SAbderrazek Zaafrani BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0])
5665a2583f0SAbderrazek Zaafrani .addReg(StReg[0])
5675a2583f0SAbderrazek Zaafrani .addReg(StReg[1]);
5685a2583f0SAbderrazek Zaafrani BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1])
5695a2583f0SAbderrazek Zaafrani .addReg(StReg[0], StRegKill[0])
5705a2583f0SAbderrazek Zaafrani .addReg(StReg[1], StRegKill[1]);
571a9134e86SEvandro Menezes // STP instructions
5725a2583f0SAbderrazek Zaafrani BuildMI(MBB, MI, DL, *ReplInstrMCID[2])
5735a2583f0SAbderrazek Zaafrani .addReg(ZipDest[0])
5745a2583f0SAbderrazek Zaafrani .addReg(ZipDest[1])
5755a2583f0SAbderrazek Zaafrani .addReg(AddrReg)
5765a2583f0SAbderrazek Zaafrani .addImm(0);
5775a2583f0SAbderrazek Zaafrani break;
578a9134e86SEvandro Menezes
5795a2583f0SAbderrazek Zaafrani case AArch64::ST4Fourv16b:
5805a2583f0SAbderrazek Zaafrani case AArch64::ST4Fourv8b:
5815a2583f0SAbderrazek Zaafrani case AArch64::ST4Fourv8h:
5825a2583f0SAbderrazek Zaafrani case AArch64::ST4Fourv4h:
5835a2583f0SAbderrazek Zaafrani case AArch64::ST4Fourv4s:
5845a2583f0SAbderrazek Zaafrani case AArch64::ST4Fourv2s:
5855a2583f0SAbderrazek Zaafrani case AArch64::ST4Fourv2d:
586a9134e86SEvandro Menezes // ZIP instructions
5875a2583f0SAbderrazek Zaafrani BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0])
5885a2583f0SAbderrazek Zaafrani .addReg(StReg[0])
5895a2583f0SAbderrazek Zaafrani .addReg(StReg[2]);
5905a2583f0SAbderrazek Zaafrani BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1])
5915a2583f0SAbderrazek Zaafrani .addReg(StReg[0], StRegKill[0])
5925a2583f0SAbderrazek Zaafrani .addReg(StReg[2], StRegKill[2]);
5935a2583f0SAbderrazek Zaafrani BuildMI(MBB, MI, DL, *ReplInstrMCID[2], ZipDest[2])
5945a2583f0SAbderrazek Zaafrani .addReg(StReg[1])
5955a2583f0SAbderrazek Zaafrani .addReg(StReg[3]);
5965a2583f0SAbderrazek Zaafrani BuildMI(MBB, MI, DL, *ReplInstrMCID[3], ZipDest[3])
5975a2583f0SAbderrazek Zaafrani .addReg(StReg[1], StRegKill[1])
5985a2583f0SAbderrazek Zaafrani .addReg(StReg[3], StRegKill[3]);
5995a2583f0SAbderrazek Zaafrani BuildMI(MBB, MI, DL, *ReplInstrMCID[4], ZipDest[4])
6005a2583f0SAbderrazek Zaafrani .addReg(ZipDest[0])
6015a2583f0SAbderrazek Zaafrani .addReg(ZipDest[2]);
6025a2583f0SAbderrazek Zaafrani BuildMI(MBB, MI, DL, *ReplInstrMCID[5], ZipDest[5])
6035a2583f0SAbderrazek Zaafrani .addReg(ZipDest[0])
6045a2583f0SAbderrazek Zaafrani .addReg(ZipDest[2]);
6055a2583f0SAbderrazek Zaafrani BuildMI(MBB, MI, DL, *ReplInstrMCID[6], ZipDest[6])
6065a2583f0SAbderrazek Zaafrani .addReg(ZipDest[1])
6075a2583f0SAbderrazek Zaafrani .addReg(ZipDest[3]);
6085a2583f0SAbderrazek Zaafrani BuildMI(MBB, MI, DL, *ReplInstrMCID[7], ZipDest[7])
6095a2583f0SAbderrazek Zaafrani .addReg(ZipDest[1])
6105a2583f0SAbderrazek Zaafrani .addReg(ZipDest[3]);
6115a2583f0SAbderrazek Zaafrani // stp instructions
6125a2583f0SAbderrazek Zaafrani BuildMI(MBB, MI, DL, *ReplInstrMCID[8])
6135a2583f0SAbderrazek Zaafrani .addReg(ZipDest[4])
6145a2583f0SAbderrazek Zaafrani .addReg(ZipDest[5])
6155a2583f0SAbderrazek Zaafrani .addReg(AddrReg)
6165a2583f0SAbderrazek Zaafrani .addImm(0);
6175a2583f0SAbderrazek Zaafrani BuildMI(MBB, MI, DL, *ReplInstrMCID[9])
6185a2583f0SAbderrazek Zaafrani .addReg(ZipDest[6])
6195a2583f0SAbderrazek Zaafrani .addReg(ZipDest[7])
6205a2583f0SAbderrazek Zaafrani .addReg(AddrReg)
6215a2583f0SAbderrazek Zaafrani .addImm(2);
6225a2583f0SAbderrazek Zaafrani break;
6235a2583f0SAbderrazek Zaafrani }
6245a2583f0SAbderrazek Zaafrani
6255a2583f0SAbderrazek Zaafrani ++NumModifiedInstr;
6265a2583f0SAbderrazek Zaafrani return true;
6275a2583f0SAbderrazek Zaafrani }
6285a2583f0SAbderrazek Zaafrani
6295a2583f0SAbderrazek Zaafrani /// Process The REG_SEQUENCE instruction, and extract the source
630a9134e86SEvandro Menezes /// operands of the ST2/4 instruction from it.
6315a2583f0SAbderrazek Zaafrani /// Example of such instruction.
6325a2583f0SAbderrazek Zaafrani /// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
6335a2583f0SAbderrazek Zaafrani /// Return true when the instruction is processed successfully.
processSeqRegInst(MachineInstr * DefiningMI,unsigned * StReg,unsigned * StRegKill,unsigned NumArg) const6345a2583f0SAbderrazek Zaafrani bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI,
6355a2583f0SAbderrazek Zaafrani unsigned* StReg, unsigned* StRegKill, unsigned NumArg) const {
636*5a667c0eSKazu Hirata assert(DefiningMI != nullptr);
6375a2583f0SAbderrazek Zaafrani if (DefiningMI->getOpcode() != AArch64::REG_SEQUENCE)
6385a2583f0SAbderrazek Zaafrani return false;
6395a2583f0SAbderrazek Zaafrani
6405a2583f0SAbderrazek Zaafrani for (unsigned i=0; i<NumArg; i++) {
6415a2583f0SAbderrazek Zaafrani StReg[i] = DefiningMI->getOperand(2*i+1).getReg();
6425a2583f0SAbderrazek Zaafrani StRegKill[i] = getKillRegState(DefiningMI->getOperand(2*i+1).isKill());
6435a2583f0SAbderrazek Zaafrani
6445b8bbbecSZarko Todorovski // Validation check for the other arguments.
6455a2583f0SAbderrazek Zaafrani if (DefiningMI->getOperand(2*i+2).isImm()) {
6465a2583f0SAbderrazek Zaafrani switch (DefiningMI->getOperand(2*i+2).getImm()) {
6475a2583f0SAbderrazek Zaafrani default:
6485a2583f0SAbderrazek Zaafrani return false;
649a9134e86SEvandro Menezes
6505a2583f0SAbderrazek Zaafrani case AArch64::dsub0:
6515a2583f0SAbderrazek Zaafrani case AArch64::dsub1:
6525a2583f0SAbderrazek Zaafrani case AArch64::dsub2:
6535a2583f0SAbderrazek Zaafrani case AArch64::dsub3:
6545a2583f0SAbderrazek Zaafrani case AArch64::qsub0:
6555a2583f0SAbderrazek Zaafrani case AArch64::qsub1:
6565a2583f0SAbderrazek Zaafrani case AArch64::qsub2:
6575a2583f0SAbderrazek Zaafrani case AArch64::qsub3:
6585a2583f0SAbderrazek Zaafrani break;
6595a2583f0SAbderrazek Zaafrani }
6605a2583f0SAbderrazek Zaafrani }
6615a2583f0SAbderrazek Zaafrani else
6625a2583f0SAbderrazek Zaafrani return false;
6635a2583f0SAbderrazek Zaafrani }
6645a2583f0SAbderrazek Zaafrani return true;
6655a2583f0SAbderrazek Zaafrani }
6665a2583f0SAbderrazek Zaafrani
6675a2583f0SAbderrazek Zaafrani /// Return the number of useful source registers for this instruction
6685a2583f0SAbderrazek Zaafrani /// (2 for ST2 and 4 for ST4).
determineSrcReg(MachineInstr & MI) const6695a2583f0SAbderrazek Zaafrani unsigned AArch64SIMDInstrOpt::determineSrcReg(MachineInstr &MI) const {
6705a2583f0SAbderrazek Zaafrani switch (MI.getOpcode()) {
6715a2583f0SAbderrazek Zaafrani default:
6725a2583f0SAbderrazek Zaafrani llvm_unreachable("Unsupported instruction for this pass");
673a9134e86SEvandro Menezes
6745a2583f0SAbderrazek Zaafrani case AArch64::ST2Twov16b:
6755a2583f0SAbderrazek Zaafrani case AArch64::ST2Twov8b:
6765a2583f0SAbderrazek Zaafrani case AArch64::ST2Twov8h:
6775a2583f0SAbderrazek Zaafrani case AArch64::ST2Twov4h:
6785a2583f0SAbderrazek Zaafrani case AArch64::ST2Twov4s:
6795a2583f0SAbderrazek Zaafrani case AArch64::ST2Twov2s:
6805a2583f0SAbderrazek Zaafrani case AArch64::ST2Twov2d:
6815a2583f0SAbderrazek Zaafrani return 2;
682a9134e86SEvandro Menezes
6835a2583f0SAbderrazek Zaafrani case AArch64::ST4Fourv16b:
6845a2583f0SAbderrazek Zaafrani case AArch64::ST4Fourv8b:
6855a2583f0SAbderrazek Zaafrani case AArch64::ST4Fourv8h:
6865a2583f0SAbderrazek Zaafrani case AArch64::ST4Fourv4h:
6875a2583f0SAbderrazek Zaafrani case AArch64::ST4Fourv4s:
6885a2583f0SAbderrazek Zaafrani case AArch64::ST4Fourv2s:
6895a2583f0SAbderrazek Zaafrani case AArch64::ST4Fourv2d:
6905a2583f0SAbderrazek Zaafrani return 4;
6915a2583f0SAbderrazek Zaafrani }
6925a2583f0SAbderrazek Zaafrani }
6935a2583f0SAbderrazek Zaafrani
runOnMachineFunction(MachineFunction & MF)6945a2583f0SAbderrazek Zaafrani bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) {
695f1caa283SMatthias Braun if (skipFunction(MF.getFunction()))
6965a2583f0SAbderrazek Zaafrani return false;
6975a2583f0SAbderrazek Zaafrani
6985a2583f0SAbderrazek Zaafrani TII = MF.getSubtarget().getInstrInfo();
6995a2583f0SAbderrazek Zaafrani MRI = &MF.getRegInfo();
7005a2583f0SAbderrazek Zaafrani const TargetSubtargetInfo &ST = MF.getSubtarget();
7015a2583f0SAbderrazek Zaafrani const AArch64InstrInfo *AAII =
7025a2583f0SAbderrazek Zaafrani static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
7035a2583f0SAbderrazek Zaafrani if (!AAII)
7045a2583f0SAbderrazek Zaafrani return false;
7050d7df36cSSanjay Patel SchedModel.init(&ST);
7065a2583f0SAbderrazek Zaafrani if (!SchedModel.hasInstrSchedModel())
7075a2583f0SAbderrazek Zaafrani return false;
7085a2583f0SAbderrazek Zaafrani
7095a2583f0SAbderrazek Zaafrani bool Changed = false;
7105a2583f0SAbderrazek Zaafrani for (auto OptimizationKind : {VectorElem, Interleave}) {
7115a2583f0SAbderrazek Zaafrani if (!shouldExitEarly(&MF, OptimizationKind)) {
7125a2583f0SAbderrazek Zaafrani SmallVector<MachineInstr *, 8> RemoveMIs;
7135a2583f0SAbderrazek Zaafrani for (MachineBasicBlock &MBB : MF) {
714ee0133dcSKazu Hirata for (MachineInstr &MI : MBB) {
7155a2583f0SAbderrazek Zaafrani bool InstRewrite;
7165a2583f0SAbderrazek Zaafrani if (OptimizationKind == VectorElem)
7175a2583f0SAbderrazek Zaafrani InstRewrite = optimizeVectElement(MI) ;
7185a2583f0SAbderrazek Zaafrani else
7195a2583f0SAbderrazek Zaafrani InstRewrite = optimizeLdStInterleave(MI);
7205a2583f0SAbderrazek Zaafrani if (InstRewrite) {
7215a2583f0SAbderrazek Zaafrani // Add MI to the list of instructions to be removed given that it
7225a2583f0SAbderrazek Zaafrani // has been replaced.
7235a2583f0SAbderrazek Zaafrani RemoveMIs.push_back(&MI);
7245a2583f0SAbderrazek Zaafrani Changed = true;
7255a2583f0SAbderrazek Zaafrani }
7265a2583f0SAbderrazek Zaafrani }
7275a2583f0SAbderrazek Zaafrani }
7285a2583f0SAbderrazek Zaafrani for (MachineInstr *MI : RemoveMIs)
7295a2583f0SAbderrazek Zaafrani MI->eraseFromParent();
7305a2583f0SAbderrazek Zaafrani }
7315a2583f0SAbderrazek Zaafrani }
7325a2583f0SAbderrazek Zaafrani
7335a2583f0SAbderrazek Zaafrani return Changed;
7345a2583f0SAbderrazek Zaafrani }
7355a2583f0SAbderrazek Zaafrani
736a9134e86SEvandro Menezes /// Returns an instance of the high cost ASIMD instruction replacement
737a9134e86SEvandro Menezes /// optimization pass.
createAArch64SIMDInstrOptPass()7385a2583f0SAbderrazek Zaafrani FunctionPass *llvm::createAArch64SIMDInstrOptPass() {
7395a2583f0SAbderrazek Zaafrani return new AArch64SIMDInstrOpt();
7405a2583f0SAbderrazek Zaafrani }
741